| { |
| "best_metric": 0.0009937735740095377, |
| "best_model_checkpoint": "/home/paperspace/Data/models/restaurant365/llm3br256/checkpoint-1600", |
| "epoch": 5.575757575757576, |
| "eval_steps": 25, |
| "global_step": 1725, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0032323232323232323, |
| "grad_norm": 0.07792975008487701, |
| "learning_rate": 1.2936610608020701e-07, |
| "loss": 0.0278, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.006464646464646465, |
| "grad_norm": 0.08275039494037628, |
| "learning_rate": 2.5873221216041403e-07, |
| "loss": 0.0237, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.009696969696969697, |
| "grad_norm": 0.07888943701982498, |
| "learning_rate": 3.8809831824062096e-07, |
| "loss": 0.022, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.01292929292929293, |
| "grad_norm": 0.0738615021109581, |
| "learning_rate": 5.174644243208281e-07, |
| "loss": 0.0202, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.01616161616161616, |
| "grad_norm": 0.08640046417713165, |
| "learning_rate": 6.468305304010349e-07, |
| "loss": 0.0244, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.019393939393939394, |
| "grad_norm": 0.07830435782670975, |
| "learning_rate": 7.761966364812419e-07, |
| "loss": 0.0233, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.022626262626262626, |
| "grad_norm": 0.070709727704525, |
| "learning_rate": 9.055627425614489e-07, |
| "loss": 0.0248, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.02585858585858586, |
| "grad_norm": 0.08060784637928009, |
| "learning_rate": 1.0349288486416561e-06, |
| "loss": 0.0241, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.02909090909090909, |
| "grad_norm": 0.07824616879224777, |
| "learning_rate": 1.164294954721863e-06, |
| "loss": 0.0253, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.03232323232323232, |
| "grad_norm": 0.07263046503067017, |
| "learning_rate": 1.2936610608020699e-06, |
| "loss": 0.0215, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.035555555555555556, |
| "grad_norm": 0.08094426989555359, |
| "learning_rate": 1.423027166882277e-06, |
| "loss": 0.0253, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.03878787878787879, |
| "grad_norm": 0.07181934267282486, |
| "learning_rate": 1.5523932729624839e-06, |
| "loss": 0.0232, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.04202020202020202, |
| "grad_norm": 0.07519516348838806, |
| "learning_rate": 1.6817593790426907e-06, |
| "loss": 0.0262, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.04525252525252525, |
| "grad_norm": 0.07137555629014969, |
| "learning_rate": 1.8111254851228978e-06, |
| "loss": 0.0271, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.048484848484848485, |
| "grad_norm": 0.06666477769613266, |
| "learning_rate": 1.9404915912031045e-06, |
| "loss": 0.0218, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.05171717171717172, |
| "grad_norm": 0.06290479004383087, |
| "learning_rate": 2.0698576972833122e-06, |
| "loss": 0.0204, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.05494949494949495, |
| "grad_norm": 0.06275156885385513, |
| "learning_rate": 2.199223803363519e-06, |
| "loss": 0.0272, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.05818181818181818, |
| "grad_norm": 0.055082306265830994, |
| "learning_rate": 2.328589909443726e-06, |
| "loss": 0.0175, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.061414141414141414, |
| "grad_norm": 0.05565667152404785, |
| "learning_rate": 2.457956015523933e-06, |
| "loss": 0.0274, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.06464646464646465, |
| "grad_norm": 0.055030979216098785, |
| "learning_rate": 2.5873221216041398e-06, |
| "loss": 0.02, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.06787878787878789, |
| "grad_norm": 0.0537736713886261, |
| "learning_rate": 2.716688227684347e-06, |
| "loss": 0.0208, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.07111111111111111, |
| "grad_norm": 0.048372287303209305, |
| "learning_rate": 2.846054333764554e-06, |
| "loss": 0.0193, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.07434343434343435, |
| "grad_norm": 0.045470114797353745, |
| "learning_rate": 2.975420439844761e-06, |
| "loss": 0.017, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.07757575757575758, |
| "grad_norm": 0.05411680042743683, |
| "learning_rate": 3.1047865459249677e-06, |
| "loss": 0.0294, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.08080808080808081, |
| "grad_norm": 0.038719285279512405, |
| "learning_rate": 3.234152652005175e-06, |
| "loss": 0.0179, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.08080808080808081, |
| "eval_loss": 0.017858153209090233, |
| "eval_runtime": 18.233, |
| "eval_samples_per_second": 5.485, |
| "eval_steps_per_second": 1.371, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.08404040404040404, |
| "grad_norm": 0.052320994436740875, |
| "learning_rate": 3.3635187580853815e-06, |
| "loss": 0.0246, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.08727272727272728, |
| "grad_norm": 0.0435178168118, |
| "learning_rate": 3.492884864165589e-06, |
| "loss": 0.02, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.0905050505050505, |
| "grad_norm": 0.042080093175172806, |
| "learning_rate": 3.6222509702457957e-06, |
| "loss": 0.018, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.09373737373737374, |
| "grad_norm": 0.04145395755767822, |
| "learning_rate": 3.751617076326003e-06, |
| "loss": 0.0219, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.09696969696969697, |
| "grad_norm": 0.04255010560154915, |
| "learning_rate": 3.880983182406209e-06, |
| "loss": 0.0215, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.10020202020202021, |
| "grad_norm": 0.03899993374943733, |
| "learning_rate": 4.010349288486417e-06, |
| "loss": 0.0182, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.10343434343434343, |
| "grad_norm": 0.035461559891700745, |
| "learning_rate": 4.1397153945666245e-06, |
| "loss": 0.0132, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.10666666666666667, |
| "grad_norm": 0.03928808495402336, |
| "learning_rate": 4.2690815006468305e-06, |
| "loss": 0.0172, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.1098989898989899, |
| "grad_norm": 0.04023351892828941, |
| "learning_rate": 4.398447606727038e-06, |
| "loss": 0.0211, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.11313131313131314, |
| "grad_norm": 0.035943325608968735, |
| "learning_rate": 4.527813712807244e-06, |
| "loss": 0.0155, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.11636363636363636, |
| "grad_norm": 0.03777831047773361, |
| "learning_rate": 4.657179818887452e-06, |
| "loss": 0.0204, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.1195959595959596, |
| "grad_norm": 0.028852691873908043, |
| "learning_rate": 4.786545924967659e-06, |
| "loss": 0.0152, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.12282828282828283, |
| "grad_norm": 0.03250971809029579, |
| "learning_rate": 4.915912031047866e-06, |
| "loss": 0.0167, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.12606060606060607, |
| "grad_norm": 0.03109503909945488, |
| "learning_rate": 5.045278137128073e-06, |
| "loss": 0.0169, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.1292929292929293, |
| "grad_norm": 0.027047201991081238, |
| "learning_rate": 5.1746442432082795e-06, |
| "loss": 0.0121, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.13252525252525252, |
| "grad_norm": 0.029273182153701782, |
| "learning_rate": 5.304010349288486e-06, |
| "loss": 0.0211, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.13575757575757577, |
| "grad_norm": 0.026186689734458923, |
| "learning_rate": 5.433376455368694e-06, |
| "loss": 0.0121, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.138989898989899, |
| "grad_norm": 0.031041577458381653, |
| "learning_rate": 5.5627425614489e-06, |
| "loss": 0.0174, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.14222222222222222, |
| "grad_norm": 0.030576322227716446, |
| "learning_rate": 5.692108667529108e-06, |
| "loss": 0.0167, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.14545454545454545, |
| "grad_norm": 0.025865159928798676, |
| "learning_rate": 5.821474773609315e-06, |
| "loss": 0.015, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.1486868686868687, |
| "grad_norm": 0.028177760541439056, |
| "learning_rate": 5.950840879689522e-06, |
| "loss": 0.0173, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.15191919191919193, |
| "grad_norm": 0.027181051671504974, |
| "learning_rate": 6.0802069857697286e-06, |
| "loss": 0.0141, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.15515151515151515, |
| "grad_norm": 0.025914832949638367, |
| "learning_rate": 6.2095730918499354e-06, |
| "loss": 0.0147, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.15838383838383838, |
| "grad_norm": 0.025943972170352936, |
| "learning_rate": 6.338939197930142e-06, |
| "loss": 0.0182, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.16161616161616163, |
| "grad_norm": 0.021595345810055733, |
| "learning_rate": 6.46830530401035e-06, |
| "loss": 0.0102, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.16161616161616163, |
| "eval_loss": 0.0121736666187644, |
| "eval_runtime": 16.3381, |
| "eval_samples_per_second": 6.121, |
| "eval_steps_per_second": 1.53, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.16484848484848486, |
| "grad_norm": 0.02361897937953472, |
| "learning_rate": 6.597671410090557e-06, |
| "loss": 0.0112, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.16808080808080808, |
| "grad_norm": 0.023629309609532356, |
| "learning_rate": 6.727037516170763e-06, |
| "loss": 0.0117, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.1713131313131313, |
| "grad_norm": 0.024288684129714966, |
| "learning_rate": 6.856403622250971e-06, |
| "loss": 0.018, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.17454545454545456, |
| "grad_norm": 0.025759516283869743, |
| "learning_rate": 6.985769728331178e-06, |
| "loss": 0.0129, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.17777777777777778, |
| "grad_norm": 0.02322845719754696, |
| "learning_rate": 7.115135834411385e-06, |
| "loss": 0.0145, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.181010101010101, |
| "grad_norm": 0.023300625383853912, |
| "learning_rate": 7.244501940491591e-06, |
| "loss": 0.0132, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.18424242424242424, |
| "grad_norm": 0.02563454397022724, |
| "learning_rate": 7.373868046571798e-06, |
| "loss": 0.0129, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.1874747474747475, |
| "grad_norm": 0.020489778369665146, |
| "learning_rate": 7.503234152652006e-06, |
| "loss": 0.0084, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.1907070707070707, |
| "grad_norm": 0.022039903327822685, |
| "learning_rate": 7.632600258732213e-06, |
| "loss": 0.0107, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.19393939393939394, |
| "grad_norm": 0.02409541979432106, |
| "learning_rate": 7.761966364812418e-06, |
| "loss": 0.0121, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.19717171717171716, |
| "grad_norm": 0.018786218017339706, |
| "learning_rate": 7.891332470892627e-06, |
| "loss": 0.0073, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.20040404040404042, |
| "grad_norm": 0.01937553659081459, |
| "learning_rate": 8.020698576972833e-06, |
| "loss": 0.0111, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.20363636363636364, |
| "grad_norm": 0.020772738382220268, |
| "learning_rate": 8.15006468305304e-06, |
| "loss": 0.0127, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.20686868686868687, |
| "grad_norm": 0.0203766617923975, |
| "learning_rate": 8.279430789133249e-06, |
| "loss": 0.0128, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.2101010101010101, |
| "grad_norm": 0.02146344818174839, |
| "learning_rate": 8.408796895213454e-06, |
| "loss": 0.0098, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.21333333333333335, |
| "grad_norm": 0.019886888563632965, |
| "learning_rate": 8.538163001293661e-06, |
| "loss": 0.0085, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.21656565656565657, |
| "grad_norm": 0.022427983582019806, |
| "learning_rate": 8.66752910737387e-06, |
| "loss": 0.0105, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.2197979797979798, |
| "grad_norm": 0.022696534171700478, |
| "learning_rate": 8.796895213454076e-06, |
| "loss": 0.0114, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.22303030303030302, |
| "grad_norm": 0.020293528214097023, |
| "learning_rate": 8.926261319534282e-06, |
| "loss": 0.0111, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.22626262626262628, |
| "grad_norm": 0.021718839183449745, |
| "learning_rate": 9.055627425614489e-06, |
| "loss": 0.0109, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.2294949494949495, |
| "grad_norm": 0.0213741734623909, |
| "learning_rate": 9.184993531694697e-06, |
| "loss": 0.0118, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.23272727272727273, |
| "grad_norm": 0.019220391288399696, |
| "learning_rate": 9.314359637774904e-06, |
| "loss": 0.0068, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.23595959595959595, |
| "grad_norm": 0.023295527324080467, |
| "learning_rate": 9.44372574385511e-06, |
| "loss": 0.0109, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.2391919191919192, |
| "grad_norm": 0.018743356689810753, |
| "learning_rate": 9.573091849935318e-06, |
| "loss": 0.0097, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.24242424242424243, |
| "grad_norm": 0.023235196247696877, |
| "learning_rate": 9.702457956015525e-06, |
| "loss": 0.0117, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.24242424242424243, |
| "eval_loss": 0.009722071699798107, |
| "eval_runtime": 16.3387, |
| "eval_samples_per_second": 6.12, |
| "eval_steps_per_second": 1.53, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.24565656565656566, |
| "grad_norm": 0.02667049877345562, |
| "learning_rate": 9.831824062095732e-06, |
| "loss": 0.0145, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.24888888888888888, |
| "grad_norm": 0.023159362375736237, |
| "learning_rate": 9.961190168175938e-06, |
| "loss": 0.0129, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.25212121212121213, |
| "grad_norm": 0.01716494746506214, |
| "learning_rate": 1.0090556274256145e-05, |
| "loss": 0.0057, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.25535353535353533, |
| "grad_norm": 0.019957246258854866, |
| "learning_rate": 1.0219922380336352e-05, |
| "loss": 0.0106, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.2585858585858586, |
| "grad_norm": 0.020428957417607307, |
| "learning_rate": 1.0349288486416559e-05, |
| "loss": 0.0093, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.26181818181818184, |
| "grad_norm": 0.020134352147579193, |
| "learning_rate": 1.0478654592496766e-05, |
| "loss": 0.0085, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.26505050505050504, |
| "grad_norm": 0.023800766095519066, |
| "learning_rate": 1.0608020698576973e-05, |
| "loss": 0.0097, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.2682828282828283, |
| "grad_norm": 0.022994179278612137, |
| "learning_rate": 1.073738680465718e-05, |
| "loss": 0.0127, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.27151515151515154, |
| "grad_norm": 0.018760228529572487, |
| "learning_rate": 1.0866752910737388e-05, |
| "loss": 0.0095, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.27474747474747474, |
| "grad_norm": 0.023791132494807243, |
| "learning_rate": 1.0996119016817593e-05, |
| "loss": 0.0114, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.277979797979798, |
| "grad_norm": 0.021235186606645584, |
| "learning_rate": 1.11254851228978e-05, |
| "loss": 0.0107, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.2812121212121212, |
| "grad_norm": 0.02389148250222206, |
| "learning_rate": 1.1254851228978009e-05, |
| "loss": 0.0098, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.28444444444444444, |
| "grad_norm": 0.023712515830993652, |
| "learning_rate": 1.1384217335058216e-05, |
| "loss": 0.01, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.2876767676767677, |
| "grad_norm": 0.023874476552009583, |
| "learning_rate": 1.1513583441138421e-05, |
| "loss": 0.0115, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.2909090909090909, |
| "grad_norm": 0.020052338019013405, |
| "learning_rate": 1.164294954721863e-05, |
| "loss": 0.0093, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.29414141414141415, |
| "grad_norm": 0.02070593275129795, |
| "learning_rate": 1.1772315653298836e-05, |
| "loss": 0.0093, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.2973737373737374, |
| "grad_norm": 0.020449379459023476, |
| "learning_rate": 1.1901681759379043e-05, |
| "loss": 0.0078, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.3006060606060606, |
| "grad_norm": 0.021471675485372543, |
| "learning_rate": 1.203104786545925e-05, |
| "loss": 0.0124, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.30383838383838385, |
| "grad_norm": 0.020752931013703346, |
| "learning_rate": 1.2160413971539457e-05, |
| "loss": 0.0085, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.30707070707070705, |
| "grad_norm": 0.01745028793811798, |
| "learning_rate": 1.2289780077619664e-05, |
| "loss": 0.0069, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.3103030303030303, |
| "grad_norm": 0.018052417784929276, |
| "learning_rate": 1.2419146183699871e-05, |
| "loss": 0.0065, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.31353535353535356, |
| "grad_norm": 0.021343907341361046, |
| "learning_rate": 1.254851228978008e-05, |
| "loss": 0.0078, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.31676767676767675, |
| "grad_norm": 0.021136371418833733, |
| "learning_rate": 1.2677878395860285e-05, |
| "loss": 0.0105, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 0.019849712029099464, |
| "learning_rate": 1.2807244501940493e-05, |
| "loss": 0.012, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.32323232323232326, |
| "grad_norm": 0.021437030285596848, |
| "learning_rate": 1.29366106080207e-05, |
| "loss": 0.0094, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.32323232323232326, |
| "eval_loss": 0.008297097869217396, |
| "eval_runtime": 16.3605, |
| "eval_samples_per_second": 6.112, |
| "eval_steps_per_second": 1.528, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.32646464646464646, |
| "grad_norm": 0.018191296607255936, |
| "learning_rate": 1.3065976714100905e-05, |
| "loss": 0.0065, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.3296969696969697, |
| "grad_norm": 0.01837129518389702, |
| "learning_rate": 1.3195342820181114e-05, |
| "loss": 0.0086, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.3329292929292929, |
| "grad_norm": 0.018085656687617302, |
| "learning_rate": 1.332470892626132e-05, |
| "loss": 0.007, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.33616161616161616, |
| "grad_norm": 0.022087739780545235, |
| "learning_rate": 1.3454075032341526e-05, |
| "loss": 0.0102, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.3393939393939394, |
| "grad_norm": 0.01995154283940792, |
| "learning_rate": 1.3583441138421735e-05, |
| "loss": 0.0096, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.3426262626262626, |
| "grad_norm": 0.018545862287282944, |
| "learning_rate": 1.3712807244501941e-05, |
| "loss": 0.0062, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.34585858585858587, |
| "grad_norm": 0.020920872688293457, |
| "learning_rate": 1.384217335058215e-05, |
| "loss": 0.0078, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.3490909090909091, |
| "grad_norm": 0.023617206141352654, |
| "learning_rate": 1.3971539456662355e-05, |
| "loss": 0.0118, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.3523232323232323, |
| "grad_norm": 0.023416129872202873, |
| "learning_rate": 1.4100905562742562e-05, |
| "loss": 0.0098, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.35555555555555557, |
| "grad_norm": 0.02004130929708481, |
| "learning_rate": 1.423027166882277e-05, |
| "loss": 0.0096, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.35878787878787877, |
| "grad_norm": 0.019093789160251617, |
| "learning_rate": 1.4359637774902976e-05, |
| "loss": 0.0077, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.362020202020202, |
| "grad_norm": 0.021551866084337234, |
| "learning_rate": 1.4489003880983183e-05, |
| "loss": 0.0128, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.3652525252525253, |
| "grad_norm": 0.019720977172255516, |
| "learning_rate": 1.4618369987063391e-05, |
| "loss": 0.011, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.36848484848484847, |
| "grad_norm": 0.01680293306708336, |
| "learning_rate": 1.4747736093143596e-05, |
| "loss": 0.0058, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.3717171717171717, |
| "grad_norm": 0.01976117305457592, |
| "learning_rate": 1.4877102199223805e-05, |
| "loss": 0.0087, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.374949494949495, |
| "grad_norm": 0.01902499422430992, |
| "learning_rate": 1.5006468305304012e-05, |
| "loss": 0.0096, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.3781818181818182, |
| "grad_norm": 0.021109282970428467, |
| "learning_rate": 1.5135834411384217e-05, |
| "loss": 0.0092, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.3814141414141414, |
| "grad_norm": 0.020100753754377365, |
| "learning_rate": 1.5265200517464426e-05, |
| "loss": 0.0075, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.3846464646464646, |
| "grad_norm": 0.018564140424132347, |
| "learning_rate": 1.5394566623544633e-05, |
| "loss": 0.0053, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.3878787878787879, |
| "grad_norm": 0.025290893390774727, |
| "learning_rate": 1.5523932729624836e-05, |
| "loss": 0.0099, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.39111111111111113, |
| "grad_norm": 0.015657838433980942, |
| "learning_rate": 1.5653298835705046e-05, |
| "loss": 0.0056, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.39434343434343433, |
| "grad_norm": 0.018975598737597466, |
| "learning_rate": 1.5782664941785253e-05, |
| "loss": 0.009, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.3975757575757576, |
| "grad_norm": 0.018999790772795677, |
| "learning_rate": 1.591203104786546e-05, |
| "loss": 0.0087, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.40080808080808084, |
| "grad_norm": 0.020040014758706093, |
| "learning_rate": 1.6041397153945667e-05, |
| "loss": 0.0076, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.40404040404040403, |
| "grad_norm": 0.020763296633958817, |
| "learning_rate": 1.6170763260025874e-05, |
| "loss": 0.0089, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.40404040404040403, |
| "eval_loss": 0.007393690291792154, |
| "eval_runtime": 16.3329, |
| "eval_samples_per_second": 6.123, |
| "eval_steps_per_second": 1.531, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.4072727272727273, |
| "grad_norm": 0.017707694321870804, |
| "learning_rate": 1.630012936610608e-05, |
| "loss": 0.0094, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.4105050505050505, |
| "grad_norm": 0.022696422412991524, |
| "learning_rate": 1.6429495472186288e-05, |
| "loss": 0.0095, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.41373737373737374, |
| "grad_norm": 0.01847272552549839, |
| "learning_rate": 1.6558861578266498e-05, |
| "loss": 0.0089, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.416969696969697, |
| "grad_norm": 0.021758975461125374, |
| "learning_rate": 1.66882276843467e-05, |
| "loss": 0.0083, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.4202020202020202, |
| "grad_norm": 0.019073544070124626, |
| "learning_rate": 1.6817593790426908e-05, |
| "loss": 0.0066, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.42343434343434344, |
| "grad_norm": 0.016551939770579338, |
| "learning_rate": 1.694695989650712e-05, |
| "loss": 0.0053, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.4266666666666667, |
| "grad_norm": 0.020770439878106117, |
| "learning_rate": 1.7076326002587322e-05, |
| "loss": 0.013, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.4298989898989899, |
| "grad_norm": 0.015350311063230038, |
| "learning_rate": 1.720569210866753e-05, |
| "loss": 0.0063, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.43313131313131314, |
| "grad_norm": 0.01807980053126812, |
| "learning_rate": 1.733505821474774e-05, |
| "loss": 0.0087, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.43636363636363634, |
| "grad_norm": 0.020209424197673798, |
| "learning_rate": 1.7464424320827943e-05, |
| "loss": 0.0091, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.4395959595959596, |
| "grad_norm": 0.01775312051177025, |
| "learning_rate": 1.7593790426908153e-05, |
| "loss": 0.0067, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.44282828282828285, |
| "grad_norm": 0.014917783439159393, |
| "learning_rate": 1.7723156532988356e-05, |
| "loss": 0.0052, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.44606060606060605, |
| "grad_norm": 0.017075974494218826, |
| "learning_rate": 1.7852522639068563e-05, |
| "loss": 0.0051, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.4492929292929293, |
| "grad_norm": 0.015500731766223907, |
| "learning_rate": 1.7981888745148774e-05, |
| "loss": 0.0052, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.45252525252525255, |
| "grad_norm": 0.019736966118216515, |
| "learning_rate": 1.8111254851228977e-05, |
| "loss": 0.0057, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.45575757575757575, |
| "grad_norm": 0.016301702708005905, |
| "learning_rate": 1.8240620957309184e-05, |
| "loss": 0.0061, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.458989898989899, |
| "grad_norm": 0.021283110603690147, |
| "learning_rate": 1.8369987063389394e-05, |
| "loss": 0.0107, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.4622222222222222, |
| "grad_norm": 0.01769009605050087, |
| "learning_rate": 1.8499353169469598e-05, |
| "loss": 0.0053, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.46545454545454545, |
| "grad_norm": 0.020896637812256813, |
| "learning_rate": 1.8628719275549808e-05, |
| "loss": 0.0071, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.4686868686868687, |
| "grad_norm": 0.022777913138270378, |
| "learning_rate": 1.8758085381630015e-05, |
| "loss": 0.0087, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.4719191919191919, |
| "grad_norm": 0.015831463038921356, |
| "learning_rate": 1.888745148771022e-05, |
| "loss": 0.0058, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.47515151515151516, |
| "grad_norm": 0.02245236746966839, |
| "learning_rate": 1.901681759379043e-05, |
| "loss": 0.008, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.4783838383838384, |
| "grad_norm": 0.017408763989806175, |
| "learning_rate": 1.9146183699870636e-05, |
| "loss": 0.0063, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.4816161616161616, |
| "grad_norm": 0.022278135642409325, |
| "learning_rate": 1.927554980595084e-05, |
| "loss": 0.0082, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.48484848484848486, |
| "grad_norm": 0.017612891271710396, |
| "learning_rate": 1.940491591203105e-05, |
| "loss": 0.0071, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.48484848484848486, |
| "eval_loss": 0.006740436423569918, |
| "eval_runtime": 16.3974, |
| "eval_samples_per_second": 6.099, |
| "eval_steps_per_second": 1.525, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.48808080808080806, |
| "grad_norm": 0.018722211942076683, |
| "learning_rate": 1.9534282018111256e-05, |
| "loss": 0.0082, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.4913131313131313, |
| "grad_norm": 0.01675565354526043, |
| "learning_rate": 1.9663648124191463e-05, |
| "loss": 0.0056, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.49454545454545457, |
| "grad_norm": 0.02161099575459957, |
| "learning_rate": 1.979301423027167e-05, |
| "loss": 0.0069, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.49777777777777776, |
| "grad_norm": 0.019918138161301613, |
| "learning_rate": 1.9922380336351877e-05, |
| "loss": 0.0071, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.501010101010101, |
| "grad_norm": 0.018196014687418938, |
| "learning_rate": 2.0051746442432084e-05, |
| "loss": 0.0079, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.5042424242424243, |
| "grad_norm": 0.019132189452648163, |
| "learning_rate": 2.018111254851229e-05, |
| "loss": 0.0076, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.5074747474747475, |
| "grad_norm": 0.0169647466391325, |
| "learning_rate": 2.0310478654592497e-05, |
| "loss": 0.0048, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.5107070707070707, |
| "grad_norm": 0.020643379539251328, |
| "learning_rate": 2.0439844760672704e-05, |
| "loss": 0.0071, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.5139393939393939, |
| "grad_norm": 0.020311389118433, |
| "learning_rate": 2.056921086675291e-05, |
| "loss": 0.0072, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.5171717171717172, |
| "grad_norm": 0.017437225207686424, |
| "learning_rate": 2.0698576972833118e-05, |
| "loss": 0.0047, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.5204040404040404, |
| "grad_norm": 0.01759376935660839, |
| "learning_rate": 2.0827943078913325e-05, |
| "loss": 0.0078, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.5236363636363637, |
| "grad_norm": 0.025967909023165703, |
| "learning_rate": 2.0957309184993532e-05, |
| "loss": 0.0087, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.5268686868686868, |
| "grad_norm": 0.020681966096162796, |
| "learning_rate": 2.108667529107374e-05, |
| "loss": 0.0082, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.5301010101010101, |
| "grad_norm": 0.023083725944161415, |
| "learning_rate": 2.1216041397153946e-05, |
| "loss": 0.0093, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.5333333333333333, |
| "grad_norm": 0.020430048927664757, |
| "learning_rate": 2.1345407503234156e-05, |
| "loss": 0.0057, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.5365656565656566, |
| "grad_norm": 0.013458729721605778, |
| "learning_rate": 2.147477360931436e-05, |
| "loss": 0.0047, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.5397979797979798, |
| "grad_norm": 0.018452608957886696, |
| "learning_rate": 2.1604139715394566e-05, |
| "loss": 0.0083, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.5430303030303031, |
| "grad_norm": 0.01787393167614937, |
| "learning_rate": 2.1733505821474777e-05, |
| "loss": 0.0059, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.5462626262626262, |
| "grad_norm": 0.019095640629529953, |
| "learning_rate": 2.186287192755498e-05, |
| "loss": 0.0091, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.5494949494949495, |
| "grad_norm": 0.025622636079788208, |
| "learning_rate": 2.1992238033635187e-05, |
| "loss": 0.0117, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.5527272727272727, |
| "grad_norm": 0.01888425275683403, |
| "learning_rate": 2.2121604139715397e-05, |
| "loss": 0.0067, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.555959595959596, |
| "grad_norm": 0.020106947049498558, |
| "learning_rate": 2.22509702457956e-05, |
| "loss": 0.0101, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.5591919191919192, |
| "grad_norm": 0.018979275599122047, |
| "learning_rate": 2.238033635187581e-05, |
| "loss": 0.0069, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.5624242424242424, |
| "grad_norm": 0.016798708587884903, |
| "learning_rate": 2.2509702457956018e-05, |
| "loss": 0.0036, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.5656565656565656, |
| "grad_norm": 0.022024372592568398, |
| "learning_rate": 2.263906856403622e-05, |
| "loss": 0.0075, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.5656565656565656, |
| "eval_loss": 0.006471461616456509, |
| "eval_runtime": 16.3713, |
| "eval_samples_per_second": 6.108, |
| "eval_steps_per_second": 1.527, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.5688888888888889, |
| "grad_norm": 0.02207980863749981, |
| "learning_rate": 2.276843467011643e-05, |
| "loss": 0.0057, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.5721212121212121, |
| "grad_norm": 0.02284344658255577, |
| "learning_rate": 2.289780077619664e-05, |
| "loss": 0.0058, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.5753535353535354, |
| "grad_norm": 0.01840182952582836, |
| "learning_rate": 2.3027166882276842e-05, |
| "loss": 0.0046, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.5785858585858585, |
| "grad_norm": 0.01821177825331688, |
| "learning_rate": 2.3156532988357052e-05, |
| "loss": 0.0058, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.5818181818181818, |
| "grad_norm": 0.02426431141793728, |
| "learning_rate": 2.328589909443726e-05, |
| "loss": 0.007, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.585050505050505, |
| "grad_norm": 0.025767231360077858, |
| "learning_rate": 2.3415265200517466e-05, |
| "loss": 0.0071, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.5882828282828283, |
| "grad_norm": 0.016904814168810844, |
| "learning_rate": 2.3544631306597673e-05, |
| "loss": 0.0062, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.5915151515151515, |
| "grad_norm": 0.017592914402484894, |
| "learning_rate": 2.367399741267788e-05, |
| "loss": 0.0061, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.5947474747474748, |
| "grad_norm": 0.017775798216462135, |
| "learning_rate": 2.3803363518758087e-05, |
| "loss": 0.005, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.597979797979798, |
| "grad_norm": 0.017719866707921028, |
| "learning_rate": 2.3932729624838294e-05, |
| "loss": 0.005, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.6012121212121212, |
| "grad_norm": 0.020644187927246094, |
| "learning_rate": 2.40620957309185e-05, |
| "loss": 0.0064, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.6044444444444445, |
| "grad_norm": 0.021930593997240067, |
| "learning_rate": 2.4191461836998707e-05, |
| "loss": 0.0059, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.6076767676767677, |
| "grad_norm": 0.022686578333377838, |
| "learning_rate": 2.4320827943078914e-05, |
| "loss": 0.0078, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.610909090909091, |
| "grad_norm": 0.019490506500005722, |
| "learning_rate": 2.445019404915912e-05, |
| "loss": 0.0057, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.6141414141414141, |
| "grad_norm": 0.023196866735816002, |
| "learning_rate": 2.4579560155239328e-05, |
| "loss": 0.0062, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.6173737373737374, |
| "grad_norm": 0.024425329640507698, |
| "learning_rate": 2.4708926261319535e-05, |
| "loss": 0.0071, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.6206060606060606, |
| "grad_norm": 0.021786116063594818, |
| "learning_rate": 2.4838292367399742e-05, |
| "loss": 0.0062, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.6238383838383839, |
| "grad_norm": 0.024886364117264748, |
| "learning_rate": 2.496765847347995e-05, |
| "loss": 0.0064, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.6270707070707071, |
| "grad_norm": 0.019750935956835747, |
| "learning_rate": 2.509702457956016e-05, |
| "loss": 0.0066, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.6303030303030303, |
| "grad_norm": 0.018560750409960747, |
| "learning_rate": 2.5226390685640362e-05, |
| "loss": 0.0065, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.6335353535353535, |
| "grad_norm": 0.022106554359197617, |
| "learning_rate": 2.535575679172057e-05, |
| "loss": 0.0053, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.6367676767676768, |
| "grad_norm": 0.018106624484062195, |
| "learning_rate": 2.548512289780078e-05, |
| "loss": 0.0067, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 0.021213680505752563, |
| "learning_rate": 2.5614489003880986e-05, |
| "loss": 0.0072, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.6432323232323233, |
| "grad_norm": 0.018439847975969315, |
| "learning_rate": 2.574385510996119e-05, |
| "loss": 0.0056, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.6464646464646465, |
| "grad_norm": 0.021683279424905777, |
| "learning_rate": 2.58732212160414e-05, |
| "loss": 0.0055, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.6464646464646465, |
| "eval_loss": 0.006109977141022682, |
| "eval_runtime": 16.3308, |
| "eval_samples_per_second": 6.123, |
| "eval_steps_per_second": 1.531, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.6496969696969697, |
| "grad_norm": 0.025424517691135406, |
| "learning_rate": 2.6002587322121607e-05, |
| "loss": 0.0072, |
| "step": 201 |
| }, |
| { |
| "epoch": 0.6529292929292929, |
| "grad_norm": 0.019010253250598907, |
| "learning_rate": 2.613195342820181e-05, |
| "loss": 0.0063, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.6561616161616162, |
| "grad_norm": 0.027690382674336433, |
| "learning_rate": 2.626131953428202e-05, |
| "loss": 0.0073, |
| "step": 203 |
| }, |
| { |
| "epoch": 0.6593939393939394, |
| "grad_norm": 0.029656609520316124, |
| "learning_rate": 2.6390685640362228e-05, |
| "loss": 0.0063, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.6626262626262627, |
| "grad_norm": 0.015361216850578785, |
| "learning_rate": 2.652005174644243e-05, |
| "loss": 0.0026, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.6658585858585858, |
| "grad_norm": 0.015420653857290745, |
| "learning_rate": 2.664941785252264e-05, |
| "loss": 0.0038, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.6690909090909091, |
| "grad_norm": 0.021386191248893738, |
| "learning_rate": 2.677878395860285e-05, |
| "loss": 0.0069, |
| "step": 207 |
| }, |
| { |
| "epoch": 0.6723232323232323, |
| "grad_norm": 0.022418996319174767, |
| "learning_rate": 2.6908150064683052e-05, |
| "loss": 0.0086, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.6755555555555556, |
| "grad_norm": 0.016804402694106102, |
| "learning_rate": 2.7037516170763262e-05, |
| "loss": 0.006, |
| "step": 209 |
| }, |
| { |
| "epoch": 0.6787878787878788, |
| "grad_norm": 0.02547198161482811, |
| "learning_rate": 2.716688227684347e-05, |
| "loss": 0.006, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.682020202020202, |
| "grad_norm": 0.02453729696571827, |
| "learning_rate": 2.7296248382923673e-05, |
| "loss": 0.0067, |
| "step": 211 |
| }, |
| { |
| "epoch": 0.6852525252525252, |
| "grad_norm": 0.017745474353432655, |
| "learning_rate": 2.7425614489003883e-05, |
| "loss": 0.0045, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.6884848484848485, |
| "grad_norm": 0.02601507492363453, |
| "learning_rate": 2.755498059508409e-05, |
| "loss": 0.0056, |
| "step": 213 |
| }, |
| { |
| "epoch": 0.6917171717171717, |
| "grad_norm": 0.023203816264867783, |
| "learning_rate": 2.76843467011643e-05, |
| "loss": 0.0053, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.694949494949495, |
| "grad_norm": 0.021944254636764526, |
| "learning_rate": 2.7813712807244503e-05, |
| "loss": 0.0059, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.6981818181818182, |
| "grad_norm": 0.022818047553300858, |
| "learning_rate": 2.794307891332471e-05, |
| "loss": 0.0063, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.7014141414141414, |
| "grad_norm": 0.020791085436940193, |
| "learning_rate": 2.807244501940492e-05, |
| "loss": 0.0052, |
| "step": 217 |
| }, |
| { |
| "epoch": 0.7046464646464646, |
| "grad_norm": 0.020792318508028984, |
| "learning_rate": 2.8201811125485124e-05, |
| "loss": 0.0066, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.7078787878787879, |
| "grad_norm": 0.017618848010897636, |
| "learning_rate": 2.833117723156533e-05, |
| "loss": 0.0045, |
| "step": 219 |
| }, |
| { |
| "epoch": 0.7111111111111111, |
| "grad_norm": 0.014517604373395443, |
| "learning_rate": 2.846054333764554e-05, |
| "loss": 0.006, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.7143434343434344, |
| "grad_norm": 0.023496482521295547, |
| "learning_rate": 2.8589909443725745e-05, |
| "loss": 0.009, |
| "step": 221 |
| }, |
| { |
| "epoch": 0.7175757575757575, |
| "grad_norm": 0.01889670267701149, |
| "learning_rate": 2.871927554980595e-05, |
| "loss": 0.0054, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.7208080808080808, |
| "grad_norm": 0.014447416178882122, |
| "learning_rate": 2.8848641655886162e-05, |
| "loss": 0.0034, |
| "step": 223 |
| }, |
| { |
| "epoch": 0.724040404040404, |
| "grad_norm": 0.020637504756450653, |
| "learning_rate": 2.8978007761966365e-05, |
| "loss": 0.0056, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.7272727272727273, |
| "grad_norm": 0.02608450874686241, |
| "learning_rate": 2.9107373868046572e-05, |
| "loss": 0.0089, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.7272727272727273, |
| "eval_loss": 0.005723138339817524, |
| "eval_runtime": 16.3246, |
| "eval_samples_per_second": 6.126, |
| "eval_steps_per_second": 1.531, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.7305050505050505, |
| "grad_norm": 0.02099795639514923, |
| "learning_rate": 2.9236739974126783e-05, |
| "loss": 0.0051, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.7337373737373737, |
| "grad_norm": 0.027195405215024948, |
| "learning_rate": 2.936610608020699e-05, |
| "loss": 0.0083, |
| "step": 227 |
| }, |
| { |
| "epoch": 0.7369696969696969, |
| "grad_norm": 0.0190272256731987, |
| "learning_rate": 2.9495472186287193e-05, |
| "loss": 0.0071, |
| "step": 228 |
| }, |
| { |
| "epoch": 0.7402020202020202, |
| "grad_norm": 0.02107142098248005, |
| "learning_rate": 2.9624838292367403e-05, |
| "loss": 0.0065, |
| "step": 229 |
| }, |
| { |
| "epoch": 0.7434343434343434, |
| "grad_norm": 0.017162326723337173, |
| "learning_rate": 2.975420439844761e-05, |
| "loss": 0.0039, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.7466666666666667, |
| "grad_norm": 0.01903739757835865, |
| "learning_rate": 2.9883570504527814e-05, |
| "loss": 0.0058, |
| "step": 231 |
| }, |
| { |
| "epoch": 0.74989898989899, |
| "grad_norm": 0.018820738419890404, |
| "learning_rate": 3.0012936610608024e-05, |
| "loss": 0.0047, |
| "step": 232 |
| }, |
| { |
| "epoch": 0.7531313131313131, |
| "grad_norm": 0.020023051649332047, |
| "learning_rate": 3.014230271668823e-05, |
| "loss": 0.0053, |
| "step": 233 |
| }, |
| { |
| "epoch": 0.7563636363636363, |
| "grad_norm": 0.01760948821902275, |
| "learning_rate": 3.0271668822768434e-05, |
| "loss": 0.005, |
| "step": 234 |
| }, |
| { |
| "epoch": 0.7595959595959596, |
| "grad_norm": 0.021413853392004967, |
| "learning_rate": 3.0401034928848644e-05, |
| "loss": 0.0064, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.7628282828282829, |
| "grad_norm": 0.016296841204166412, |
| "learning_rate": 3.053040103492885e-05, |
| "loss": 0.0054, |
| "step": 236 |
| }, |
| { |
| "epoch": 0.7660606060606061, |
| "grad_norm": 0.021530069410800934, |
| "learning_rate": 3.0659767141009055e-05, |
| "loss": 0.0056, |
| "step": 237 |
| }, |
| { |
| "epoch": 0.7692929292929293, |
| "grad_norm": 0.023852026090025902, |
| "learning_rate": 3.0789133247089265e-05, |
| "loss": 0.0062, |
| "step": 238 |
| }, |
| { |
| "epoch": 0.7725252525252525, |
| "grad_norm": 0.018425408750772476, |
| "learning_rate": 3.0918499353169475e-05, |
| "loss": 0.0061, |
| "step": 239 |
| }, |
| { |
| "epoch": 0.7757575757575758, |
| "grad_norm": 0.01333833672106266, |
| "learning_rate": 3.104786545924967e-05, |
| "loss": 0.0028, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.778989898989899, |
| "grad_norm": 0.0206610094755888, |
| "learning_rate": 3.117723156532988e-05, |
| "loss": 0.0078, |
| "step": 241 |
| }, |
| { |
| "epoch": 0.7822222222222223, |
| "grad_norm": 0.012977082282304764, |
| "learning_rate": 3.130659767141009e-05, |
| "loss": 0.0026, |
| "step": 242 |
| }, |
| { |
| "epoch": 0.7854545454545454, |
| "grad_norm": 0.01584058254957199, |
| "learning_rate": 3.14359637774903e-05, |
| "loss": 0.0041, |
| "step": 243 |
| }, |
| { |
| "epoch": 0.7886868686868687, |
| "grad_norm": 0.01646556332707405, |
| "learning_rate": 3.1565329883570506e-05, |
| "loss": 0.0046, |
| "step": 244 |
| }, |
| { |
| "epoch": 0.7919191919191919, |
| "grad_norm": 0.020162571221590042, |
| "learning_rate": 3.169469598965072e-05, |
| "loss": 0.0059, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.7951515151515152, |
| "grad_norm": 0.017648255452513695, |
| "learning_rate": 3.182406209573092e-05, |
| "loss": 0.0041, |
| "step": 246 |
| }, |
| { |
| "epoch": 0.7983838383838384, |
| "grad_norm": 0.02181166596710682, |
| "learning_rate": 3.1953428201811124e-05, |
| "loss": 0.0068, |
| "step": 247 |
| }, |
| { |
| "epoch": 0.8016161616161617, |
| "grad_norm": 0.0214746855199337, |
| "learning_rate": 3.2082794307891334e-05, |
| "loss": 0.0061, |
| "step": 248 |
| }, |
| { |
| "epoch": 0.8048484848484848, |
| "grad_norm": 0.021200764924287796, |
| "learning_rate": 3.2212160413971544e-05, |
| "loss": 0.0071, |
| "step": 249 |
| }, |
| { |
| "epoch": 0.8080808080808081, |
| "grad_norm": 0.015399099327623844, |
| "learning_rate": 3.234152652005175e-05, |
| "loss": 0.0035, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.8080808080808081, |
| "eval_loss": 0.005290315952152014, |
| "eval_runtime": 16.3833, |
| "eval_samples_per_second": 6.104, |
| "eval_steps_per_second": 1.526, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.8113131313131313, |
| "grad_norm": 0.029362117871642113, |
| "learning_rate": 3.247089262613196e-05, |
| "loss": 0.0096, |
| "step": 251 |
| }, |
| { |
| "epoch": 0.8145454545454546, |
| "grad_norm": 0.020502356812357903, |
| "learning_rate": 3.260025873221216e-05, |
| "loss": 0.0052, |
| "step": 252 |
| }, |
| { |
| "epoch": 0.8177777777777778, |
| "grad_norm": 0.016865013167262077, |
| "learning_rate": 3.2729624838292365e-05, |
| "loss": 0.0036, |
| "step": 253 |
| }, |
| { |
| "epoch": 0.821010101010101, |
| "grad_norm": 0.020496444776654243, |
| "learning_rate": 3.2858990944372575e-05, |
| "loss": 0.0068, |
| "step": 254 |
| }, |
| { |
| "epoch": 0.8242424242424242, |
| "grad_norm": 0.014448349364101887, |
| "learning_rate": 3.2988357050452786e-05, |
| "loss": 0.003, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.8274747474747475, |
| "grad_norm": 0.022134538739919662, |
| "learning_rate": 3.3117723156532996e-05, |
| "loss": 0.0072, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.8307070707070707, |
| "grad_norm": 0.019236182793974876, |
| "learning_rate": 3.324708926261319e-05, |
| "loss": 0.0052, |
| "step": 257 |
| }, |
| { |
| "epoch": 0.833939393939394, |
| "grad_norm": 0.018017595633864403, |
| "learning_rate": 3.33764553686934e-05, |
| "loss": 0.0061, |
| "step": 258 |
| }, |
| { |
| "epoch": 0.8371717171717171, |
| "grad_norm": 0.01559491641819477, |
| "learning_rate": 3.350582147477361e-05, |
| "loss": 0.0047, |
| "step": 259 |
| }, |
| { |
| "epoch": 0.8404040404040404, |
| "grad_norm": 0.023867258802056313, |
| "learning_rate": 3.3635187580853817e-05, |
| "loss": 0.0081, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.8436363636363636, |
| "grad_norm": 0.01980549655854702, |
| "learning_rate": 3.376455368693403e-05, |
| "loss": 0.0069, |
| "step": 261 |
| }, |
| { |
| "epoch": 0.8468686868686869, |
| "grad_norm": 0.017097562551498413, |
| "learning_rate": 3.389391979301424e-05, |
| "loss": 0.0046, |
| "step": 262 |
| }, |
| { |
| "epoch": 0.8501010101010101, |
| "grad_norm": 0.017034752294421196, |
| "learning_rate": 3.4023285899094434e-05, |
| "loss": 0.0037, |
| "step": 263 |
| }, |
| { |
| "epoch": 0.8533333333333334, |
| "grad_norm": 0.016199825331568718, |
| "learning_rate": 3.4152652005174644e-05, |
| "loss": 0.0033, |
| "step": 264 |
| }, |
| { |
| "epoch": 0.8565656565656565, |
| "grad_norm": 0.01960723288357258, |
| "learning_rate": 3.4282018111254854e-05, |
| "loss": 0.0056, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.8597979797979798, |
| "grad_norm": 0.024866629391908646, |
| "learning_rate": 3.441138421733506e-05, |
| "loss": 0.0076, |
| "step": 266 |
| }, |
| { |
| "epoch": 0.863030303030303, |
| "grad_norm": 0.017310963943600655, |
| "learning_rate": 3.454075032341527e-05, |
| "loss": 0.0041, |
| "step": 267 |
| }, |
| { |
| "epoch": 0.8662626262626263, |
| "grad_norm": 0.0179507564753294, |
| "learning_rate": 3.467011642949548e-05, |
| "loss": 0.005, |
| "step": 268 |
| }, |
| { |
| "epoch": 0.8694949494949495, |
| "grad_norm": 0.022981947287917137, |
| "learning_rate": 3.4799482535575675e-05, |
| "loss": 0.0077, |
| "step": 269 |
| }, |
| { |
| "epoch": 0.8727272727272727, |
| "grad_norm": 0.018609842285513878, |
| "learning_rate": 3.4928848641655885e-05, |
| "loss": 0.005, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.8759595959595959, |
| "grad_norm": 0.020172659307718277, |
| "learning_rate": 3.5058214747736096e-05, |
| "loss": 0.0063, |
| "step": 271 |
| }, |
| { |
| "epoch": 0.8791919191919192, |
| "grad_norm": 0.01627444103360176, |
| "learning_rate": 3.5187580853816306e-05, |
| "loss": 0.0047, |
| "step": 272 |
| }, |
| { |
| "epoch": 0.8824242424242424, |
| "grad_norm": 0.018499860540032387, |
| "learning_rate": 3.531694695989651e-05, |
| "loss": 0.0067, |
| "step": 273 |
| }, |
| { |
| "epoch": 0.8856565656565657, |
| "grad_norm": 0.017454711720347404, |
| "learning_rate": 3.544631306597671e-05, |
| "loss": 0.0052, |
| "step": 274 |
| }, |
| { |
| "epoch": 0.8888888888888888, |
| "grad_norm": 0.021992985159158707, |
| "learning_rate": 3.557567917205692e-05, |
| "loss": 0.0054, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.8888888888888888, |
| "eval_loss": 0.005000718403607607, |
| "eval_runtime": 16.3443, |
| "eval_samples_per_second": 6.118, |
| "eval_steps_per_second": 1.53, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.8921212121212121, |
| "grad_norm": 0.015971461310982704, |
| "learning_rate": 3.570504527813713e-05, |
| "loss": 0.0047, |
| "step": 276 |
| }, |
| { |
| "epoch": 0.8953535353535353, |
| "grad_norm": 0.01679675467312336, |
| "learning_rate": 3.583441138421734e-05, |
| "loss": 0.0037, |
| "step": 277 |
| }, |
| { |
| "epoch": 0.8985858585858586, |
| "grad_norm": 0.01664590835571289, |
| "learning_rate": 3.596377749029755e-05, |
| "loss": 0.0048, |
| "step": 278 |
| }, |
| { |
| "epoch": 0.9018181818181819, |
| "grad_norm": 0.024900000542402267, |
| "learning_rate": 3.609314359637775e-05, |
| "loss": 0.006, |
| "step": 279 |
| }, |
| { |
| "epoch": 0.9050505050505051, |
| "grad_norm": 0.017504746094346046, |
| "learning_rate": 3.6222509702457954e-05, |
| "loss": 0.0048, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.9082828282828282, |
| "grad_norm": 0.02234167419373989, |
| "learning_rate": 3.6351875808538164e-05, |
| "loss": 0.0068, |
| "step": 281 |
| }, |
| { |
| "epoch": 0.9115151515151515, |
| "grad_norm": 0.019042737782001495, |
| "learning_rate": 3.648124191461837e-05, |
| "loss": 0.0046, |
| "step": 282 |
| }, |
| { |
| "epoch": 0.9147474747474748, |
| "grad_norm": 0.015853388234972954, |
| "learning_rate": 3.661060802069858e-05, |
| "loss": 0.0036, |
| "step": 283 |
| }, |
| { |
| "epoch": 0.917979797979798, |
| "grad_norm": 0.0178302600979805, |
| "learning_rate": 3.673997412677879e-05, |
| "loss": 0.0051, |
| "step": 284 |
| }, |
| { |
| "epoch": 0.9212121212121213, |
| "grad_norm": 0.02232186682522297, |
| "learning_rate": 3.6869340232859e-05, |
| "loss": 0.0052, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.9244444444444444, |
| "grad_norm": 0.020168444141745567, |
| "learning_rate": 3.6998706338939195e-05, |
| "loss": 0.0077, |
| "step": 286 |
| }, |
| { |
| "epoch": 0.9276767676767677, |
| "grad_norm": 0.01861269772052765, |
| "learning_rate": 3.7128072445019406e-05, |
| "loss": 0.0052, |
| "step": 287 |
| }, |
| { |
| "epoch": 0.9309090909090909, |
| "grad_norm": 0.02563636004924774, |
| "learning_rate": 3.7257438551099616e-05, |
| "loss": 0.0092, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.9341414141414142, |
| "grad_norm": 0.019770383834838867, |
| "learning_rate": 3.738680465717982e-05, |
| "loss": 0.0042, |
| "step": 289 |
| }, |
| { |
| "epoch": 0.9373737373737374, |
| "grad_norm": 0.016416925936937332, |
| "learning_rate": 3.751617076326003e-05, |
| "loss": 0.0036, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.9406060606060606, |
| "grad_norm": 0.013349834829568863, |
| "learning_rate": 3.764553686934023e-05, |
| "loss": 0.0042, |
| "step": 291 |
| }, |
| { |
| "epoch": 0.9438383838383838, |
| "grad_norm": 0.01671830751001835, |
| "learning_rate": 3.777490297542044e-05, |
| "loss": 0.0045, |
| "step": 292 |
| }, |
| { |
| "epoch": 0.9470707070707071, |
| "grad_norm": 0.01900608092546463, |
| "learning_rate": 3.790426908150065e-05, |
| "loss": 0.0061, |
| "step": 293 |
| }, |
| { |
| "epoch": 0.9503030303030303, |
| "grad_norm": 0.01652224175632, |
| "learning_rate": 3.803363518758086e-05, |
| "loss": 0.0042, |
| "step": 294 |
| }, |
| { |
| "epoch": 0.9535353535353536, |
| "grad_norm": 0.014939991757273674, |
| "learning_rate": 3.816300129366106e-05, |
| "loss": 0.0035, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.9567676767676768, |
| "grad_norm": 0.0184948593378067, |
| "learning_rate": 3.829236739974127e-05, |
| "loss": 0.0057, |
| "step": 296 |
| }, |
| { |
| "epoch": 0.96, |
| "grad_norm": 0.018070057034492493, |
| "learning_rate": 3.8421733505821475e-05, |
| "loss": 0.0042, |
| "step": 297 |
| }, |
| { |
| "epoch": 0.9632323232323232, |
| "grad_norm": 0.015646759420633316, |
| "learning_rate": 3.855109961190168e-05, |
| "loss": 0.0038, |
| "step": 298 |
| }, |
| { |
| "epoch": 0.9664646464646465, |
| "grad_norm": 0.01496923342347145, |
| "learning_rate": 3.868046571798189e-05, |
| "loss": 0.003, |
| "step": 299 |
| }, |
| { |
| "epoch": 0.9696969696969697, |
| "grad_norm": 0.01583954133093357, |
| "learning_rate": 3.88098318240621e-05, |
| "loss": 0.0031, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.9696969696969697, |
| "eval_loss": 0.004727667197585106, |
| "eval_runtime": 16.3211, |
| "eval_samples_per_second": 6.127, |
| "eval_steps_per_second": 1.532, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.972929292929293, |
| "grad_norm": 0.019395483657717705, |
| "learning_rate": 3.893919793014231e-05, |
| "loss": 0.0042, |
| "step": 301 |
| }, |
| { |
| "epoch": 0.9761616161616161, |
| "grad_norm": 0.01872323825955391, |
| "learning_rate": 3.906856403622251e-05, |
| "loss": 0.0047, |
| "step": 302 |
| }, |
| { |
| "epoch": 0.9793939393939394, |
| "grad_norm": 0.019994398579001427, |
| "learning_rate": 3.9197930142302716e-05, |
| "loss": 0.0048, |
| "step": 303 |
| }, |
| { |
| "epoch": 0.9826262626262626, |
| "grad_norm": 0.019639745354652405, |
| "learning_rate": 3.9327296248382926e-05, |
| "loss": 0.0064, |
| "step": 304 |
| }, |
| { |
| "epoch": 0.9858585858585859, |
| "grad_norm": 0.01813843846321106, |
| "learning_rate": 3.945666235446313e-05, |
| "loss": 0.0062, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.9890909090909091, |
| "grad_norm": 0.0215386301279068, |
| "learning_rate": 3.958602846054334e-05, |
| "loss": 0.0058, |
| "step": 306 |
| }, |
| { |
| "epoch": 0.9923232323232323, |
| "grad_norm": 0.014398916624486446, |
| "learning_rate": 3.971539456662355e-05, |
| "loss": 0.003, |
| "step": 307 |
| }, |
| { |
| "epoch": 0.9955555555555555, |
| "grad_norm": 0.011320858262479305, |
| "learning_rate": 3.9844760672703754e-05, |
| "loss": 0.0031, |
| "step": 308 |
| }, |
| { |
| "epoch": 0.9987878787878788, |
| "grad_norm": 0.014571242034435272, |
| "learning_rate": 3.997412677878396e-05, |
| "loss": 0.0043, |
| "step": 309 |
| }, |
| { |
| "epoch": 1.002020202020202, |
| "grad_norm": 0.03683099523186684, |
| "learning_rate": 4.010349288486417e-05, |
| "loss": 0.0101, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.0052525252525253, |
| "grad_norm": 0.01816723309457302, |
| "learning_rate": 4.023285899094437e-05, |
| "loss": 0.0055, |
| "step": 311 |
| }, |
| { |
| "epoch": 1.0084848484848485, |
| "grad_norm": 0.01920638605952263, |
| "learning_rate": 4.036222509702458e-05, |
| "loss": 0.0052, |
| "step": 312 |
| }, |
| { |
| "epoch": 1.0117171717171718, |
| "grad_norm": 0.016668478026986122, |
| "learning_rate": 4.049159120310479e-05, |
| "loss": 0.0035, |
| "step": 313 |
| }, |
| { |
| "epoch": 1.014949494949495, |
| "grad_norm": 0.019997665658593178, |
| "learning_rate": 4.0620957309184995e-05, |
| "loss": 0.0049, |
| "step": 314 |
| }, |
| { |
| "epoch": 1.018181818181818, |
| "grad_norm": 0.01405120175331831, |
| "learning_rate": 4.07503234152652e-05, |
| "loss": 0.0032, |
| "step": 315 |
| }, |
| { |
| "epoch": 1.0214141414141413, |
| "grad_norm": 0.0201172586530447, |
| "learning_rate": 4.087968952134541e-05, |
| "loss": 0.0074, |
| "step": 316 |
| }, |
| { |
| "epoch": 1.0246464646464646, |
| "grad_norm": 0.0167316235601902, |
| "learning_rate": 4.100905562742562e-05, |
| "loss": 0.0044, |
| "step": 317 |
| }, |
| { |
| "epoch": 1.0278787878787878, |
| "grad_norm": 0.02000577375292778, |
| "learning_rate": 4.113842173350582e-05, |
| "loss": 0.006, |
| "step": 318 |
| }, |
| { |
| "epoch": 1.031111111111111, |
| "grad_norm": 0.015127904713153839, |
| "learning_rate": 4.126778783958603e-05, |
| "loss": 0.004, |
| "step": 319 |
| }, |
| { |
| "epoch": 1.0343434343434343, |
| "grad_norm": 0.01947900466620922, |
| "learning_rate": 4.1397153945666236e-05, |
| "loss": 0.0058, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.0375757575757576, |
| "grad_norm": 0.013327086344361305, |
| "learning_rate": 4.152652005174644e-05, |
| "loss": 0.0033, |
| "step": 321 |
| }, |
| { |
| "epoch": 1.0408080808080808, |
| "grad_norm": 0.01852571964263916, |
| "learning_rate": 4.165588615782665e-05, |
| "loss": 0.0063, |
| "step": 322 |
| }, |
| { |
| "epoch": 1.044040404040404, |
| "grad_norm": 0.018009457737207413, |
| "learning_rate": 4.178525226390686e-05, |
| "loss": 0.0034, |
| "step": 323 |
| }, |
| { |
| "epoch": 1.0472727272727274, |
| "grad_norm": 0.013065322302281857, |
| "learning_rate": 4.1914618369987064e-05, |
| "loss": 0.0026, |
| "step": 324 |
| }, |
| { |
| "epoch": 1.0505050505050506, |
| "grad_norm": 0.02070598676800728, |
| "learning_rate": 4.2043984476067274e-05, |
| "loss": 0.0048, |
| "step": 325 |
| }, |
| { |
| "epoch": 1.0505050505050506, |
| "eval_loss": 0.0044373939745128155, |
| "eval_runtime": 16.3426, |
| "eval_samples_per_second": 6.119, |
| "eval_steps_per_second": 1.53, |
| "step": 325 |
| }, |
| { |
| "epoch": 1.0537373737373736, |
| "grad_norm": 0.014944811351597309, |
| "learning_rate": 4.217335058214748e-05, |
| "loss": 0.0038, |
| "step": 326 |
| }, |
| { |
| "epoch": 1.056969696969697, |
| "grad_norm": 0.01953217387199402, |
| "learning_rate": 4.230271668822768e-05, |
| "loss": 0.0039, |
| "step": 327 |
| }, |
| { |
| "epoch": 1.0602020202020201, |
| "grad_norm": 0.014163571409881115, |
| "learning_rate": 4.243208279430789e-05, |
| "loss": 0.0032, |
| "step": 328 |
| }, |
| { |
| "epoch": 1.0634343434343434, |
| "grad_norm": 0.01625542901456356, |
| "learning_rate": 4.25614489003881e-05, |
| "loss": 0.0029, |
| "step": 329 |
| }, |
| { |
| "epoch": 1.0666666666666667, |
| "grad_norm": 0.018217438831925392, |
| "learning_rate": 4.269081500646831e-05, |
| "loss": 0.0045, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.06989898989899, |
| "grad_norm": 0.016029763966798782, |
| "learning_rate": 4.2820181112548515e-05, |
| "loss": 0.0046, |
| "step": 331 |
| }, |
| { |
| "epoch": 1.0731313131313132, |
| "grad_norm": 0.01773686707019806, |
| "learning_rate": 4.294954721862872e-05, |
| "loss": 0.0058, |
| "step": 332 |
| }, |
| { |
| "epoch": 1.0763636363636364, |
| "grad_norm": 0.0151143753901124, |
| "learning_rate": 4.307891332470893e-05, |
| "loss": 0.0035, |
| "step": 333 |
| }, |
| { |
| "epoch": 1.0795959595959597, |
| "grad_norm": 0.01726050116121769, |
| "learning_rate": 4.320827943078913e-05, |
| "loss": 0.0045, |
| "step": 334 |
| }, |
| { |
| "epoch": 1.082828282828283, |
| "grad_norm": 0.019141988828778267, |
| "learning_rate": 4.333764553686934e-05, |
| "loss": 0.0049, |
| "step": 335 |
| }, |
| { |
| "epoch": 1.086060606060606, |
| "grad_norm": 0.020579032599925995, |
| "learning_rate": 4.346701164294955e-05, |
| "loss": 0.0054, |
| "step": 336 |
| }, |
| { |
| "epoch": 1.0892929292929292, |
| "grad_norm": 0.017402032390236855, |
| "learning_rate": 4.359637774902976e-05, |
| "loss": 0.0029, |
| "step": 337 |
| }, |
| { |
| "epoch": 1.0925252525252525, |
| "grad_norm": 0.01794702559709549, |
| "learning_rate": 4.372574385510996e-05, |
| "loss": 0.0036, |
| "step": 338 |
| }, |
| { |
| "epoch": 1.0957575757575757, |
| "grad_norm": 0.014414181001484394, |
| "learning_rate": 4.385510996119017e-05, |
| "loss": 0.0045, |
| "step": 339 |
| }, |
| { |
| "epoch": 1.098989898989899, |
| "grad_norm": 0.014286419376730919, |
| "learning_rate": 4.3984476067270374e-05, |
| "loss": 0.0033, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.1022222222222222, |
| "grad_norm": 0.014535068534314632, |
| "learning_rate": 4.4113842173350584e-05, |
| "loss": 0.0027, |
| "step": 341 |
| }, |
| { |
| "epoch": 1.1054545454545455, |
| "grad_norm": 0.025067241862416267, |
| "learning_rate": 4.4243208279430794e-05, |
| "loss": 0.0056, |
| "step": 342 |
| }, |
| { |
| "epoch": 1.1086868686868687, |
| "grad_norm": 0.02371220663189888, |
| "learning_rate": 4.4372574385511e-05, |
| "loss": 0.0075, |
| "step": 343 |
| }, |
| { |
| "epoch": 1.111919191919192, |
| "grad_norm": 0.014735649339854717, |
| "learning_rate": 4.45019404915912e-05, |
| "loss": 0.003, |
| "step": 344 |
| }, |
| { |
| "epoch": 1.1151515151515152, |
| "grad_norm": 0.01727231964468956, |
| "learning_rate": 4.463130659767141e-05, |
| "loss": 0.005, |
| "step": 345 |
| }, |
| { |
| "epoch": 1.1183838383838385, |
| "grad_norm": 0.01831323653459549, |
| "learning_rate": 4.476067270375162e-05, |
| "loss": 0.0047, |
| "step": 346 |
| }, |
| { |
| "epoch": 1.1216161616161617, |
| "grad_norm": 0.015522800385951996, |
| "learning_rate": 4.4890038809831825e-05, |
| "loss": 0.0031, |
| "step": 347 |
| }, |
| { |
| "epoch": 1.1248484848484848, |
| "grad_norm": 0.017642300575971603, |
| "learning_rate": 4.5019404915912036e-05, |
| "loss": 0.0039, |
| "step": 348 |
| }, |
| { |
| "epoch": 1.128080808080808, |
| "grad_norm": 0.015553643926978111, |
| "learning_rate": 4.514877102199224e-05, |
| "loss": 0.003, |
| "step": 349 |
| }, |
| { |
| "epoch": 1.1313131313131313, |
| "grad_norm": 0.014179611578583717, |
| "learning_rate": 4.527813712807244e-05, |
| "loss": 0.0027, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.1313131313131313, |
| "eval_loss": 0.004338070284575224, |
| "eval_runtime": 16.3532, |
| "eval_samples_per_second": 6.115, |
| "eval_steps_per_second": 1.529, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.1345454545454545, |
| "grad_norm": 0.014060231857001781, |
| "learning_rate": 4.540750323415265e-05, |
| "loss": 0.0029, |
| "step": 351 |
| }, |
| { |
| "epoch": 1.1377777777777778, |
| "grad_norm": 0.023371202871203423, |
| "learning_rate": 4.553686934023286e-05, |
| "loss": 0.0053, |
| "step": 352 |
| }, |
| { |
| "epoch": 1.141010101010101, |
| "grad_norm": 0.024268243461847305, |
| "learning_rate": 4.566623544631307e-05, |
| "loss": 0.0062, |
| "step": 353 |
| }, |
| { |
| "epoch": 1.1442424242424243, |
| "grad_norm": 0.01984681747853756, |
| "learning_rate": 4.579560155239328e-05, |
| "loss": 0.005, |
| "step": 354 |
| }, |
| { |
| "epoch": 1.1474747474747475, |
| "grad_norm": 0.01846770942211151, |
| "learning_rate": 4.592496765847348e-05, |
| "loss": 0.0047, |
| "step": 355 |
| }, |
| { |
| "epoch": 1.1507070707070708, |
| "grad_norm": 0.01641472429037094, |
| "learning_rate": 4.6054333764553684e-05, |
| "loss": 0.0046, |
| "step": 356 |
| }, |
| { |
| "epoch": 1.1539393939393938, |
| "grad_norm": 0.016151661053299904, |
| "learning_rate": 4.6183699870633894e-05, |
| "loss": 0.0037, |
| "step": 357 |
| }, |
| { |
| "epoch": 1.157171717171717, |
| "grad_norm": 0.01714576967060566, |
| "learning_rate": 4.6313065976714105e-05, |
| "loss": 0.0037, |
| "step": 358 |
| }, |
| { |
| "epoch": 1.1604040404040403, |
| "grad_norm": 0.01588398776948452, |
| "learning_rate": 4.6442432082794315e-05, |
| "loss": 0.0039, |
| "step": 359 |
| }, |
| { |
| "epoch": 1.1636363636363636, |
| "grad_norm": 0.01592858135700226, |
| "learning_rate": 4.657179818887452e-05, |
| "loss": 0.0033, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.1668686868686868, |
| "grad_norm": 0.018423041328787804, |
| "learning_rate": 4.670116429495472e-05, |
| "loss": 0.0048, |
| "step": 361 |
| }, |
| { |
| "epoch": 1.17010101010101, |
| "grad_norm": 0.014986859634518623, |
| "learning_rate": 4.683053040103493e-05, |
| "loss": 0.0025, |
| "step": 362 |
| }, |
| { |
| "epoch": 1.1733333333333333, |
| "grad_norm": 0.015355497598648071, |
| "learning_rate": 4.6959896507115136e-05, |
| "loss": 0.0035, |
| "step": 363 |
| }, |
| { |
| "epoch": 1.1765656565656566, |
| "grad_norm": 0.02171158418059349, |
| "learning_rate": 4.7089262613195346e-05, |
| "loss": 0.0047, |
| "step": 364 |
| }, |
| { |
| "epoch": 1.1797979797979798, |
| "grad_norm": 0.017679892480373383, |
| "learning_rate": 4.7218628719275556e-05, |
| "loss": 0.0031, |
| "step": 365 |
| }, |
| { |
| "epoch": 1.183030303030303, |
| "grad_norm": 0.015987776219844818, |
| "learning_rate": 4.734799482535576e-05, |
| "loss": 0.0034, |
| "step": 366 |
| }, |
| { |
| "epoch": 1.1862626262626264, |
| "grad_norm": 0.023376576602458954, |
| "learning_rate": 4.747736093143596e-05, |
| "loss": 0.0065, |
| "step": 367 |
| }, |
| { |
| "epoch": 1.1894949494949496, |
| "grad_norm": 0.01579899713397026, |
| "learning_rate": 4.760672703751617e-05, |
| "loss": 0.0032, |
| "step": 368 |
| }, |
| { |
| "epoch": 1.1927272727272726, |
| "grad_norm": 0.011925517581403255, |
| "learning_rate": 4.773609314359638e-05, |
| "loss": 0.0025, |
| "step": 369 |
| }, |
| { |
| "epoch": 1.195959595959596, |
| "grad_norm": 0.02474330924451351, |
| "learning_rate": 4.786545924967659e-05, |
| "loss": 0.0078, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.1991919191919191, |
| "grad_norm": 0.01529130432754755, |
| "learning_rate": 4.79948253557568e-05, |
| "loss": 0.0035, |
| "step": 371 |
| }, |
| { |
| "epoch": 1.2024242424242424, |
| "grad_norm": 0.014695713296532631, |
| "learning_rate": 4.8124191461837e-05, |
| "loss": 0.0055, |
| "step": 372 |
| }, |
| { |
| "epoch": 1.2056565656565656, |
| "grad_norm": 0.013606542721390724, |
| "learning_rate": 4.8253557567917204e-05, |
| "loss": 0.0028, |
| "step": 373 |
| }, |
| { |
| "epoch": 1.208888888888889, |
| "grad_norm": 0.012038321234285831, |
| "learning_rate": 4.8382923673997415e-05, |
| "loss": 0.0027, |
| "step": 374 |
| }, |
| { |
| "epoch": 1.2121212121212122, |
| "grad_norm": 0.01633366383612156, |
| "learning_rate": 4.8512289780077625e-05, |
| "loss": 0.0033, |
| "step": 375 |
| }, |
| { |
| "epoch": 1.2121212121212122, |
| "eval_loss": 0.004153456538915634, |
| "eval_runtime": 16.3409, |
| "eval_samples_per_second": 6.12, |
| "eval_steps_per_second": 1.53, |
| "step": 375 |
| }, |
| { |
| "epoch": 1.2153535353535354, |
| "grad_norm": 0.019631968811154366, |
| "learning_rate": 4.864165588615783e-05, |
| "loss": 0.0035, |
| "step": 376 |
| }, |
| { |
| "epoch": 1.2185858585858587, |
| "grad_norm": 0.017855241894721985, |
| "learning_rate": 4.877102199223804e-05, |
| "loss": 0.0047, |
| "step": 377 |
| }, |
| { |
| "epoch": 1.221818181818182, |
| "grad_norm": 0.016393886879086494, |
| "learning_rate": 4.890038809831824e-05, |
| "loss": 0.0027, |
| "step": 378 |
| }, |
| { |
| "epoch": 1.225050505050505, |
| "grad_norm": 0.0160859115421772, |
| "learning_rate": 4.9029754204398446e-05, |
| "loss": 0.0037, |
| "step": 379 |
| }, |
| { |
| "epoch": 1.2282828282828282, |
| "grad_norm": 0.015383926220238209, |
| "learning_rate": 4.9159120310478656e-05, |
| "loss": 0.0036, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.2315151515151515, |
| "grad_norm": 0.018972352147102356, |
| "learning_rate": 4.9288486416558866e-05, |
| "loss": 0.004, |
| "step": 381 |
| }, |
| { |
| "epoch": 1.2347474747474747, |
| "grad_norm": 0.015323465690016747, |
| "learning_rate": 4.941785252263907e-05, |
| "loss": 0.0033, |
| "step": 382 |
| }, |
| { |
| "epoch": 1.237979797979798, |
| "grad_norm": 0.01524687185883522, |
| "learning_rate": 4.954721862871928e-05, |
| "loss": 0.0035, |
| "step": 383 |
| }, |
| { |
| "epoch": 1.2412121212121212, |
| "grad_norm": 0.014032768085598946, |
| "learning_rate": 4.9676584734799483e-05, |
| "loss": 0.0021, |
| "step": 384 |
| }, |
| { |
| "epoch": 1.2444444444444445, |
| "grad_norm": 0.020573202520608902, |
| "learning_rate": 4.980595084087969e-05, |
| "loss": 0.0063, |
| "step": 385 |
| }, |
| { |
| "epoch": 1.2476767676767677, |
| "grad_norm": 0.01424959022551775, |
| "learning_rate": 4.99353169469599e-05, |
| "loss": 0.0035, |
| "step": 386 |
| }, |
| { |
| "epoch": 1.250909090909091, |
| "grad_norm": 0.01842297427356243, |
| "learning_rate": 5.006468305304011e-05, |
| "loss": 0.0055, |
| "step": 387 |
| }, |
| { |
| "epoch": 1.2541414141414142, |
| "grad_norm": 0.01563439331948757, |
| "learning_rate": 5.019404915912032e-05, |
| "loss": 0.0027, |
| "step": 388 |
| }, |
| { |
| "epoch": 1.2573737373737375, |
| "grad_norm": 0.016885295510292053, |
| "learning_rate": 5.032341526520052e-05, |
| "loss": 0.0037, |
| "step": 389 |
| }, |
| { |
| "epoch": 1.2606060606060607, |
| "grad_norm": 0.02013927884399891, |
| "learning_rate": 5.0452781371280725e-05, |
| "loss": 0.0053, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.2638383838383838, |
| "grad_norm": 0.019207125529646873, |
| "learning_rate": 5.058214747736093e-05, |
| "loss": 0.0034, |
| "step": 391 |
| }, |
| { |
| "epoch": 1.267070707070707, |
| "grad_norm": 0.019889958202838898, |
| "learning_rate": 5.071151358344114e-05, |
| "loss": 0.0071, |
| "step": 392 |
| }, |
| { |
| "epoch": 1.2703030303030303, |
| "grad_norm": 0.014296320267021656, |
| "learning_rate": 5.084087968952135e-05, |
| "loss": 0.0035, |
| "step": 393 |
| }, |
| { |
| "epoch": 1.2735353535353535, |
| "grad_norm": 0.01841472089290619, |
| "learning_rate": 5.097024579560156e-05, |
| "loss": 0.0058, |
| "step": 394 |
| }, |
| { |
| "epoch": 1.2767676767676768, |
| "grad_norm": 0.01906520500779152, |
| "learning_rate": 5.109961190168176e-05, |
| "loss": 0.0043, |
| "step": 395 |
| }, |
| { |
| "epoch": 1.28, |
| "grad_norm": 0.014472922310233116, |
| "learning_rate": 5.122897800776197e-05, |
| "loss": 0.0044, |
| "step": 396 |
| }, |
| { |
| "epoch": 1.2832323232323233, |
| "grad_norm": 0.01328935194760561, |
| "learning_rate": 5.135834411384217e-05, |
| "loss": 0.0036, |
| "step": 397 |
| }, |
| { |
| "epoch": 1.2864646464646465, |
| "grad_norm": 0.014197317883372307, |
| "learning_rate": 5.148771021992238e-05, |
| "loss": 0.0028, |
| "step": 398 |
| }, |
| { |
| "epoch": 1.2896969696969696, |
| "grad_norm": 0.017615506425499916, |
| "learning_rate": 5.161707632600259e-05, |
| "loss": 0.0051, |
| "step": 399 |
| }, |
| { |
| "epoch": 1.2929292929292928, |
| "grad_norm": 0.0213606059551239, |
| "learning_rate": 5.17464424320828e-05, |
| "loss": 0.004, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.2929292929292928, |
| "eval_loss": 0.0040789199993014336, |
| "eval_runtime": 16.353, |
| "eval_samples_per_second": 6.115, |
| "eval_steps_per_second": 1.529, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.296161616161616, |
| "grad_norm": 0.012542161159217358, |
| "learning_rate": 5.1875808538163004e-05, |
| "loss": 0.003, |
| "step": 401 |
| }, |
| { |
| "epoch": 1.2993939393939393, |
| "grad_norm": 0.013753628358244896, |
| "learning_rate": 5.2005174644243214e-05, |
| "loss": 0.0038, |
| "step": 402 |
| }, |
| { |
| "epoch": 1.3026262626262626, |
| "grad_norm": 0.01603030040860176, |
| "learning_rate": 5.213454075032341e-05, |
| "loss": 0.0037, |
| "step": 403 |
| }, |
| { |
| "epoch": 1.3058585858585858, |
| "grad_norm": 0.011316108517348766, |
| "learning_rate": 5.226390685640362e-05, |
| "loss": 0.0032, |
| "step": 404 |
| }, |
| { |
| "epoch": 1.309090909090909, |
| "grad_norm": 0.015535411424934864, |
| "learning_rate": 5.239327296248383e-05, |
| "loss": 0.0037, |
| "step": 405 |
| }, |
| { |
| "epoch": 1.3123232323232323, |
| "grad_norm": 0.02128685638308525, |
| "learning_rate": 5.252263906856404e-05, |
| "loss": 0.0078, |
| "step": 406 |
| }, |
| { |
| "epoch": 1.3155555555555556, |
| "grad_norm": 0.012263530865311623, |
| "learning_rate": 5.2652005174644245e-05, |
| "loss": 0.0023, |
| "step": 407 |
| }, |
| { |
| "epoch": 1.3187878787878788, |
| "grad_norm": 0.014120521955192089, |
| "learning_rate": 5.2781371280724455e-05, |
| "loss": 0.0031, |
| "step": 408 |
| }, |
| { |
| "epoch": 1.322020202020202, |
| "grad_norm": 0.01697922684252262, |
| "learning_rate": 5.2910737386804666e-05, |
| "loss": 0.0037, |
| "step": 409 |
| }, |
| { |
| "epoch": 1.3252525252525253, |
| "grad_norm": 0.015713240951299667, |
| "learning_rate": 5.304010349288486e-05, |
| "loss": 0.0032, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.3284848484848486, |
| "grad_norm": 0.02035530097782612, |
| "learning_rate": 5.316946959896507e-05, |
| "loss": 0.0041, |
| "step": 411 |
| }, |
| { |
| "epoch": 1.3317171717171719, |
| "grad_norm": 0.01861133798956871, |
| "learning_rate": 5.329883570504528e-05, |
| "loss": 0.0045, |
| "step": 412 |
| }, |
| { |
| "epoch": 1.3349494949494949, |
| "grad_norm": 0.018983542919158936, |
| "learning_rate": 5.3428201811125486e-05, |
| "loss": 0.0044, |
| "step": 413 |
| }, |
| { |
| "epoch": 1.3381818181818181, |
| "grad_norm": 0.014268258586525917, |
| "learning_rate": 5.35575679172057e-05, |
| "loss": 0.0026, |
| "step": 414 |
| }, |
| { |
| "epoch": 1.3414141414141414, |
| "grad_norm": 0.01702299527823925, |
| "learning_rate": 5.368693402328591e-05, |
| "loss": 0.0036, |
| "step": 415 |
| }, |
| { |
| "epoch": 1.3446464646464646, |
| "grad_norm": 0.01684991456568241, |
| "learning_rate": 5.3816300129366104e-05, |
| "loss": 0.0033, |
| "step": 416 |
| }, |
| { |
| "epoch": 1.347878787878788, |
| "grad_norm": 0.01297234557569027, |
| "learning_rate": 5.3945666235446314e-05, |
| "loss": 0.0031, |
| "step": 417 |
| }, |
| { |
| "epoch": 1.3511111111111112, |
| "grad_norm": 0.01872768998146057, |
| "learning_rate": 5.4075032341526524e-05, |
| "loss": 0.004, |
| "step": 418 |
| }, |
| { |
| "epoch": 1.3543434343434344, |
| "grad_norm": 0.01233761291950941, |
| "learning_rate": 5.420439844760673e-05, |
| "loss": 0.0029, |
| "step": 419 |
| }, |
| { |
| "epoch": 1.3575757575757577, |
| "grad_norm": 0.01894952915608883, |
| "learning_rate": 5.433376455368694e-05, |
| "loss": 0.0052, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.3608080808080807, |
| "grad_norm": 0.015206394717097282, |
| "learning_rate": 5.446313065976715e-05, |
| "loss": 0.004, |
| "step": 421 |
| }, |
| { |
| "epoch": 1.364040404040404, |
| "grad_norm": 0.017510687932372093, |
| "learning_rate": 5.4592496765847345e-05, |
| "loss": 0.0045, |
| "step": 422 |
| }, |
| { |
| "epoch": 1.3672727272727272, |
| "grad_norm": 0.02136324532330036, |
| "learning_rate": 5.4721862871927555e-05, |
| "loss": 0.0045, |
| "step": 423 |
| }, |
| { |
| "epoch": 1.3705050505050504, |
| "grad_norm": 0.018400510773062706, |
| "learning_rate": 5.4851228978007766e-05, |
| "loss": 0.0053, |
| "step": 424 |
| }, |
| { |
| "epoch": 1.3737373737373737, |
| "grad_norm": 0.015479810535907745, |
| "learning_rate": 5.498059508408797e-05, |
| "loss": 0.0035, |
| "step": 425 |
| }, |
| { |
| "epoch": 1.3737373737373737, |
| "eval_loss": 0.003997208550572395, |
| "eval_runtime": 16.3384, |
| "eval_samples_per_second": 6.121, |
| "eval_steps_per_second": 1.53, |
| "step": 425 |
| }, |
| { |
| "epoch": 1.376969696969697, |
| "grad_norm": 0.01698392629623413, |
| "learning_rate": 5.510996119016818e-05, |
| "loss": 0.0042, |
| "step": 426 |
| }, |
| { |
| "epoch": 1.3802020202020202, |
| "grad_norm": 0.016880692914128304, |
| "learning_rate": 5.523932729624839e-05, |
| "loss": 0.0035, |
| "step": 427 |
| }, |
| { |
| "epoch": 1.3834343434343435, |
| "grad_norm": 0.017129207029938698, |
| "learning_rate": 5.53686934023286e-05, |
| "loss": 0.0035, |
| "step": 428 |
| }, |
| { |
| "epoch": 1.3866666666666667, |
| "grad_norm": 0.011064956896007061, |
| "learning_rate": 5.5498059508408797e-05, |
| "loss": 0.002, |
| "step": 429 |
| }, |
| { |
| "epoch": 1.38989898989899, |
| "grad_norm": 0.012862925417721272, |
| "learning_rate": 5.562742561448901e-05, |
| "loss": 0.0022, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.3931313131313132, |
| "grad_norm": 0.01640648953616619, |
| "learning_rate": 5.575679172056921e-05, |
| "loss": 0.0035, |
| "step": 431 |
| }, |
| { |
| "epoch": 1.3963636363636365, |
| "grad_norm": 0.01809324324131012, |
| "learning_rate": 5.588615782664942e-05, |
| "loss": 0.0045, |
| "step": 432 |
| }, |
| { |
| "epoch": 1.3995959595959597, |
| "grad_norm": 0.014535348862409592, |
| "learning_rate": 5.601552393272963e-05, |
| "loss": 0.0038, |
| "step": 433 |
| }, |
| { |
| "epoch": 1.4028282828282828, |
| "grad_norm": 0.01154601201415062, |
| "learning_rate": 5.614489003880984e-05, |
| "loss": 0.0027, |
| "step": 434 |
| }, |
| { |
| "epoch": 1.406060606060606, |
| "grad_norm": 0.018451618030667305, |
| "learning_rate": 5.627425614489004e-05, |
| "loss": 0.0036, |
| "step": 435 |
| }, |
| { |
| "epoch": 1.4092929292929293, |
| "grad_norm": 0.014590181410312653, |
| "learning_rate": 5.640362225097025e-05, |
| "loss": 0.0037, |
| "step": 436 |
| }, |
| { |
| "epoch": 1.4125252525252525, |
| "grad_norm": 0.016967356204986572, |
| "learning_rate": 5.653298835705045e-05, |
| "loss": 0.0047, |
| "step": 437 |
| }, |
| { |
| "epoch": 1.4157575757575758, |
| "grad_norm": 0.011046033352613449, |
| "learning_rate": 5.666235446313066e-05, |
| "loss": 0.0028, |
| "step": 438 |
| }, |
| { |
| "epoch": 1.418989898989899, |
| "grad_norm": 0.01384658832103014, |
| "learning_rate": 5.679172056921087e-05, |
| "loss": 0.0036, |
| "step": 439 |
| }, |
| { |
| "epoch": 1.4222222222222223, |
| "grad_norm": 0.01323483232408762, |
| "learning_rate": 5.692108667529108e-05, |
| "loss": 0.0037, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.4254545454545455, |
| "grad_norm": 0.012249852530658245, |
| "learning_rate": 5.7050452781371286e-05, |
| "loss": 0.0028, |
| "step": 441 |
| }, |
| { |
| "epoch": 1.4286868686868686, |
| "grad_norm": 0.013492387719452381, |
| "learning_rate": 5.717981888745149e-05, |
| "loss": 0.0035, |
| "step": 442 |
| }, |
| { |
| "epoch": 1.4319191919191918, |
| "grad_norm": 0.012272081337869167, |
| "learning_rate": 5.730918499353169e-05, |
| "loss": 0.0024, |
| "step": 443 |
| }, |
| { |
| "epoch": 1.435151515151515, |
| "grad_norm": 0.014237101189792156, |
| "learning_rate": 5.74385510996119e-05, |
| "loss": 0.0033, |
| "step": 444 |
| }, |
| { |
| "epoch": 1.4383838383838383, |
| "grad_norm": 0.015182110480964184, |
| "learning_rate": 5.7567917205692113e-05, |
| "loss": 0.0036, |
| "step": 445 |
| }, |
| { |
| "epoch": 1.4416161616161616, |
| "grad_norm": 0.016182074323296547, |
| "learning_rate": 5.7697283311772324e-05, |
| "loss": 0.0033, |
| "step": 446 |
| }, |
| { |
| "epoch": 1.4448484848484848, |
| "grad_norm": 0.014590322971343994, |
| "learning_rate": 5.782664941785253e-05, |
| "loss": 0.0034, |
| "step": 447 |
| }, |
| { |
| "epoch": 1.448080808080808, |
| "grad_norm": 0.01567489095032215, |
| "learning_rate": 5.795601552393273e-05, |
| "loss": 0.003, |
| "step": 448 |
| }, |
| { |
| "epoch": 1.4513131313131313, |
| "grad_norm": 0.01376601867377758, |
| "learning_rate": 5.8085381630012934e-05, |
| "loss": 0.0026, |
| "step": 449 |
| }, |
| { |
| "epoch": 1.4545454545454546, |
| "grad_norm": 0.016052531078457832, |
| "learning_rate": 5.8214747736093145e-05, |
| "loss": 0.003, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.4545454545454546, |
| "eval_loss": 0.0035364925861358643, |
| "eval_runtime": 16.3342, |
| "eval_samples_per_second": 6.122, |
| "eval_steps_per_second": 1.531, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.4577777777777778, |
| "grad_norm": 0.01367964968085289, |
| "learning_rate": 5.8344113842173355e-05, |
| "loss": 0.0025, |
| "step": 451 |
| }, |
| { |
| "epoch": 1.461010101010101, |
| "grad_norm": 0.021040860563516617, |
| "learning_rate": 5.8473479948253565e-05, |
| "loss": 0.0042, |
| "step": 452 |
| }, |
| { |
| "epoch": 1.4642424242424243, |
| "grad_norm": 0.01780344359576702, |
| "learning_rate": 5.860284605433377e-05, |
| "loss": 0.0037, |
| "step": 453 |
| }, |
| { |
| "epoch": 1.4674747474747476, |
| "grad_norm": 0.013049687258899212, |
| "learning_rate": 5.873221216041398e-05, |
| "loss": 0.0032, |
| "step": 454 |
| }, |
| { |
| "epoch": 1.4707070707070706, |
| "grad_norm": 0.01268705539405346, |
| "learning_rate": 5.8861578266494176e-05, |
| "loss": 0.0033, |
| "step": 455 |
| }, |
| { |
| "epoch": 1.4739393939393939, |
| "grad_norm": 0.013753107748925686, |
| "learning_rate": 5.8990944372574386e-05, |
| "loss": 0.0032, |
| "step": 456 |
| }, |
| { |
| "epoch": 1.4771717171717171, |
| "grad_norm": 0.01435878872871399, |
| "learning_rate": 5.9120310478654596e-05, |
| "loss": 0.0029, |
| "step": 457 |
| }, |
| { |
| "epoch": 1.4804040404040404, |
| "grad_norm": 0.015471439808607101, |
| "learning_rate": 5.9249676584734806e-05, |
| "loss": 0.0058, |
| "step": 458 |
| }, |
| { |
| "epoch": 1.4836363636363636, |
| "grad_norm": 0.011495107784867287, |
| "learning_rate": 5.937904269081501e-05, |
| "loss": 0.0028, |
| "step": 459 |
| }, |
| { |
| "epoch": 1.486868686868687, |
| "grad_norm": 0.013401095755398273, |
| "learning_rate": 5.950840879689522e-05, |
| "loss": 0.0018, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.4901010101010101, |
| "grad_norm": 0.01646360009908676, |
| "learning_rate": 5.963777490297542e-05, |
| "loss": 0.0049, |
| "step": 461 |
| }, |
| { |
| "epoch": 1.4933333333333334, |
| "grad_norm": 0.013629264198243618, |
| "learning_rate": 5.976714100905563e-05, |
| "loss": 0.0028, |
| "step": 462 |
| }, |
| { |
| "epoch": 1.4965656565656564, |
| "grad_norm": 0.012427560985088348, |
| "learning_rate": 5.989650711513584e-05, |
| "loss": 0.0028, |
| "step": 463 |
| }, |
| { |
| "epoch": 1.4997979797979797, |
| "grad_norm": 0.01451667957007885, |
| "learning_rate": 6.002587322121605e-05, |
| "loss": 0.0036, |
| "step": 464 |
| }, |
| { |
| "epoch": 1.503030303030303, |
| "grad_norm": 0.02106543630361557, |
| "learning_rate": 6.015523932729625e-05, |
| "loss": 0.0032, |
| "step": 465 |
| }, |
| { |
| "epoch": 1.5062626262626262, |
| "grad_norm": 0.02062024176120758, |
| "learning_rate": 6.028460543337646e-05, |
| "loss": 0.0047, |
| "step": 466 |
| }, |
| { |
| "epoch": 1.5094949494949494, |
| "grad_norm": 0.01738291047513485, |
| "learning_rate": 6.041397153945667e-05, |
| "loss": 0.0041, |
| "step": 467 |
| }, |
| { |
| "epoch": 1.5127272727272727, |
| "grad_norm": 0.018532967194914818, |
| "learning_rate": 6.054333764553687e-05, |
| "loss": 0.0049, |
| "step": 468 |
| }, |
| { |
| "epoch": 1.515959595959596, |
| "grad_norm": 0.015022393316030502, |
| "learning_rate": 6.067270375161708e-05, |
| "loss": 0.0043, |
| "step": 469 |
| }, |
| { |
| "epoch": 1.5191919191919192, |
| "grad_norm": 0.011915473267436028, |
| "learning_rate": 6.080206985769729e-05, |
| "loss": 0.0031, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.5224242424242425, |
| "grad_norm": 0.011362905614078045, |
| "learning_rate": 6.093143596377749e-05, |
| "loss": 0.0018, |
| "step": 471 |
| }, |
| { |
| "epoch": 1.5256565656565657, |
| "grad_norm": 0.01683732494711876, |
| "learning_rate": 6.10608020698577e-05, |
| "loss": 0.0039, |
| "step": 472 |
| }, |
| { |
| "epoch": 1.528888888888889, |
| "grad_norm": 0.01199402753263712, |
| "learning_rate": 6.119016817593791e-05, |
| "loss": 0.0028, |
| "step": 473 |
| }, |
| { |
| "epoch": 1.5321212121212122, |
| "grad_norm": 0.019562341272830963, |
| "learning_rate": 6.131953428201811e-05, |
| "loss": 0.006, |
| "step": 474 |
| }, |
| { |
| "epoch": 1.5353535353535355, |
| "grad_norm": 0.011094428598880768, |
| "learning_rate": 6.144890038809832e-05, |
| "loss": 0.0025, |
| "step": 475 |
| }, |
| { |
| "epoch": 1.5353535353535355, |
| "eval_loss": 0.0035142824053764343, |
| "eval_runtime": 16.3643, |
| "eval_samples_per_second": 6.111, |
| "eval_steps_per_second": 1.528, |
| "step": 475 |
| }, |
| { |
| "epoch": 1.5385858585858587, |
| "grad_norm": 0.01065843552350998, |
| "learning_rate": 6.157826649417853e-05, |
| "loss": 0.0021, |
| "step": 476 |
| }, |
| { |
| "epoch": 1.541818181818182, |
| "grad_norm": 0.016556832939386368, |
| "learning_rate": 6.170763260025874e-05, |
| "loss": 0.0037, |
| "step": 477 |
| }, |
| { |
| "epoch": 1.545050505050505, |
| "grad_norm": 0.014447256922721863, |
| "learning_rate": 6.183699870633895e-05, |
| "loss": 0.0038, |
| "step": 478 |
| }, |
| { |
| "epoch": 1.5482828282828283, |
| "grad_norm": 0.01537930965423584, |
| "learning_rate": 6.196636481241915e-05, |
| "loss": 0.0039, |
| "step": 479 |
| }, |
| { |
| "epoch": 1.5515151515151515, |
| "grad_norm": 0.012505353428423405, |
| "learning_rate": 6.209573091849934e-05, |
| "loss": 0.0025, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.5547474747474748, |
| "grad_norm": 0.01292706374078989, |
| "learning_rate": 6.222509702457955e-05, |
| "loss": 0.003, |
| "step": 481 |
| }, |
| { |
| "epoch": 1.557979797979798, |
| "grad_norm": 0.010675537399947643, |
| "learning_rate": 6.235446313065976e-05, |
| "loss": 0.0023, |
| "step": 482 |
| }, |
| { |
| "epoch": 1.561212121212121, |
| "grad_norm": 0.014718054793775082, |
| "learning_rate": 6.248382923673998e-05, |
| "loss": 0.0037, |
| "step": 483 |
| }, |
| { |
| "epoch": 1.5644444444444443, |
| "grad_norm": 0.016600441187620163, |
| "learning_rate": 6.261319534282019e-05, |
| "loss": 0.0041, |
| "step": 484 |
| }, |
| { |
| "epoch": 1.5676767676767676, |
| "grad_norm": 0.0184367336332798, |
| "learning_rate": 6.27425614489004e-05, |
| "loss": 0.0031, |
| "step": 485 |
| }, |
| { |
| "epoch": 1.5709090909090908, |
| "grad_norm": 0.014964444562792778, |
| "learning_rate": 6.28719275549806e-05, |
| "loss": 0.0028, |
| "step": 486 |
| }, |
| { |
| "epoch": 1.574141414141414, |
| "grad_norm": 0.01758856140077114, |
| "learning_rate": 6.30012936610608e-05, |
| "loss": 0.0045, |
| "step": 487 |
| }, |
| { |
| "epoch": 1.5773737373737373, |
| "grad_norm": 0.012081869877874851, |
| "learning_rate": 6.313065976714101e-05, |
| "loss": 0.0025, |
| "step": 488 |
| }, |
| { |
| "epoch": 1.5806060606060606, |
| "grad_norm": 0.014401961117982864, |
| "learning_rate": 6.326002587322122e-05, |
| "loss": 0.0032, |
| "step": 489 |
| }, |
| { |
| "epoch": 1.5838383838383838, |
| "grad_norm": 0.013464621268212795, |
| "learning_rate": 6.338939197930143e-05, |
| "loss": 0.0035, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.587070707070707, |
| "grad_norm": 0.008521300740540028, |
| "learning_rate": 6.351875808538163e-05, |
| "loss": 0.0016, |
| "step": 491 |
| }, |
| { |
| "epoch": 1.5903030303030303, |
| "grad_norm": 0.01373705081641674, |
| "learning_rate": 6.364812419146184e-05, |
| "loss": 0.0032, |
| "step": 492 |
| }, |
| { |
| "epoch": 1.5935353535353536, |
| "grad_norm": 0.010133868083357811, |
| "learning_rate": 6.377749029754204e-05, |
| "loss": 0.0028, |
| "step": 493 |
| }, |
| { |
| "epoch": 1.5967676767676768, |
| "grad_norm": 0.012263374403119087, |
| "learning_rate": 6.390685640362225e-05, |
| "loss": 0.0025, |
| "step": 494 |
| }, |
| { |
| "epoch": 1.6, |
| "grad_norm": 0.01148121990263462, |
| "learning_rate": 6.403622250970246e-05, |
| "loss": 0.002, |
| "step": 495 |
| }, |
| { |
| "epoch": 1.6032323232323233, |
| "grad_norm": 0.011973985470831394, |
| "learning_rate": 6.416558861578267e-05, |
| "loss": 0.0023, |
| "step": 496 |
| }, |
| { |
| "epoch": 1.6064646464646466, |
| "grad_norm": 0.015984676778316498, |
| "learning_rate": 6.429495472186288e-05, |
| "loss": 0.0044, |
| "step": 497 |
| }, |
| { |
| "epoch": 1.6096969696969698, |
| "grad_norm": 0.011556907556951046, |
| "learning_rate": 6.442432082794309e-05, |
| "loss": 0.002, |
| "step": 498 |
| }, |
| { |
| "epoch": 1.6129292929292929, |
| "grad_norm": 0.016730768606066704, |
| "learning_rate": 6.45536869340233e-05, |
| "loss": 0.0035, |
| "step": 499 |
| }, |
| { |
| "epoch": 1.6161616161616161, |
| "grad_norm": 0.015524023212492466, |
| "learning_rate": 6.46830530401035e-05, |
| "loss": 0.0042, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.6161616161616161, |
| "eval_loss": 0.003171939868479967, |
| "eval_runtime": 16.3588, |
| "eval_samples_per_second": 6.113, |
| "eval_steps_per_second": 1.528, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.6193939393939394, |
| "grad_norm": 0.014239253476262093, |
| "learning_rate": 6.48124191461837e-05, |
| "loss": 0.0021, |
| "step": 501 |
| }, |
| { |
| "epoch": 1.6226262626262626, |
| "grad_norm": 0.010067800991237164, |
| "learning_rate": 6.494178525226392e-05, |
| "loss": 0.0027, |
| "step": 502 |
| }, |
| { |
| "epoch": 1.625858585858586, |
| "grad_norm": 0.0146791972219944, |
| "learning_rate": 6.507115135834411e-05, |
| "loss": 0.0029, |
| "step": 503 |
| }, |
| { |
| "epoch": 1.6290909090909091, |
| "grad_norm": 0.011821575462818146, |
| "learning_rate": 6.520051746442432e-05, |
| "loss": 0.0029, |
| "step": 504 |
| }, |
| { |
| "epoch": 1.6323232323232322, |
| "grad_norm": 0.015757029876112938, |
| "learning_rate": 6.532988357050453e-05, |
| "loss": 0.0047, |
| "step": 505 |
| }, |
| { |
| "epoch": 1.6355555555555554, |
| "grad_norm": 0.018067853525280952, |
| "learning_rate": 6.545924967658473e-05, |
| "loss": 0.0036, |
| "step": 506 |
| }, |
| { |
| "epoch": 1.6387878787878787, |
| "grad_norm": 0.013980223797261715, |
| "learning_rate": 6.558861578266494e-05, |
| "loss": 0.0038, |
| "step": 507 |
| }, |
| { |
| "epoch": 1.642020202020202, |
| "grad_norm": 0.013387070037424564, |
| "learning_rate": 6.571798188874515e-05, |
| "loss": 0.0032, |
| "step": 508 |
| }, |
| { |
| "epoch": 1.6452525252525252, |
| "grad_norm": 0.011839921586215496, |
| "learning_rate": 6.584734799482536e-05, |
| "loss": 0.0028, |
| "step": 509 |
| }, |
| { |
| "epoch": 1.6484848484848484, |
| "grad_norm": 0.012730401009321213, |
| "learning_rate": 6.597671410090557e-05, |
| "loss": 0.0033, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.6517171717171717, |
| "grad_norm": 0.015643352642655373, |
| "learning_rate": 6.610608020698578e-05, |
| "loss": 0.0031, |
| "step": 511 |
| }, |
| { |
| "epoch": 1.654949494949495, |
| "grad_norm": 0.014435230754315853, |
| "learning_rate": 6.623544631306599e-05, |
| "loss": 0.0026, |
| "step": 512 |
| }, |
| { |
| "epoch": 1.6581818181818182, |
| "grad_norm": 0.011074685491621494, |
| "learning_rate": 6.636481241914619e-05, |
| "loss": 0.0022, |
| "step": 513 |
| }, |
| { |
| "epoch": 1.6614141414141415, |
| "grad_norm": 0.015933366492390633, |
| "learning_rate": 6.649417852522638e-05, |
| "loss": 0.0031, |
| "step": 514 |
| }, |
| { |
| "epoch": 1.6646464646464647, |
| "grad_norm": 0.01202269084751606, |
| "learning_rate": 6.66235446313066e-05, |
| "loss": 0.0024, |
| "step": 515 |
| }, |
| { |
| "epoch": 1.667878787878788, |
| "grad_norm": 0.015371977351605892, |
| "learning_rate": 6.67529107373868e-05, |
| "loss": 0.003, |
| "step": 516 |
| }, |
| { |
| "epoch": 1.6711111111111112, |
| "grad_norm": 0.016634363681077957, |
| "learning_rate": 6.688227684346702e-05, |
| "loss": 0.0033, |
| "step": 517 |
| }, |
| { |
| "epoch": 1.6743434343434345, |
| "grad_norm": 0.01571839489042759, |
| "learning_rate": 6.701164294954723e-05, |
| "loss": 0.0035, |
| "step": 518 |
| }, |
| { |
| "epoch": 1.6775757575757577, |
| "grad_norm": 0.012390735559165478, |
| "learning_rate": 6.714100905562742e-05, |
| "loss": 0.0025, |
| "step": 519 |
| }, |
| { |
| "epoch": 1.680808080808081, |
| "grad_norm": 0.010502530261874199, |
| "learning_rate": 6.727037516170763e-05, |
| "loss": 0.0022, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.684040404040404, |
| "grad_norm": 0.014913235791027546, |
| "learning_rate": 6.739974126778784e-05, |
| "loss": 0.0043, |
| "step": 521 |
| }, |
| { |
| "epoch": 1.6872727272727273, |
| "grad_norm": 0.01774546504020691, |
| "learning_rate": 6.752910737386805e-05, |
| "loss": 0.0051, |
| "step": 522 |
| }, |
| { |
| "epoch": 1.6905050505050505, |
| "grad_norm": 0.010366516187787056, |
| "learning_rate": 6.765847347994826e-05, |
| "loss": 0.0026, |
| "step": 523 |
| }, |
| { |
| "epoch": 1.6937373737373738, |
| "grad_norm": 0.016834240406751633, |
| "learning_rate": 6.778783958602847e-05, |
| "loss": 0.0033, |
| "step": 524 |
| }, |
| { |
| "epoch": 1.696969696969697, |
| "grad_norm": 0.014722582884132862, |
| "learning_rate": 6.791720569210867e-05, |
| "loss": 0.0027, |
| "step": 525 |
| }, |
| { |
| "epoch": 1.696969696969697, |
| "eval_loss": 0.003057735273614526, |
| "eval_runtime": 16.3577, |
| "eval_samples_per_second": 6.113, |
| "eval_steps_per_second": 1.528, |
| "step": 525 |
| }, |
| { |
| "epoch": 1.70020202020202, |
| "grad_norm": 0.013499128632247448, |
| "learning_rate": 6.804657179818887e-05, |
| "loss": 0.0025, |
| "step": 526 |
| }, |
| { |
| "epoch": 1.7034343434343433, |
| "grad_norm": 0.010805454105138779, |
| "learning_rate": 6.817593790426908e-05, |
| "loss": 0.0019, |
| "step": 527 |
| }, |
| { |
| "epoch": 1.7066666666666666, |
| "grad_norm": 0.015221547335386276, |
| "learning_rate": 6.830530401034929e-05, |
| "loss": 0.0028, |
| "step": 528 |
| }, |
| { |
| "epoch": 1.7098989898989898, |
| "grad_norm": 0.016968918964266777, |
| "learning_rate": 6.84346701164295e-05, |
| "loss": 0.0038, |
| "step": 529 |
| }, |
| { |
| "epoch": 1.713131313131313, |
| "grad_norm": 0.014539297670125961, |
| "learning_rate": 6.856403622250971e-05, |
| "loss": 0.0032, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.7163636363636363, |
| "grad_norm": 0.01730852946639061, |
| "learning_rate": 6.869340232858992e-05, |
| "loss": 0.0048, |
| "step": 531 |
| }, |
| { |
| "epoch": 1.7195959595959596, |
| "grad_norm": 0.012829813174903393, |
| "learning_rate": 6.882276843467012e-05, |
| "loss": 0.0027, |
| "step": 532 |
| }, |
| { |
| "epoch": 1.7228282828282828, |
| "grad_norm": 0.01123909279704094, |
| "learning_rate": 6.895213454075033e-05, |
| "loss": 0.003, |
| "step": 533 |
| }, |
| { |
| "epoch": 1.726060606060606, |
| "grad_norm": 0.009748244658112526, |
| "learning_rate": 6.908150064683054e-05, |
| "loss": 0.0018, |
| "step": 534 |
| }, |
| { |
| "epoch": 1.7292929292929293, |
| "grad_norm": 0.01839064247906208, |
| "learning_rate": 6.921086675291075e-05, |
| "loss": 0.0049, |
| "step": 535 |
| }, |
| { |
| "epoch": 1.7325252525252526, |
| "grad_norm": 0.011202250607311726, |
| "learning_rate": 6.934023285899096e-05, |
| "loss": 0.0027, |
| "step": 536 |
| }, |
| { |
| "epoch": 1.7357575757575758, |
| "grad_norm": 0.009495667181909084, |
| "learning_rate": 6.946959896507115e-05, |
| "loss": 0.0019, |
| "step": 537 |
| }, |
| { |
| "epoch": 1.738989898989899, |
| "grad_norm": 0.012967769987881184, |
| "learning_rate": 6.959896507115135e-05, |
| "loss": 0.0035, |
| "step": 538 |
| }, |
| { |
| "epoch": 1.7422222222222223, |
| "grad_norm": 0.012850048951804638, |
| "learning_rate": 6.972833117723156e-05, |
| "loss": 0.0029, |
| "step": 539 |
| }, |
| { |
| "epoch": 1.7454545454545456, |
| "grad_norm": 0.007810965646058321, |
| "learning_rate": 6.985769728331177e-05, |
| "loss": 0.0017, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.7486868686868688, |
| "grad_norm": 0.009875420480966568, |
| "learning_rate": 6.998706338939198e-05, |
| "loss": 0.0022, |
| "step": 541 |
| }, |
| { |
| "epoch": 1.7519191919191919, |
| "grad_norm": 0.012761441990733147, |
| "learning_rate": 7.011642949547219e-05, |
| "loss": 0.0027, |
| "step": 542 |
| }, |
| { |
| "epoch": 1.7551515151515151, |
| "grad_norm": 0.011730210855603218, |
| "learning_rate": 7.02457956015524e-05, |
| "loss": 0.0021, |
| "step": 543 |
| }, |
| { |
| "epoch": 1.7583838383838384, |
| "grad_norm": 0.010599935427308083, |
| "learning_rate": 7.037516170763261e-05, |
| "loss": 0.0024, |
| "step": 544 |
| }, |
| { |
| "epoch": 1.7616161616161616, |
| "grad_norm": 0.014166628941893578, |
| "learning_rate": 7.050452781371281e-05, |
| "loss": 0.0035, |
| "step": 545 |
| }, |
| { |
| "epoch": 1.7648484848484849, |
| "grad_norm": 0.011188055388629436, |
| "learning_rate": 7.063389391979302e-05, |
| "loss": 0.0025, |
| "step": 546 |
| }, |
| { |
| "epoch": 1.768080808080808, |
| "grad_norm": 0.012701560743153095, |
| "learning_rate": 7.076326002587323e-05, |
| "loss": 0.0028, |
| "step": 547 |
| }, |
| { |
| "epoch": 1.7713131313131312, |
| "grad_norm": 0.013478513807058334, |
| "learning_rate": 7.089262613195343e-05, |
| "loss": 0.0028, |
| "step": 548 |
| }, |
| { |
| "epoch": 1.7745454545454544, |
| "grad_norm": 0.014012198895215988, |
| "learning_rate": 7.102199223803364e-05, |
| "loss": 0.0033, |
| "step": 549 |
| }, |
| { |
| "epoch": 1.7777777777777777, |
| "grad_norm": 0.015581433661282063, |
| "learning_rate": 7.115135834411385e-05, |
| "loss": 0.0031, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.7777777777777777, |
| "eval_loss": 0.0029114685021340847, |
| "eval_runtime": 16.3606, |
| "eval_samples_per_second": 6.112, |
| "eval_steps_per_second": 1.528, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.781010101010101, |
| "grad_norm": 0.010581339709460735, |
| "learning_rate": 7.128072445019404e-05, |
| "loss": 0.0018, |
| "step": 551 |
| }, |
| { |
| "epoch": 1.7842424242424242, |
| "grad_norm": 0.012763739563524723, |
| "learning_rate": 7.141009055627425e-05, |
| "loss": 0.0025, |
| "step": 552 |
| }, |
| { |
| "epoch": 1.7874747474747474, |
| "grad_norm": 0.013727094978094101, |
| "learning_rate": 7.153945666235446e-05, |
| "loss": 0.0029, |
| "step": 553 |
| }, |
| { |
| "epoch": 1.7907070707070707, |
| "grad_norm": 0.010230779647827148, |
| "learning_rate": 7.166882276843467e-05, |
| "loss": 0.0018, |
| "step": 554 |
| }, |
| { |
| "epoch": 1.793939393939394, |
| "grad_norm": 0.013886131346225739, |
| "learning_rate": 7.179818887451488e-05, |
| "loss": 0.0034, |
| "step": 555 |
| }, |
| { |
| "epoch": 1.7971717171717172, |
| "grad_norm": 0.014412900432944298, |
| "learning_rate": 7.19275549805951e-05, |
| "loss": 0.0031, |
| "step": 556 |
| }, |
| { |
| "epoch": 1.8004040404040405, |
| "grad_norm": 0.012488032691180706, |
| "learning_rate": 7.20569210866753e-05, |
| "loss": 0.0022, |
| "step": 557 |
| }, |
| { |
| "epoch": 1.8036363636363637, |
| "grad_norm": 0.013285035267472267, |
| "learning_rate": 7.21862871927555e-05, |
| "loss": 0.0028, |
| "step": 558 |
| }, |
| { |
| "epoch": 1.806868686868687, |
| "grad_norm": 0.01979801058769226, |
| "learning_rate": 7.231565329883571e-05, |
| "loss": 0.0045, |
| "step": 559 |
| }, |
| { |
| "epoch": 1.8101010101010102, |
| "grad_norm": 0.013309924863278866, |
| "learning_rate": 7.244501940491591e-05, |
| "loss": 0.0028, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.8133333333333335, |
| "grad_norm": 0.011176072061061859, |
| "learning_rate": 7.257438551099612e-05, |
| "loss": 0.0019, |
| "step": 561 |
| }, |
| { |
| "epoch": 1.8165656565656567, |
| "grad_norm": 0.01441121008247137, |
| "learning_rate": 7.270375161707633e-05, |
| "loss": 0.004, |
| "step": 562 |
| }, |
| { |
| "epoch": 1.8197979797979797, |
| "grad_norm": 0.014490502886474133, |
| "learning_rate": 7.283311772315654e-05, |
| "loss": 0.0028, |
| "step": 563 |
| }, |
| { |
| "epoch": 1.823030303030303, |
| "grad_norm": 0.010466169565916061, |
| "learning_rate": 7.296248382923674e-05, |
| "loss": 0.0022, |
| "step": 564 |
| }, |
| { |
| "epoch": 1.8262626262626263, |
| "grad_norm": 0.012618757784366608, |
| "learning_rate": 7.309184993531695e-05, |
| "loss": 0.0019, |
| "step": 565 |
| }, |
| { |
| "epoch": 1.8294949494949495, |
| "grad_norm": 0.015231903642416, |
| "learning_rate": 7.322121604139716e-05, |
| "loss": 0.004, |
| "step": 566 |
| }, |
| { |
| "epoch": 1.8327272727272728, |
| "grad_norm": 0.010003426112234592, |
| "learning_rate": 7.335058214747737e-05, |
| "loss": 0.0018, |
| "step": 567 |
| }, |
| { |
| "epoch": 1.835959595959596, |
| "grad_norm": 0.01798427663743496, |
| "learning_rate": 7.347994825355758e-05, |
| "loss": 0.0041, |
| "step": 568 |
| }, |
| { |
| "epoch": 1.839191919191919, |
| "grad_norm": 0.013741639442741871, |
| "learning_rate": 7.360931435963779e-05, |
| "loss": 0.0029, |
| "step": 569 |
| }, |
| { |
| "epoch": 1.8424242424242423, |
| "grad_norm": 0.009864463470876217, |
| "learning_rate": 7.3738680465718e-05, |
| "loss": 0.0017, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.8456565656565656, |
| "grad_norm": 0.010337362065911293, |
| "learning_rate": 7.38680465717982e-05, |
| "loss": 0.0018, |
| "step": 571 |
| }, |
| { |
| "epoch": 1.8488888888888888, |
| "grad_norm": 0.011854260228574276, |
| "learning_rate": 7.399741267787839e-05, |
| "loss": 0.0025, |
| "step": 572 |
| }, |
| { |
| "epoch": 1.852121212121212, |
| "grad_norm": 0.011286146938800812, |
| "learning_rate": 7.41267787839586e-05, |
| "loss": 0.0023, |
| "step": 573 |
| }, |
| { |
| "epoch": 1.8553535353535353, |
| "grad_norm": 0.014115465804934502, |
| "learning_rate": 7.425614489003881e-05, |
| "loss": 0.003, |
| "step": 574 |
| }, |
| { |
| "epoch": 1.8585858585858586, |
| "grad_norm": 0.009482797235250473, |
| "learning_rate": 7.438551099611902e-05, |
| "loss": 0.0015, |
| "step": 575 |
| }, |
| { |
| "epoch": 1.8585858585858586, |
| "eval_loss": 0.0029468617867678404, |
| "eval_runtime": 16.3473, |
| "eval_samples_per_second": 6.117, |
| "eval_steps_per_second": 1.529, |
| "step": 575 |
| }, |
| { |
| "epoch": 1.8618181818181818, |
| "grad_norm": 0.01927042193710804, |
| "learning_rate": 7.451487710219923e-05, |
| "loss": 0.0034, |
| "step": 576 |
| }, |
| { |
| "epoch": 1.865050505050505, |
| "grad_norm": 0.018555158749222755, |
| "learning_rate": 7.464424320827943e-05, |
| "loss": 0.0051, |
| "step": 577 |
| }, |
| { |
| "epoch": 1.8682828282828283, |
| "grad_norm": 0.012103302404284477, |
| "learning_rate": 7.477360931435964e-05, |
| "loss": 0.0026, |
| "step": 578 |
| }, |
| { |
| "epoch": 1.8715151515151516, |
| "grad_norm": 0.008798914961516857, |
| "learning_rate": 7.490297542043985e-05, |
| "loss": 0.002, |
| "step": 579 |
| }, |
| { |
| "epoch": 1.8747474747474748, |
| "grad_norm": 0.010500255040824413, |
| "learning_rate": 7.503234152652006e-05, |
| "loss": 0.0029, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.877979797979798, |
| "grad_norm": 0.012508660554885864, |
| "learning_rate": 7.516170763260027e-05, |
| "loss": 0.0025, |
| "step": 581 |
| }, |
| { |
| "epoch": 1.8812121212121213, |
| "grad_norm": 0.009473022073507309, |
| "learning_rate": 7.529107373868047e-05, |
| "loss": 0.0022, |
| "step": 582 |
| }, |
| { |
| "epoch": 1.8844444444444446, |
| "grad_norm": 0.019143516197800636, |
| "learning_rate": 7.542043984476068e-05, |
| "loss": 0.0025, |
| "step": 583 |
| }, |
| { |
| "epoch": 1.8876767676767678, |
| "grad_norm": 0.012900716625154018, |
| "learning_rate": 7.554980595084087e-05, |
| "loss": 0.0025, |
| "step": 584 |
| }, |
| { |
| "epoch": 1.8909090909090909, |
| "grad_norm": 0.012412898242473602, |
| "learning_rate": 7.567917205692108e-05, |
| "loss": 0.0018, |
| "step": 585 |
| }, |
| { |
| "epoch": 1.8941414141414141, |
| "grad_norm": 0.017902657389640808, |
| "learning_rate": 7.58085381630013e-05, |
| "loss": 0.0042, |
| "step": 586 |
| }, |
| { |
| "epoch": 1.8973737373737374, |
| "grad_norm": 0.01020246185362339, |
| "learning_rate": 7.59379042690815e-05, |
| "loss": 0.0022, |
| "step": 587 |
| }, |
| { |
| "epoch": 1.9006060606060606, |
| "grad_norm": 0.009255877695977688, |
| "learning_rate": 7.606727037516171e-05, |
| "loss": 0.0021, |
| "step": 588 |
| }, |
| { |
| "epoch": 1.9038383838383839, |
| "grad_norm": 0.01436795573681593, |
| "learning_rate": 7.619663648124192e-05, |
| "loss": 0.0035, |
| "step": 589 |
| }, |
| { |
| "epoch": 1.907070707070707, |
| "grad_norm": 0.011169711127877235, |
| "learning_rate": 7.632600258732212e-05, |
| "loss": 0.0025, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.9103030303030302, |
| "grad_norm": 0.011039819568395615, |
| "learning_rate": 7.645536869340233e-05, |
| "loss": 0.002, |
| "step": 591 |
| }, |
| { |
| "epoch": 1.9135353535353534, |
| "grad_norm": 0.01015357207506895, |
| "learning_rate": 7.658473479948254e-05, |
| "loss": 0.0022, |
| "step": 592 |
| }, |
| { |
| "epoch": 1.9167676767676767, |
| "grad_norm": 0.013857224024832249, |
| "learning_rate": 7.671410090556275e-05, |
| "loss": 0.0036, |
| "step": 593 |
| }, |
| { |
| "epoch": 1.92, |
| "grad_norm": 0.012443410232663155, |
| "learning_rate": 7.684346701164295e-05, |
| "loss": 0.0028, |
| "step": 594 |
| }, |
| { |
| "epoch": 1.9232323232323232, |
| "grad_norm": 0.009128548204898834, |
| "learning_rate": 7.697283311772316e-05, |
| "loss": 0.0016, |
| "step": 595 |
| }, |
| { |
| "epoch": 1.9264646464646464, |
| "grad_norm": 0.01139936875551939, |
| "learning_rate": 7.710219922380336e-05, |
| "loss": 0.0018, |
| "step": 596 |
| }, |
| { |
| "epoch": 1.9296969696969697, |
| "grad_norm": 0.01219150424003601, |
| "learning_rate": 7.723156532988357e-05, |
| "loss": 0.0022, |
| "step": 597 |
| }, |
| { |
| "epoch": 1.932929292929293, |
| "grad_norm": 0.013011794537305832, |
| "learning_rate": 7.736093143596378e-05, |
| "loss": 0.0032, |
| "step": 598 |
| }, |
| { |
| "epoch": 1.9361616161616162, |
| "grad_norm": 0.014185096137225628, |
| "learning_rate": 7.749029754204399e-05, |
| "loss": 0.0025, |
| "step": 599 |
| }, |
| { |
| "epoch": 1.9393939393939394, |
| "grad_norm": 0.023565856739878654, |
| "learning_rate": 7.76196636481242e-05, |
| "loss": 0.0032, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.9393939393939394, |
| "eval_loss": 0.002943198662251234, |
| "eval_runtime": 16.3316, |
| "eval_samples_per_second": 6.123, |
| "eval_steps_per_second": 1.531, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.9426262626262627, |
| "grad_norm": 0.018740547820925713, |
| "learning_rate": 7.774902975420441e-05, |
| "loss": 0.0032, |
| "step": 601 |
| }, |
| { |
| "epoch": 1.945858585858586, |
| "grad_norm": 0.01407854538410902, |
| "learning_rate": 7.787839586028462e-05, |
| "loss": 0.0031, |
| "step": 602 |
| }, |
| { |
| "epoch": 1.9490909090909092, |
| "grad_norm": 0.013722199015319347, |
| "learning_rate": 7.800776196636481e-05, |
| "loss": 0.0036, |
| "step": 603 |
| }, |
| { |
| "epoch": 1.9523232323232325, |
| "grad_norm": 0.013010906986892223, |
| "learning_rate": 7.813712807244502e-05, |
| "loss": 0.0033, |
| "step": 604 |
| }, |
| { |
| "epoch": 1.9555555555555557, |
| "grad_norm": 0.010232384316623211, |
| "learning_rate": 7.826649417852523e-05, |
| "loss": 0.0019, |
| "step": 605 |
| }, |
| { |
| "epoch": 1.9587878787878787, |
| "grad_norm": 0.00989801436662674, |
| "learning_rate": 7.839586028460543e-05, |
| "loss": 0.0021, |
| "step": 606 |
| }, |
| { |
| "epoch": 1.962020202020202, |
| "grad_norm": 0.011175636202096939, |
| "learning_rate": 7.852522639068564e-05, |
| "loss": 0.0026, |
| "step": 607 |
| }, |
| { |
| "epoch": 1.9652525252525253, |
| "grad_norm": 0.013437672518193722, |
| "learning_rate": 7.865459249676585e-05, |
| "loss": 0.0037, |
| "step": 608 |
| }, |
| { |
| "epoch": 1.9684848484848485, |
| "grad_norm": 0.011438854038715363, |
| "learning_rate": 7.878395860284605e-05, |
| "loss": 0.002, |
| "step": 609 |
| }, |
| { |
| "epoch": 1.9717171717171718, |
| "grad_norm": 0.009777581319212914, |
| "learning_rate": 7.891332470892626e-05, |
| "loss": 0.0016, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.9749494949494948, |
| "grad_norm": 0.013843595050275326, |
| "learning_rate": 7.904269081500647e-05, |
| "loss": 0.0047, |
| "step": 611 |
| }, |
| { |
| "epoch": 1.978181818181818, |
| "grad_norm": 0.011239755898714066, |
| "learning_rate": 7.917205692108668e-05, |
| "loss": 0.0028, |
| "step": 612 |
| }, |
| { |
| "epoch": 1.9814141414141413, |
| "grad_norm": 0.013993465341627598, |
| "learning_rate": 7.930142302716689e-05, |
| "loss": 0.0025, |
| "step": 613 |
| }, |
| { |
| "epoch": 1.9846464646464645, |
| "grad_norm": 0.009957659058272839, |
| "learning_rate": 7.94307891332471e-05, |
| "loss": 0.003, |
| "step": 614 |
| }, |
| { |
| "epoch": 1.9878787878787878, |
| "grad_norm": 0.012244481593370438, |
| "learning_rate": 7.956015523932731e-05, |
| "loss": 0.0032, |
| "step": 615 |
| }, |
| { |
| "epoch": 1.991111111111111, |
| "grad_norm": 0.011564073152840137, |
| "learning_rate": 7.968952134540751e-05, |
| "loss": 0.0019, |
| "step": 616 |
| }, |
| { |
| "epoch": 1.9943434343434343, |
| "grad_norm": 0.009937223978340626, |
| "learning_rate": 7.981888745148772e-05, |
| "loss": 0.0022, |
| "step": 617 |
| }, |
| { |
| "epoch": 1.9975757575757576, |
| "grad_norm": 0.01421060785651207, |
| "learning_rate": 7.994825355756791e-05, |
| "loss": 0.0028, |
| "step": 618 |
| }, |
| { |
| "epoch": 2.000808080808081, |
| "grad_norm": 0.022042306140065193, |
| "learning_rate": 8.007761966364812e-05, |
| "loss": 0.0038, |
| "step": 619 |
| }, |
| { |
| "epoch": 2.004040404040404, |
| "grad_norm": 0.012076015584170818, |
| "learning_rate": 8.020698576972833e-05, |
| "loss": 0.0019, |
| "step": 620 |
| }, |
| { |
| "epoch": 2.0072727272727273, |
| "grad_norm": 0.010562622919678688, |
| "learning_rate": 8.033635187580855e-05, |
| "loss": 0.0014, |
| "step": 621 |
| }, |
| { |
| "epoch": 2.0105050505050506, |
| "grad_norm": 0.012629817239940166, |
| "learning_rate": 8.046571798188874e-05, |
| "loss": 0.0021, |
| "step": 622 |
| }, |
| { |
| "epoch": 2.013737373737374, |
| "grad_norm": 0.012920092791318893, |
| "learning_rate": 8.059508408796895e-05, |
| "loss": 0.0029, |
| "step": 623 |
| }, |
| { |
| "epoch": 2.016969696969697, |
| "grad_norm": 0.010951900854706764, |
| "learning_rate": 8.072445019404916e-05, |
| "loss": 0.0018, |
| "step": 624 |
| }, |
| { |
| "epoch": 2.0202020202020203, |
| "grad_norm": 0.013417042791843414, |
| "learning_rate": 8.085381630012937e-05, |
| "loss": 0.0018, |
| "step": 625 |
| }, |
| { |
| "epoch": 2.0202020202020203, |
| "eval_loss": 0.003077675588428974, |
| "eval_runtime": 16.3513, |
| "eval_samples_per_second": 6.116, |
| "eval_steps_per_second": 1.529, |
| "step": 625 |
| }, |
| { |
| "epoch": 2.0234343434343436, |
| "grad_norm": 0.015751207247376442, |
| "learning_rate": 8.098318240620958e-05, |
| "loss": 0.0028, |
| "step": 626 |
| }, |
| { |
| "epoch": 2.026666666666667, |
| "grad_norm": 0.010942324064671993, |
| "learning_rate": 8.111254851228979e-05, |
| "loss": 0.0015, |
| "step": 627 |
| }, |
| { |
| "epoch": 2.02989898989899, |
| "grad_norm": 0.014838308095932007, |
| "learning_rate": 8.124191461836999e-05, |
| "loss": 0.0024, |
| "step": 628 |
| }, |
| { |
| "epoch": 2.0331313131313133, |
| "grad_norm": 0.015047342516481876, |
| "learning_rate": 8.13712807244502e-05, |
| "loss": 0.0021, |
| "step": 629 |
| }, |
| { |
| "epoch": 2.036363636363636, |
| "grad_norm": 0.013887758366763592, |
| "learning_rate": 8.15006468305304e-05, |
| "loss": 0.0016, |
| "step": 630 |
| }, |
| { |
| "epoch": 2.0395959595959594, |
| "grad_norm": 0.010356971994042397, |
| "learning_rate": 8.163001293661061e-05, |
| "loss": 0.002, |
| "step": 631 |
| }, |
| { |
| "epoch": 2.0428282828282827, |
| "grad_norm": 0.015105321072041988, |
| "learning_rate": 8.175937904269082e-05, |
| "loss": 0.0024, |
| "step": 632 |
| }, |
| { |
| "epoch": 2.046060606060606, |
| "grad_norm": 0.01421608217060566, |
| "learning_rate": 8.188874514877103e-05, |
| "loss": 0.0022, |
| "step": 633 |
| }, |
| { |
| "epoch": 2.049292929292929, |
| "grad_norm": 0.011619077064096928, |
| "learning_rate": 8.201811125485124e-05, |
| "loss": 0.0018, |
| "step": 634 |
| }, |
| { |
| "epoch": 2.0525252525252524, |
| "grad_norm": 0.012455479241907597, |
| "learning_rate": 8.214747736093143e-05, |
| "loss": 0.0016, |
| "step": 635 |
| }, |
| { |
| "epoch": 2.0557575757575757, |
| "grad_norm": 0.011255488730967045, |
| "learning_rate": 8.227684346701164e-05, |
| "loss": 0.0013, |
| "step": 636 |
| }, |
| { |
| "epoch": 2.058989898989899, |
| "grad_norm": 0.012253155931830406, |
| "learning_rate": 8.240620957309186e-05, |
| "loss": 0.0017, |
| "step": 637 |
| }, |
| { |
| "epoch": 2.062222222222222, |
| "grad_norm": 0.015251807868480682, |
| "learning_rate": 8.253557567917207e-05, |
| "loss": 0.0035, |
| "step": 638 |
| }, |
| { |
| "epoch": 2.0654545454545454, |
| "grad_norm": 0.01149311289191246, |
| "learning_rate": 8.266494178525228e-05, |
| "loss": 0.0023, |
| "step": 639 |
| }, |
| { |
| "epoch": 2.0686868686868687, |
| "grad_norm": 0.011605948209762573, |
| "learning_rate": 8.279430789133247e-05, |
| "loss": 0.0029, |
| "step": 640 |
| }, |
| { |
| "epoch": 2.071919191919192, |
| "grad_norm": 0.01147545874118805, |
| "learning_rate": 8.292367399741268e-05, |
| "loss": 0.0018, |
| "step": 641 |
| }, |
| { |
| "epoch": 2.075151515151515, |
| "grad_norm": 0.00932023860514164, |
| "learning_rate": 8.305304010349288e-05, |
| "loss": 0.0013, |
| "step": 642 |
| }, |
| { |
| "epoch": 2.0783838383838384, |
| "grad_norm": 0.009235905483365059, |
| "learning_rate": 8.318240620957309e-05, |
| "loss": 0.0012, |
| "step": 643 |
| }, |
| { |
| "epoch": 2.0816161616161617, |
| "grad_norm": 0.009316741488873959, |
| "learning_rate": 8.33117723156533e-05, |
| "loss": 0.0013, |
| "step": 644 |
| }, |
| { |
| "epoch": 2.084848484848485, |
| "grad_norm": 0.011550882831215858, |
| "learning_rate": 8.344113842173351e-05, |
| "loss": 0.0021, |
| "step": 645 |
| }, |
| { |
| "epoch": 2.088080808080808, |
| "grad_norm": 0.011506769806146622, |
| "learning_rate": 8.357050452781372e-05, |
| "loss": 0.0019, |
| "step": 646 |
| }, |
| { |
| "epoch": 2.0913131313131315, |
| "grad_norm": 0.013658232986927032, |
| "learning_rate": 8.369987063389393e-05, |
| "loss": 0.0025, |
| "step": 647 |
| }, |
| { |
| "epoch": 2.0945454545454547, |
| "grad_norm": 0.012891994789242744, |
| "learning_rate": 8.382923673997413e-05, |
| "loss": 0.002, |
| "step": 648 |
| }, |
| { |
| "epoch": 2.097777777777778, |
| "grad_norm": 0.01157211884856224, |
| "learning_rate": 8.395860284605434e-05, |
| "loss": 0.0019, |
| "step": 649 |
| }, |
| { |
| "epoch": 2.101010101010101, |
| "grad_norm": 0.014404572546482086, |
| "learning_rate": 8.408796895213455e-05, |
| "loss": 0.0018, |
| "step": 650 |
| }, |
| { |
| "epoch": 2.101010101010101, |
| "eval_loss": 0.0025548762641847134, |
| "eval_runtime": 16.344, |
| "eval_samples_per_second": 6.118, |
| "eval_steps_per_second": 1.53, |
| "step": 650 |
| }, |
| { |
| "epoch": 2.1042424242424245, |
| "grad_norm": 0.014213437214493752, |
| "learning_rate": 8.421733505821476e-05, |
| "loss": 0.002, |
| "step": 651 |
| }, |
| { |
| "epoch": 2.1074747474747473, |
| "grad_norm": 0.013342815451323986, |
| "learning_rate": 8.434670116429496e-05, |
| "loss": 0.0019, |
| "step": 652 |
| }, |
| { |
| "epoch": 2.1107070707070705, |
| "grad_norm": 0.009655567817389965, |
| "learning_rate": 8.447606727037517e-05, |
| "loss": 0.0013, |
| "step": 653 |
| }, |
| { |
| "epoch": 2.113939393939394, |
| "grad_norm": 0.012099673971533775, |
| "learning_rate": 8.460543337645536e-05, |
| "loss": 0.002, |
| "step": 654 |
| }, |
| { |
| "epoch": 2.117171717171717, |
| "grad_norm": 0.009277356788516045, |
| "learning_rate": 8.473479948253557e-05, |
| "loss": 0.002, |
| "step": 655 |
| }, |
| { |
| "epoch": 2.1204040404040403, |
| "grad_norm": 0.010116140358150005, |
| "learning_rate": 8.486416558861578e-05, |
| "loss": 0.002, |
| "step": 656 |
| }, |
| { |
| "epoch": 2.1236363636363635, |
| "grad_norm": 0.01329744327813387, |
| "learning_rate": 8.499353169469599e-05, |
| "loss": 0.0025, |
| "step": 657 |
| }, |
| { |
| "epoch": 2.126868686868687, |
| "grad_norm": 0.011432441882789135, |
| "learning_rate": 8.51228978007762e-05, |
| "loss": 0.0021, |
| "step": 658 |
| }, |
| { |
| "epoch": 2.13010101010101, |
| "grad_norm": 0.010472339577972889, |
| "learning_rate": 8.525226390685641e-05, |
| "loss": 0.0022, |
| "step": 659 |
| }, |
| { |
| "epoch": 2.1333333333333333, |
| "grad_norm": 0.00992510374635458, |
| "learning_rate": 8.538163001293662e-05, |
| "loss": 0.0019, |
| "step": 660 |
| }, |
| { |
| "epoch": 2.1365656565656566, |
| "grad_norm": 0.008269916288554668, |
| "learning_rate": 8.551099611901682e-05, |
| "loss": 0.0016, |
| "step": 661 |
| }, |
| { |
| "epoch": 2.13979797979798, |
| "grad_norm": 0.014426509849727154, |
| "learning_rate": 8.564036222509703e-05, |
| "loss": 0.0024, |
| "step": 662 |
| }, |
| { |
| "epoch": 2.143030303030303, |
| "grad_norm": 0.011803694069385529, |
| "learning_rate": 8.576972833117724e-05, |
| "loss": 0.0019, |
| "step": 663 |
| }, |
| { |
| "epoch": 2.1462626262626263, |
| "grad_norm": 0.01087699830532074, |
| "learning_rate": 8.589909443725744e-05, |
| "loss": 0.0018, |
| "step": 664 |
| }, |
| { |
| "epoch": 2.1494949494949496, |
| "grad_norm": 0.00785099621862173, |
| "learning_rate": 8.602846054333765e-05, |
| "loss": 0.0013, |
| "step": 665 |
| }, |
| { |
| "epoch": 2.152727272727273, |
| "grad_norm": 0.013654896058142185, |
| "learning_rate": 8.615782664941786e-05, |
| "loss": 0.0019, |
| "step": 666 |
| }, |
| { |
| "epoch": 2.155959595959596, |
| "grad_norm": 0.010862999595701694, |
| "learning_rate": 8.628719275549805e-05, |
| "loss": 0.0012, |
| "step": 667 |
| }, |
| { |
| "epoch": 2.1591919191919193, |
| "grad_norm": 0.01307867094874382, |
| "learning_rate": 8.641655886157827e-05, |
| "loss": 0.0024, |
| "step": 668 |
| }, |
| { |
| "epoch": 2.1624242424242426, |
| "grad_norm": 0.01710178703069687, |
| "learning_rate": 8.654592496765848e-05, |
| "loss": 0.0018, |
| "step": 669 |
| }, |
| { |
| "epoch": 2.165656565656566, |
| "grad_norm": 0.011032214388251305, |
| "learning_rate": 8.667529107373869e-05, |
| "loss": 0.0017, |
| "step": 670 |
| }, |
| { |
| "epoch": 2.168888888888889, |
| "grad_norm": 0.01086389645934105, |
| "learning_rate": 8.68046571798189e-05, |
| "loss": 0.0013, |
| "step": 671 |
| }, |
| { |
| "epoch": 2.172121212121212, |
| "grad_norm": 0.011431907303631306, |
| "learning_rate": 8.69340232858991e-05, |
| "loss": 0.002, |
| "step": 672 |
| }, |
| { |
| "epoch": 2.175353535353535, |
| "grad_norm": 0.013286993838846684, |
| "learning_rate": 8.706338939197932e-05, |
| "loss": 0.0027, |
| "step": 673 |
| }, |
| { |
| "epoch": 2.1785858585858584, |
| "grad_norm": 0.011653001420199871, |
| "learning_rate": 8.719275549805951e-05, |
| "loss": 0.0023, |
| "step": 674 |
| }, |
| { |
| "epoch": 2.1818181818181817, |
| "grad_norm": 0.011231204494833946, |
| "learning_rate": 8.732212160413972e-05, |
| "loss": 0.0017, |
| "step": 675 |
| }, |
| { |
| "epoch": 2.1818181818181817, |
| "eval_loss": 0.0027512190863490105, |
| "eval_runtime": 16.3587, |
| "eval_samples_per_second": 6.113, |
| "eval_steps_per_second": 1.528, |
| "step": 675 |
| }, |
| { |
| "epoch": 2.185050505050505, |
| "grad_norm": 0.010902061127126217, |
| "learning_rate": 8.745148771021992e-05, |
| "loss": 0.0017, |
| "step": 676 |
| }, |
| { |
| "epoch": 2.188282828282828, |
| "grad_norm": 0.012578235007822514, |
| "learning_rate": 8.758085381630013e-05, |
| "loss": 0.0017, |
| "step": 677 |
| }, |
| { |
| "epoch": 2.1915151515151514, |
| "grad_norm": 0.014864742755889893, |
| "learning_rate": 8.771021992238034e-05, |
| "loss": 0.0021, |
| "step": 678 |
| }, |
| { |
| "epoch": 2.1947474747474747, |
| "grad_norm": 0.01891174167394638, |
| "learning_rate": 8.783958602846055e-05, |
| "loss": 0.0049, |
| "step": 679 |
| }, |
| { |
| "epoch": 2.197979797979798, |
| "grad_norm": 0.009410976432263851, |
| "learning_rate": 8.796895213454075e-05, |
| "loss": 0.002, |
| "step": 680 |
| }, |
| { |
| "epoch": 2.201212121212121, |
| "grad_norm": 0.011760878376662731, |
| "learning_rate": 8.809831824062096e-05, |
| "loss": 0.0022, |
| "step": 681 |
| }, |
| { |
| "epoch": 2.2044444444444444, |
| "grad_norm": 0.012635684572160244, |
| "learning_rate": 8.822768434670117e-05, |
| "loss": 0.0018, |
| "step": 682 |
| }, |
| { |
| "epoch": 2.2076767676767677, |
| "grad_norm": 0.010938025079667568, |
| "learning_rate": 8.835705045278138e-05, |
| "loss": 0.0018, |
| "step": 683 |
| }, |
| { |
| "epoch": 2.210909090909091, |
| "grad_norm": 0.013083130121231079, |
| "learning_rate": 8.848641655886159e-05, |
| "loss": 0.0023, |
| "step": 684 |
| }, |
| { |
| "epoch": 2.214141414141414, |
| "grad_norm": 0.009881705977022648, |
| "learning_rate": 8.86157826649418e-05, |
| "loss": 0.0015, |
| "step": 685 |
| }, |
| { |
| "epoch": 2.2173737373737374, |
| "grad_norm": 0.01099600363522768, |
| "learning_rate": 8.8745148771022e-05, |
| "loss": 0.0016, |
| "step": 686 |
| }, |
| { |
| "epoch": 2.2206060606060607, |
| "grad_norm": 0.012169835157692432, |
| "learning_rate": 8.88745148771022e-05, |
| "loss": 0.002, |
| "step": 687 |
| }, |
| { |
| "epoch": 2.223838383838384, |
| "grad_norm": 0.007999802008271217, |
| "learning_rate": 8.90038809831824e-05, |
| "loss": 0.0016, |
| "step": 688 |
| }, |
| { |
| "epoch": 2.227070707070707, |
| "grad_norm": 0.011233525350689888, |
| "learning_rate": 8.913324708926261e-05, |
| "loss": 0.0013, |
| "step": 689 |
| }, |
| { |
| "epoch": 2.2303030303030305, |
| "grad_norm": 0.008273192681372166, |
| "learning_rate": 8.926261319534282e-05, |
| "loss": 0.001, |
| "step": 690 |
| }, |
| { |
| "epoch": 2.2335353535353537, |
| "grad_norm": 0.012826783582568169, |
| "learning_rate": 8.939197930142303e-05, |
| "loss": 0.0029, |
| "step": 691 |
| }, |
| { |
| "epoch": 2.236767676767677, |
| "grad_norm": 0.016064029186964035, |
| "learning_rate": 8.952134540750324e-05, |
| "loss": 0.0029, |
| "step": 692 |
| }, |
| { |
| "epoch": 2.24, |
| "grad_norm": 0.010056176222860813, |
| "learning_rate": 8.965071151358344e-05, |
| "loss": 0.0022, |
| "step": 693 |
| }, |
| { |
| "epoch": 2.2432323232323235, |
| "grad_norm": 0.013265883550047874, |
| "learning_rate": 8.978007761966365e-05, |
| "loss": 0.0025, |
| "step": 694 |
| }, |
| { |
| "epoch": 2.2464646464646463, |
| "grad_norm": 0.011746291071176529, |
| "learning_rate": 8.990944372574386e-05, |
| "loss": 0.0018, |
| "step": 695 |
| }, |
| { |
| "epoch": 2.2496969696969695, |
| "grad_norm": 0.012033309787511826, |
| "learning_rate": 9.003880983182407e-05, |
| "loss": 0.0016, |
| "step": 696 |
| }, |
| { |
| "epoch": 2.252929292929293, |
| "grad_norm": 0.016966667026281357, |
| "learning_rate": 9.016817593790428e-05, |
| "loss": 0.0021, |
| "step": 697 |
| }, |
| { |
| "epoch": 2.256161616161616, |
| "grad_norm": 0.012350259348750114, |
| "learning_rate": 9.029754204398448e-05, |
| "loss": 0.0027, |
| "step": 698 |
| }, |
| { |
| "epoch": 2.2593939393939393, |
| "grad_norm": 0.013753430917859077, |
| "learning_rate": 9.042690815006469e-05, |
| "loss": 0.0015, |
| "step": 699 |
| }, |
| { |
| "epoch": 2.2626262626262625, |
| "grad_norm": 0.013047127053141594, |
| "learning_rate": 9.055627425614489e-05, |
| "loss": 0.0016, |
| "step": 700 |
| }, |
| { |
| "epoch": 2.2626262626262625, |
| "eval_loss": 0.002442617667838931, |
| "eval_runtime": 16.3476, |
| "eval_samples_per_second": 6.117, |
| "eval_steps_per_second": 1.529, |
| "step": 700 |
| }, |
| { |
| "epoch": 2.265858585858586, |
| "grad_norm": 0.011234000325202942, |
| "learning_rate": 9.06856403622251e-05, |
| "loss": 0.0018, |
| "step": 701 |
| }, |
| { |
| "epoch": 2.269090909090909, |
| "grad_norm": 0.011713809333741665, |
| "learning_rate": 9.08150064683053e-05, |
| "loss": 0.0017, |
| "step": 702 |
| }, |
| { |
| "epoch": 2.2723232323232323, |
| "grad_norm": 0.009527718648314476, |
| "learning_rate": 9.094437257438552e-05, |
| "loss": 0.0019, |
| "step": 703 |
| }, |
| { |
| "epoch": 2.2755555555555556, |
| "grad_norm": 0.011378354392945766, |
| "learning_rate": 9.107373868046573e-05, |
| "loss": 0.0019, |
| "step": 704 |
| }, |
| { |
| "epoch": 2.278787878787879, |
| "grad_norm": 0.011011448688805103, |
| "learning_rate": 9.120310478654594e-05, |
| "loss": 0.0018, |
| "step": 705 |
| }, |
| { |
| "epoch": 2.282020202020202, |
| "grad_norm": 0.01137737650424242, |
| "learning_rate": 9.133247089262613e-05, |
| "loss": 0.0018, |
| "step": 706 |
| }, |
| { |
| "epoch": 2.2852525252525253, |
| "grad_norm": 0.012261508032679558, |
| "learning_rate": 9.146183699870634e-05, |
| "loss": 0.0024, |
| "step": 707 |
| }, |
| { |
| "epoch": 2.2884848484848486, |
| "grad_norm": 0.00794307328760624, |
| "learning_rate": 9.159120310478655e-05, |
| "loss": 0.0013, |
| "step": 708 |
| }, |
| { |
| "epoch": 2.291717171717172, |
| "grad_norm": 0.008616283535957336, |
| "learning_rate": 9.172056921086676e-05, |
| "loss": 0.0014, |
| "step": 709 |
| }, |
| { |
| "epoch": 2.294949494949495, |
| "grad_norm": 0.012844040058553219, |
| "learning_rate": 9.184993531694696e-05, |
| "loss": 0.002, |
| "step": 710 |
| }, |
| { |
| "epoch": 2.2981818181818183, |
| "grad_norm": 0.01080672349780798, |
| "learning_rate": 9.197930142302717e-05, |
| "loss": 0.0017, |
| "step": 711 |
| }, |
| { |
| "epoch": 2.3014141414141416, |
| "grad_norm": 0.008765915408730507, |
| "learning_rate": 9.210866752910737e-05, |
| "loss": 0.0014, |
| "step": 712 |
| }, |
| { |
| "epoch": 2.304646464646465, |
| "grad_norm": 0.00891137681901455, |
| "learning_rate": 9.223803363518758e-05, |
| "loss": 0.0014, |
| "step": 713 |
| }, |
| { |
| "epoch": 2.3078787878787876, |
| "grad_norm": 0.014199002645909786, |
| "learning_rate": 9.236739974126779e-05, |
| "loss": 0.0014, |
| "step": 714 |
| }, |
| { |
| "epoch": 2.311111111111111, |
| "grad_norm": 0.010338046588003635, |
| "learning_rate": 9.2496765847348e-05, |
| "loss": 0.0012, |
| "step": 715 |
| }, |
| { |
| "epoch": 2.314343434343434, |
| "grad_norm": 0.017939046025276184, |
| "learning_rate": 9.262613195342821e-05, |
| "loss": 0.0026, |
| "step": 716 |
| }, |
| { |
| "epoch": 2.3175757575757574, |
| "grad_norm": 0.016568968072533607, |
| "learning_rate": 9.275549805950842e-05, |
| "loss": 0.0025, |
| "step": 717 |
| }, |
| { |
| "epoch": 2.3208080808080807, |
| "grad_norm": 0.015017041936516762, |
| "learning_rate": 9.288486416558863e-05, |
| "loss": 0.0017, |
| "step": 718 |
| }, |
| { |
| "epoch": 2.324040404040404, |
| "grad_norm": 0.015940289944410324, |
| "learning_rate": 9.301423027166883e-05, |
| "loss": 0.0022, |
| "step": 719 |
| }, |
| { |
| "epoch": 2.327272727272727, |
| "grad_norm": 0.010147231630980968, |
| "learning_rate": 9.314359637774904e-05, |
| "loss": 0.0016, |
| "step": 720 |
| }, |
| { |
| "epoch": 2.3305050505050504, |
| "grad_norm": 0.010845660232007504, |
| "learning_rate": 9.327296248382925e-05, |
| "loss": 0.002, |
| "step": 721 |
| }, |
| { |
| "epoch": 2.3337373737373737, |
| "grad_norm": 0.012357765808701515, |
| "learning_rate": 9.340232858990944e-05, |
| "loss": 0.0017, |
| "step": 722 |
| }, |
| { |
| "epoch": 2.336969696969697, |
| "grad_norm": 0.011769283562898636, |
| "learning_rate": 9.353169469598965e-05, |
| "loss": 0.0019, |
| "step": 723 |
| }, |
| { |
| "epoch": 2.34020202020202, |
| "grad_norm": 0.0064726341515779495, |
| "learning_rate": 9.366106080206986e-05, |
| "loss": 0.0011, |
| "step": 724 |
| }, |
| { |
| "epoch": 2.3434343434343434, |
| "grad_norm": 0.009450256824493408, |
| "learning_rate": 9.379042690815006e-05, |
| "loss": 0.0018, |
| "step": 725 |
| }, |
| { |
| "epoch": 2.3434343434343434, |
| "eval_loss": 0.0025604518596082926, |
| "eval_runtime": 16.3442, |
| "eval_samples_per_second": 6.118, |
| "eval_steps_per_second": 1.53, |
| "step": 725 |
| }, |
| { |
| "epoch": 2.3466666666666667, |
| "grad_norm": 0.012289798818528652, |
| "learning_rate": 9.391979301423027e-05, |
| "loss": 0.0027, |
| "step": 726 |
| }, |
| { |
| "epoch": 2.34989898989899, |
| "grad_norm": 0.009702212177217007, |
| "learning_rate": 9.404915912031048e-05, |
| "loss": 0.0015, |
| "step": 727 |
| }, |
| { |
| "epoch": 2.353131313131313, |
| "grad_norm": 0.009124784730374813, |
| "learning_rate": 9.417852522639069e-05, |
| "loss": 0.0014, |
| "step": 728 |
| }, |
| { |
| "epoch": 2.3563636363636364, |
| "grad_norm": 0.00956012587994337, |
| "learning_rate": 9.43078913324709e-05, |
| "loss": 0.0018, |
| "step": 729 |
| }, |
| { |
| "epoch": 2.3595959595959597, |
| "grad_norm": 0.008290289901196957, |
| "learning_rate": 9.443725743855111e-05, |
| "loss": 0.0014, |
| "step": 730 |
| }, |
| { |
| "epoch": 2.362828282828283, |
| "grad_norm": 0.016345232725143433, |
| "learning_rate": 9.456662354463132e-05, |
| "loss": 0.0024, |
| "step": 731 |
| }, |
| { |
| "epoch": 2.366060606060606, |
| "grad_norm": 0.013285723514854908, |
| "learning_rate": 9.469598965071152e-05, |
| "loss": 0.0022, |
| "step": 732 |
| }, |
| { |
| "epoch": 2.3692929292929295, |
| "grad_norm": 0.015846766531467438, |
| "learning_rate": 9.482535575679173e-05, |
| "loss": 0.0026, |
| "step": 733 |
| }, |
| { |
| "epoch": 2.3725252525252527, |
| "grad_norm": 0.010950244031846523, |
| "learning_rate": 9.495472186287193e-05, |
| "loss": 0.0017, |
| "step": 734 |
| }, |
| { |
| "epoch": 2.375757575757576, |
| "grad_norm": 0.01026253029704094, |
| "learning_rate": 9.508408796895214e-05, |
| "loss": 0.0019, |
| "step": 735 |
| }, |
| { |
| "epoch": 2.378989898989899, |
| "grad_norm": 0.009852707386016846, |
| "learning_rate": 9.521345407503235e-05, |
| "loss": 0.0016, |
| "step": 736 |
| }, |
| { |
| "epoch": 2.3822222222222225, |
| "grad_norm": 0.008770400658249855, |
| "learning_rate": 9.534282018111256e-05, |
| "loss": 0.0015, |
| "step": 737 |
| }, |
| { |
| "epoch": 2.3854545454545453, |
| "grad_norm": 0.008731266483664513, |
| "learning_rate": 9.547218628719275e-05, |
| "loss": 0.0015, |
| "step": 738 |
| }, |
| { |
| "epoch": 2.3886868686868685, |
| "grad_norm": 0.012001359835267067, |
| "learning_rate": 9.560155239327296e-05, |
| "loss": 0.0015, |
| "step": 739 |
| }, |
| { |
| "epoch": 2.391919191919192, |
| "grad_norm": 0.014997370541095734, |
| "learning_rate": 9.573091849935317e-05, |
| "loss": 0.003, |
| "step": 740 |
| }, |
| { |
| "epoch": 2.395151515151515, |
| "grad_norm": 0.012686568312346935, |
| "learning_rate": 9.586028460543338e-05, |
| "loss": 0.0014, |
| "step": 741 |
| }, |
| { |
| "epoch": 2.3983838383838383, |
| "grad_norm": 0.012569274753332138, |
| "learning_rate": 9.59896507115136e-05, |
| "loss": 0.0022, |
| "step": 742 |
| }, |
| { |
| "epoch": 2.4016161616161615, |
| "grad_norm": 0.009259608574211597, |
| "learning_rate": 9.61190168175938e-05, |
| "loss": 0.0013, |
| "step": 743 |
| }, |
| { |
| "epoch": 2.404848484848485, |
| "grad_norm": 0.012454778887331486, |
| "learning_rate": 9.6248382923674e-05, |
| "loss": 0.0024, |
| "step": 744 |
| }, |
| { |
| "epoch": 2.408080808080808, |
| "grad_norm": 0.009780270047485828, |
| "learning_rate": 9.63777490297542e-05, |
| "loss": 0.0017, |
| "step": 745 |
| }, |
| { |
| "epoch": 2.4113131313131313, |
| "grad_norm": 0.010736271739006042, |
| "learning_rate": 9.650711513583441e-05, |
| "loss": 0.0016, |
| "step": 746 |
| }, |
| { |
| "epoch": 2.4145454545454546, |
| "grad_norm": 0.01118789054453373, |
| "learning_rate": 9.663648124191462e-05, |
| "loss": 0.0024, |
| "step": 747 |
| }, |
| { |
| "epoch": 2.417777777777778, |
| "grad_norm": 0.00890735350549221, |
| "learning_rate": 9.676584734799483e-05, |
| "loss": 0.0017, |
| "step": 748 |
| }, |
| { |
| "epoch": 2.421010101010101, |
| "grad_norm": 0.010420437902212143, |
| "learning_rate": 9.689521345407504e-05, |
| "loss": 0.0018, |
| "step": 749 |
| }, |
| { |
| "epoch": 2.4242424242424243, |
| "grad_norm": 0.012901760637760162, |
| "learning_rate": 9.702457956015525e-05, |
| "loss": 0.0025, |
| "step": 750 |
| }, |
| { |
| "epoch": 2.4242424242424243, |
| "eval_loss": 0.0022431614343076944, |
| "eval_runtime": 16.3284, |
| "eval_samples_per_second": 6.124, |
| "eval_steps_per_second": 1.531, |
| "step": 750 |
| }, |
| { |
| "epoch": 2.4274747474747476, |
| "grad_norm": 0.011588324792683125, |
| "learning_rate": 9.715394566623545e-05, |
| "loss": 0.0017, |
| "step": 751 |
| }, |
| { |
| "epoch": 2.430707070707071, |
| "grad_norm": 0.014227008447051048, |
| "learning_rate": 9.728331177231566e-05, |
| "loss": 0.0021, |
| "step": 752 |
| }, |
| { |
| "epoch": 2.433939393939394, |
| "grad_norm": 0.010249685496091843, |
| "learning_rate": 9.741267787839587e-05, |
| "loss": 0.0015, |
| "step": 753 |
| }, |
| { |
| "epoch": 2.4371717171717173, |
| "grad_norm": 0.009854748845100403, |
| "learning_rate": 9.754204398447608e-05, |
| "loss": 0.0017, |
| "step": 754 |
| }, |
| { |
| "epoch": 2.4404040404040406, |
| "grad_norm": 0.011436713859438896, |
| "learning_rate": 9.767141009055629e-05, |
| "loss": 0.0018, |
| "step": 755 |
| }, |
| { |
| "epoch": 2.443636363636364, |
| "grad_norm": 0.010603224858641624, |
| "learning_rate": 9.780077619663648e-05, |
| "loss": 0.0016, |
| "step": 756 |
| }, |
| { |
| "epoch": 2.4468686868686866, |
| "grad_norm": 0.012437733821570873, |
| "learning_rate": 9.793014230271668e-05, |
| "loss": 0.0022, |
| "step": 757 |
| }, |
| { |
| "epoch": 2.45010101010101, |
| "grad_norm": 0.008577422238886356, |
| "learning_rate": 9.805950840879689e-05, |
| "loss": 0.0014, |
| "step": 758 |
| }, |
| { |
| "epoch": 2.453333333333333, |
| "grad_norm": 0.012434886768460274, |
| "learning_rate": 9.81888745148771e-05, |
| "loss": 0.002, |
| "step": 759 |
| }, |
| { |
| "epoch": 2.4565656565656564, |
| "grad_norm": 0.015642991289496422, |
| "learning_rate": 9.831824062095731e-05, |
| "loss": 0.0033, |
| "step": 760 |
| }, |
| { |
| "epoch": 2.4597979797979797, |
| "grad_norm": 0.01265759114176035, |
| "learning_rate": 9.844760672703752e-05, |
| "loss": 0.0017, |
| "step": 761 |
| }, |
| { |
| "epoch": 2.463030303030303, |
| "grad_norm": 0.013272153213620186, |
| "learning_rate": 9.857697283311773e-05, |
| "loss": 0.0025, |
| "step": 762 |
| }, |
| { |
| "epoch": 2.466262626262626, |
| "grad_norm": 0.013514629565179348, |
| "learning_rate": 9.870633893919794e-05, |
| "loss": 0.0017, |
| "step": 763 |
| }, |
| { |
| "epoch": 2.4694949494949494, |
| "grad_norm": 0.011923782527446747, |
| "learning_rate": 9.883570504527814e-05, |
| "loss": 0.0019, |
| "step": 764 |
| }, |
| { |
| "epoch": 2.4727272727272727, |
| "grad_norm": 0.01172129437327385, |
| "learning_rate": 9.896507115135835e-05, |
| "loss": 0.0026, |
| "step": 765 |
| }, |
| { |
| "epoch": 2.475959595959596, |
| "grad_norm": 0.010015101172029972, |
| "learning_rate": 9.909443725743856e-05, |
| "loss": 0.0016, |
| "step": 766 |
| }, |
| { |
| "epoch": 2.479191919191919, |
| "grad_norm": 0.006037898361682892, |
| "learning_rate": 9.922380336351877e-05, |
| "loss": 0.0008, |
| "step": 767 |
| }, |
| { |
| "epoch": 2.4824242424242424, |
| "grad_norm": 0.02578366920351982, |
| "learning_rate": 9.935316946959897e-05, |
| "loss": 0.0021, |
| "step": 768 |
| }, |
| { |
| "epoch": 2.4856565656565657, |
| "grad_norm": 0.01217565406113863, |
| "learning_rate": 9.948253557567918e-05, |
| "loss": 0.0026, |
| "step": 769 |
| }, |
| { |
| "epoch": 2.488888888888889, |
| "grad_norm": 0.011332768015563488, |
| "learning_rate": 9.961190168175937e-05, |
| "loss": 0.0015, |
| "step": 770 |
| }, |
| { |
| "epoch": 2.492121212121212, |
| "grad_norm": 0.013233241625130177, |
| "learning_rate": 9.974126778783958e-05, |
| "loss": 0.0019, |
| "step": 771 |
| }, |
| { |
| "epoch": 2.4953535353535354, |
| "grad_norm": 0.012045849114656448, |
| "learning_rate": 9.98706338939198e-05, |
| "loss": 0.0017, |
| "step": 772 |
| }, |
| { |
| "epoch": 2.4985858585858587, |
| "grad_norm": 0.010305625386536121, |
| "learning_rate": 0.0001, |
| "loss": 0.0018, |
| "step": 773 |
| }, |
| { |
| "epoch": 2.501818181818182, |
| "grad_norm": 0.011099765077233315, |
| "learning_rate": 9.999999489471233e-05, |
| "loss": 0.0019, |
| "step": 774 |
| }, |
| { |
| "epoch": 2.505050505050505, |
| "grad_norm": 0.014014177024364471, |
| "learning_rate": 9.99999795788503e-05, |
| "loss": 0.0029, |
| "step": 775 |
| }, |
| { |
| "epoch": 2.505050505050505, |
| "eval_loss": 0.002437673741951585, |
| "eval_runtime": 16.3461, |
| "eval_samples_per_second": 6.118, |
| "eval_steps_per_second": 1.529, |
| "step": 775 |
| }, |
| { |
| "epoch": 2.5082828282828284, |
| "grad_norm": 0.013955999165773392, |
| "learning_rate": 9.99999540524171e-05, |
| "loss": 0.0021, |
| "step": 776 |
| }, |
| { |
| "epoch": 2.5115151515151517, |
| "grad_norm": 0.012839140370488167, |
| "learning_rate": 9.999991831541789e-05, |
| "loss": 0.0014, |
| "step": 777 |
| }, |
| { |
| "epoch": 2.514747474747475, |
| "grad_norm": 0.008688856847584248, |
| "learning_rate": 9.999987236786e-05, |
| "loss": 0.0012, |
| "step": 778 |
| }, |
| { |
| "epoch": 2.517979797979798, |
| "grad_norm": 0.00880356039851904, |
| "learning_rate": 9.999981620975281e-05, |
| "loss": 0.002, |
| "step": 779 |
| }, |
| { |
| "epoch": 2.5212121212121215, |
| "grad_norm": 0.010685171000659466, |
| "learning_rate": 9.999974984110779e-05, |
| "loss": 0.0014, |
| "step": 780 |
| }, |
| { |
| "epoch": 2.5244444444444447, |
| "grad_norm": 0.01713552698493004, |
| "learning_rate": 9.999967326193847e-05, |
| "loss": 0.0026, |
| "step": 781 |
| }, |
| { |
| "epoch": 2.5276767676767675, |
| "grad_norm": 0.010320529341697693, |
| "learning_rate": 9.999958647226049e-05, |
| "loss": 0.0016, |
| "step": 782 |
| }, |
| { |
| "epoch": 2.5309090909090908, |
| "grad_norm": 0.011808270588517189, |
| "learning_rate": 9.999948947209162e-05, |
| "loss": 0.0021, |
| "step": 783 |
| }, |
| { |
| "epoch": 2.534141414141414, |
| "grad_norm": 0.014705700799822807, |
| "learning_rate": 9.999938226145161e-05, |
| "loss": 0.0032, |
| "step": 784 |
| }, |
| { |
| "epoch": 2.5373737373737373, |
| "grad_norm": 0.016158709302544594, |
| "learning_rate": 9.999926484036237e-05, |
| "loss": 0.0021, |
| "step": 785 |
| }, |
| { |
| "epoch": 2.5406060606060605, |
| "grad_norm": 0.011575833894312382, |
| "learning_rate": 9.999913720884791e-05, |
| "loss": 0.0021, |
| "step": 786 |
| }, |
| { |
| "epoch": 2.543838383838384, |
| "grad_norm": 0.012563912197947502, |
| "learning_rate": 9.999899936693426e-05, |
| "loss": 0.0023, |
| "step": 787 |
| }, |
| { |
| "epoch": 2.547070707070707, |
| "grad_norm": 0.015457016415894032, |
| "learning_rate": 9.99988513146496e-05, |
| "loss": 0.0031, |
| "step": 788 |
| }, |
| { |
| "epoch": 2.5503030303030303, |
| "grad_norm": 0.013360939919948578, |
| "learning_rate": 9.999869305202412e-05, |
| "loss": 0.002, |
| "step": 789 |
| }, |
| { |
| "epoch": 2.5535353535353535, |
| "grad_norm": 0.01096352655440569, |
| "learning_rate": 9.999852457909018e-05, |
| "loss": 0.0022, |
| "step": 790 |
| }, |
| { |
| "epoch": 2.556767676767677, |
| "grad_norm": 0.016650492325425148, |
| "learning_rate": 9.999834589588217e-05, |
| "loss": 0.0023, |
| "step": 791 |
| }, |
| { |
| "epoch": 2.56, |
| "grad_norm": 0.009199659340083599, |
| "learning_rate": 9.999815700243656e-05, |
| "loss": 0.001, |
| "step": 792 |
| }, |
| { |
| "epoch": 2.5632323232323233, |
| "grad_norm": 0.011376001872122288, |
| "learning_rate": 9.999795789879196e-05, |
| "loss": 0.0022, |
| "step": 793 |
| }, |
| { |
| "epoch": 2.5664646464646466, |
| "grad_norm": 0.014007000252604485, |
| "learning_rate": 9.9997748584989e-05, |
| "loss": 0.0022, |
| "step": 794 |
| }, |
| { |
| "epoch": 2.56969696969697, |
| "grad_norm": 0.013973750174045563, |
| "learning_rate": 9.999752906107042e-05, |
| "loss": 0.0018, |
| "step": 795 |
| }, |
| { |
| "epoch": 2.572929292929293, |
| "grad_norm": 0.011831262148916721, |
| "learning_rate": 9.999729932708109e-05, |
| "loss": 0.002, |
| "step": 796 |
| }, |
| { |
| "epoch": 2.5761616161616163, |
| "grad_norm": 0.019147152081131935, |
| "learning_rate": 9.999705938306789e-05, |
| "loss": 0.0026, |
| "step": 797 |
| }, |
| { |
| "epoch": 2.579393939393939, |
| "grad_norm": 0.012269653379917145, |
| "learning_rate": 9.999680922907982e-05, |
| "loss": 0.0018, |
| "step": 798 |
| }, |
| { |
| "epoch": 2.5826262626262624, |
| "grad_norm": 0.018080374225974083, |
| "learning_rate": 9.999654886516798e-05, |
| "loss": 0.0047, |
| "step": 799 |
| }, |
| { |
| "epoch": 2.5858585858585856, |
| "grad_norm": 0.00920415110886097, |
| "learning_rate": 9.999627829138554e-05, |
| "loss": 0.0017, |
| "step": 800 |
| }, |
| { |
| "epoch": 2.5858585858585856, |
| "eval_loss": 0.002303797984495759, |
| "eval_runtime": 16.3446, |
| "eval_samples_per_second": 6.118, |
| "eval_steps_per_second": 1.53, |
| "step": 800 |
| }, |
| { |
| "epoch": 2.589090909090909, |
| "grad_norm": 0.009201723150908947, |
| "learning_rate": 9.999599750778772e-05, |
| "loss": 0.0023, |
| "step": 801 |
| }, |
| { |
| "epoch": 2.592323232323232, |
| "grad_norm": 0.011312730610370636, |
| "learning_rate": 9.999570651443191e-05, |
| "loss": 0.0022, |
| "step": 802 |
| }, |
| { |
| "epoch": 2.5955555555555554, |
| "grad_norm": 0.007410750258713961, |
| "learning_rate": 9.99954053113775e-05, |
| "loss": 0.0012, |
| "step": 803 |
| }, |
| { |
| "epoch": 2.5987878787878786, |
| "grad_norm": 0.010384928435087204, |
| "learning_rate": 9.9995093898686e-05, |
| "loss": 0.0017, |
| "step": 804 |
| }, |
| { |
| "epoch": 2.602020202020202, |
| "grad_norm": 0.010302623733878136, |
| "learning_rate": 9.999477227642103e-05, |
| "loss": 0.0019, |
| "step": 805 |
| }, |
| { |
| "epoch": 2.605252525252525, |
| "grad_norm": 0.012414009310305119, |
| "learning_rate": 9.999444044464823e-05, |
| "loss": 0.0017, |
| "step": 806 |
| }, |
| { |
| "epoch": 2.6084848484848484, |
| "grad_norm": 0.01266095694154501, |
| "learning_rate": 9.999409840343539e-05, |
| "loss": 0.0023, |
| "step": 807 |
| }, |
| { |
| "epoch": 2.6117171717171717, |
| "grad_norm": 0.009656663052737713, |
| "learning_rate": 9.999374615285236e-05, |
| "loss": 0.0017, |
| "step": 808 |
| }, |
| { |
| "epoch": 2.614949494949495, |
| "grad_norm": 0.007852123118937016, |
| "learning_rate": 9.999338369297106e-05, |
| "loss": 0.0013, |
| "step": 809 |
| }, |
| { |
| "epoch": 2.618181818181818, |
| "grad_norm": 0.011096924543380737, |
| "learning_rate": 9.999301102386553e-05, |
| "loss": 0.0018, |
| "step": 810 |
| }, |
| { |
| "epoch": 2.6214141414141414, |
| "grad_norm": 0.010280488058924675, |
| "learning_rate": 9.999262814561185e-05, |
| "loss": 0.0013, |
| "step": 811 |
| }, |
| { |
| "epoch": 2.6246464646464647, |
| "grad_norm": 0.012525126338005066, |
| "learning_rate": 9.999223505828821e-05, |
| "loss": 0.0027, |
| "step": 812 |
| }, |
| { |
| "epoch": 2.627878787878788, |
| "grad_norm": 0.009808313101530075, |
| "learning_rate": 9.999183176197491e-05, |
| "loss": 0.0014, |
| "step": 813 |
| }, |
| { |
| "epoch": 2.631111111111111, |
| "grad_norm": 0.011017659679055214, |
| "learning_rate": 9.999141825675426e-05, |
| "loss": 0.0014, |
| "step": 814 |
| }, |
| { |
| "epoch": 2.6343434343434344, |
| "grad_norm": 0.012968490831553936, |
| "learning_rate": 9.999099454271074e-05, |
| "loss": 0.002, |
| "step": 815 |
| }, |
| { |
| "epoch": 2.6375757575757577, |
| "grad_norm": 0.015998907387256622, |
| "learning_rate": 9.999056061993089e-05, |
| "loss": 0.0029, |
| "step": 816 |
| }, |
| { |
| "epoch": 2.640808080808081, |
| "grad_norm": 0.01360913272947073, |
| "learning_rate": 9.999011648850329e-05, |
| "loss": 0.0024, |
| "step": 817 |
| }, |
| { |
| "epoch": 2.644040404040404, |
| "grad_norm": 0.01287829503417015, |
| "learning_rate": 9.998966214851864e-05, |
| "loss": 0.0025, |
| "step": 818 |
| }, |
| { |
| "epoch": 2.6472727272727274, |
| "grad_norm": 0.0104606244713068, |
| "learning_rate": 9.998919760006972e-05, |
| "loss": 0.0019, |
| "step": 819 |
| }, |
| { |
| "epoch": 2.6505050505050507, |
| "grad_norm": 0.009416928514838219, |
| "learning_rate": 9.998872284325142e-05, |
| "loss": 0.002, |
| "step": 820 |
| }, |
| { |
| "epoch": 2.653737373737374, |
| "grad_norm": 0.006855939514935017, |
| "learning_rate": 9.998823787816066e-05, |
| "loss": 0.0013, |
| "step": 821 |
| }, |
| { |
| "epoch": 2.656969696969697, |
| "grad_norm": 0.0099884532392025, |
| "learning_rate": 9.99877427048965e-05, |
| "loss": 0.0019, |
| "step": 822 |
| }, |
| { |
| "epoch": 2.6602020202020205, |
| "grad_norm": 0.009265619330108166, |
| "learning_rate": 9.998723732356006e-05, |
| "loss": 0.0015, |
| "step": 823 |
| }, |
| { |
| "epoch": 2.6634343434343437, |
| "grad_norm": 0.01063575316220522, |
| "learning_rate": 9.998672173425452e-05, |
| "loss": 0.0018, |
| "step": 824 |
| }, |
| { |
| "epoch": 2.6666666666666665, |
| "grad_norm": 0.01031999010592699, |
| "learning_rate": 9.998619593708518e-05, |
| "loss": 0.0014, |
| "step": 825 |
| }, |
| { |
| "epoch": 2.6666666666666665, |
| "eval_loss": 0.0020872033201158047, |
| "eval_runtime": 16.3423, |
| "eval_samples_per_second": 6.119, |
| "eval_steps_per_second": 1.53, |
| "step": 825 |
| }, |
| { |
| "epoch": 2.6698989898989898, |
| "grad_norm": 0.015460864640772343, |
| "learning_rate": 9.998565993215943e-05, |
| "loss": 0.0027, |
| "step": 826 |
| }, |
| { |
| "epoch": 2.673131313131313, |
| "grad_norm": 0.010955015197396278, |
| "learning_rate": 9.998511371958672e-05, |
| "loss": 0.0014, |
| "step": 827 |
| }, |
| { |
| "epoch": 2.6763636363636363, |
| "grad_norm": 0.01116356160491705, |
| "learning_rate": 9.998455729947858e-05, |
| "loss": 0.0019, |
| "step": 828 |
| }, |
| { |
| "epoch": 2.6795959595959595, |
| "grad_norm": 0.012614821083843708, |
| "learning_rate": 9.998399067194864e-05, |
| "loss": 0.0025, |
| "step": 829 |
| }, |
| { |
| "epoch": 2.682828282828283, |
| "grad_norm": 0.008147659711539745, |
| "learning_rate": 9.998341383711263e-05, |
| "loss": 0.0013, |
| "step": 830 |
| }, |
| { |
| "epoch": 2.686060606060606, |
| "grad_norm": 0.011116529814898968, |
| "learning_rate": 9.998282679508835e-05, |
| "loss": 0.0021, |
| "step": 831 |
| }, |
| { |
| "epoch": 2.6892929292929293, |
| "grad_norm": 0.00877335760742426, |
| "learning_rate": 9.998222954599563e-05, |
| "loss": 0.0013, |
| "step": 832 |
| }, |
| { |
| "epoch": 2.6925252525252525, |
| "grad_norm": 0.00894878152757883, |
| "learning_rate": 9.99816220899565e-05, |
| "loss": 0.0013, |
| "step": 833 |
| }, |
| { |
| "epoch": 2.695757575757576, |
| "grad_norm": 0.0159080121666193, |
| "learning_rate": 9.998100442709497e-05, |
| "loss": 0.0027, |
| "step": 834 |
| }, |
| { |
| "epoch": 2.698989898989899, |
| "grad_norm": 0.010290171019732952, |
| "learning_rate": 9.998037655753717e-05, |
| "loss": 0.0018, |
| "step": 835 |
| }, |
| { |
| "epoch": 2.7022222222222223, |
| "grad_norm": 0.0123711246997118, |
| "learning_rate": 9.997973848141137e-05, |
| "loss": 0.002, |
| "step": 836 |
| }, |
| { |
| "epoch": 2.7054545454545456, |
| "grad_norm": 0.00988102424889803, |
| "learning_rate": 9.997909019884781e-05, |
| "loss": 0.0012, |
| "step": 837 |
| }, |
| { |
| "epoch": 2.708686868686869, |
| "grad_norm": 0.011489730328321457, |
| "learning_rate": 9.99784317099789e-05, |
| "loss": 0.002, |
| "step": 838 |
| }, |
| { |
| "epoch": 2.711919191919192, |
| "grad_norm": 0.011886958964169025, |
| "learning_rate": 9.997776301493914e-05, |
| "loss": 0.0017, |
| "step": 839 |
| }, |
| { |
| "epoch": 2.7151515151515153, |
| "grad_norm": 0.011621091514825821, |
| "learning_rate": 9.997708411386501e-05, |
| "loss": 0.0018, |
| "step": 840 |
| }, |
| { |
| "epoch": 2.718383838383838, |
| "grad_norm": 0.010116496123373508, |
| "learning_rate": 9.997639500689523e-05, |
| "loss": 0.0021, |
| "step": 841 |
| }, |
| { |
| "epoch": 2.7216161616161614, |
| "grad_norm": 0.010914750397205353, |
| "learning_rate": 9.997569569417049e-05, |
| "loss": 0.0018, |
| "step": 842 |
| }, |
| { |
| "epoch": 2.7248484848484846, |
| "grad_norm": 0.011812432669103146, |
| "learning_rate": 9.997498617583358e-05, |
| "loss": 0.0023, |
| "step": 843 |
| }, |
| { |
| "epoch": 2.728080808080808, |
| "grad_norm": 0.015192636288702488, |
| "learning_rate": 9.997426645202943e-05, |
| "loss": 0.0027, |
| "step": 844 |
| }, |
| { |
| "epoch": 2.731313131313131, |
| "grad_norm": 0.008144897408783436, |
| "learning_rate": 9.9973536522905e-05, |
| "loss": 0.0014, |
| "step": 845 |
| }, |
| { |
| "epoch": 2.7345454545454544, |
| "grad_norm": 0.011417154222726822, |
| "learning_rate": 9.997279638860933e-05, |
| "loss": 0.0016, |
| "step": 846 |
| }, |
| { |
| "epoch": 2.7377777777777776, |
| "grad_norm": 0.011398770846426487, |
| "learning_rate": 9.99720460492936e-05, |
| "loss": 0.0022, |
| "step": 847 |
| }, |
| { |
| "epoch": 2.741010101010101, |
| "grad_norm": 0.013799142092466354, |
| "learning_rate": 9.997128550511099e-05, |
| "loss": 0.0023, |
| "step": 848 |
| }, |
| { |
| "epoch": 2.744242424242424, |
| "grad_norm": 0.010035104118287563, |
| "learning_rate": 9.997051475621687e-05, |
| "loss": 0.0015, |
| "step": 849 |
| }, |
| { |
| "epoch": 2.7474747474747474, |
| "grad_norm": 0.01012366358190775, |
| "learning_rate": 9.996973380276857e-05, |
| "loss": 0.0015, |
| "step": 850 |
| }, |
| { |
| "epoch": 2.7474747474747474, |
| "eval_loss": 0.0020715424325317144, |
| "eval_runtime": 16.3608, |
| "eval_samples_per_second": 6.112, |
| "eval_steps_per_second": 1.528, |
| "step": 850 |
| }, |
| { |
| "epoch": 2.7507070707070707, |
| "grad_norm": 0.015252598561346531, |
| "learning_rate": 9.996894264492563e-05, |
| "loss": 0.0025, |
| "step": 851 |
| }, |
| { |
| "epoch": 2.753939393939394, |
| "grad_norm": 0.013939411379396915, |
| "learning_rate": 9.99681412828496e-05, |
| "loss": 0.0022, |
| "step": 852 |
| }, |
| { |
| "epoch": 2.757171717171717, |
| "grad_norm": 0.012870310805737972, |
| "learning_rate": 9.996732971670408e-05, |
| "loss": 0.0019, |
| "step": 853 |
| }, |
| { |
| "epoch": 2.7604040404040404, |
| "grad_norm": 0.010139352641999722, |
| "learning_rate": 9.996650794665487e-05, |
| "loss": 0.0027, |
| "step": 854 |
| }, |
| { |
| "epoch": 2.7636363636363637, |
| "grad_norm": 0.011695247143507004, |
| "learning_rate": 9.996567597286974e-05, |
| "loss": 0.0019, |
| "step": 855 |
| }, |
| { |
| "epoch": 2.766868686868687, |
| "grad_norm": 0.009308499284088612, |
| "learning_rate": 9.996483379551861e-05, |
| "loss": 0.0012, |
| "step": 856 |
| }, |
| { |
| "epoch": 2.77010101010101, |
| "grad_norm": 0.009836779907345772, |
| "learning_rate": 9.996398141477344e-05, |
| "loss": 0.0012, |
| "step": 857 |
| }, |
| { |
| "epoch": 2.7733333333333334, |
| "grad_norm": 0.010676603764295578, |
| "learning_rate": 9.996311883080832e-05, |
| "loss": 0.0013, |
| "step": 858 |
| }, |
| { |
| "epoch": 2.7765656565656567, |
| "grad_norm": 0.008592822588980198, |
| "learning_rate": 9.996224604379938e-05, |
| "loss": 0.0012, |
| "step": 859 |
| }, |
| { |
| "epoch": 2.77979797979798, |
| "grad_norm": 0.008707406930625439, |
| "learning_rate": 9.996136305392487e-05, |
| "loss": 0.0016, |
| "step": 860 |
| }, |
| { |
| "epoch": 2.783030303030303, |
| "grad_norm": 0.010407894849777222, |
| "learning_rate": 9.996046986136509e-05, |
| "loss": 0.0016, |
| "step": 861 |
| }, |
| { |
| "epoch": 2.7862626262626264, |
| "grad_norm": 0.009875278919935226, |
| "learning_rate": 9.995956646630246e-05, |
| "loss": 0.0017, |
| "step": 862 |
| }, |
| { |
| "epoch": 2.7894949494949497, |
| "grad_norm": 0.01143390778452158, |
| "learning_rate": 9.995865286892145e-05, |
| "loss": 0.0018, |
| "step": 863 |
| }, |
| { |
| "epoch": 2.792727272727273, |
| "grad_norm": 0.008934702724218369, |
| "learning_rate": 9.995772906940864e-05, |
| "loss": 0.0012, |
| "step": 864 |
| }, |
| { |
| "epoch": 2.795959595959596, |
| "grad_norm": 0.010277238674461842, |
| "learning_rate": 9.995679506795264e-05, |
| "loss": 0.0014, |
| "step": 865 |
| }, |
| { |
| "epoch": 2.7991919191919195, |
| "grad_norm": 0.009647834114730358, |
| "learning_rate": 9.995585086474424e-05, |
| "loss": 0.0018, |
| "step": 866 |
| }, |
| { |
| "epoch": 2.8024242424242423, |
| "grad_norm": 0.011817540973424911, |
| "learning_rate": 9.995489645997622e-05, |
| "loss": 0.0018, |
| "step": 867 |
| }, |
| { |
| "epoch": 2.8056565656565655, |
| "grad_norm": 0.011575527489185333, |
| "learning_rate": 9.99539318538435e-05, |
| "loss": 0.0019, |
| "step": 868 |
| }, |
| { |
| "epoch": 2.8088888888888888, |
| "grad_norm": 0.012380240485072136, |
| "learning_rate": 9.995295704654304e-05, |
| "loss": 0.0019, |
| "step": 869 |
| }, |
| { |
| "epoch": 2.812121212121212, |
| "grad_norm": 0.00826615747064352, |
| "learning_rate": 9.995197203827393e-05, |
| "loss": 0.0011, |
| "step": 870 |
| }, |
| { |
| "epoch": 2.8153535353535353, |
| "grad_norm": 0.011454376392066479, |
| "learning_rate": 9.995097682923733e-05, |
| "loss": 0.0017, |
| "step": 871 |
| }, |
| { |
| "epoch": 2.8185858585858585, |
| "grad_norm": 0.009519209153950214, |
| "learning_rate": 9.994997141963644e-05, |
| "loss": 0.0017, |
| "step": 872 |
| }, |
| { |
| "epoch": 2.821818181818182, |
| "grad_norm": 0.008130215108394623, |
| "learning_rate": 9.994895580967658e-05, |
| "loss": 0.0013, |
| "step": 873 |
| }, |
| { |
| "epoch": 2.825050505050505, |
| "grad_norm": 0.010969970375299454, |
| "learning_rate": 9.994792999956518e-05, |
| "loss": 0.0022, |
| "step": 874 |
| }, |
| { |
| "epoch": 2.8282828282828283, |
| "grad_norm": 0.010993543080985546, |
| "learning_rate": 9.994689398951169e-05, |
| "loss": 0.0017, |
| "step": 875 |
| }, |
| { |
| "epoch": 2.8282828282828283, |
| "eval_loss": 0.0020453694742172956, |
| "eval_runtime": 16.3593, |
| "eval_samples_per_second": 6.113, |
| "eval_steps_per_second": 1.528, |
| "step": 875 |
| }, |
| { |
| "epoch": 2.8315151515151515, |
| "grad_norm": 0.011773471720516682, |
| "learning_rate": 9.994584777972769e-05, |
| "loss": 0.002, |
| "step": 876 |
| }, |
| { |
| "epoch": 2.834747474747475, |
| "grad_norm": 0.006767973769456148, |
| "learning_rate": 9.994479137042683e-05, |
| "loss": 0.001, |
| "step": 877 |
| }, |
| { |
| "epoch": 2.837979797979798, |
| "grad_norm": 0.015704987570643425, |
| "learning_rate": 9.994372476182484e-05, |
| "loss": 0.0023, |
| "step": 878 |
| }, |
| { |
| "epoch": 2.8412121212121213, |
| "grad_norm": 0.012366436421871185, |
| "learning_rate": 9.994264795413953e-05, |
| "loss": 0.0014, |
| "step": 879 |
| }, |
| { |
| "epoch": 2.8444444444444446, |
| "grad_norm": 0.010697831399738789, |
| "learning_rate": 9.99415609475908e-05, |
| "loss": 0.0024, |
| "step": 880 |
| }, |
| { |
| "epoch": 2.847676767676768, |
| "grad_norm": 0.008919494226574898, |
| "learning_rate": 9.994046374240062e-05, |
| "loss": 0.0013, |
| "step": 881 |
| }, |
| { |
| "epoch": 2.850909090909091, |
| "grad_norm": 0.008485561236739159, |
| "learning_rate": 9.993935633879306e-05, |
| "loss": 0.0013, |
| "step": 882 |
| }, |
| { |
| "epoch": 2.854141414141414, |
| "grad_norm": 0.008861992508172989, |
| "learning_rate": 9.993823873699426e-05, |
| "loss": 0.0016, |
| "step": 883 |
| }, |
| { |
| "epoch": 2.857373737373737, |
| "grad_norm": 0.009203307330608368, |
| "learning_rate": 9.993711093723245e-05, |
| "loss": 0.0018, |
| "step": 884 |
| }, |
| { |
| "epoch": 2.8606060606060604, |
| "grad_norm": 0.008553597144782543, |
| "learning_rate": 9.993597293973796e-05, |
| "loss": 0.0012, |
| "step": 885 |
| }, |
| { |
| "epoch": 2.8638383838383836, |
| "grad_norm": 0.01069615874439478, |
| "learning_rate": 9.993482474474314e-05, |
| "loss": 0.002, |
| "step": 886 |
| }, |
| { |
| "epoch": 2.867070707070707, |
| "grad_norm": 0.007396090310066938, |
| "learning_rate": 9.99336663524825e-05, |
| "loss": 0.0013, |
| "step": 887 |
| }, |
| { |
| "epoch": 2.87030303030303, |
| "grad_norm": 0.012649512849748135, |
| "learning_rate": 9.993249776319258e-05, |
| "loss": 0.0017, |
| "step": 888 |
| }, |
| { |
| "epoch": 2.8735353535353534, |
| "grad_norm": 0.008179185912013054, |
| "learning_rate": 9.993131897711202e-05, |
| "loss": 0.0012, |
| "step": 889 |
| }, |
| { |
| "epoch": 2.8767676767676766, |
| "grad_norm": 0.014896427281200886, |
| "learning_rate": 9.993012999448154e-05, |
| "loss": 0.002, |
| "step": 890 |
| }, |
| { |
| "epoch": 2.88, |
| "grad_norm": 0.009167522192001343, |
| "learning_rate": 9.992893081554397e-05, |
| "loss": 0.0012, |
| "step": 891 |
| }, |
| { |
| "epoch": 2.883232323232323, |
| "grad_norm": 0.008768011815845966, |
| "learning_rate": 9.992772144054415e-05, |
| "loss": 0.0017, |
| "step": 892 |
| }, |
| { |
| "epoch": 2.8864646464646464, |
| "grad_norm": 0.012819679453969002, |
| "learning_rate": 9.992650186972909e-05, |
| "loss": 0.0016, |
| "step": 893 |
| }, |
| { |
| "epoch": 2.8896969696969697, |
| "grad_norm": 0.010624030604958534, |
| "learning_rate": 9.99252721033478e-05, |
| "loss": 0.0012, |
| "step": 894 |
| }, |
| { |
| "epoch": 2.892929292929293, |
| "grad_norm": 0.011288948357105255, |
| "learning_rate": 9.992403214165147e-05, |
| "loss": 0.0016, |
| "step": 895 |
| }, |
| { |
| "epoch": 2.896161616161616, |
| "grad_norm": 0.010915448889136314, |
| "learning_rate": 9.992278198489327e-05, |
| "loss": 0.0015, |
| "step": 896 |
| }, |
| { |
| "epoch": 2.8993939393939394, |
| "grad_norm": 0.00751241622492671, |
| "learning_rate": 9.99215216333285e-05, |
| "loss": 0.0011, |
| "step": 897 |
| }, |
| { |
| "epoch": 2.9026262626262627, |
| "grad_norm": 0.007899762131273746, |
| "learning_rate": 9.992025108721454e-05, |
| "loss": 0.001, |
| "step": 898 |
| }, |
| { |
| "epoch": 2.905858585858586, |
| "grad_norm": 0.01074175350368023, |
| "learning_rate": 9.991897034681087e-05, |
| "loss": 0.0015, |
| "step": 899 |
| }, |
| { |
| "epoch": 2.909090909090909, |
| "grad_norm": 0.010675775818526745, |
| "learning_rate": 9.9917679412379e-05, |
| "loss": 0.0018, |
| "step": 900 |
| }, |
| { |
| "epoch": 2.909090909090909, |
| "eval_loss": 0.0017840900691226125, |
| "eval_runtime": 16.3496, |
| "eval_samples_per_second": 6.116, |
| "eval_steps_per_second": 1.529, |
| "step": 900 |
| }, |
| { |
| "epoch": 2.9123232323232324, |
| "grad_norm": 0.009357589296996593, |
| "learning_rate": 9.99163782841826e-05, |
| "loss": 0.0016, |
| "step": 901 |
| }, |
| { |
| "epoch": 2.9155555555555557, |
| "grad_norm": 0.008398734033107758, |
| "learning_rate": 9.991506696248731e-05, |
| "loss": 0.0014, |
| "step": 902 |
| }, |
| { |
| "epoch": 2.918787878787879, |
| "grad_norm": 0.008578549139201641, |
| "learning_rate": 9.991374544756098e-05, |
| "loss": 0.0015, |
| "step": 903 |
| }, |
| { |
| "epoch": 2.922020202020202, |
| "grad_norm": 0.006966270040720701, |
| "learning_rate": 9.991241373967344e-05, |
| "loss": 0.001, |
| "step": 904 |
| }, |
| { |
| "epoch": 2.9252525252525254, |
| "grad_norm": 0.009255737066268921, |
| "learning_rate": 9.991107183909664e-05, |
| "loss": 0.0018, |
| "step": 905 |
| }, |
| { |
| "epoch": 2.9284848484848487, |
| "grad_norm": 0.00997141469269991, |
| "learning_rate": 9.990971974610466e-05, |
| "loss": 0.0016, |
| "step": 906 |
| }, |
| { |
| "epoch": 2.931717171717172, |
| "grad_norm": 0.008654761128127575, |
| "learning_rate": 9.990835746097356e-05, |
| "loss": 0.0015, |
| "step": 907 |
| }, |
| { |
| "epoch": 2.934949494949495, |
| "grad_norm": 0.011613531969487667, |
| "learning_rate": 9.990698498398155e-05, |
| "loss": 0.0029, |
| "step": 908 |
| }, |
| { |
| "epoch": 2.9381818181818184, |
| "grad_norm": 0.012548528611660004, |
| "learning_rate": 9.990560231540889e-05, |
| "loss": 0.0011, |
| "step": 909 |
| }, |
| { |
| "epoch": 2.9414141414141413, |
| "grad_norm": 0.010035105049610138, |
| "learning_rate": 9.990420945553797e-05, |
| "loss": 0.0013, |
| "step": 910 |
| }, |
| { |
| "epoch": 2.9446464646464645, |
| "grad_norm": 0.011190433986485004, |
| "learning_rate": 9.990280640465321e-05, |
| "loss": 0.0016, |
| "step": 911 |
| }, |
| { |
| "epoch": 2.9478787878787878, |
| "grad_norm": 0.01153119932860136, |
| "learning_rate": 9.990139316304112e-05, |
| "loss": 0.0018, |
| "step": 912 |
| }, |
| { |
| "epoch": 2.951111111111111, |
| "grad_norm": 0.010992007330060005, |
| "learning_rate": 9.989996973099032e-05, |
| "loss": 0.0015, |
| "step": 913 |
| }, |
| { |
| "epoch": 2.9543434343434343, |
| "grad_norm": 0.012004894204437733, |
| "learning_rate": 9.989853610879147e-05, |
| "loss": 0.0021, |
| "step": 914 |
| }, |
| { |
| "epoch": 2.9575757575757575, |
| "grad_norm": 0.008344232104718685, |
| "learning_rate": 9.989709229673736e-05, |
| "loss": 0.0014, |
| "step": 915 |
| }, |
| { |
| "epoch": 2.9608080808080808, |
| "grad_norm": 0.008308797143399715, |
| "learning_rate": 9.98956382951228e-05, |
| "loss": 0.0012, |
| "step": 916 |
| }, |
| { |
| "epoch": 2.964040404040404, |
| "grad_norm": 0.010753040201961994, |
| "learning_rate": 9.989417410424475e-05, |
| "loss": 0.0017, |
| "step": 917 |
| }, |
| { |
| "epoch": 2.9672727272727273, |
| "grad_norm": 0.009839852340519428, |
| "learning_rate": 9.98926997244022e-05, |
| "loss": 0.0016, |
| "step": 918 |
| }, |
| { |
| "epoch": 2.9705050505050505, |
| "grad_norm": 0.011538838967680931, |
| "learning_rate": 9.989121515589622e-05, |
| "loss": 0.002, |
| "step": 919 |
| }, |
| { |
| "epoch": 2.973737373737374, |
| "grad_norm": 0.008903412148356438, |
| "learning_rate": 9.988972039902997e-05, |
| "loss": 0.0013, |
| "step": 920 |
| }, |
| { |
| "epoch": 2.976969696969697, |
| "grad_norm": 0.011637412011623383, |
| "learning_rate": 9.988821545410874e-05, |
| "loss": 0.0015, |
| "step": 921 |
| }, |
| { |
| "epoch": 2.9802020202020203, |
| "grad_norm": 0.008440854959189892, |
| "learning_rate": 9.988670032143981e-05, |
| "loss": 0.0015, |
| "step": 922 |
| }, |
| { |
| "epoch": 2.9834343434343435, |
| "grad_norm": 0.009967893362045288, |
| "learning_rate": 9.988517500133262e-05, |
| "loss": 0.0016, |
| "step": 923 |
| }, |
| { |
| "epoch": 2.986666666666667, |
| "grad_norm": 0.0078852828592062, |
| "learning_rate": 9.988363949409865e-05, |
| "loss": 0.0012, |
| "step": 924 |
| }, |
| { |
| "epoch": 2.98989898989899, |
| "grad_norm": 0.007426911499351263, |
| "learning_rate": 9.988209380005144e-05, |
| "loss": 0.0012, |
| "step": 925 |
| }, |
| { |
| "epoch": 2.98989898989899, |
| "eval_loss": 0.001759253442287445, |
| "eval_runtime": 16.3381, |
| "eval_samples_per_second": 6.121, |
| "eval_steps_per_second": 1.53, |
| "step": 925 |
| }, |
| { |
| "epoch": 2.993131313131313, |
| "grad_norm": 0.007714211009442806, |
| "learning_rate": 9.98805379195067e-05, |
| "loss": 0.001, |
| "step": 926 |
| }, |
| { |
| "epoch": 2.996363636363636, |
| "grad_norm": 0.009345155209302902, |
| "learning_rate": 9.987897185278208e-05, |
| "loss": 0.0012, |
| "step": 927 |
| }, |
| { |
| "epoch": 2.9995959595959594, |
| "grad_norm": 0.012734991498291492, |
| "learning_rate": 9.987739560019746e-05, |
| "loss": 0.0014, |
| "step": 928 |
| }, |
| { |
| "epoch": 3.0028282828282826, |
| "grad_norm": 0.011406145989894867, |
| "learning_rate": 9.987580916207468e-05, |
| "loss": 0.0016, |
| "step": 929 |
| }, |
| { |
| "epoch": 3.006060606060606, |
| "grad_norm": 0.01067868061363697, |
| "learning_rate": 9.987421253873775e-05, |
| "loss": 0.0011, |
| "step": 930 |
| }, |
| { |
| "epoch": 3.009292929292929, |
| "grad_norm": 0.01311782281845808, |
| "learning_rate": 9.987260573051269e-05, |
| "loss": 0.0013, |
| "step": 931 |
| }, |
| { |
| "epoch": 3.0125252525252524, |
| "grad_norm": 0.014876693487167358, |
| "learning_rate": 9.987098873772763e-05, |
| "loss": 0.0013, |
| "step": 932 |
| }, |
| { |
| "epoch": 3.0157575757575756, |
| "grad_norm": 0.011760083958506584, |
| "learning_rate": 9.986936156071278e-05, |
| "loss": 0.0009, |
| "step": 933 |
| }, |
| { |
| "epoch": 3.018989898989899, |
| "grad_norm": 0.012099731713533401, |
| "learning_rate": 9.986772419980044e-05, |
| "loss": 0.0015, |
| "step": 934 |
| }, |
| { |
| "epoch": 3.022222222222222, |
| "grad_norm": 0.008160330355167389, |
| "learning_rate": 9.986607665532497e-05, |
| "loss": 0.0009, |
| "step": 935 |
| }, |
| { |
| "epoch": 3.0254545454545454, |
| "grad_norm": 0.0057359496131539345, |
| "learning_rate": 9.986441892762281e-05, |
| "loss": 0.0009, |
| "step": 936 |
| }, |
| { |
| "epoch": 3.0286868686868686, |
| "grad_norm": 0.011564587242901325, |
| "learning_rate": 9.98627510170325e-05, |
| "loss": 0.0014, |
| "step": 937 |
| }, |
| { |
| "epoch": 3.031919191919192, |
| "grad_norm": 0.008928325958549976, |
| "learning_rate": 9.986107292389464e-05, |
| "loss": 0.0013, |
| "step": 938 |
| }, |
| { |
| "epoch": 3.035151515151515, |
| "grad_norm": 0.007116743829101324, |
| "learning_rate": 9.985938464855191e-05, |
| "loss": 0.001, |
| "step": 939 |
| }, |
| { |
| "epoch": 3.0383838383838384, |
| "grad_norm": 0.00824098102748394, |
| "learning_rate": 9.985768619134909e-05, |
| "loss": 0.0009, |
| "step": 940 |
| }, |
| { |
| "epoch": 3.0416161616161617, |
| "grad_norm": 0.010526294820010662, |
| "learning_rate": 9.985597755263302e-05, |
| "loss": 0.0014, |
| "step": 941 |
| }, |
| { |
| "epoch": 3.044848484848485, |
| "grad_norm": 0.008329221978783607, |
| "learning_rate": 9.985425873275263e-05, |
| "loss": 0.0011, |
| "step": 942 |
| }, |
| { |
| "epoch": 3.048080808080808, |
| "grad_norm": 0.00860538613051176, |
| "learning_rate": 9.98525297320589e-05, |
| "loss": 0.0012, |
| "step": 943 |
| }, |
| { |
| "epoch": 3.0513131313131314, |
| "grad_norm": 0.008362906984984875, |
| "learning_rate": 9.985079055090493e-05, |
| "loss": 0.0013, |
| "step": 944 |
| }, |
| { |
| "epoch": 3.0545454545454547, |
| "grad_norm": 0.008007494732737541, |
| "learning_rate": 9.984904118964588e-05, |
| "loss": 0.001, |
| "step": 945 |
| }, |
| { |
| "epoch": 3.057777777777778, |
| "grad_norm": 0.008498580195009708, |
| "learning_rate": 9.984728164863898e-05, |
| "loss": 0.0009, |
| "step": 946 |
| }, |
| { |
| "epoch": 3.061010101010101, |
| "grad_norm": 0.009447433985769749, |
| "learning_rate": 9.984551192824355e-05, |
| "loss": 0.001, |
| "step": 947 |
| }, |
| { |
| "epoch": 3.0642424242424244, |
| "grad_norm": 0.012203999795019627, |
| "learning_rate": 9.9843732028821e-05, |
| "loss": 0.0009, |
| "step": 948 |
| }, |
| { |
| "epoch": 3.0674747474747477, |
| "grad_norm": 0.009113363921642303, |
| "learning_rate": 9.98419419507348e-05, |
| "loss": 0.0009, |
| "step": 949 |
| }, |
| { |
| "epoch": 3.0707070707070705, |
| "grad_norm": 0.008994522504508495, |
| "learning_rate": 9.98401416943505e-05, |
| "loss": 0.0008, |
| "step": 950 |
| }, |
| { |
| "epoch": 3.0707070707070705, |
| "eval_loss": 0.0017512531485408545, |
| "eval_runtime": 16.3377, |
| "eval_samples_per_second": 6.121, |
| "eval_steps_per_second": 1.53, |
| "step": 950 |
| }, |
| { |
| "epoch": 3.0739393939393937, |
| "grad_norm": 0.009916388429701328, |
| "learning_rate": 9.983833126003572e-05, |
| "loss": 0.001, |
| "step": 951 |
| }, |
| { |
| "epoch": 3.077171717171717, |
| "grad_norm": 0.012694688513875008, |
| "learning_rate": 9.98365106481602e-05, |
| "loss": 0.0016, |
| "step": 952 |
| }, |
| { |
| "epoch": 3.0804040404040403, |
| "grad_norm": 0.010637965053319931, |
| "learning_rate": 9.983467985909573e-05, |
| "loss": 0.0012, |
| "step": 953 |
| }, |
| { |
| "epoch": 3.0836363636363635, |
| "grad_norm": 0.01486038789153099, |
| "learning_rate": 9.983283889321615e-05, |
| "loss": 0.0011, |
| "step": 954 |
| }, |
| { |
| "epoch": 3.0868686868686868, |
| "grad_norm": 0.011312978342175484, |
| "learning_rate": 9.983098775089742e-05, |
| "loss": 0.0014, |
| "step": 955 |
| }, |
| { |
| "epoch": 3.09010101010101, |
| "grad_norm": 0.01307759527117014, |
| "learning_rate": 9.982912643251757e-05, |
| "loss": 0.0015, |
| "step": 956 |
| }, |
| { |
| "epoch": 3.0933333333333333, |
| "grad_norm": 0.007377214729785919, |
| "learning_rate": 9.98272549384567e-05, |
| "loss": 0.0011, |
| "step": 957 |
| }, |
| { |
| "epoch": 3.0965656565656565, |
| "grad_norm": 0.007143994327634573, |
| "learning_rate": 9.982537326909697e-05, |
| "loss": 0.0006, |
| "step": 958 |
| }, |
| { |
| "epoch": 3.0997979797979798, |
| "grad_norm": 0.010922285728156567, |
| "learning_rate": 9.982348142482269e-05, |
| "loss": 0.0012, |
| "step": 959 |
| }, |
| { |
| "epoch": 3.103030303030303, |
| "grad_norm": 0.009535728953778744, |
| "learning_rate": 9.982157940602014e-05, |
| "loss": 0.0012, |
| "step": 960 |
| }, |
| { |
| "epoch": 3.1062626262626263, |
| "grad_norm": 0.00803733803331852, |
| "learning_rate": 9.981966721307778e-05, |
| "loss": 0.0013, |
| "step": 961 |
| }, |
| { |
| "epoch": 3.1094949494949495, |
| "grad_norm": 0.008351441472768784, |
| "learning_rate": 9.981774484638606e-05, |
| "loss": 0.0012, |
| "step": 962 |
| }, |
| { |
| "epoch": 3.112727272727273, |
| "grad_norm": 0.012005380354821682, |
| "learning_rate": 9.981581230633758e-05, |
| "loss": 0.0011, |
| "step": 963 |
| }, |
| { |
| "epoch": 3.115959595959596, |
| "grad_norm": 0.010061024688184261, |
| "learning_rate": 9.981386959332697e-05, |
| "loss": 0.001, |
| "step": 964 |
| }, |
| { |
| "epoch": 3.1191919191919193, |
| "grad_norm": 0.008992062881588936, |
| "learning_rate": 9.981191670775097e-05, |
| "loss": 0.0012, |
| "step": 965 |
| }, |
| { |
| "epoch": 3.1224242424242425, |
| "grad_norm": 0.006035625468939543, |
| "learning_rate": 9.980995365000836e-05, |
| "loss": 0.0007, |
| "step": 966 |
| }, |
| { |
| "epoch": 3.125656565656566, |
| "grad_norm": 0.013019300997257233, |
| "learning_rate": 9.980798042050004e-05, |
| "loss": 0.0012, |
| "step": 967 |
| }, |
| { |
| "epoch": 3.128888888888889, |
| "grad_norm": 0.014314939267933369, |
| "learning_rate": 9.980599701962896e-05, |
| "loss": 0.0015, |
| "step": 968 |
| }, |
| { |
| "epoch": 3.1321212121212123, |
| "grad_norm": 0.009003909304738045, |
| "learning_rate": 9.980400344780015e-05, |
| "loss": 0.001, |
| "step": 969 |
| }, |
| { |
| "epoch": 3.1353535353535356, |
| "grad_norm": 0.009461263194680214, |
| "learning_rate": 9.98019997054207e-05, |
| "loss": 0.0011, |
| "step": 970 |
| }, |
| { |
| "epoch": 3.1385858585858584, |
| "grad_norm": 0.007095228880643845, |
| "learning_rate": 9.979998579289984e-05, |
| "loss": 0.0009, |
| "step": 971 |
| }, |
| { |
| "epoch": 3.1418181818181816, |
| "grad_norm": 0.012431626208126545, |
| "learning_rate": 9.979796171064881e-05, |
| "loss": 0.0019, |
| "step": 972 |
| }, |
| { |
| "epoch": 3.145050505050505, |
| "grad_norm": 0.00869319960474968, |
| "learning_rate": 9.979592745908095e-05, |
| "loss": 0.0009, |
| "step": 973 |
| }, |
| { |
| "epoch": 3.148282828282828, |
| "grad_norm": 0.013157929293811321, |
| "learning_rate": 9.979388303861169e-05, |
| "loss": 0.0011, |
| "step": 974 |
| }, |
| { |
| "epoch": 3.1515151515151514, |
| "grad_norm": 0.009458480402827263, |
| "learning_rate": 9.97918284496585e-05, |
| "loss": 0.0011, |
| "step": 975 |
| }, |
| { |
| "epoch": 3.1515151515151514, |
| "eval_loss": 0.0017218769062310457, |
| "eval_runtime": 16.4046, |
| "eval_samples_per_second": 6.096, |
| "eval_steps_per_second": 1.524, |
| "step": 975 |
| }, |
| { |
| "epoch": 3.1547474747474746, |
| "grad_norm": 0.007865170948207378, |
| "learning_rate": 9.978976369264098e-05, |
| "loss": 0.0008, |
| "step": 976 |
| }, |
| { |
| "epoch": 3.157979797979798, |
| "grad_norm": 0.009995974600315094, |
| "learning_rate": 9.978768876798075e-05, |
| "loss": 0.0011, |
| "step": 977 |
| }, |
| { |
| "epoch": 3.161212121212121, |
| "grad_norm": 0.009956127032637596, |
| "learning_rate": 9.978560367610156e-05, |
| "loss": 0.0011, |
| "step": 978 |
| }, |
| { |
| "epoch": 3.1644444444444444, |
| "grad_norm": 0.008594691753387451, |
| "learning_rate": 9.978350841742919e-05, |
| "loss": 0.0011, |
| "step": 979 |
| }, |
| { |
| "epoch": 3.1676767676767676, |
| "grad_norm": 0.011499758809804916, |
| "learning_rate": 9.978140299239152e-05, |
| "loss": 0.0011, |
| "step": 980 |
| }, |
| { |
| "epoch": 3.170909090909091, |
| "grad_norm": 0.006824036594480276, |
| "learning_rate": 9.977928740141851e-05, |
| "loss": 0.001, |
| "step": 981 |
| }, |
| { |
| "epoch": 3.174141414141414, |
| "grad_norm": 0.006031815893948078, |
| "learning_rate": 9.977716164494217e-05, |
| "loss": 0.0007, |
| "step": 982 |
| }, |
| { |
| "epoch": 3.1773737373737374, |
| "grad_norm": 0.00841270387172699, |
| "learning_rate": 9.977502572339664e-05, |
| "loss": 0.0015, |
| "step": 983 |
| }, |
| { |
| "epoch": 3.1806060606060607, |
| "grad_norm": 0.008552856743335724, |
| "learning_rate": 9.977287963721804e-05, |
| "loss": 0.001, |
| "step": 984 |
| }, |
| { |
| "epoch": 3.183838383838384, |
| "grad_norm": 0.007405848242342472, |
| "learning_rate": 9.977072338684469e-05, |
| "loss": 0.001, |
| "step": 985 |
| }, |
| { |
| "epoch": 3.187070707070707, |
| "grad_norm": 0.006996997632086277, |
| "learning_rate": 9.976855697271689e-05, |
| "loss": 0.0008, |
| "step": 986 |
| }, |
| { |
| "epoch": 3.1903030303030304, |
| "grad_norm": 0.009966832585632801, |
| "learning_rate": 9.976638039527704e-05, |
| "loss": 0.0014, |
| "step": 987 |
| }, |
| { |
| "epoch": 3.1935353535353537, |
| "grad_norm": 0.00842028297483921, |
| "learning_rate": 9.976419365496963e-05, |
| "loss": 0.001, |
| "step": 988 |
| }, |
| { |
| "epoch": 3.196767676767677, |
| "grad_norm": 0.008361369371414185, |
| "learning_rate": 9.976199675224123e-05, |
| "loss": 0.0008, |
| "step": 989 |
| }, |
| { |
| "epoch": 3.2, |
| "grad_norm": 0.008185483515262604, |
| "learning_rate": 9.975978968754045e-05, |
| "loss": 0.0008, |
| "step": 990 |
| }, |
| { |
| "epoch": 3.2032323232323234, |
| "grad_norm": 0.018663762137293816, |
| "learning_rate": 9.975757246131803e-05, |
| "loss": 0.002, |
| "step": 991 |
| }, |
| { |
| "epoch": 3.2064646464646467, |
| "grad_norm": 0.011232037097215652, |
| "learning_rate": 9.975534507402671e-05, |
| "loss": 0.0013, |
| "step": 992 |
| }, |
| { |
| "epoch": 3.2096969696969695, |
| "grad_norm": 0.008030761033296585, |
| "learning_rate": 9.975310752612137e-05, |
| "loss": 0.0008, |
| "step": 993 |
| }, |
| { |
| "epoch": 3.2129292929292927, |
| "grad_norm": 0.008287128992378712, |
| "learning_rate": 9.975085981805897e-05, |
| "loss": 0.0008, |
| "step": 994 |
| }, |
| { |
| "epoch": 3.216161616161616, |
| "grad_norm": 0.007731509394943714, |
| "learning_rate": 9.974860195029847e-05, |
| "loss": 0.0009, |
| "step": 995 |
| }, |
| { |
| "epoch": 3.2193939393939393, |
| "grad_norm": 0.007766639348119497, |
| "learning_rate": 9.974633392330097e-05, |
| "loss": 0.0009, |
| "step": 996 |
| }, |
| { |
| "epoch": 3.2226262626262625, |
| "grad_norm": 0.00740783428773284, |
| "learning_rate": 9.974405573752965e-05, |
| "loss": 0.0009, |
| "step": 997 |
| }, |
| { |
| "epoch": 3.2258585858585858, |
| "grad_norm": 0.011892711743712425, |
| "learning_rate": 9.974176739344971e-05, |
| "loss": 0.0013, |
| "step": 998 |
| }, |
| { |
| "epoch": 3.229090909090909, |
| "grad_norm": 0.01170049887150526, |
| "learning_rate": 9.973946889152847e-05, |
| "loss": 0.0015, |
| "step": 999 |
| }, |
| { |
| "epoch": 3.2323232323232323, |
| "grad_norm": 0.010566040873527527, |
| "learning_rate": 9.973716023223531e-05, |
| "loss": 0.0012, |
| "step": 1000 |
| }, |
| { |
| "epoch": 3.2323232323232323, |
| "eval_loss": 0.0016789839137345552, |
| "eval_runtime": 16.3486, |
| "eval_samples_per_second": 6.117, |
| "eval_steps_per_second": 1.529, |
| "step": 1000 |
| }, |
| { |
| "epoch": 3.2355555555555555, |
| "grad_norm": 0.008946800604462624, |
| "learning_rate": 9.97348414160417e-05, |
| "loss": 0.0011, |
| "step": 1001 |
| }, |
| { |
| "epoch": 3.2387878787878788, |
| "grad_norm": 0.00717885559424758, |
| "learning_rate": 9.973251244342114e-05, |
| "loss": 0.0007, |
| "step": 1002 |
| }, |
| { |
| "epoch": 3.242020202020202, |
| "grad_norm": 0.008244099095463753, |
| "learning_rate": 9.973017331484926e-05, |
| "loss": 0.0008, |
| "step": 1003 |
| }, |
| { |
| "epoch": 3.2452525252525253, |
| "grad_norm": 0.009603862650692463, |
| "learning_rate": 9.972782403080372e-05, |
| "loss": 0.0011, |
| "step": 1004 |
| }, |
| { |
| "epoch": 3.2484848484848485, |
| "grad_norm": 0.00887330248951912, |
| "learning_rate": 9.972546459176425e-05, |
| "loss": 0.0015, |
| "step": 1005 |
| }, |
| { |
| "epoch": 3.251717171717172, |
| "grad_norm": 0.009757821448147297, |
| "learning_rate": 9.972309499821273e-05, |
| "loss": 0.0013, |
| "step": 1006 |
| }, |
| { |
| "epoch": 3.254949494949495, |
| "grad_norm": 0.010830886662006378, |
| "learning_rate": 9.972071525063303e-05, |
| "loss": 0.0014, |
| "step": 1007 |
| }, |
| { |
| "epoch": 3.2581818181818183, |
| "grad_norm": 0.0073830136097967625, |
| "learning_rate": 9.971832534951108e-05, |
| "loss": 0.0009, |
| "step": 1008 |
| }, |
| { |
| "epoch": 3.2614141414141415, |
| "grad_norm": 0.010923303663730621, |
| "learning_rate": 9.9715925295335e-05, |
| "loss": 0.0014, |
| "step": 1009 |
| }, |
| { |
| "epoch": 3.264646464646465, |
| "grad_norm": 0.011959812603890896, |
| "learning_rate": 9.971351508859488e-05, |
| "loss": 0.0014, |
| "step": 1010 |
| }, |
| { |
| "epoch": 3.267878787878788, |
| "grad_norm": 0.00867361482232809, |
| "learning_rate": 9.971109472978288e-05, |
| "loss": 0.0008, |
| "step": 1011 |
| }, |
| { |
| "epoch": 3.2711111111111113, |
| "grad_norm": 0.008474764414131641, |
| "learning_rate": 9.97086642193933e-05, |
| "loss": 0.0007, |
| "step": 1012 |
| }, |
| { |
| "epoch": 3.274343434343434, |
| "grad_norm": 0.007730542682111263, |
| "learning_rate": 9.970622355792247e-05, |
| "loss": 0.0008, |
| "step": 1013 |
| }, |
| { |
| "epoch": 3.2775757575757574, |
| "grad_norm": 0.005717351101338863, |
| "learning_rate": 9.970377274586879e-05, |
| "loss": 0.0007, |
| "step": 1014 |
| }, |
| { |
| "epoch": 3.2808080808080806, |
| "grad_norm": 0.006671001203358173, |
| "learning_rate": 9.970131178373277e-05, |
| "loss": 0.0009, |
| "step": 1015 |
| }, |
| { |
| "epoch": 3.284040404040404, |
| "grad_norm": 0.007357608526945114, |
| "learning_rate": 9.969884067201695e-05, |
| "loss": 0.0008, |
| "step": 1016 |
| }, |
| { |
| "epoch": 3.287272727272727, |
| "grad_norm": 0.008754865266382694, |
| "learning_rate": 9.969635941122595e-05, |
| "loss": 0.0007, |
| "step": 1017 |
| }, |
| { |
| "epoch": 3.2905050505050504, |
| "grad_norm": 0.010542918927967548, |
| "learning_rate": 9.969386800186649e-05, |
| "loss": 0.001, |
| "step": 1018 |
| }, |
| { |
| "epoch": 3.2937373737373736, |
| "grad_norm": 0.010549299418926239, |
| "learning_rate": 9.969136644444731e-05, |
| "loss": 0.0015, |
| "step": 1019 |
| }, |
| { |
| "epoch": 3.296969696969697, |
| "grad_norm": 0.009419445879757404, |
| "learning_rate": 9.968885473947932e-05, |
| "loss": 0.001, |
| "step": 1020 |
| }, |
| { |
| "epoch": 3.30020202020202, |
| "grad_norm": 0.009973052889108658, |
| "learning_rate": 9.968633288747539e-05, |
| "loss": 0.0011, |
| "step": 1021 |
| }, |
| { |
| "epoch": 3.3034343434343434, |
| "grad_norm": 0.012304474599659443, |
| "learning_rate": 9.968380088895052e-05, |
| "loss": 0.0011, |
| "step": 1022 |
| }, |
| { |
| "epoch": 3.3066666666666666, |
| "grad_norm": 0.009277598932385445, |
| "learning_rate": 9.968125874442179e-05, |
| "loss": 0.0012, |
| "step": 1023 |
| }, |
| { |
| "epoch": 3.30989898989899, |
| "grad_norm": 0.007734949700534344, |
| "learning_rate": 9.96787064544083e-05, |
| "loss": 0.0007, |
| "step": 1024 |
| }, |
| { |
| "epoch": 3.313131313131313, |
| "grad_norm": 0.007729734294116497, |
| "learning_rate": 9.96761440194313e-05, |
| "loss": 0.0008, |
| "step": 1025 |
| }, |
| { |
| "epoch": 3.313131313131313, |
| "eval_loss": 0.0016529730055481195, |
| "eval_runtime": 16.4101, |
| "eval_samples_per_second": 6.094, |
| "eval_steps_per_second": 1.523, |
| "step": 1025 |
| }, |
| { |
| "epoch": 3.3163636363636364, |
| "grad_norm": 0.010236666537821293, |
| "learning_rate": 9.967357144001403e-05, |
| "loss": 0.0012, |
| "step": 1026 |
| }, |
| { |
| "epoch": 3.3195959595959597, |
| "grad_norm": 0.00807194970548153, |
| "learning_rate": 9.967098871668186e-05, |
| "loss": 0.0009, |
| "step": 1027 |
| }, |
| { |
| "epoch": 3.322828282828283, |
| "grad_norm": 0.005198963917791843, |
| "learning_rate": 9.966839584996222e-05, |
| "loss": 0.0007, |
| "step": 1028 |
| }, |
| { |
| "epoch": 3.326060606060606, |
| "grad_norm": 0.008235493674874306, |
| "learning_rate": 9.96657928403846e-05, |
| "loss": 0.0009, |
| "step": 1029 |
| }, |
| { |
| "epoch": 3.3292929292929294, |
| "grad_norm": 0.008224842138588428, |
| "learning_rate": 9.966317968848054e-05, |
| "loss": 0.0011, |
| "step": 1030 |
| }, |
| { |
| "epoch": 3.3325252525252527, |
| "grad_norm": 0.009314566850662231, |
| "learning_rate": 9.966055639478369e-05, |
| "loss": 0.001, |
| "step": 1031 |
| }, |
| { |
| "epoch": 3.335757575757576, |
| "grad_norm": 0.007725287228822708, |
| "learning_rate": 9.965792295982978e-05, |
| "loss": 0.001, |
| "step": 1032 |
| }, |
| { |
| "epoch": 3.338989898989899, |
| "grad_norm": 0.009058133698999882, |
| "learning_rate": 9.965527938415655e-05, |
| "loss": 0.0012, |
| "step": 1033 |
| }, |
| { |
| "epoch": 3.3422222222222224, |
| "grad_norm": 0.0107278972864151, |
| "learning_rate": 9.965262566830388e-05, |
| "loss": 0.0013, |
| "step": 1034 |
| }, |
| { |
| "epoch": 3.3454545454545457, |
| "grad_norm": 0.00889227632433176, |
| "learning_rate": 9.964996181281367e-05, |
| "loss": 0.001, |
| "step": 1035 |
| }, |
| { |
| "epoch": 3.348686868686869, |
| "grad_norm": 0.01083658542484045, |
| "learning_rate": 9.964728781822992e-05, |
| "loss": 0.001, |
| "step": 1036 |
| }, |
| { |
| "epoch": 3.3519191919191917, |
| "grad_norm": 0.009385057725012302, |
| "learning_rate": 9.964460368509867e-05, |
| "loss": 0.0008, |
| "step": 1037 |
| }, |
| { |
| "epoch": 3.355151515151515, |
| "grad_norm": 0.008289688266813755, |
| "learning_rate": 9.964190941396808e-05, |
| "loss": 0.0011, |
| "step": 1038 |
| }, |
| { |
| "epoch": 3.3583838383838382, |
| "grad_norm": 0.009917953051626682, |
| "learning_rate": 9.963920500538834e-05, |
| "loss": 0.0013, |
| "step": 1039 |
| }, |
| { |
| "epoch": 3.3616161616161615, |
| "grad_norm": 0.008948505856096745, |
| "learning_rate": 9.963649045991173e-05, |
| "loss": 0.0014, |
| "step": 1040 |
| }, |
| { |
| "epoch": 3.3648484848484848, |
| "grad_norm": 0.007556795608252287, |
| "learning_rate": 9.963376577809256e-05, |
| "loss": 0.0012, |
| "step": 1041 |
| }, |
| { |
| "epoch": 3.368080808080808, |
| "grad_norm": 0.008106544613838196, |
| "learning_rate": 9.963103096048728e-05, |
| "loss": 0.001, |
| "step": 1042 |
| }, |
| { |
| "epoch": 3.3713131313131313, |
| "grad_norm": 0.00776927825063467, |
| "learning_rate": 9.962828600765433e-05, |
| "loss": 0.0011, |
| "step": 1043 |
| }, |
| { |
| "epoch": 3.3745454545454545, |
| "grad_norm": 0.007516583893448114, |
| "learning_rate": 9.96255309201543e-05, |
| "loss": 0.0008, |
| "step": 1044 |
| }, |
| { |
| "epoch": 3.3777777777777778, |
| "grad_norm": 0.007022594567388296, |
| "learning_rate": 9.962276569854977e-05, |
| "loss": 0.0007, |
| "step": 1045 |
| }, |
| { |
| "epoch": 3.381010101010101, |
| "grad_norm": 0.006233508232980967, |
| "learning_rate": 9.96199903434055e-05, |
| "loss": 0.0008, |
| "step": 1046 |
| }, |
| { |
| "epoch": 3.3842424242424243, |
| "grad_norm": 0.009586937725543976, |
| "learning_rate": 9.961720485528819e-05, |
| "loss": 0.0012, |
| "step": 1047 |
| }, |
| { |
| "epoch": 3.3874747474747475, |
| "grad_norm": 0.013609235174953938, |
| "learning_rate": 9.961440923476666e-05, |
| "loss": 0.0014, |
| "step": 1048 |
| }, |
| { |
| "epoch": 3.390707070707071, |
| "grad_norm": 0.009207941591739655, |
| "learning_rate": 9.961160348241185e-05, |
| "loss": 0.0012, |
| "step": 1049 |
| }, |
| { |
| "epoch": 3.393939393939394, |
| "grad_norm": 0.01084598433226347, |
| "learning_rate": 9.96087875987967e-05, |
| "loss": 0.0013, |
| "step": 1050 |
| }, |
| { |
| "epoch": 3.393939393939394, |
| "eval_loss": 0.0015429488848894835, |
| "eval_runtime": 16.3827, |
| "eval_samples_per_second": 6.104, |
| "eval_steps_per_second": 1.526, |
| "step": 1050 |
| }, |
| { |
| "epoch": 3.3971717171717173, |
| "grad_norm": 0.006414481904357672, |
| "learning_rate": 9.960596158449627e-05, |
| "loss": 0.0007, |
| "step": 1051 |
| }, |
| { |
| "epoch": 3.4004040404040405, |
| "grad_norm": 0.006256110034883022, |
| "learning_rate": 9.960312544008763e-05, |
| "loss": 0.0008, |
| "step": 1052 |
| }, |
| { |
| "epoch": 3.403636363636364, |
| "grad_norm": 0.008881442248821259, |
| "learning_rate": 9.960027916614998e-05, |
| "loss": 0.0017, |
| "step": 1053 |
| }, |
| { |
| "epoch": 3.406868686868687, |
| "grad_norm": 0.006901131477206945, |
| "learning_rate": 9.959742276326456e-05, |
| "loss": 0.0007, |
| "step": 1054 |
| }, |
| { |
| "epoch": 3.41010101010101, |
| "grad_norm": 0.008546717464923859, |
| "learning_rate": 9.959455623201465e-05, |
| "loss": 0.001, |
| "step": 1055 |
| }, |
| { |
| "epoch": 3.413333333333333, |
| "grad_norm": 0.010368199087679386, |
| "learning_rate": 9.959167957298568e-05, |
| "loss": 0.0013, |
| "step": 1056 |
| }, |
| { |
| "epoch": 3.4165656565656564, |
| "grad_norm": 0.010289453901350498, |
| "learning_rate": 9.958879278676506e-05, |
| "loss": 0.0011, |
| "step": 1057 |
| }, |
| { |
| "epoch": 3.4197979797979796, |
| "grad_norm": 0.009570143185555935, |
| "learning_rate": 9.958589587394231e-05, |
| "loss": 0.001, |
| "step": 1058 |
| }, |
| { |
| "epoch": 3.423030303030303, |
| "grad_norm": 0.00869888998568058, |
| "learning_rate": 9.958298883510903e-05, |
| "loss": 0.0008, |
| "step": 1059 |
| }, |
| { |
| "epoch": 3.426262626262626, |
| "grad_norm": 0.007902804762125015, |
| "learning_rate": 9.958007167085886e-05, |
| "loss": 0.0012, |
| "step": 1060 |
| }, |
| { |
| "epoch": 3.4294949494949494, |
| "grad_norm": 0.010951451025903225, |
| "learning_rate": 9.95771443817875e-05, |
| "loss": 0.0008, |
| "step": 1061 |
| }, |
| { |
| "epoch": 3.4327272727272726, |
| "grad_norm": 0.008102809078991413, |
| "learning_rate": 9.957420696849275e-05, |
| "loss": 0.0009, |
| "step": 1062 |
| }, |
| { |
| "epoch": 3.435959595959596, |
| "grad_norm": 0.010727401822805405, |
| "learning_rate": 9.957125943157448e-05, |
| "loss": 0.0011, |
| "step": 1063 |
| }, |
| { |
| "epoch": 3.439191919191919, |
| "grad_norm": 0.007969755679368973, |
| "learning_rate": 9.956830177163461e-05, |
| "loss": 0.001, |
| "step": 1064 |
| }, |
| { |
| "epoch": 3.4424242424242424, |
| "grad_norm": 0.007324006408452988, |
| "learning_rate": 9.95653339892771e-05, |
| "loss": 0.0007, |
| "step": 1065 |
| }, |
| { |
| "epoch": 3.4456565656565656, |
| "grad_norm": 0.0077498964965343475, |
| "learning_rate": 9.956235608510802e-05, |
| "loss": 0.0009, |
| "step": 1066 |
| }, |
| { |
| "epoch": 3.448888888888889, |
| "grad_norm": 0.008977367542684078, |
| "learning_rate": 9.95593680597355e-05, |
| "loss": 0.0009, |
| "step": 1067 |
| }, |
| { |
| "epoch": 3.452121212121212, |
| "grad_norm": 0.009699931368231773, |
| "learning_rate": 9.955636991376971e-05, |
| "loss": 0.0014, |
| "step": 1068 |
| }, |
| { |
| "epoch": 3.4553535353535354, |
| "grad_norm": 0.00786769948899746, |
| "learning_rate": 9.955336164782292e-05, |
| "loss": 0.0008, |
| "step": 1069 |
| }, |
| { |
| "epoch": 3.4585858585858587, |
| "grad_norm": 0.00882691703736782, |
| "learning_rate": 9.955034326250946e-05, |
| "loss": 0.001, |
| "step": 1070 |
| }, |
| { |
| "epoch": 3.461818181818182, |
| "grad_norm": 0.009861079044640064, |
| "learning_rate": 9.954731475844571e-05, |
| "loss": 0.001, |
| "step": 1071 |
| }, |
| { |
| "epoch": 3.465050505050505, |
| "grad_norm": 0.007571252528578043, |
| "learning_rate": 9.954427613625013e-05, |
| "loss": 0.001, |
| "step": 1072 |
| }, |
| { |
| "epoch": 3.4682828282828284, |
| "grad_norm": 0.008296932093799114, |
| "learning_rate": 9.95412273965432e-05, |
| "loss": 0.0011, |
| "step": 1073 |
| }, |
| { |
| "epoch": 3.4715151515151517, |
| "grad_norm": 0.008980454877018929, |
| "learning_rate": 9.953816853994759e-05, |
| "loss": 0.0011, |
| "step": 1074 |
| }, |
| { |
| "epoch": 3.474747474747475, |
| "grad_norm": 0.010265195742249489, |
| "learning_rate": 9.953509956708789e-05, |
| "loss": 0.0015, |
| "step": 1075 |
| }, |
| { |
| "epoch": 3.474747474747475, |
| "eval_loss": 0.0014913254417479038, |
| "eval_runtime": 16.4259, |
| "eval_samples_per_second": 6.088, |
| "eval_steps_per_second": 1.522, |
| "step": 1075 |
| }, |
| { |
| "epoch": 3.477979797979798, |
| "grad_norm": 0.010150940157473087, |
| "learning_rate": 9.953202047859085e-05, |
| "loss": 0.0013, |
| "step": 1076 |
| }, |
| { |
| "epoch": 3.4812121212121214, |
| "grad_norm": 0.012889894656836987, |
| "learning_rate": 9.952893127508522e-05, |
| "loss": 0.0014, |
| "step": 1077 |
| }, |
| { |
| "epoch": 3.4844444444444447, |
| "grad_norm": 0.009226840920746326, |
| "learning_rate": 9.95258319572019e-05, |
| "loss": 0.001, |
| "step": 1078 |
| }, |
| { |
| "epoch": 3.4876767676767675, |
| "grad_norm": 0.007165413349866867, |
| "learning_rate": 9.952272252557378e-05, |
| "loss": 0.001, |
| "step": 1079 |
| }, |
| { |
| "epoch": 3.4909090909090907, |
| "grad_norm": 0.008806322701275349, |
| "learning_rate": 9.951960298083583e-05, |
| "loss": 0.001, |
| "step": 1080 |
| }, |
| { |
| "epoch": 3.494141414141414, |
| "grad_norm": 0.009387836791574955, |
| "learning_rate": 9.95164733236251e-05, |
| "loss": 0.0012, |
| "step": 1081 |
| }, |
| { |
| "epoch": 3.4973737373737372, |
| "grad_norm": 0.008622254244983196, |
| "learning_rate": 9.951333355458072e-05, |
| "loss": 0.001, |
| "step": 1082 |
| }, |
| { |
| "epoch": 3.5006060606060605, |
| "grad_norm": 0.011083496734499931, |
| "learning_rate": 9.951018367434386e-05, |
| "loss": 0.0014, |
| "step": 1083 |
| }, |
| { |
| "epoch": 3.5038383838383838, |
| "grad_norm": 0.00632804911583662, |
| "learning_rate": 9.950702368355775e-05, |
| "loss": 0.0007, |
| "step": 1084 |
| }, |
| { |
| "epoch": 3.507070707070707, |
| "grad_norm": 0.007108396850526333, |
| "learning_rate": 9.950385358286772e-05, |
| "loss": 0.0008, |
| "step": 1085 |
| }, |
| { |
| "epoch": 3.5103030303030303, |
| "grad_norm": 0.006121058017015457, |
| "learning_rate": 9.950067337292112e-05, |
| "loss": 0.0006, |
| "step": 1086 |
| }, |
| { |
| "epoch": 3.5135353535353535, |
| "grad_norm": 0.023006780073046684, |
| "learning_rate": 9.949748305436741e-05, |
| "loss": 0.0006, |
| "step": 1087 |
| }, |
| { |
| "epoch": 3.5167676767676768, |
| "grad_norm": 0.01592613197863102, |
| "learning_rate": 9.949428262785805e-05, |
| "loss": 0.0013, |
| "step": 1088 |
| }, |
| { |
| "epoch": 3.52, |
| "grad_norm": 0.012256249785423279, |
| "learning_rate": 9.949107209404665e-05, |
| "loss": 0.0013, |
| "step": 1089 |
| }, |
| { |
| "epoch": 3.5232323232323233, |
| "grad_norm": 0.007439706940203905, |
| "learning_rate": 9.948785145358879e-05, |
| "loss": 0.0007, |
| "step": 1090 |
| }, |
| { |
| "epoch": 3.5264646464646465, |
| "grad_norm": 0.014797743409872055, |
| "learning_rate": 9.948462070714219e-05, |
| "loss": 0.0012, |
| "step": 1091 |
| }, |
| { |
| "epoch": 3.5296969696969698, |
| "grad_norm": 0.010126681998372078, |
| "learning_rate": 9.948137985536662e-05, |
| "loss": 0.001, |
| "step": 1092 |
| }, |
| { |
| "epoch": 3.532929292929293, |
| "grad_norm": 0.007417519111186266, |
| "learning_rate": 9.947812889892387e-05, |
| "loss": 0.0008, |
| "step": 1093 |
| }, |
| { |
| "epoch": 3.5361616161616163, |
| "grad_norm": 0.007076946087181568, |
| "learning_rate": 9.947486783847784e-05, |
| "loss": 0.0009, |
| "step": 1094 |
| }, |
| { |
| "epoch": 3.5393939393939395, |
| "grad_norm": 0.00807663332670927, |
| "learning_rate": 9.947159667469446e-05, |
| "loss": 0.001, |
| "step": 1095 |
| }, |
| { |
| "epoch": 3.542626262626263, |
| "grad_norm": 0.006275820545852184, |
| "learning_rate": 9.946831540824175e-05, |
| "loss": 0.0008, |
| "step": 1096 |
| }, |
| { |
| "epoch": 3.5458585858585856, |
| "grad_norm": 0.006674634292721748, |
| "learning_rate": 9.94650240397898e-05, |
| "loss": 0.0007, |
| "step": 1097 |
| }, |
| { |
| "epoch": 3.549090909090909, |
| "grad_norm": 0.009223060682415962, |
| "learning_rate": 9.946172257001069e-05, |
| "loss": 0.0009, |
| "step": 1098 |
| }, |
| { |
| "epoch": 3.552323232323232, |
| "grad_norm": 0.006842616479843855, |
| "learning_rate": 9.945841099957869e-05, |
| "loss": 0.0009, |
| "step": 1099 |
| }, |
| { |
| "epoch": 3.5555555555555554, |
| "grad_norm": 0.006222919095307589, |
| "learning_rate": 9.945508932917001e-05, |
| "loss": 0.0006, |
| "step": 1100 |
| }, |
| { |
| "epoch": 3.5555555555555554, |
| "eval_loss": 0.0015258953208103776, |
| "eval_runtime": 16.3837, |
| "eval_samples_per_second": 6.104, |
| "eval_steps_per_second": 1.526, |
| "step": 1100 |
| }, |
| { |
| "epoch": 3.5587878787878786, |
| "grad_norm": 0.007528199348598719, |
| "learning_rate": 9.9451757559463e-05, |
| "loss": 0.0007, |
| "step": 1101 |
| }, |
| { |
| "epoch": 3.562020202020202, |
| "grad_norm": 0.009182704612612724, |
| "learning_rate": 9.944841569113803e-05, |
| "loss": 0.0011, |
| "step": 1102 |
| }, |
| { |
| "epoch": 3.565252525252525, |
| "grad_norm": 0.011688272468745708, |
| "learning_rate": 9.944506372487754e-05, |
| "loss": 0.0009, |
| "step": 1103 |
| }, |
| { |
| "epoch": 3.5684848484848484, |
| "grad_norm": 0.010002604685723782, |
| "learning_rate": 9.944170166136607e-05, |
| "loss": 0.0008, |
| "step": 1104 |
| }, |
| { |
| "epoch": 3.5717171717171716, |
| "grad_norm": 0.011699234135448933, |
| "learning_rate": 9.943832950129018e-05, |
| "loss": 0.0011, |
| "step": 1105 |
| }, |
| { |
| "epoch": 3.574949494949495, |
| "grad_norm": 0.013462217524647713, |
| "learning_rate": 9.943494724533848e-05, |
| "loss": 0.0015, |
| "step": 1106 |
| }, |
| { |
| "epoch": 3.578181818181818, |
| "grad_norm": 0.009158116765320301, |
| "learning_rate": 9.943155489420169e-05, |
| "loss": 0.0011, |
| "step": 1107 |
| }, |
| { |
| "epoch": 3.5814141414141414, |
| "grad_norm": 0.0077377245761454105, |
| "learning_rate": 9.942815244857256e-05, |
| "loss": 0.0008, |
| "step": 1108 |
| }, |
| { |
| "epoch": 3.5846464646464646, |
| "grad_norm": 0.014242468401789665, |
| "learning_rate": 9.942473990914593e-05, |
| "loss": 0.0011, |
| "step": 1109 |
| }, |
| { |
| "epoch": 3.587878787878788, |
| "grad_norm": 0.007221339736133814, |
| "learning_rate": 9.942131727661863e-05, |
| "loss": 0.0007, |
| "step": 1110 |
| }, |
| { |
| "epoch": 3.591111111111111, |
| "grad_norm": 0.00929784495383501, |
| "learning_rate": 9.941788455168965e-05, |
| "loss": 0.001, |
| "step": 1111 |
| }, |
| { |
| "epoch": 3.5943434343434344, |
| "grad_norm": 0.010798311792314053, |
| "learning_rate": 9.941444173505997e-05, |
| "loss": 0.0014, |
| "step": 1112 |
| }, |
| { |
| "epoch": 3.5975757575757576, |
| "grad_norm": 0.011390223167836666, |
| "learning_rate": 9.941098882743267e-05, |
| "loss": 0.0011, |
| "step": 1113 |
| }, |
| { |
| "epoch": 3.600808080808081, |
| "grad_norm": 0.010920228436589241, |
| "learning_rate": 9.940752582951283e-05, |
| "loss": 0.0014, |
| "step": 1114 |
| }, |
| { |
| "epoch": 3.604040404040404, |
| "grad_norm": 0.011515631340444088, |
| "learning_rate": 9.940405274200769e-05, |
| "loss": 0.0013, |
| "step": 1115 |
| }, |
| { |
| "epoch": 3.6072727272727274, |
| "grad_norm": 0.0058773891068995, |
| "learning_rate": 9.940056956562645e-05, |
| "loss": 0.0007, |
| "step": 1116 |
| }, |
| { |
| "epoch": 3.6105050505050507, |
| "grad_norm": 0.010255025699734688, |
| "learning_rate": 9.939707630108044e-05, |
| "loss": 0.0011, |
| "step": 1117 |
| }, |
| { |
| "epoch": 3.613737373737374, |
| "grad_norm": 0.008029567077755928, |
| "learning_rate": 9.939357294908301e-05, |
| "loss": 0.0011, |
| "step": 1118 |
| }, |
| { |
| "epoch": 3.616969696969697, |
| "grad_norm": 0.00854676403105259, |
| "learning_rate": 9.939005951034959e-05, |
| "loss": 0.001, |
| "step": 1119 |
| }, |
| { |
| "epoch": 3.6202020202020204, |
| "grad_norm": 0.00881232414394617, |
| "learning_rate": 9.938653598559769e-05, |
| "loss": 0.0013, |
| "step": 1120 |
| }, |
| { |
| "epoch": 3.6234343434343437, |
| "grad_norm": 0.009200935252010822, |
| "learning_rate": 9.93830023755468e-05, |
| "loss": 0.0017, |
| "step": 1121 |
| }, |
| { |
| "epoch": 3.626666666666667, |
| "grad_norm": 0.006003112066537142, |
| "learning_rate": 9.937945868091856e-05, |
| "loss": 0.0007, |
| "step": 1122 |
| }, |
| { |
| "epoch": 3.6298989898989897, |
| "grad_norm": 0.008456099778413773, |
| "learning_rate": 9.937590490243665e-05, |
| "loss": 0.0008, |
| "step": 1123 |
| }, |
| { |
| "epoch": 3.633131313131313, |
| "grad_norm": 0.00595937529578805, |
| "learning_rate": 9.937234104082676e-05, |
| "loss": 0.0006, |
| "step": 1124 |
| }, |
| { |
| "epoch": 3.6363636363636362, |
| "grad_norm": 0.008380000479519367, |
| "learning_rate": 9.936876709681668e-05, |
| "loss": 0.0011, |
| "step": 1125 |
| }, |
| { |
| "epoch": 3.6363636363636362, |
| "eval_loss": 0.0013611949980258942, |
| "eval_runtime": 16.4095, |
| "eval_samples_per_second": 6.094, |
| "eval_steps_per_second": 1.524, |
| "step": 1125 |
| }, |
| { |
| "epoch": 3.6395959595959595, |
| "grad_norm": 0.01206933706998825, |
| "learning_rate": 9.936518307113625e-05, |
| "loss": 0.0011, |
| "step": 1126 |
| }, |
| { |
| "epoch": 3.6428282828282827, |
| "grad_norm": 0.008933588862419128, |
| "learning_rate": 9.936158896451737e-05, |
| "loss": 0.0011, |
| "step": 1127 |
| }, |
| { |
| "epoch": 3.646060606060606, |
| "grad_norm": 0.01173634547740221, |
| "learning_rate": 9.9357984777694e-05, |
| "loss": 0.0014, |
| "step": 1128 |
| }, |
| { |
| "epoch": 3.6492929292929293, |
| "grad_norm": 0.010516511276364326, |
| "learning_rate": 9.935437051140216e-05, |
| "loss": 0.001, |
| "step": 1129 |
| }, |
| { |
| "epoch": 3.6525252525252525, |
| "grad_norm": 0.011558487080037594, |
| "learning_rate": 9.935074616637992e-05, |
| "loss": 0.001, |
| "step": 1130 |
| }, |
| { |
| "epoch": 3.6557575757575758, |
| "grad_norm": 0.011997685767710209, |
| "learning_rate": 9.934711174336742e-05, |
| "loss": 0.0016, |
| "step": 1131 |
| }, |
| { |
| "epoch": 3.658989898989899, |
| "grad_norm": 0.008594452403485775, |
| "learning_rate": 9.934346724310684e-05, |
| "loss": 0.0008, |
| "step": 1132 |
| }, |
| { |
| "epoch": 3.6622222222222223, |
| "grad_norm": 0.009548204019665718, |
| "learning_rate": 9.933981266634243e-05, |
| "loss": 0.001, |
| "step": 1133 |
| }, |
| { |
| "epoch": 3.6654545454545455, |
| "grad_norm": 0.006191310007125139, |
| "learning_rate": 9.93361480138205e-05, |
| "loss": 0.0008, |
| "step": 1134 |
| }, |
| { |
| "epoch": 3.6686868686868688, |
| "grad_norm": 0.007743754889816046, |
| "learning_rate": 9.933247328628944e-05, |
| "loss": 0.0011, |
| "step": 1135 |
| }, |
| { |
| "epoch": 3.671919191919192, |
| "grad_norm": 0.00836245995014906, |
| "learning_rate": 9.93287884844996e-05, |
| "loss": 0.0008, |
| "step": 1136 |
| }, |
| { |
| "epoch": 3.6751515151515153, |
| "grad_norm": 0.009158983826637268, |
| "learning_rate": 9.932509360920353e-05, |
| "loss": 0.0014, |
| "step": 1137 |
| }, |
| { |
| "epoch": 3.6783838383838385, |
| "grad_norm": 0.006849655881524086, |
| "learning_rate": 9.932138866115574e-05, |
| "loss": 0.0006, |
| "step": 1138 |
| }, |
| { |
| "epoch": 3.6816161616161613, |
| "grad_norm": 0.005113608669489622, |
| "learning_rate": 9.931767364111283e-05, |
| "loss": 0.0007, |
| "step": 1139 |
| }, |
| { |
| "epoch": 3.6848484848484846, |
| "grad_norm": 0.007342294789850712, |
| "learning_rate": 9.931394854983345e-05, |
| "loss": 0.0008, |
| "step": 1140 |
| }, |
| { |
| "epoch": 3.688080808080808, |
| "grad_norm": 0.007378540001809597, |
| "learning_rate": 9.931021338807828e-05, |
| "loss": 0.0008, |
| "step": 1141 |
| }, |
| { |
| "epoch": 3.691313131313131, |
| "grad_norm": 0.00792661588639021, |
| "learning_rate": 9.93064681566101e-05, |
| "loss": 0.0008, |
| "step": 1142 |
| }, |
| { |
| "epoch": 3.6945454545454544, |
| "grad_norm": 0.011528728529810905, |
| "learning_rate": 9.930271285619376e-05, |
| "loss": 0.0013, |
| "step": 1143 |
| }, |
| { |
| "epoch": 3.6977777777777776, |
| "grad_norm": 0.00503445602953434, |
| "learning_rate": 9.92989474875961e-05, |
| "loss": 0.0008, |
| "step": 1144 |
| }, |
| { |
| "epoch": 3.701010101010101, |
| "grad_norm": 0.006888690870255232, |
| "learning_rate": 9.929517205158605e-05, |
| "loss": 0.0007, |
| "step": 1145 |
| }, |
| { |
| "epoch": 3.704242424242424, |
| "grad_norm": 0.012218410149216652, |
| "learning_rate": 9.929138654893462e-05, |
| "loss": 0.0015, |
| "step": 1146 |
| }, |
| { |
| "epoch": 3.7074747474747474, |
| "grad_norm": 0.007437384687364101, |
| "learning_rate": 9.928759098041482e-05, |
| "loss": 0.0005, |
| "step": 1147 |
| }, |
| { |
| "epoch": 3.7107070707070706, |
| "grad_norm": 0.007831266149878502, |
| "learning_rate": 9.928378534680178e-05, |
| "loss": 0.0008, |
| "step": 1148 |
| }, |
| { |
| "epoch": 3.713939393939394, |
| "grad_norm": 0.007762413937598467, |
| "learning_rate": 9.927996964887265e-05, |
| "loss": 0.001, |
| "step": 1149 |
| }, |
| { |
| "epoch": 3.717171717171717, |
| "grad_norm": 0.011595054529607296, |
| "learning_rate": 9.927614388740663e-05, |
| "loss": 0.0014, |
| "step": 1150 |
| }, |
| { |
| "epoch": 3.717171717171717, |
| "eval_loss": 0.0014385798713192344, |
| "eval_runtime": 16.3516, |
| "eval_samples_per_second": 6.116, |
| "eval_steps_per_second": 1.529, |
| "step": 1150 |
| }, |
| { |
| "epoch": 3.7204040404040404, |
| "grad_norm": 0.010485325008630753, |
| "learning_rate": 9.9272308063185e-05, |
| "loss": 0.0013, |
| "step": 1151 |
| }, |
| { |
| "epoch": 3.7236363636363636, |
| "grad_norm": 0.011887076310813427, |
| "learning_rate": 9.926846217699104e-05, |
| "loss": 0.0013, |
| "step": 1152 |
| }, |
| { |
| "epoch": 3.726868686868687, |
| "grad_norm": 0.009834819473326206, |
| "learning_rate": 9.926460622961016e-05, |
| "loss": 0.001, |
| "step": 1153 |
| }, |
| { |
| "epoch": 3.73010101010101, |
| "grad_norm": 0.006702498998492956, |
| "learning_rate": 9.926074022182979e-05, |
| "loss": 0.0004, |
| "step": 1154 |
| }, |
| { |
| "epoch": 3.7333333333333334, |
| "grad_norm": 0.011306805536150932, |
| "learning_rate": 9.92568641544394e-05, |
| "loss": 0.0011, |
| "step": 1155 |
| }, |
| { |
| "epoch": 3.7365656565656566, |
| "grad_norm": 0.010399502702057362, |
| "learning_rate": 9.925297802823054e-05, |
| "loss": 0.0011, |
| "step": 1156 |
| }, |
| { |
| "epoch": 3.73979797979798, |
| "grad_norm": 0.010177621617913246, |
| "learning_rate": 9.924908184399677e-05, |
| "loss": 0.001, |
| "step": 1157 |
| }, |
| { |
| "epoch": 3.743030303030303, |
| "grad_norm": 0.00788433663547039, |
| "learning_rate": 9.924517560253378e-05, |
| "loss": 0.001, |
| "step": 1158 |
| }, |
| { |
| "epoch": 3.7462626262626264, |
| "grad_norm": 0.005668324884027243, |
| "learning_rate": 9.924125930463924e-05, |
| "loss": 0.0004, |
| "step": 1159 |
| }, |
| { |
| "epoch": 3.7494949494949497, |
| "grad_norm": 0.011663157492876053, |
| "learning_rate": 9.92373329511129e-05, |
| "loss": 0.0016, |
| "step": 1160 |
| }, |
| { |
| "epoch": 3.752727272727273, |
| "grad_norm": 0.007720078341662884, |
| "learning_rate": 9.92333965427566e-05, |
| "loss": 0.001, |
| "step": 1161 |
| }, |
| { |
| "epoch": 3.755959595959596, |
| "grad_norm": 0.011692766100168228, |
| "learning_rate": 9.922945008037417e-05, |
| "loss": 0.0017, |
| "step": 1162 |
| }, |
| { |
| "epoch": 3.7591919191919194, |
| "grad_norm": 0.007470360025763512, |
| "learning_rate": 9.922549356477152e-05, |
| "loss": 0.0008, |
| "step": 1163 |
| }, |
| { |
| "epoch": 3.7624242424242427, |
| "grad_norm": 0.007125111296772957, |
| "learning_rate": 9.922152699675664e-05, |
| "loss": 0.001, |
| "step": 1164 |
| }, |
| { |
| "epoch": 3.765656565656566, |
| "grad_norm": 0.009858732111752033, |
| "learning_rate": 9.921755037713952e-05, |
| "loss": 0.0009, |
| "step": 1165 |
| }, |
| { |
| "epoch": 3.7688888888888887, |
| "grad_norm": 0.005097588524222374, |
| "learning_rate": 9.921356370673225e-05, |
| "loss": 0.0006, |
| "step": 1166 |
| }, |
| { |
| "epoch": 3.772121212121212, |
| "grad_norm": 0.005794189404696226, |
| "learning_rate": 9.920956698634896e-05, |
| "loss": 0.0007, |
| "step": 1167 |
| }, |
| { |
| "epoch": 3.7753535353535352, |
| "grad_norm": 0.00921961385756731, |
| "learning_rate": 9.92055602168058e-05, |
| "loss": 0.0009, |
| "step": 1168 |
| }, |
| { |
| "epoch": 3.7785858585858585, |
| "grad_norm": 0.010736838914453983, |
| "learning_rate": 9.920154339892104e-05, |
| "loss": 0.0012, |
| "step": 1169 |
| }, |
| { |
| "epoch": 3.7818181818181817, |
| "grad_norm": 0.01014452613890171, |
| "learning_rate": 9.919751653351493e-05, |
| "loss": 0.0011, |
| "step": 1170 |
| }, |
| { |
| "epoch": 3.785050505050505, |
| "grad_norm": 0.011449403129518032, |
| "learning_rate": 9.919347962140979e-05, |
| "loss": 0.001, |
| "step": 1171 |
| }, |
| { |
| "epoch": 3.7882828282828283, |
| "grad_norm": 0.009003501385450363, |
| "learning_rate": 9.918943266343004e-05, |
| "loss": 0.0009, |
| "step": 1172 |
| }, |
| { |
| "epoch": 3.7915151515151515, |
| "grad_norm": 0.007624128367751837, |
| "learning_rate": 9.91853756604021e-05, |
| "loss": 0.0011, |
| "step": 1173 |
| }, |
| { |
| "epoch": 3.7947474747474748, |
| "grad_norm": 0.008245960809290409, |
| "learning_rate": 9.918130861315444e-05, |
| "loss": 0.0012, |
| "step": 1174 |
| }, |
| { |
| "epoch": 3.797979797979798, |
| "grad_norm": 0.008305901661515236, |
| "learning_rate": 9.91772315225176e-05, |
| "loss": 0.0011, |
| "step": 1175 |
| }, |
| { |
| "epoch": 3.797979797979798, |
| "eval_loss": 0.0014257283182814717, |
| "eval_runtime": 16.3483, |
| "eval_samples_per_second": 6.117, |
| "eval_steps_per_second": 1.529, |
| "step": 1175 |
| }, |
| { |
| "epoch": 3.8012121212121213, |
| "grad_norm": 0.0067170909605920315, |
| "learning_rate": 9.917314438932421e-05, |
| "loss": 0.0009, |
| "step": 1176 |
| }, |
| { |
| "epoch": 3.8044444444444445, |
| "grad_norm": 0.009056408889591694, |
| "learning_rate": 9.916904721440887e-05, |
| "loss": 0.0009, |
| "step": 1177 |
| }, |
| { |
| "epoch": 3.8076767676767678, |
| "grad_norm": 0.007501684594899416, |
| "learning_rate": 9.916493999860828e-05, |
| "loss": 0.001, |
| "step": 1178 |
| }, |
| { |
| "epoch": 3.810909090909091, |
| "grad_norm": 0.007683174684643745, |
| "learning_rate": 9.916082274276117e-05, |
| "loss": 0.0006, |
| "step": 1179 |
| }, |
| { |
| "epoch": 3.8141414141414143, |
| "grad_norm": 0.009569701738655567, |
| "learning_rate": 9.915669544770836e-05, |
| "loss": 0.0011, |
| "step": 1180 |
| }, |
| { |
| "epoch": 3.8173737373737375, |
| "grad_norm": 0.006904344540089369, |
| "learning_rate": 9.915255811429267e-05, |
| "loss": 0.0007, |
| "step": 1181 |
| }, |
| { |
| "epoch": 3.8206060606060603, |
| "grad_norm": 0.009351781569421291, |
| "learning_rate": 9.914841074335898e-05, |
| "loss": 0.0011, |
| "step": 1182 |
| }, |
| { |
| "epoch": 3.8238383838383836, |
| "grad_norm": 0.010896628722548485, |
| "learning_rate": 9.914425333575426e-05, |
| "loss": 0.001, |
| "step": 1183 |
| }, |
| { |
| "epoch": 3.827070707070707, |
| "grad_norm": 0.010172506794333458, |
| "learning_rate": 9.914008589232749e-05, |
| "loss": 0.0013, |
| "step": 1184 |
| }, |
| { |
| "epoch": 3.83030303030303, |
| "grad_norm": 0.009605888277292252, |
| "learning_rate": 9.91359084139297e-05, |
| "loss": 0.0008, |
| "step": 1185 |
| }, |
| { |
| "epoch": 3.8335353535353534, |
| "grad_norm": 0.006624049041420221, |
| "learning_rate": 9.913172090141399e-05, |
| "loss": 0.0006, |
| "step": 1186 |
| }, |
| { |
| "epoch": 3.8367676767676766, |
| "grad_norm": 0.008812621235847473, |
| "learning_rate": 9.912752335563548e-05, |
| "loss": 0.0009, |
| "step": 1187 |
| }, |
| { |
| "epoch": 3.84, |
| "grad_norm": 0.013925976119935513, |
| "learning_rate": 9.912331577745138e-05, |
| "loss": 0.0011, |
| "step": 1188 |
| }, |
| { |
| "epoch": 3.843232323232323, |
| "grad_norm": 0.007836563512682915, |
| "learning_rate": 9.911909816772091e-05, |
| "loss": 0.0009, |
| "step": 1189 |
| }, |
| { |
| "epoch": 3.8464646464646464, |
| "grad_norm": 0.007102631032466888, |
| "learning_rate": 9.911487052730537e-05, |
| "loss": 0.0009, |
| "step": 1190 |
| }, |
| { |
| "epoch": 3.8496969696969696, |
| "grad_norm": 0.011060229502618313, |
| "learning_rate": 9.911063285706808e-05, |
| "loss": 0.001, |
| "step": 1191 |
| }, |
| { |
| "epoch": 3.852929292929293, |
| "grad_norm": 0.006159051787108183, |
| "learning_rate": 9.910638515787442e-05, |
| "loss": 0.0006, |
| "step": 1192 |
| }, |
| { |
| "epoch": 3.856161616161616, |
| "grad_norm": 0.005718485452234745, |
| "learning_rate": 9.910212743059182e-05, |
| "loss": 0.0006, |
| "step": 1193 |
| }, |
| { |
| "epoch": 3.8593939393939394, |
| "grad_norm": 0.007066367659717798, |
| "learning_rate": 9.909785967608977e-05, |
| "loss": 0.0008, |
| "step": 1194 |
| }, |
| { |
| "epoch": 3.8626262626262626, |
| "grad_norm": 0.009436920285224915, |
| "learning_rate": 9.909358189523978e-05, |
| "loss": 0.0011, |
| "step": 1195 |
| }, |
| { |
| "epoch": 3.865858585858586, |
| "grad_norm": 0.011869143694639206, |
| "learning_rate": 9.908929408891542e-05, |
| "loss": 0.0017, |
| "step": 1196 |
| }, |
| { |
| "epoch": 3.869090909090909, |
| "grad_norm": 0.008457844145596027, |
| "learning_rate": 9.908499625799235e-05, |
| "loss": 0.0009, |
| "step": 1197 |
| }, |
| { |
| "epoch": 3.8723232323232324, |
| "grad_norm": 0.008997390978038311, |
| "learning_rate": 9.908068840334818e-05, |
| "loss": 0.0011, |
| "step": 1198 |
| }, |
| { |
| "epoch": 3.8755555555555556, |
| "grad_norm": 0.008231529034674168, |
| "learning_rate": 9.907637052586265e-05, |
| "loss": 0.0008, |
| "step": 1199 |
| }, |
| { |
| "epoch": 3.878787878787879, |
| "grad_norm": 0.010332757607102394, |
| "learning_rate": 9.907204262641751e-05, |
| "loss": 0.0009, |
| "step": 1200 |
| }, |
| { |
| "epoch": 3.878787878787879, |
| "eval_loss": 0.0013901249039918184, |
| "eval_runtime": 16.4073, |
| "eval_samples_per_second": 6.095, |
| "eval_steps_per_second": 1.524, |
| "step": 1200 |
| }, |
| { |
| "epoch": 3.882020202020202, |
| "grad_norm": 0.008306692354381084, |
| "learning_rate": 9.906770470589657e-05, |
| "loss": 0.0009, |
| "step": 1201 |
| }, |
| { |
| "epoch": 3.8852525252525254, |
| "grad_norm": 0.007444894872605801, |
| "learning_rate": 9.90633567651857e-05, |
| "loss": 0.0009, |
| "step": 1202 |
| }, |
| { |
| "epoch": 3.8884848484848487, |
| "grad_norm": 0.009011060930788517, |
| "learning_rate": 9.905899880517278e-05, |
| "loss": 0.0011, |
| "step": 1203 |
| }, |
| { |
| "epoch": 3.891717171717172, |
| "grad_norm": 0.007725950796157122, |
| "learning_rate": 9.905463082674778e-05, |
| "loss": 0.0009, |
| "step": 1204 |
| }, |
| { |
| "epoch": 3.894949494949495, |
| "grad_norm": 0.008126996457576752, |
| "learning_rate": 9.905025283080265e-05, |
| "loss": 0.001, |
| "step": 1205 |
| }, |
| { |
| "epoch": 3.8981818181818184, |
| "grad_norm": 0.010366525501012802, |
| "learning_rate": 9.904586481823146e-05, |
| "loss": 0.0011, |
| "step": 1206 |
| }, |
| { |
| "epoch": 3.9014141414141417, |
| "grad_norm": 0.003636955050751567, |
| "learning_rate": 9.904146678993027e-05, |
| "loss": 0.0005, |
| "step": 1207 |
| }, |
| { |
| "epoch": 3.904646464646465, |
| "grad_norm": 0.0062943738885223866, |
| "learning_rate": 9.903705874679724e-05, |
| "loss": 0.0006, |
| "step": 1208 |
| }, |
| { |
| "epoch": 3.9078787878787877, |
| "grad_norm": 0.007669993210583925, |
| "learning_rate": 9.903264068973252e-05, |
| "loss": 0.0008, |
| "step": 1209 |
| }, |
| { |
| "epoch": 3.911111111111111, |
| "grad_norm": 0.010288223624229431, |
| "learning_rate": 9.902821261963833e-05, |
| "loss": 0.0008, |
| "step": 1210 |
| }, |
| { |
| "epoch": 3.9143434343434342, |
| "grad_norm": 0.007312690373510122, |
| "learning_rate": 9.902377453741893e-05, |
| "loss": 0.0008, |
| "step": 1211 |
| }, |
| { |
| "epoch": 3.9175757575757575, |
| "grad_norm": 0.007710176985710859, |
| "learning_rate": 9.901932644398065e-05, |
| "loss": 0.0009, |
| "step": 1212 |
| }, |
| { |
| "epoch": 3.9208080808080807, |
| "grad_norm": 0.010064015164971352, |
| "learning_rate": 9.901486834023182e-05, |
| "loss": 0.001, |
| "step": 1213 |
| }, |
| { |
| "epoch": 3.924040404040404, |
| "grad_norm": 0.006745761260390282, |
| "learning_rate": 9.901040022708283e-05, |
| "loss": 0.0007, |
| "step": 1214 |
| }, |
| { |
| "epoch": 3.9272727272727272, |
| "grad_norm": 0.00955828558653593, |
| "learning_rate": 9.900592210544614e-05, |
| "loss": 0.0011, |
| "step": 1215 |
| }, |
| { |
| "epoch": 3.9305050505050505, |
| "grad_norm": 0.01317381951957941, |
| "learning_rate": 9.900143397623622e-05, |
| "loss": 0.0013, |
| "step": 1216 |
| }, |
| { |
| "epoch": 3.9337373737373738, |
| "grad_norm": 0.00733515340834856, |
| "learning_rate": 9.899693584036959e-05, |
| "loss": 0.0007, |
| "step": 1217 |
| }, |
| { |
| "epoch": 3.936969696969697, |
| "grad_norm": 0.008123023435473442, |
| "learning_rate": 9.899242769876486e-05, |
| "loss": 0.0008, |
| "step": 1218 |
| }, |
| { |
| "epoch": 3.9402020202020203, |
| "grad_norm": 0.009008335880935192, |
| "learning_rate": 9.89879095523426e-05, |
| "loss": 0.0009, |
| "step": 1219 |
| }, |
| { |
| "epoch": 3.9434343434343435, |
| "grad_norm": 0.005812053102999926, |
| "learning_rate": 9.898338140202549e-05, |
| "loss": 0.0008, |
| "step": 1220 |
| }, |
| { |
| "epoch": 3.9466666666666668, |
| "grad_norm": 0.00885814055800438, |
| "learning_rate": 9.897884324873824e-05, |
| "loss": 0.0009, |
| "step": 1221 |
| }, |
| { |
| "epoch": 3.94989898989899, |
| "grad_norm": 0.00969020463526249, |
| "learning_rate": 9.897429509340756e-05, |
| "loss": 0.0008, |
| "step": 1222 |
| }, |
| { |
| "epoch": 3.9531313131313133, |
| "grad_norm": 0.006836381740868092, |
| "learning_rate": 9.896973693696228e-05, |
| "loss": 0.0007, |
| "step": 1223 |
| }, |
| { |
| "epoch": 3.9563636363636365, |
| "grad_norm": 0.007620660122483969, |
| "learning_rate": 9.896516878033317e-05, |
| "loss": 0.0005, |
| "step": 1224 |
| }, |
| { |
| "epoch": 3.9595959595959593, |
| "grad_norm": 0.007473440375179052, |
| "learning_rate": 9.896059062445315e-05, |
| "loss": 0.0005, |
| "step": 1225 |
| }, |
| { |
| "epoch": 3.9595959595959593, |
| "eval_loss": 0.0014186676125973463, |
| "eval_runtime": 16.3439, |
| "eval_samples_per_second": 6.118, |
| "eval_steps_per_second": 1.53, |
| "step": 1225 |
| }, |
| { |
| "epoch": 3.9628282828282826, |
| "grad_norm": 0.009552820585668087, |
| "learning_rate": 9.895600247025712e-05, |
| "loss": 0.0008, |
| "step": 1226 |
| }, |
| { |
| "epoch": 3.966060606060606, |
| "grad_norm": 0.013574999757111073, |
| "learning_rate": 9.895140431868203e-05, |
| "loss": 0.0007, |
| "step": 1227 |
| }, |
| { |
| "epoch": 3.969292929292929, |
| "grad_norm": 0.00827439408749342, |
| "learning_rate": 9.894679617066687e-05, |
| "loss": 0.0011, |
| "step": 1228 |
| }, |
| { |
| "epoch": 3.9725252525252523, |
| "grad_norm": 0.010331332683563232, |
| "learning_rate": 9.894217802715266e-05, |
| "loss": 0.0011, |
| "step": 1229 |
| }, |
| { |
| "epoch": 3.9757575757575756, |
| "grad_norm": 0.011210299097001553, |
| "learning_rate": 9.893754988908253e-05, |
| "loss": 0.0013, |
| "step": 1230 |
| }, |
| { |
| "epoch": 3.978989898989899, |
| "grad_norm": 0.006434638053178787, |
| "learning_rate": 9.893291175740156e-05, |
| "loss": 0.0007, |
| "step": 1231 |
| }, |
| { |
| "epoch": 3.982222222222222, |
| "grad_norm": 0.00667984364554286, |
| "learning_rate": 9.892826363305691e-05, |
| "loss": 0.0007, |
| "step": 1232 |
| }, |
| { |
| "epoch": 3.9854545454545454, |
| "grad_norm": 0.007211148273199797, |
| "learning_rate": 9.892360551699779e-05, |
| "loss": 0.0008, |
| "step": 1233 |
| }, |
| { |
| "epoch": 3.9886868686868686, |
| "grad_norm": 0.004201942123472691, |
| "learning_rate": 9.891893741017546e-05, |
| "loss": 0.0006, |
| "step": 1234 |
| }, |
| { |
| "epoch": 3.991919191919192, |
| "grad_norm": 0.007570468354970217, |
| "learning_rate": 9.891425931354317e-05, |
| "loss": 0.001, |
| "step": 1235 |
| }, |
| { |
| "epoch": 3.995151515151515, |
| "grad_norm": 0.005565334111452103, |
| "learning_rate": 9.890957122805624e-05, |
| "loss": 0.0007, |
| "step": 1236 |
| }, |
| { |
| "epoch": 3.9983838383838384, |
| "grad_norm": 0.008423715829849243, |
| "learning_rate": 9.890487315467205e-05, |
| "loss": 0.0012, |
| "step": 1237 |
| }, |
| { |
| "epoch": 4.001616161616162, |
| "grad_norm": 0.01644114963710308, |
| "learning_rate": 9.890016509434998e-05, |
| "loss": 0.0013, |
| "step": 1238 |
| }, |
| { |
| "epoch": 4.004848484848485, |
| "grad_norm": 0.01001213863492012, |
| "learning_rate": 9.889544704805151e-05, |
| "loss": 0.0007, |
| "step": 1239 |
| }, |
| { |
| "epoch": 4.008080808080808, |
| "grad_norm": 0.004935890436172485, |
| "learning_rate": 9.889071901674008e-05, |
| "loss": 0.0004, |
| "step": 1240 |
| }, |
| { |
| "epoch": 4.011313131313131, |
| "grad_norm": 0.005576086696237326, |
| "learning_rate": 9.888598100138123e-05, |
| "loss": 0.0005, |
| "step": 1241 |
| }, |
| { |
| "epoch": 4.014545454545455, |
| "grad_norm": 0.005199114326387644, |
| "learning_rate": 9.888123300294249e-05, |
| "loss": 0.0005, |
| "step": 1242 |
| }, |
| { |
| "epoch": 4.017777777777778, |
| "grad_norm": 0.006733989343047142, |
| "learning_rate": 9.887647502239348e-05, |
| "loss": 0.0006, |
| "step": 1243 |
| }, |
| { |
| "epoch": 4.021010101010101, |
| "grad_norm": 0.004577841609716415, |
| "learning_rate": 9.887170706070584e-05, |
| "loss": 0.0005, |
| "step": 1244 |
| }, |
| { |
| "epoch": 4.024242424242424, |
| "grad_norm": 0.004053604323416948, |
| "learning_rate": 9.886692911885322e-05, |
| "loss": 0.0004, |
| "step": 1245 |
| }, |
| { |
| "epoch": 4.027474747474748, |
| "grad_norm": 0.008622344583272934, |
| "learning_rate": 9.886214119781137e-05, |
| "loss": 0.0009, |
| "step": 1246 |
| }, |
| { |
| "epoch": 4.030707070707071, |
| "grad_norm": 0.01063740998506546, |
| "learning_rate": 9.885734329855798e-05, |
| "loss": 0.0007, |
| "step": 1247 |
| }, |
| { |
| "epoch": 4.033939393939394, |
| "grad_norm": 0.008718481287360191, |
| "learning_rate": 9.885253542207288e-05, |
| "loss": 0.0008, |
| "step": 1248 |
| }, |
| { |
| "epoch": 4.037171717171717, |
| "grad_norm": 0.012205623090267181, |
| "learning_rate": 9.884771756933788e-05, |
| "loss": 0.001, |
| "step": 1249 |
| }, |
| { |
| "epoch": 4.040404040404041, |
| "grad_norm": 0.007628642953932285, |
| "learning_rate": 9.884288974133684e-05, |
| "loss": 0.0005, |
| "step": 1250 |
| }, |
| { |
| "epoch": 4.040404040404041, |
| "eval_loss": 0.001357471919618547, |
| "eval_runtime": 16.3579, |
| "eval_samples_per_second": 6.113, |
| "eval_steps_per_second": 1.528, |
| "step": 1250 |
| }, |
| { |
| "epoch": 4.043636363636364, |
| "grad_norm": 0.00829236675053835, |
| "learning_rate": 9.883805193905567e-05, |
| "loss": 0.0005, |
| "step": 1251 |
| }, |
| { |
| "epoch": 4.046868686868687, |
| "grad_norm": 0.007074315100908279, |
| "learning_rate": 9.883320416348227e-05, |
| "loss": 0.0007, |
| "step": 1252 |
| }, |
| { |
| "epoch": 4.05010101010101, |
| "grad_norm": 0.005583590362221003, |
| "learning_rate": 9.882834641560666e-05, |
| "loss": 0.0004, |
| "step": 1253 |
| }, |
| { |
| "epoch": 4.053333333333334, |
| "grad_norm": 0.005131459794938564, |
| "learning_rate": 9.882347869642081e-05, |
| "loss": 0.0006, |
| "step": 1254 |
| }, |
| { |
| "epoch": 4.056565656565657, |
| "grad_norm": 0.005253889597952366, |
| "learning_rate": 9.881860100691878e-05, |
| "loss": 0.0004, |
| "step": 1255 |
| }, |
| { |
| "epoch": 4.05979797979798, |
| "grad_norm": 0.01153517421334982, |
| "learning_rate": 9.881371334809666e-05, |
| "loss": 0.0011, |
| "step": 1256 |
| }, |
| { |
| "epoch": 4.063030303030303, |
| "grad_norm": 0.005751041695475578, |
| "learning_rate": 9.880881572095256e-05, |
| "loss": 0.0006, |
| "step": 1257 |
| }, |
| { |
| "epoch": 4.066262626262627, |
| "grad_norm": 0.006720800418406725, |
| "learning_rate": 9.880390812648662e-05, |
| "loss": 0.0006, |
| "step": 1258 |
| }, |
| { |
| "epoch": 4.069494949494949, |
| "grad_norm": 0.007284671068191528, |
| "learning_rate": 9.879899056570101e-05, |
| "loss": 0.0008, |
| "step": 1259 |
| }, |
| { |
| "epoch": 4.072727272727272, |
| "grad_norm": 0.010002688504755497, |
| "learning_rate": 9.879406303959999e-05, |
| "loss": 0.0007, |
| "step": 1260 |
| }, |
| { |
| "epoch": 4.075959595959596, |
| "grad_norm": 0.011124187149107456, |
| "learning_rate": 9.878912554918982e-05, |
| "loss": 0.0006, |
| "step": 1261 |
| }, |
| { |
| "epoch": 4.079191919191919, |
| "grad_norm": 0.004590832628309727, |
| "learning_rate": 9.878417809547878e-05, |
| "loss": 0.0004, |
| "step": 1262 |
| }, |
| { |
| "epoch": 4.082424242424242, |
| "grad_norm": 0.011564413085579872, |
| "learning_rate": 9.877922067947717e-05, |
| "loss": 0.0009, |
| "step": 1263 |
| }, |
| { |
| "epoch": 4.085656565656565, |
| "grad_norm": 0.008163025602698326, |
| "learning_rate": 9.877425330219739e-05, |
| "loss": 0.001, |
| "step": 1264 |
| }, |
| { |
| "epoch": 4.088888888888889, |
| "grad_norm": 0.011917063035070896, |
| "learning_rate": 9.876927596465381e-05, |
| "loss": 0.0009, |
| "step": 1265 |
| }, |
| { |
| "epoch": 4.092121212121212, |
| "grad_norm": 0.004871271550655365, |
| "learning_rate": 9.876428866786288e-05, |
| "loss": 0.0005, |
| "step": 1266 |
| }, |
| { |
| "epoch": 4.095353535353535, |
| "grad_norm": 0.010824841447174549, |
| "learning_rate": 9.875929141284306e-05, |
| "loss": 0.001, |
| "step": 1267 |
| }, |
| { |
| "epoch": 4.098585858585858, |
| "grad_norm": 0.009914460591971874, |
| "learning_rate": 9.875428420061483e-05, |
| "loss": 0.0008, |
| "step": 1268 |
| }, |
| { |
| "epoch": 4.101818181818182, |
| "grad_norm": 0.007953731343150139, |
| "learning_rate": 9.874926703220073e-05, |
| "loss": 0.0007, |
| "step": 1269 |
| }, |
| { |
| "epoch": 4.105050505050505, |
| "grad_norm": 0.011278949677944183, |
| "learning_rate": 9.874423990862533e-05, |
| "loss": 0.0007, |
| "step": 1270 |
| }, |
| { |
| "epoch": 4.108282828282828, |
| "grad_norm": 0.004450497217476368, |
| "learning_rate": 9.873920283091521e-05, |
| "loss": 0.0004, |
| "step": 1271 |
| }, |
| { |
| "epoch": 4.111515151515151, |
| "grad_norm": 0.007889357395470142, |
| "learning_rate": 9.873415580009901e-05, |
| "loss": 0.0008, |
| "step": 1272 |
| }, |
| { |
| "epoch": 4.114747474747475, |
| "grad_norm": 0.012178676202893257, |
| "learning_rate": 9.872909881720741e-05, |
| "loss": 0.001, |
| "step": 1273 |
| }, |
| { |
| "epoch": 4.117979797979798, |
| "grad_norm": 0.00808215606957674, |
| "learning_rate": 9.872403188327308e-05, |
| "loss": 0.0007, |
| "step": 1274 |
| }, |
| { |
| "epoch": 4.121212121212121, |
| "grad_norm": 0.00894109159708023, |
| "learning_rate": 9.871895499933075e-05, |
| "loss": 0.001, |
| "step": 1275 |
| }, |
| { |
| "epoch": 4.121212121212121, |
| "eval_loss": 0.0013170680031180382, |
| "eval_runtime": 16.341, |
| "eval_samples_per_second": 6.12, |
| "eval_steps_per_second": 1.53, |
| "step": 1275 |
| }, |
| { |
| "epoch": 4.124444444444444, |
| "grad_norm": 0.005990320350974798, |
| "learning_rate": 9.871386816641718e-05, |
| "loss": 0.0005, |
| "step": 1276 |
| }, |
| { |
| "epoch": 4.127676767676768, |
| "grad_norm": 0.005295825656503439, |
| "learning_rate": 9.870877138557116e-05, |
| "loss": 0.0005, |
| "step": 1277 |
| }, |
| { |
| "epoch": 4.130909090909091, |
| "grad_norm": 0.006675821729004383, |
| "learning_rate": 9.870366465783351e-05, |
| "loss": 0.0008, |
| "step": 1278 |
| }, |
| { |
| "epoch": 4.134141414141414, |
| "grad_norm": 0.010094426572322845, |
| "learning_rate": 9.869854798424709e-05, |
| "loss": 0.0009, |
| "step": 1279 |
| }, |
| { |
| "epoch": 4.137373737373737, |
| "grad_norm": 0.005339372903108597, |
| "learning_rate": 9.869342136585677e-05, |
| "loss": 0.0006, |
| "step": 1280 |
| }, |
| { |
| "epoch": 4.140606060606061, |
| "grad_norm": 0.008103127591311932, |
| "learning_rate": 9.868828480370948e-05, |
| "loss": 0.0007, |
| "step": 1281 |
| }, |
| { |
| "epoch": 4.143838383838384, |
| "grad_norm": 0.010563495568931103, |
| "learning_rate": 9.868313829885414e-05, |
| "loss": 0.0009, |
| "step": 1282 |
| }, |
| { |
| "epoch": 4.147070707070707, |
| "grad_norm": 0.0077997660264372826, |
| "learning_rate": 9.867798185234176e-05, |
| "loss": 0.0006, |
| "step": 1283 |
| }, |
| { |
| "epoch": 4.15030303030303, |
| "grad_norm": 0.010151439346373081, |
| "learning_rate": 9.867281546522533e-05, |
| "loss": 0.0007, |
| "step": 1284 |
| }, |
| { |
| "epoch": 4.153535353535354, |
| "grad_norm": 0.005344895180314779, |
| "learning_rate": 9.866763913855988e-05, |
| "loss": 0.0005, |
| "step": 1285 |
| }, |
| { |
| "epoch": 4.156767676767677, |
| "grad_norm": 0.007908494211733341, |
| "learning_rate": 9.866245287340247e-05, |
| "loss": 0.0006, |
| "step": 1286 |
| }, |
| { |
| "epoch": 4.16, |
| "grad_norm": 0.0050413538701832294, |
| "learning_rate": 9.865725667081221e-05, |
| "loss": 0.0005, |
| "step": 1287 |
| }, |
| { |
| "epoch": 4.163232323232323, |
| "grad_norm": 0.010074461810290813, |
| "learning_rate": 9.865205053185023e-05, |
| "loss": 0.0007, |
| "step": 1288 |
| }, |
| { |
| "epoch": 4.166464646464647, |
| "grad_norm": 0.005633131600916386, |
| "learning_rate": 9.864683445757966e-05, |
| "loss": 0.0005, |
| "step": 1289 |
| }, |
| { |
| "epoch": 4.16969696969697, |
| "grad_norm": 0.0041339644230902195, |
| "learning_rate": 9.86416084490657e-05, |
| "loss": 0.0005, |
| "step": 1290 |
| }, |
| { |
| "epoch": 4.172929292929293, |
| "grad_norm": 0.009557208977639675, |
| "learning_rate": 9.863637250737557e-05, |
| "loss": 0.001, |
| "step": 1291 |
| }, |
| { |
| "epoch": 4.176161616161616, |
| "grad_norm": 0.009876096621155739, |
| "learning_rate": 9.863112663357847e-05, |
| "loss": 0.0013, |
| "step": 1292 |
| }, |
| { |
| "epoch": 4.17939393939394, |
| "grad_norm": 0.007773978635668755, |
| "learning_rate": 9.86258708287457e-05, |
| "loss": 0.0006, |
| "step": 1293 |
| }, |
| { |
| "epoch": 4.182626262626263, |
| "grad_norm": 0.008549852296710014, |
| "learning_rate": 9.862060509395056e-05, |
| "loss": 0.0006, |
| "step": 1294 |
| }, |
| { |
| "epoch": 4.185858585858586, |
| "grad_norm": 0.0066665527410805225, |
| "learning_rate": 9.861532943026836e-05, |
| "loss": 0.0005, |
| "step": 1295 |
| }, |
| { |
| "epoch": 4.189090909090909, |
| "grad_norm": 0.007346499711275101, |
| "learning_rate": 9.861004383877645e-05, |
| "loss": 0.0004, |
| "step": 1296 |
| }, |
| { |
| "epoch": 4.192323232323233, |
| "grad_norm": 0.008795664645731449, |
| "learning_rate": 9.860474832055421e-05, |
| "loss": 0.0009, |
| "step": 1297 |
| }, |
| { |
| "epoch": 4.195555555555556, |
| "grad_norm": 0.008401592262089252, |
| "learning_rate": 9.859944287668306e-05, |
| "loss": 0.0008, |
| "step": 1298 |
| }, |
| { |
| "epoch": 4.198787878787879, |
| "grad_norm": 0.009794225916266441, |
| "learning_rate": 9.85941275082464e-05, |
| "loss": 0.0009, |
| "step": 1299 |
| }, |
| { |
| "epoch": 4.202020202020202, |
| "grad_norm": 0.012242306023836136, |
| "learning_rate": 9.858880221632973e-05, |
| "loss": 0.0011, |
| "step": 1300 |
| }, |
| { |
| "epoch": 4.202020202020202, |
| "eval_loss": 0.0013084042584523559, |
| "eval_runtime": 16.3554, |
| "eval_samples_per_second": 6.114, |
| "eval_steps_per_second": 1.529, |
| "step": 1300 |
| }, |
| { |
| "epoch": 4.205252525252526, |
| "grad_norm": 0.011090662330389023, |
| "learning_rate": 9.85834670020205e-05, |
| "loss": 0.001, |
| "step": 1301 |
| }, |
| { |
| "epoch": 4.208484848484849, |
| "grad_norm": 0.006621975917369127, |
| "learning_rate": 9.857812186640824e-05, |
| "loss": 0.0005, |
| "step": 1302 |
| }, |
| { |
| "epoch": 4.211717171717171, |
| "grad_norm": 0.009116945788264275, |
| "learning_rate": 9.857276681058452e-05, |
| "loss": 0.0008, |
| "step": 1303 |
| }, |
| { |
| "epoch": 4.214949494949495, |
| "grad_norm": 0.005176109727472067, |
| "learning_rate": 9.856740183564284e-05, |
| "loss": 0.0005, |
| "step": 1304 |
| }, |
| { |
| "epoch": 4.218181818181818, |
| "grad_norm": 0.005892486311495304, |
| "learning_rate": 9.856202694267882e-05, |
| "loss": 0.0007, |
| "step": 1305 |
| }, |
| { |
| "epoch": 4.221414141414141, |
| "grad_norm": 0.005805740132927895, |
| "learning_rate": 9.855664213279008e-05, |
| "loss": 0.0006, |
| "step": 1306 |
| }, |
| { |
| "epoch": 4.224646464646464, |
| "grad_norm": 0.00955948606133461, |
| "learning_rate": 9.855124740707627e-05, |
| "loss": 0.0006, |
| "step": 1307 |
| }, |
| { |
| "epoch": 4.227878787878788, |
| "grad_norm": 0.007011524401605129, |
| "learning_rate": 9.854584276663903e-05, |
| "loss": 0.0008, |
| "step": 1308 |
| }, |
| { |
| "epoch": 4.231111111111111, |
| "grad_norm": 0.004449961706995964, |
| "learning_rate": 9.854042821258205e-05, |
| "loss": 0.0004, |
| "step": 1309 |
| }, |
| { |
| "epoch": 4.234343434343434, |
| "grad_norm": 0.006270397454500198, |
| "learning_rate": 9.853500374601106e-05, |
| "loss": 0.0006, |
| "step": 1310 |
| }, |
| { |
| "epoch": 4.237575757575757, |
| "grad_norm": 0.007936222478747368, |
| "learning_rate": 9.85295693680338e-05, |
| "loss": 0.0008, |
| "step": 1311 |
| }, |
| { |
| "epoch": 4.240808080808081, |
| "grad_norm": 0.006835588254034519, |
| "learning_rate": 9.852412507976002e-05, |
| "loss": 0.0006, |
| "step": 1312 |
| }, |
| { |
| "epoch": 4.244040404040404, |
| "grad_norm": 0.00826151855289936, |
| "learning_rate": 9.851867088230152e-05, |
| "loss": 0.0007, |
| "step": 1313 |
| }, |
| { |
| "epoch": 4.247272727272727, |
| "grad_norm": 0.006757290102541447, |
| "learning_rate": 9.85132067767721e-05, |
| "loss": 0.0007, |
| "step": 1314 |
| }, |
| { |
| "epoch": 4.25050505050505, |
| "grad_norm": 0.005502210929989815, |
| "learning_rate": 9.850773276428759e-05, |
| "loss": 0.0005, |
| "step": 1315 |
| }, |
| { |
| "epoch": 4.253737373737374, |
| "grad_norm": 0.007804628927260637, |
| "learning_rate": 9.850224884596585e-05, |
| "loss": 0.0007, |
| "step": 1316 |
| }, |
| { |
| "epoch": 4.256969696969697, |
| "grad_norm": 0.0061260005459189415, |
| "learning_rate": 9.849675502292676e-05, |
| "loss": 0.0005, |
| "step": 1317 |
| }, |
| { |
| "epoch": 4.26020202020202, |
| "grad_norm": 0.007784692570567131, |
| "learning_rate": 9.849125129629224e-05, |
| "loss": 0.0006, |
| "step": 1318 |
| }, |
| { |
| "epoch": 4.263434343434343, |
| "grad_norm": 0.005905506666749716, |
| "learning_rate": 9.848573766718617e-05, |
| "loss": 0.0006, |
| "step": 1319 |
| }, |
| { |
| "epoch": 4.266666666666667, |
| "grad_norm": 0.006844382267445326, |
| "learning_rate": 9.848021413673454e-05, |
| "loss": 0.0005, |
| "step": 1320 |
| }, |
| { |
| "epoch": 4.26989898989899, |
| "grad_norm": 0.013326677493751049, |
| "learning_rate": 9.847468070606529e-05, |
| "loss": 0.0012, |
| "step": 1321 |
| }, |
| { |
| "epoch": 4.273131313131313, |
| "grad_norm": 0.005161616485565901, |
| "learning_rate": 9.846913737630843e-05, |
| "loss": 0.0006, |
| "step": 1322 |
| }, |
| { |
| "epoch": 4.276363636363636, |
| "grad_norm": 0.007726424839347601, |
| "learning_rate": 9.846358414859597e-05, |
| "loss": 0.0006, |
| "step": 1323 |
| }, |
| { |
| "epoch": 4.27959595959596, |
| "grad_norm": 0.008142873644828796, |
| "learning_rate": 9.845802102406192e-05, |
| "loss": 0.0006, |
| "step": 1324 |
| }, |
| { |
| "epoch": 4.282828282828283, |
| "grad_norm": 0.008318033069372177, |
| "learning_rate": 9.845244800384237e-05, |
| "loss": 0.0007, |
| "step": 1325 |
| }, |
| { |
| "epoch": 4.282828282828283, |
| "eval_loss": 0.0013507588300853968, |
| "eval_runtime": 16.3379, |
| "eval_samples_per_second": 6.121, |
| "eval_steps_per_second": 1.53, |
| "step": 1325 |
| }, |
| { |
| "epoch": 4.286060606060606, |
| "grad_norm": 0.008951600641012192, |
| "learning_rate": 9.844686508907537e-05, |
| "loss": 0.0007, |
| "step": 1326 |
| }, |
| { |
| "epoch": 4.289292929292929, |
| "grad_norm": 0.00513639347627759, |
| "learning_rate": 9.844127228090102e-05, |
| "loss": 0.0005, |
| "step": 1327 |
| }, |
| { |
| "epoch": 4.292525252525253, |
| "grad_norm": 0.008280271664261818, |
| "learning_rate": 9.843566958046145e-05, |
| "loss": 0.0008, |
| "step": 1328 |
| }, |
| { |
| "epoch": 4.295757575757576, |
| "grad_norm": 0.006896166130900383, |
| "learning_rate": 9.843005698890076e-05, |
| "loss": 0.0005, |
| "step": 1329 |
| }, |
| { |
| "epoch": 4.298989898989899, |
| "grad_norm": 0.006979741621762514, |
| "learning_rate": 9.842443450736514e-05, |
| "loss": 0.0006, |
| "step": 1330 |
| }, |
| { |
| "epoch": 4.302222222222222, |
| "grad_norm": 0.010027660056948662, |
| "learning_rate": 9.841880213700274e-05, |
| "loss": 0.0008, |
| "step": 1331 |
| }, |
| { |
| "epoch": 4.305454545454546, |
| "grad_norm": 0.007248376961797476, |
| "learning_rate": 9.841315987896378e-05, |
| "loss": 0.0008, |
| "step": 1332 |
| }, |
| { |
| "epoch": 4.308686868686869, |
| "grad_norm": 0.009557382203638554, |
| "learning_rate": 9.840750773440046e-05, |
| "loss": 0.0009, |
| "step": 1333 |
| }, |
| { |
| "epoch": 4.311919191919192, |
| "grad_norm": 0.007266751490533352, |
| "learning_rate": 9.840184570446702e-05, |
| "loss": 0.0008, |
| "step": 1334 |
| }, |
| { |
| "epoch": 4.315151515151515, |
| "grad_norm": 0.005888795014470816, |
| "learning_rate": 9.839617379031971e-05, |
| "loss": 0.0005, |
| "step": 1335 |
| }, |
| { |
| "epoch": 4.318383838383839, |
| "grad_norm": 0.005488747730851173, |
| "learning_rate": 9.839049199311679e-05, |
| "loss": 0.0005, |
| "step": 1336 |
| }, |
| { |
| "epoch": 4.321616161616162, |
| "grad_norm": 0.010251648724079132, |
| "learning_rate": 9.838480031401856e-05, |
| "loss": 0.0008, |
| "step": 1337 |
| }, |
| { |
| "epoch": 4.324848484848485, |
| "grad_norm": 0.007355945184826851, |
| "learning_rate": 9.837909875418732e-05, |
| "loss": 0.0007, |
| "step": 1338 |
| }, |
| { |
| "epoch": 4.328080808080808, |
| "grad_norm": 0.0043625845573842525, |
| "learning_rate": 9.837338731478741e-05, |
| "loss": 0.0005, |
| "step": 1339 |
| }, |
| { |
| "epoch": 4.331313131313132, |
| "grad_norm": 0.0071039339527487755, |
| "learning_rate": 9.836766599698514e-05, |
| "loss": 0.0005, |
| "step": 1340 |
| }, |
| { |
| "epoch": 4.334545454545455, |
| "grad_norm": 0.007601459976285696, |
| "learning_rate": 9.83619348019489e-05, |
| "loss": 0.0008, |
| "step": 1341 |
| }, |
| { |
| "epoch": 4.337777777777778, |
| "grad_norm": 0.009080283343791962, |
| "learning_rate": 9.835619373084905e-05, |
| "loss": 0.0011, |
| "step": 1342 |
| }, |
| { |
| "epoch": 4.3410101010101005, |
| "grad_norm": 0.011209072545170784, |
| "learning_rate": 9.835044278485799e-05, |
| "loss": 0.0008, |
| "step": 1343 |
| }, |
| { |
| "epoch": 4.344242424242424, |
| "grad_norm": 0.004815065301954746, |
| "learning_rate": 9.834468196515011e-05, |
| "loss": 0.0004, |
| "step": 1344 |
| }, |
| { |
| "epoch": 4.347474747474747, |
| "grad_norm": 0.009188278578221798, |
| "learning_rate": 9.833891127290187e-05, |
| "loss": 0.0007, |
| "step": 1345 |
| }, |
| { |
| "epoch": 4.35070707070707, |
| "grad_norm": 0.008015873841941357, |
| "learning_rate": 9.833313070929167e-05, |
| "loss": 0.0005, |
| "step": 1346 |
| }, |
| { |
| "epoch": 4.3539393939393936, |
| "grad_norm": 0.00927853025496006, |
| "learning_rate": 9.83273402755e-05, |
| "loss": 0.0006, |
| "step": 1347 |
| }, |
| { |
| "epoch": 4.357171717171717, |
| "grad_norm": 0.004810738377273083, |
| "learning_rate": 9.832153997270934e-05, |
| "loss": 0.0005, |
| "step": 1348 |
| }, |
| { |
| "epoch": 4.36040404040404, |
| "grad_norm": 0.0066960956901311874, |
| "learning_rate": 9.831572980210413e-05, |
| "loss": 0.0006, |
| "step": 1349 |
| }, |
| { |
| "epoch": 4.363636363636363, |
| "grad_norm": 0.005726319272071123, |
| "learning_rate": 9.830990976487094e-05, |
| "loss": 0.0007, |
| "step": 1350 |
| }, |
| { |
| "epoch": 4.363636363636363, |
| "eval_loss": 0.0014153890078887343, |
| "eval_runtime": 16.3594, |
| "eval_samples_per_second": 6.113, |
| "eval_steps_per_second": 1.528, |
| "step": 1350 |
| }, |
| { |
| "epoch": 4.366868686868687, |
| "grad_norm": 0.008256395347416401, |
| "learning_rate": 9.830407986219823e-05, |
| "loss": 0.0007, |
| "step": 1351 |
| }, |
| { |
| "epoch": 4.37010101010101, |
| "grad_norm": 0.006172611843794584, |
| "learning_rate": 9.829824009527657e-05, |
| "loss": 0.0005, |
| "step": 1352 |
| }, |
| { |
| "epoch": 4.373333333333333, |
| "grad_norm": 0.005505148787051439, |
| "learning_rate": 9.82923904652985e-05, |
| "loss": 0.0005, |
| "step": 1353 |
| }, |
| { |
| "epoch": 4.376565656565656, |
| "grad_norm": 0.007830656133592129, |
| "learning_rate": 9.828653097345857e-05, |
| "loss": 0.0005, |
| "step": 1354 |
| }, |
| { |
| "epoch": 4.37979797979798, |
| "grad_norm": 0.008136449381709099, |
| "learning_rate": 9.828066162095335e-05, |
| "loss": 0.0008, |
| "step": 1355 |
| }, |
| { |
| "epoch": 4.383030303030303, |
| "grad_norm": 0.007075618952512741, |
| "learning_rate": 9.827478240898145e-05, |
| "loss": 0.0005, |
| "step": 1356 |
| }, |
| { |
| "epoch": 4.386262626262626, |
| "grad_norm": 0.006704601459205151, |
| "learning_rate": 9.826889333874348e-05, |
| "loss": 0.0006, |
| "step": 1357 |
| }, |
| { |
| "epoch": 4.389494949494949, |
| "grad_norm": 0.029812738299369812, |
| "learning_rate": 9.826299441144203e-05, |
| "loss": 0.001, |
| "step": 1358 |
| }, |
| { |
| "epoch": 4.392727272727273, |
| "grad_norm": 0.007639670744538307, |
| "learning_rate": 9.825708562828173e-05, |
| "loss": 0.0007, |
| "step": 1359 |
| }, |
| { |
| "epoch": 4.395959595959596, |
| "grad_norm": 0.006942340638488531, |
| "learning_rate": 9.825116699046923e-05, |
| "loss": 0.0005, |
| "step": 1360 |
| }, |
| { |
| "epoch": 4.399191919191919, |
| "grad_norm": 0.005940373986959457, |
| "learning_rate": 9.82452384992132e-05, |
| "loss": 0.0006, |
| "step": 1361 |
| }, |
| { |
| "epoch": 4.402424242424242, |
| "grad_norm": 0.010452733375132084, |
| "learning_rate": 9.823930015572428e-05, |
| "loss": 0.0009, |
| "step": 1362 |
| }, |
| { |
| "epoch": 4.405656565656566, |
| "grad_norm": 0.007077785674482584, |
| "learning_rate": 9.823335196121517e-05, |
| "loss": 0.0005, |
| "step": 1363 |
| }, |
| { |
| "epoch": 4.408888888888889, |
| "grad_norm": 0.004983243998140097, |
| "learning_rate": 9.822739391690054e-05, |
| "loss": 0.0005, |
| "step": 1364 |
| }, |
| { |
| "epoch": 4.412121212121212, |
| "grad_norm": 0.00897799339145422, |
| "learning_rate": 9.82214260239971e-05, |
| "loss": 0.0008, |
| "step": 1365 |
| }, |
| { |
| "epoch": 4.415353535353535, |
| "grad_norm": 0.004031193442642689, |
| "learning_rate": 9.821544828372357e-05, |
| "loss": 0.0006, |
| "step": 1366 |
| }, |
| { |
| "epoch": 4.418585858585859, |
| "grad_norm": 0.006824927404522896, |
| "learning_rate": 9.820946069730066e-05, |
| "loss": 0.0007, |
| "step": 1367 |
| }, |
| { |
| "epoch": 4.421818181818182, |
| "grad_norm": 0.010958773083984852, |
| "learning_rate": 9.820346326595111e-05, |
| "loss": 0.0013, |
| "step": 1368 |
| }, |
| { |
| "epoch": 4.425050505050505, |
| "grad_norm": 0.009391525760293007, |
| "learning_rate": 9.819745599089967e-05, |
| "loss": 0.0009, |
| "step": 1369 |
| }, |
| { |
| "epoch": 4.428282828282828, |
| "grad_norm": 0.005469952709972858, |
| "learning_rate": 9.819143887337308e-05, |
| "loss": 0.0006, |
| "step": 1370 |
| }, |
| { |
| "epoch": 4.431515151515152, |
| "grad_norm": 0.009335340932011604, |
| "learning_rate": 9.818541191460011e-05, |
| "loss": 0.001, |
| "step": 1371 |
| }, |
| { |
| "epoch": 4.434747474747475, |
| "grad_norm": 0.004970667418092489, |
| "learning_rate": 9.817937511581155e-05, |
| "loss": 0.0005, |
| "step": 1372 |
| }, |
| { |
| "epoch": 4.437979797979798, |
| "grad_norm": 0.007790463976562023, |
| "learning_rate": 9.817332847824017e-05, |
| "loss": 0.0008, |
| "step": 1373 |
| }, |
| { |
| "epoch": 4.441212121212121, |
| "grad_norm": 0.008447278290987015, |
| "learning_rate": 9.816727200312076e-05, |
| "loss": 0.0009, |
| "step": 1374 |
| }, |
| { |
| "epoch": 4.444444444444445, |
| "grad_norm": 0.005252547562122345, |
| "learning_rate": 9.816120569169013e-05, |
| "loss": 0.0006, |
| "step": 1375 |
| }, |
| { |
| "epoch": 4.444444444444445, |
| "eval_loss": 0.0013391395332291722, |
| "eval_runtime": 16.418, |
| "eval_samples_per_second": 6.091, |
| "eval_steps_per_second": 1.523, |
| "step": 1375 |
| }, |
| { |
| "epoch": 4.447676767676768, |
| "grad_norm": 0.0075187585316598415, |
| "learning_rate": 9.815512954518709e-05, |
| "loss": 0.0009, |
| "step": 1376 |
| }, |
| { |
| "epoch": 4.450909090909091, |
| "grad_norm": 0.008715563453733921, |
| "learning_rate": 9.814904356485245e-05, |
| "loss": 0.0012, |
| "step": 1377 |
| }, |
| { |
| "epoch": 4.454141414141414, |
| "grad_norm": 0.009248634800314903, |
| "learning_rate": 9.814294775192904e-05, |
| "loss": 0.0005, |
| "step": 1378 |
| }, |
| { |
| "epoch": 4.457373737373738, |
| "grad_norm": 0.005856595933437347, |
| "learning_rate": 9.81368421076617e-05, |
| "loss": 0.0008, |
| "step": 1379 |
| }, |
| { |
| "epoch": 4.460606060606061, |
| "grad_norm": 0.009122004732489586, |
| "learning_rate": 9.813072663329728e-05, |
| "loss": 0.0007, |
| "step": 1380 |
| }, |
| { |
| "epoch": 4.463838383838384, |
| "grad_norm": 0.007110130041837692, |
| "learning_rate": 9.812460133008463e-05, |
| "loss": 0.0005, |
| "step": 1381 |
| }, |
| { |
| "epoch": 4.467070707070707, |
| "grad_norm": 0.004328586161136627, |
| "learning_rate": 9.811846619927459e-05, |
| "loss": 0.0003, |
| "step": 1382 |
| }, |
| { |
| "epoch": 4.470303030303031, |
| "grad_norm": 0.008717586286365986, |
| "learning_rate": 9.811232124212002e-05, |
| "loss": 0.0007, |
| "step": 1383 |
| }, |
| { |
| "epoch": 4.473535353535354, |
| "grad_norm": 0.005248973146080971, |
| "learning_rate": 9.810616645987582e-05, |
| "loss": 0.0004, |
| "step": 1384 |
| }, |
| { |
| "epoch": 4.476767676767677, |
| "grad_norm": 0.00873158685863018, |
| "learning_rate": 9.810000185379884e-05, |
| "loss": 0.0006, |
| "step": 1385 |
| }, |
| { |
| "epoch": 4.48, |
| "grad_norm": 0.011159945279359818, |
| "learning_rate": 9.8093827425148e-05, |
| "loss": 0.0005, |
| "step": 1386 |
| }, |
| { |
| "epoch": 4.483232323232324, |
| "grad_norm": 0.014001946896314621, |
| "learning_rate": 9.808764317518413e-05, |
| "loss": 0.001, |
| "step": 1387 |
| }, |
| { |
| "epoch": 4.486464646464647, |
| "grad_norm": 0.012009704485535622, |
| "learning_rate": 9.808144910517019e-05, |
| "loss": 0.0013, |
| "step": 1388 |
| }, |
| { |
| "epoch": 4.489696969696969, |
| "grad_norm": 0.009032601490616798, |
| "learning_rate": 9.807524521637102e-05, |
| "loss": 0.0007, |
| "step": 1389 |
| }, |
| { |
| "epoch": 4.4929292929292926, |
| "grad_norm": 0.010897103697061539, |
| "learning_rate": 9.806903151005357e-05, |
| "loss": 0.0007, |
| "step": 1390 |
| }, |
| { |
| "epoch": 4.496161616161616, |
| "grad_norm": 0.008785312063992023, |
| "learning_rate": 9.806280798748674e-05, |
| "loss": 0.0009, |
| "step": 1391 |
| }, |
| { |
| "epoch": 4.499393939393939, |
| "grad_norm": 0.012614482082426548, |
| "learning_rate": 9.805657464994144e-05, |
| "loss": 0.0007, |
| "step": 1392 |
| }, |
| { |
| "epoch": 4.502626262626262, |
| "grad_norm": 0.011149034835398197, |
| "learning_rate": 9.80503314986906e-05, |
| "loss": 0.001, |
| "step": 1393 |
| }, |
| { |
| "epoch": 4.505858585858586, |
| "grad_norm": 0.006704424507915974, |
| "learning_rate": 9.804407853500912e-05, |
| "loss": 0.0006, |
| "step": 1394 |
| }, |
| { |
| "epoch": 4.509090909090909, |
| "grad_norm": 0.007612358313053846, |
| "learning_rate": 9.803781576017394e-05, |
| "loss": 0.0007, |
| "step": 1395 |
| }, |
| { |
| "epoch": 4.512323232323232, |
| "grad_norm": 0.006742058787494898, |
| "learning_rate": 9.8031543175464e-05, |
| "loss": 0.0008, |
| "step": 1396 |
| }, |
| { |
| "epoch": 4.515555555555555, |
| "grad_norm": 0.007213367614895105, |
| "learning_rate": 9.802526078216021e-05, |
| "loss": 0.0008, |
| "step": 1397 |
| }, |
| { |
| "epoch": 4.518787878787879, |
| "grad_norm": 0.007504299283027649, |
| "learning_rate": 9.801896858154553e-05, |
| "loss": 0.0006, |
| "step": 1398 |
| }, |
| { |
| "epoch": 4.522020202020202, |
| "grad_norm": 0.009704144671559334, |
| "learning_rate": 9.80126665749049e-05, |
| "loss": 0.0008, |
| "step": 1399 |
| }, |
| { |
| "epoch": 4.525252525252525, |
| "grad_norm": 0.008799389936029911, |
| "learning_rate": 9.800635476352525e-05, |
| "loss": 0.0008, |
| "step": 1400 |
| }, |
| { |
| "epoch": 4.525252525252525, |
| "eval_loss": 0.0011541817802935839, |
| "eval_runtime": 16.3316, |
| "eval_samples_per_second": 6.123, |
| "eval_steps_per_second": 1.531, |
| "step": 1400 |
| }, |
| { |
| "epoch": 4.528484848484848, |
| "grad_norm": 0.00995288323611021, |
| "learning_rate": 9.800003314869552e-05, |
| "loss": 0.0009, |
| "step": 1401 |
| }, |
| { |
| "epoch": 4.531717171717172, |
| "grad_norm": 0.007712456863373518, |
| "learning_rate": 9.799370173170667e-05, |
| "loss": 0.0006, |
| "step": 1402 |
| }, |
| { |
| "epoch": 4.534949494949495, |
| "grad_norm": 0.008242973126471043, |
| "learning_rate": 9.798736051385165e-05, |
| "loss": 0.0007, |
| "step": 1403 |
| }, |
| { |
| "epoch": 4.538181818181818, |
| "grad_norm": 0.0056484718807041645, |
| "learning_rate": 9.79810094964254e-05, |
| "loss": 0.0006, |
| "step": 1404 |
| }, |
| { |
| "epoch": 4.541414141414141, |
| "grad_norm": 0.008368073962628841, |
| "learning_rate": 9.797464868072488e-05, |
| "loss": 0.0007, |
| "step": 1405 |
| }, |
| { |
| "epoch": 4.544646464646465, |
| "grad_norm": 0.00682596443220973, |
| "learning_rate": 9.796827806804902e-05, |
| "loss": 0.0006, |
| "step": 1406 |
| }, |
| { |
| "epoch": 4.547878787878788, |
| "grad_norm": 0.008641253225505352, |
| "learning_rate": 9.79618976596988e-05, |
| "loss": 0.0009, |
| "step": 1407 |
| }, |
| { |
| "epoch": 4.551111111111111, |
| "grad_norm": 0.015729064121842384, |
| "learning_rate": 9.795550745697716e-05, |
| "loss": 0.0007, |
| "step": 1408 |
| }, |
| { |
| "epoch": 4.554343434343434, |
| "grad_norm": 0.010178539901971817, |
| "learning_rate": 9.794910746118904e-05, |
| "loss": 0.001, |
| "step": 1409 |
| }, |
| { |
| "epoch": 4.557575757575758, |
| "grad_norm": 0.007269847672432661, |
| "learning_rate": 9.794269767364143e-05, |
| "loss": 0.0006, |
| "step": 1410 |
| }, |
| { |
| "epoch": 4.560808080808081, |
| "grad_norm": 0.012646600604057312, |
| "learning_rate": 9.793627809564324e-05, |
| "loss": 0.0015, |
| "step": 1411 |
| }, |
| { |
| "epoch": 4.564040404040404, |
| "grad_norm": 0.008321951143443584, |
| "learning_rate": 9.792984872850546e-05, |
| "loss": 0.001, |
| "step": 1412 |
| }, |
| { |
| "epoch": 4.567272727272727, |
| "grad_norm": 0.00516975112259388, |
| "learning_rate": 9.792340957354101e-05, |
| "loss": 0.0007, |
| "step": 1413 |
| }, |
| { |
| "epoch": 4.570505050505051, |
| "grad_norm": 0.008603175170719624, |
| "learning_rate": 9.791696063206484e-05, |
| "loss": 0.0008, |
| "step": 1414 |
| }, |
| { |
| "epoch": 4.573737373737374, |
| "grad_norm": 0.008575500920414925, |
| "learning_rate": 9.791050190539393e-05, |
| "loss": 0.0007, |
| "step": 1415 |
| }, |
| { |
| "epoch": 4.576969696969697, |
| "grad_norm": 0.006796093191951513, |
| "learning_rate": 9.790403339484718e-05, |
| "loss": 0.0005, |
| "step": 1416 |
| }, |
| { |
| "epoch": 4.58020202020202, |
| "grad_norm": 0.006996788550168276, |
| "learning_rate": 9.789755510174559e-05, |
| "loss": 0.0005, |
| "step": 1417 |
| }, |
| { |
| "epoch": 4.583434343434344, |
| "grad_norm": 0.0074277641251683235, |
| "learning_rate": 9.789106702741204e-05, |
| "loss": 0.0006, |
| "step": 1418 |
| }, |
| { |
| "epoch": 4.586666666666667, |
| "grad_norm": 0.00950582791119814, |
| "learning_rate": 9.788456917317153e-05, |
| "loss": 0.0008, |
| "step": 1419 |
| }, |
| { |
| "epoch": 4.58989898989899, |
| "grad_norm": 0.008638204075396061, |
| "learning_rate": 9.787806154035095e-05, |
| "loss": 0.0006, |
| "step": 1420 |
| }, |
| { |
| "epoch": 4.593131313131313, |
| "grad_norm": 0.00890905037522316, |
| "learning_rate": 9.787154413027926e-05, |
| "loss": 0.0006, |
| "step": 1421 |
| }, |
| { |
| "epoch": 4.596363636363637, |
| "grad_norm": 0.01198984868824482, |
| "learning_rate": 9.786501694428738e-05, |
| "loss": 0.001, |
| "step": 1422 |
| }, |
| { |
| "epoch": 4.59959595959596, |
| "grad_norm": 0.006485726218670607, |
| "learning_rate": 9.785847998370823e-05, |
| "loss": 0.0006, |
| "step": 1423 |
| }, |
| { |
| "epoch": 4.602828282828283, |
| "grad_norm": 0.01143205352127552, |
| "learning_rate": 9.785193324987673e-05, |
| "loss": 0.0008, |
| "step": 1424 |
| }, |
| { |
| "epoch": 4.606060606060606, |
| "grad_norm": 0.00786137767136097, |
| "learning_rate": 9.784537674412984e-05, |
| "loss": 0.0008, |
| "step": 1425 |
| }, |
| { |
| "epoch": 4.606060606060606, |
| "eval_loss": 0.001231278060004115, |
| "eval_runtime": 16.3543, |
| "eval_samples_per_second": 6.115, |
| "eval_steps_per_second": 1.529, |
| "step": 1425 |
| }, |
| { |
| "epoch": 4.60929292929293, |
| "grad_norm": 0.009320951998233795, |
| "learning_rate": 9.783881046780643e-05, |
| "loss": 0.0008, |
| "step": 1426 |
| }, |
| { |
| "epoch": 4.612525252525252, |
| "grad_norm": 0.007053622510284185, |
| "learning_rate": 9.783223442224741e-05, |
| "loss": 0.0008, |
| "step": 1427 |
| }, |
| { |
| "epoch": 4.615757575757575, |
| "grad_norm": 0.006885794457048178, |
| "learning_rate": 9.782564860879571e-05, |
| "loss": 0.0006, |
| "step": 1428 |
| }, |
| { |
| "epoch": 4.6189898989898985, |
| "grad_norm": 0.009176738560199738, |
| "learning_rate": 9.781905302879622e-05, |
| "loss": 0.0006, |
| "step": 1429 |
| }, |
| { |
| "epoch": 4.622222222222222, |
| "grad_norm": 0.005100234877318144, |
| "learning_rate": 9.78124476835958e-05, |
| "loss": 0.0005, |
| "step": 1430 |
| }, |
| { |
| "epoch": 4.625454545454545, |
| "grad_norm": 0.009323365055024624, |
| "learning_rate": 9.78058325745434e-05, |
| "loss": 0.0007, |
| "step": 1431 |
| }, |
| { |
| "epoch": 4.628686868686868, |
| "grad_norm": 0.008040891028940678, |
| "learning_rate": 9.779920770298985e-05, |
| "loss": 0.0007, |
| "step": 1432 |
| }, |
| { |
| "epoch": 4.6319191919191915, |
| "grad_norm": 0.005325017962604761, |
| "learning_rate": 9.779257307028804e-05, |
| "loss": 0.0005, |
| "step": 1433 |
| }, |
| { |
| "epoch": 4.635151515151515, |
| "grad_norm": 0.006852106191217899, |
| "learning_rate": 9.778592867779284e-05, |
| "loss": 0.0006, |
| "step": 1434 |
| }, |
| { |
| "epoch": 4.638383838383838, |
| "grad_norm": 0.008187889121472836, |
| "learning_rate": 9.777927452686114e-05, |
| "loss": 0.0006, |
| "step": 1435 |
| }, |
| { |
| "epoch": 4.641616161616161, |
| "grad_norm": 0.0061571612022817135, |
| "learning_rate": 9.777261061885172e-05, |
| "loss": 0.0006, |
| "step": 1436 |
| }, |
| { |
| "epoch": 4.644848484848485, |
| "grad_norm": 0.011466188356280327, |
| "learning_rate": 9.77659369551255e-05, |
| "loss": 0.0009, |
| "step": 1437 |
| }, |
| { |
| "epoch": 4.648080808080808, |
| "grad_norm": 0.012491497211158276, |
| "learning_rate": 9.775925353704529e-05, |
| "loss": 0.0011, |
| "step": 1438 |
| }, |
| { |
| "epoch": 4.651313131313131, |
| "grad_norm": 0.01179596409201622, |
| "learning_rate": 9.775256036597593e-05, |
| "loss": 0.0012, |
| "step": 1439 |
| }, |
| { |
| "epoch": 4.654545454545454, |
| "grad_norm": 0.011520985513925552, |
| "learning_rate": 9.774585744328421e-05, |
| "loss": 0.001, |
| "step": 1440 |
| }, |
| { |
| "epoch": 4.657777777777778, |
| "grad_norm": 0.0077523160725831985, |
| "learning_rate": 9.7739144770339e-05, |
| "loss": 0.0008, |
| "step": 1441 |
| }, |
| { |
| "epoch": 4.661010101010101, |
| "grad_norm": 0.006166713312268257, |
| "learning_rate": 9.773242234851106e-05, |
| "loss": 0.0006, |
| "step": 1442 |
| }, |
| { |
| "epoch": 4.664242424242424, |
| "grad_norm": 0.008120764046907425, |
| "learning_rate": 9.772569017917319e-05, |
| "loss": 0.0008, |
| "step": 1443 |
| }, |
| { |
| "epoch": 4.667474747474747, |
| "grad_norm": 0.007862396538257599, |
| "learning_rate": 9.77189482637002e-05, |
| "loss": 0.0009, |
| "step": 1444 |
| }, |
| { |
| "epoch": 4.670707070707071, |
| "grad_norm": 0.008648447692394257, |
| "learning_rate": 9.771219660346886e-05, |
| "loss": 0.0009, |
| "step": 1445 |
| }, |
| { |
| "epoch": 4.673939393939394, |
| "grad_norm": 0.008738266304135323, |
| "learning_rate": 9.770543519985791e-05, |
| "loss": 0.0009, |
| "step": 1446 |
| }, |
| { |
| "epoch": 4.677171717171717, |
| "grad_norm": 0.009699663147330284, |
| "learning_rate": 9.769866405424815e-05, |
| "loss": 0.0009, |
| "step": 1447 |
| }, |
| { |
| "epoch": 4.68040404040404, |
| "grad_norm": 0.007863318547606468, |
| "learning_rate": 9.769188316802229e-05, |
| "loss": 0.0007, |
| "step": 1448 |
| }, |
| { |
| "epoch": 4.683636363636364, |
| "grad_norm": 0.008350216783583164, |
| "learning_rate": 9.768509254256507e-05, |
| "loss": 0.0008, |
| "step": 1449 |
| }, |
| { |
| "epoch": 4.686868686868687, |
| "grad_norm": 0.007823821157217026, |
| "learning_rate": 9.767829217926321e-05, |
| "loss": 0.0006, |
| "step": 1450 |
| }, |
| { |
| "epoch": 4.686868686868687, |
| "eval_loss": 0.0012428788468241692, |
| "eval_runtime": 16.3526, |
| "eval_samples_per_second": 6.115, |
| "eval_steps_per_second": 1.529, |
| "step": 1450 |
| }, |
| { |
| "epoch": 4.69010101010101, |
| "grad_norm": 0.007611010689288378, |
| "learning_rate": 9.767148207950546e-05, |
| "loss": 0.0009, |
| "step": 1451 |
| }, |
| { |
| "epoch": 4.693333333333333, |
| "grad_norm": 0.007722372189164162, |
| "learning_rate": 9.766466224468249e-05, |
| "loss": 0.0006, |
| "step": 1452 |
| }, |
| { |
| "epoch": 4.696565656565657, |
| "grad_norm": 0.006537162233144045, |
| "learning_rate": 9.765783267618698e-05, |
| "loss": 0.0006, |
| "step": 1453 |
| }, |
| { |
| "epoch": 4.69979797979798, |
| "grad_norm": 0.007506988011300564, |
| "learning_rate": 9.765099337541364e-05, |
| "loss": 0.0008, |
| "step": 1454 |
| }, |
| { |
| "epoch": 4.703030303030303, |
| "grad_norm": 0.007848326116800308, |
| "learning_rate": 9.764414434375909e-05, |
| "loss": 0.0007, |
| "step": 1455 |
| }, |
| { |
| "epoch": 4.706262626262626, |
| "grad_norm": 0.009280819445848465, |
| "learning_rate": 9.763728558262202e-05, |
| "loss": 0.0007, |
| "step": 1456 |
| }, |
| { |
| "epoch": 4.70949494949495, |
| "grad_norm": 0.011445286683738232, |
| "learning_rate": 9.763041709340305e-05, |
| "loss": 0.001, |
| "step": 1457 |
| }, |
| { |
| "epoch": 4.712727272727273, |
| "grad_norm": 0.009234776720404625, |
| "learning_rate": 9.76235388775048e-05, |
| "loss": 0.0007, |
| "step": 1458 |
| }, |
| { |
| "epoch": 4.715959595959596, |
| "grad_norm": 0.0059725018218159676, |
| "learning_rate": 9.76166509363319e-05, |
| "loss": 0.0005, |
| "step": 1459 |
| }, |
| { |
| "epoch": 4.719191919191919, |
| "grad_norm": 0.006415718700736761, |
| "learning_rate": 9.760975327129093e-05, |
| "loss": 0.0006, |
| "step": 1460 |
| }, |
| { |
| "epoch": 4.722424242424243, |
| "grad_norm": 0.006273853592574596, |
| "learning_rate": 9.760284588379047e-05, |
| "loss": 0.0007, |
| "step": 1461 |
| }, |
| { |
| "epoch": 4.725656565656566, |
| "grad_norm": 0.005407524295151234, |
| "learning_rate": 9.759592877524111e-05, |
| "loss": 0.0005, |
| "step": 1462 |
| }, |
| { |
| "epoch": 4.728888888888889, |
| "grad_norm": 0.008380726911127567, |
| "learning_rate": 9.758900194705537e-05, |
| "loss": 0.0007, |
| "step": 1463 |
| }, |
| { |
| "epoch": 4.732121212121212, |
| "grad_norm": 0.006552028935402632, |
| "learning_rate": 9.758206540064782e-05, |
| "loss": 0.0007, |
| "step": 1464 |
| }, |
| { |
| "epoch": 4.735353535353536, |
| "grad_norm": 0.00797539483755827, |
| "learning_rate": 9.757511913743496e-05, |
| "loss": 0.0008, |
| "step": 1465 |
| }, |
| { |
| "epoch": 4.738585858585859, |
| "grad_norm": 0.009962940588593483, |
| "learning_rate": 9.756816315883532e-05, |
| "loss": 0.0007, |
| "step": 1466 |
| }, |
| { |
| "epoch": 4.741818181818182, |
| "grad_norm": 0.007465899921953678, |
| "learning_rate": 9.756119746626937e-05, |
| "loss": 0.0006, |
| "step": 1467 |
| }, |
| { |
| "epoch": 4.745050505050505, |
| "grad_norm": 0.006115617696195841, |
| "learning_rate": 9.755422206115959e-05, |
| "loss": 0.0005, |
| "step": 1468 |
| }, |
| { |
| "epoch": 4.748282828282829, |
| "grad_norm": 0.007522049359977245, |
| "learning_rate": 9.754723694493043e-05, |
| "loss": 0.0006, |
| "step": 1469 |
| }, |
| { |
| "epoch": 4.751515151515152, |
| "grad_norm": 0.006444310769438744, |
| "learning_rate": 9.754024211900835e-05, |
| "loss": 0.0008, |
| "step": 1470 |
| }, |
| { |
| "epoch": 4.754747474747475, |
| "grad_norm": 0.004826118238270283, |
| "learning_rate": 9.753323758482176e-05, |
| "loss": 0.0005, |
| "step": 1471 |
| }, |
| { |
| "epoch": 4.757979797979798, |
| "grad_norm": 0.008243625983595848, |
| "learning_rate": 9.752622334380108e-05, |
| "loss": 0.0005, |
| "step": 1472 |
| }, |
| { |
| "epoch": 4.761212121212122, |
| "grad_norm": 0.007873287424445152, |
| "learning_rate": 9.751919939737868e-05, |
| "loss": 0.0009, |
| "step": 1473 |
| }, |
| { |
| "epoch": 4.764444444444445, |
| "grad_norm": 0.007708546239882708, |
| "learning_rate": 9.751216574698895e-05, |
| "loss": 0.0006, |
| "step": 1474 |
| }, |
| { |
| "epoch": 4.767676767676767, |
| "grad_norm": 0.007383065298199654, |
| "learning_rate": 9.750512239406822e-05, |
| "loss": 0.0006, |
| "step": 1475 |
| }, |
| { |
| "epoch": 4.767676767676767, |
| "eval_loss": 0.0011928840540349483, |
| "eval_runtime": 16.3406, |
| "eval_samples_per_second": 6.12, |
| "eval_steps_per_second": 1.53, |
| "step": 1475 |
| }, |
| { |
| "epoch": 4.7709090909090905, |
| "grad_norm": 0.008480955846607685, |
| "learning_rate": 9.749806934005485e-05, |
| "loss": 0.0007, |
| "step": 1476 |
| }, |
| { |
| "epoch": 4.774141414141414, |
| "grad_norm": 0.007100993767380714, |
| "learning_rate": 9.749100658638914e-05, |
| "loss": 0.0005, |
| "step": 1477 |
| }, |
| { |
| "epoch": 4.777373737373737, |
| "grad_norm": 0.008701356127858162, |
| "learning_rate": 9.748393413451338e-05, |
| "loss": 0.0007, |
| "step": 1478 |
| }, |
| { |
| "epoch": 4.78060606060606, |
| "grad_norm": 0.008532720617949963, |
| "learning_rate": 9.747685198587187e-05, |
| "loss": 0.0009, |
| "step": 1479 |
| }, |
| { |
| "epoch": 4.783838383838384, |
| "grad_norm": 0.004517727065831423, |
| "learning_rate": 9.746976014191083e-05, |
| "loss": 0.0005, |
| "step": 1480 |
| }, |
| { |
| "epoch": 4.787070707070707, |
| "grad_norm": 0.008706235326826572, |
| "learning_rate": 9.746265860407851e-05, |
| "loss": 0.0005, |
| "step": 1481 |
| }, |
| { |
| "epoch": 4.79030303030303, |
| "grad_norm": 0.008171387016773224, |
| "learning_rate": 9.745554737382513e-05, |
| "loss": 0.0008, |
| "step": 1482 |
| }, |
| { |
| "epoch": 4.793535353535353, |
| "grad_norm": 0.005098575726151466, |
| "learning_rate": 9.74484264526029e-05, |
| "loss": 0.0004, |
| "step": 1483 |
| }, |
| { |
| "epoch": 4.796767676767677, |
| "grad_norm": 0.0059188841842114925, |
| "learning_rate": 9.744129584186598e-05, |
| "loss": 0.0005, |
| "step": 1484 |
| }, |
| { |
| "epoch": 4.8, |
| "grad_norm": 0.005888924468308687, |
| "learning_rate": 9.743415554307053e-05, |
| "loss": 0.0006, |
| "step": 1485 |
| }, |
| { |
| "epoch": 4.803232323232323, |
| "grad_norm": 0.01280274335294962, |
| "learning_rate": 9.742700555767466e-05, |
| "loss": 0.0008, |
| "step": 1486 |
| }, |
| { |
| "epoch": 4.806464646464646, |
| "grad_norm": 0.011109513230621815, |
| "learning_rate": 9.74198458871385e-05, |
| "loss": 0.0008, |
| "step": 1487 |
| }, |
| { |
| "epoch": 4.80969696969697, |
| "grad_norm": 0.007763589732348919, |
| "learning_rate": 9.741267653292413e-05, |
| "loss": 0.0006, |
| "step": 1488 |
| }, |
| { |
| "epoch": 4.812929292929293, |
| "grad_norm": 0.009517017751932144, |
| "learning_rate": 9.740549749649561e-05, |
| "loss": 0.0007, |
| "step": 1489 |
| }, |
| { |
| "epoch": 4.816161616161616, |
| "grad_norm": 0.009175878949463367, |
| "learning_rate": 9.739830877931899e-05, |
| "loss": 0.0011, |
| "step": 1490 |
| }, |
| { |
| "epoch": 4.819393939393939, |
| "grad_norm": 0.007311566732823849, |
| "learning_rate": 9.739111038286228e-05, |
| "loss": 0.0008, |
| "step": 1491 |
| }, |
| { |
| "epoch": 4.822626262626263, |
| "grad_norm": 0.007254389580339193, |
| "learning_rate": 9.73839023085955e-05, |
| "loss": 0.0005, |
| "step": 1492 |
| }, |
| { |
| "epoch": 4.825858585858586, |
| "grad_norm": 0.0115917157381773, |
| "learning_rate": 9.737668455799059e-05, |
| "loss": 0.0007, |
| "step": 1493 |
| }, |
| { |
| "epoch": 4.829090909090909, |
| "grad_norm": 0.007447521202266216, |
| "learning_rate": 9.73694571325215e-05, |
| "loss": 0.0006, |
| "step": 1494 |
| }, |
| { |
| "epoch": 4.832323232323232, |
| "grad_norm": 0.010047023184597492, |
| "learning_rate": 9.736222003366417e-05, |
| "loss": 0.001, |
| "step": 1495 |
| }, |
| { |
| "epoch": 4.835555555555556, |
| "grad_norm": 0.008117943070828915, |
| "learning_rate": 9.735497326289651e-05, |
| "loss": 0.0008, |
| "step": 1496 |
| }, |
| { |
| "epoch": 4.838787878787879, |
| "grad_norm": 0.007704007439315319, |
| "learning_rate": 9.734771682169837e-05, |
| "loss": 0.0007, |
| "step": 1497 |
| }, |
| { |
| "epoch": 4.842020202020202, |
| "grad_norm": 0.007732712663710117, |
| "learning_rate": 9.73404507115516e-05, |
| "loss": 0.0006, |
| "step": 1498 |
| }, |
| { |
| "epoch": 4.845252525252525, |
| "grad_norm": 0.007416680920869112, |
| "learning_rate": 9.733317493394003e-05, |
| "loss": 0.0006, |
| "step": 1499 |
| }, |
| { |
| "epoch": 4.848484848484849, |
| "grad_norm": 0.008525696583092213, |
| "learning_rate": 9.732588949034945e-05, |
| "loss": 0.0007, |
| "step": 1500 |
| }, |
| { |
| "epoch": 4.848484848484849, |
| "eval_loss": 0.0012222006916999817, |
| "eval_runtime": 16.3659, |
| "eval_samples_per_second": 6.11, |
| "eval_steps_per_second": 1.528, |
| "step": 1500 |
| }, |
| { |
| "epoch": 4.851717171717172, |
| "grad_norm": 0.008345399983227253, |
| "learning_rate": 9.731859438226765e-05, |
| "loss": 0.0007, |
| "step": 1501 |
| }, |
| { |
| "epoch": 4.854949494949495, |
| "grad_norm": 0.00756820198148489, |
| "learning_rate": 9.731128961118435e-05, |
| "loss": 0.0008, |
| "step": 1502 |
| }, |
| { |
| "epoch": 4.858181818181818, |
| "grad_norm": 0.005215240642428398, |
| "learning_rate": 9.73039751785913e-05, |
| "loss": 0.0006, |
| "step": 1503 |
| }, |
| { |
| "epoch": 4.861414141414142, |
| "grad_norm": 0.008432602509856224, |
| "learning_rate": 9.729665108598216e-05, |
| "loss": 0.0008, |
| "step": 1504 |
| }, |
| { |
| "epoch": 4.864646464646465, |
| "grad_norm": 0.005354681517928839, |
| "learning_rate": 9.72893173348526e-05, |
| "loss": 0.0005, |
| "step": 1505 |
| }, |
| { |
| "epoch": 4.867878787878788, |
| "grad_norm": 0.006146441213786602, |
| "learning_rate": 9.728197392670027e-05, |
| "loss": 0.0006, |
| "step": 1506 |
| }, |
| { |
| "epoch": 4.871111111111111, |
| "grad_norm": 0.006235864013433456, |
| "learning_rate": 9.727462086302477e-05, |
| "loss": 0.0007, |
| "step": 1507 |
| }, |
| { |
| "epoch": 4.874343434343435, |
| "grad_norm": 0.006023712456226349, |
| "learning_rate": 9.72672581453277e-05, |
| "loss": 0.0006, |
| "step": 1508 |
| }, |
| { |
| "epoch": 4.877575757575758, |
| "grad_norm": 0.005139671731740236, |
| "learning_rate": 9.725988577511256e-05, |
| "loss": 0.0005, |
| "step": 1509 |
| }, |
| { |
| "epoch": 4.880808080808081, |
| "grad_norm": 0.006575802806764841, |
| "learning_rate": 9.725250375388493e-05, |
| "loss": 0.0006, |
| "step": 1510 |
| }, |
| { |
| "epoch": 4.884040404040404, |
| "grad_norm": 0.0070196036249399185, |
| "learning_rate": 9.724511208315227e-05, |
| "loss": 0.0009, |
| "step": 1511 |
| }, |
| { |
| "epoch": 4.887272727272728, |
| "grad_norm": 0.008368157781660557, |
| "learning_rate": 9.723771076442405e-05, |
| "loss": 0.0007, |
| "step": 1512 |
| }, |
| { |
| "epoch": 4.89050505050505, |
| "grad_norm": 0.006412284914404154, |
| "learning_rate": 9.723029979921172e-05, |
| "loss": 0.0007, |
| "step": 1513 |
| }, |
| { |
| "epoch": 4.893737373737373, |
| "grad_norm": 0.006129475310444832, |
| "learning_rate": 9.722287918902867e-05, |
| "loss": 0.0004, |
| "step": 1514 |
| }, |
| { |
| "epoch": 4.8969696969696965, |
| "grad_norm": 0.008330601267516613, |
| "learning_rate": 9.721544893539027e-05, |
| "loss": 0.0007, |
| "step": 1515 |
| }, |
| { |
| "epoch": 4.90020202020202, |
| "grad_norm": 0.007703237701207399, |
| "learning_rate": 9.720800903981388e-05, |
| "loss": 0.0008, |
| "step": 1516 |
| }, |
| { |
| "epoch": 4.903434343434343, |
| "grad_norm": 0.006720873061567545, |
| "learning_rate": 9.72005595038188e-05, |
| "loss": 0.0007, |
| "step": 1517 |
| }, |
| { |
| "epoch": 4.906666666666666, |
| "grad_norm": 0.0058440230786800385, |
| "learning_rate": 9.71931003289263e-05, |
| "loss": 0.0006, |
| "step": 1518 |
| }, |
| { |
| "epoch": 4.9098989898989895, |
| "grad_norm": 0.007237616926431656, |
| "learning_rate": 9.718563151665966e-05, |
| "loss": 0.0005, |
| "step": 1519 |
| }, |
| { |
| "epoch": 4.913131313131313, |
| "grad_norm": 0.007109343074262142, |
| "learning_rate": 9.717815306854407e-05, |
| "loss": 0.0006, |
| "step": 1520 |
| }, |
| { |
| "epoch": 4.916363636363636, |
| "grad_norm": 0.009684831835329533, |
| "learning_rate": 9.717066498610673e-05, |
| "loss": 0.0008, |
| "step": 1521 |
| }, |
| { |
| "epoch": 4.919595959595959, |
| "grad_norm": 0.00765199726447463, |
| "learning_rate": 9.716316727087679e-05, |
| "loss": 0.0006, |
| "step": 1522 |
| }, |
| { |
| "epoch": 4.9228282828282826, |
| "grad_norm": 0.008175229653716087, |
| "learning_rate": 9.715565992438536e-05, |
| "loss": 0.0005, |
| "step": 1523 |
| }, |
| { |
| "epoch": 4.926060606060606, |
| "grad_norm": 0.005975640844553709, |
| "learning_rate": 9.714814294816554e-05, |
| "loss": 0.0005, |
| "step": 1524 |
| }, |
| { |
| "epoch": 4.929292929292929, |
| "grad_norm": 0.010827175341546535, |
| "learning_rate": 9.714061634375238e-05, |
| "loss": 0.0009, |
| "step": 1525 |
| }, |
| { |
| "epoch": 4.929292929292929, |
| "eval_loss": 0.0010831886902451515, |
| "eval_runtime": 16.3427, |
| "eval_samples_per_second": 6.119, |
| "eval_steps_per_second": 1.53, |
| "step": 1525 |
| }, |
| { |
| "epoch": 4.932525252525252, |
| "grad_norm": 0.007128015626221895, |
| "learning_rate": 9.713308011268289e-05, |
| "loss": 0.0006, |
| "step": 1526 |
| }, |
| { |
| "epoch": 4.935757575757576, |
| "grad_norm": 0.007858987897634506, |
| "learning_rate": 9.712553425649606e-05, |
| "loss": 0.0008, |
| "step": 1527 |
| }, |
| { |
| "epoch": 4.938989898989899, |
| "grad_norm": 0.007131603546440601, |
| "learning_rate": 9.711797877673285e-05, |
| "loss": 0.0006, |
| "step": 1528 |
| }, |
| { |
| "epoch": 4.942222222222222, |
| "grad_norm": 0.00486780097708106, |
| "learning_rate": 9.711041367493617e-05, |
| "loss": 0.0004, |
| "step": 1529 |
| }, |
| { |
| "epoch": 4.945454545454545, |
| "grad_norm": 0.006875573191791773, |
| "learning_rate": 9.710283895265089e-05, |
| "loss": 0.0008, |
| "step": 1530 |
| }, |
| { |
| "epoch": 4.948686868686869, |
| "grad_norm": 0.00844037625938654, |
| "learning_rate": 9.709525461142387e-05, |
| "loss": 0.0009, |
| "step": 1531 |
| }, |
| { |
| "epoch": 4.951919191919192, |
| "grad_norm": 0.008449964225292206, |
| "learning_rate": 9.708766065280391e-05, |
| "loss": 0.0006, |
| "step": 1532 |
| }, |
| { |
| "epoch": 4.955151515151515, |
| "grad_norm": 0.009186098352074623, |
| "learning_rate": 9.70800570783418e-05, |
| "loss": 0.001, |
| "step": 1533 |
| }, |
| { |
| "epoch": 4.958383838383838, |
| "grad_norm": 0.007868651300668716, |
| "learning_rate": 9.707244388959025e-05, |
| "loss": 0.0007, |
| "step": 1534 |
| }, |
| { |
| "epoch": 4.961616161616162, |
| "grad_norm": 0.007710058242082596, |
| "learning_rate": 9.706482108810398e-05, |
| "loss": 0.0005, |
| "step": 1535 |
| }, |
| { |
| "epoch": 4.964848484848485, |
| "grad_norm": 0.008696713484823704, |
| "learning_rate": 9.705718867543964e-05, |
| "loss": 0.0006, |
| "step": 1536 |
| }, |
| { |
| "epoch": 4.968080808080808, |
| "grad_norm": 0.006613404490053654, |
| "learning_rate": 9.704954665315589e-05, |
| "loss": 0.0006, |
| "step": 1537 |
| }, |
| { |
| "epoch": 4.971313131313131, |
| "grad_norm": 0.004911929834634066, |
| "learning_rate": 9.704189502281329e-05, |
| "loss": 0.0006, |
| "step": 1538 |
| }, |
| { |
| "epoch": 4.974545454545455, |
| "grad_norm": 0.007607612758874893, |
| "learning_rate": 9.703423378597438e-05, |
| "loss": 0.001, |
| "step": 1539 |
| }, |
| { |
| "epoch": 4.977777777777778, |
| "grad_norm": 0.008079234510660172, |
| "learning_rate": 9.70265629442037e-05, |
| "loss": 0.0008, |
| "step": 1540 |
| }, |
| { |
| "epoch": 4.981010101010101, |
| "grad_norm": 0.00701570138335228, |
| "learning_rate": 9.701888249906772e-05, |
| "loss": 0.0006, |
| "step": 1541 |
| }, |
| { |
| "epoch": 4.984242424242424, |
| "grad_norm": 0.0105473967269063, |
| "learning_rate": 9.701119245213486e-05, |
| "loss": 0.0007, |
| "step": 1542 |
| }, |
| { |
| "epoch": 4.987474747474748, |
| "grad_norm": 0.005615720059722662, |
| "learning_rate": 9.700349280497552e-05, |
| "loss": 0.0005, |
| "step": 1543 |
| }, |
| { |
| "epoch": 4.990707070707071, |
| "grad_norm": 0.005315994843840599, |
| "learning_rate": 9.699578355916206e-05, |
| "loss": 0.0006, |
| "step": 1544 |
| }, |
| { |
| "epoch": 4.993939393939394, |
| "grad_norm": 0.0044459691271185875, |
| "learning_rate": 9.69880647162688e-05, |
| "loss": 0.0004, |
| "step": 1545 |
| }, |
| { |
| "epoch": 4.997171717171717, |
| "grad_norm": 0.005914855748414993, |
| "learning_rate": 9.698033627787201e-05, |
| "loss": 0.0006, |
| "step": 1546 |
| }, |
| { |
| "epoch": 5.000404040404041, |
| "grad_norm": 0.012615597806870937, |
| "learning_rate": 9.697259824554996e-05, |
| "loss": 0.0012, |
| "step": 1547 |
| }, |
| { |
| "epoch": 5.003636363636364, |
| "grad_norm": 0.005166727118194103, |
| "learning_rate": 9.696485062088279e-05, |
| "loss": 0.0005, |
| "step": 1548 |
| }, |
| { |
| "epoch": 5.006868686868687, |
| "grad_norm": 0.004762967582792044, |
| "learning_rate": 9.695709340545268e-05, |
| "loss": 0.0005, |
| "step": 1549 |
| }, |
| { |
| "epoch": 5.01010101010101, |
| "grad_norm": 0.006077293772250414, |
| "learning_rate": 9.694932660084375e-05, |
| "loss": 0.0004, |
| "step": 1550 |
| }, |
| { |
| "epoch": 5.01010101010101, |
| "eval_loss": 0.0011314748553559184, |
| "eval_runtime": 16.3383, |
| "eval_samples_per_second": 6.121, |
| "eval_steps_per_second": 1.53, |
| "step": 1550 |
| }, |
| { |
| "epoch": 5.013333333333334, |
| "grad_norm": 0.006806342862546444, |
| "learning_rate": 9.694155020864207e-05, |
| "loss": 0.0004, |
| "step": 1551 |
| }, |
| { |
| "epoch": 5.016565656565657, |
| "grad_norm": 0.004313282202929258, |
| "learning_rate": 9.693376423043564e-05, |
| "loss": 0.0004, |
| "step": 1552 |
| }, |
| { |
| "epoch": 5.01979797979798, |
| "grad_norm": 0.005981628783047199, |
| "learning_rate": 9.692596866781447e-05, |
| "loss": 0.0004, |
| "step": 1553 |
| }, |
| { |
| "epoch": 5.023030303030303, |
| "grad_norm": 0.009414016269147396, |
| "learning_rate": 9.691816352237053e-05, |
| "loss": 0.0006, |
| "step": 1554 |
| }, |
| { |
| "epoch": 5.026262626262627, |
| "grad_norm": 0.005016586743295193, |
| "learning_rate": 9.691034879569767e-05, |
| "loss": 0.0005, |
| "step": 1555 |
| }, |
| { |
| "epoch": 5.02949494949495, |
| "grad_norm": 0.010389821603894234, |
| "learning_rate": 9.690252448939177e-05, |
| "loss": 0.0006, |
| "step": 1556 |
| }, |
| { |
| "epoch": 5.032727272727273, |
| "grad_norm": 0.010016842745244503, |
| "learning_rate": 9.689469060505064e-05, |
| "loss": 0.0006, |
| "step": 1557 |
| }, |
| { |
| "epoch": 5.0359595959595955, |
| "grad_norm": 0.005418992601335049, |
| "learning_rate": 9.688684714427406e-05, |
| "loss": 0.0004, |
| "step": 1558 |
| }, |
| { |
| "epoch": 5.039191919191919, |
| "grad_norm": 0.010485680773854256, |
| "learning_rate": 9.687899410866374e-05, |
| "loss": 0.001, |
| "step": 1559 |
| }, |
| { |
| "epoch": 5.042424242424242, |
| "grad_norm": 0.0053639658726751804, |
| "learning_rate": 9.687113149982339e-05, |
| "loss": 0.0003, |
| "step": 1560 |
| }, |
| { |
| "epoch": 5.045656565656565, |
| "grad_norm": 0.008133779279887676, |
| "learning_rate": 9.68632593193586e-05, |
| "loss": 0.0006, |
| "step": 1561 |
| }, |
| { |
| "epoch": 5.0488888888888885, |
| "grad_norm": 0.005261108744889498, |
| "learning_rate": 9.6855377568877e-05, |
| "loss": 0.0003, |
| "step": 1562 |
| }, |
| { |
| "epoch": 5.052121212121212, |
| "grad_norm": 0.0037532998248934746, |
| "learning_rate": 9.68474862499881e-05, |
| "loss": 0.0003, |
| "step": 1563 |
| }, |
| { |
| "epoch": 5.055353535353535, |
| "grad_norm": 0.006041843444108963, |
| "learning_rate": 9.683958536430341e-05, |
| "loss": 0.0004, |
| "step": 1564 |
| }, |
| { |
| "epoch": 5.058585858585858, |
| "grad_norm": 0.005084019619971514, |
| "learning_rate": 9.68316749134364e-05, |
| "loss": 0.0004, |
| "step": 1565 |
| }, |
| { |
| "epoch": 5.0618181818181816, |
| "grad_norm": 0.006829909980297089, |
| "learning_rate": 9.682375489900246e-05, |
| "loss": 0.0007, |
| "step": 1566 |
| }, |
| { |
| "epoch": 5.065050505050505, |
| "grad_norm": 0.007718116044998169, |
| "learning_rate": 9.681582532261895e-05, |
| "loss": 0.0006, |
| "step": 1567 |
| }, |
| { |
| "epoch": 5.068282828282828, |
| "grad_norm": 0.006197106558829546, |
| "learning_rate": 9.680788618590517e-05, |
| "loss": 0.0006, |
| "step": 1568 |
| }, |
| { |
| "epoch": 5.071515151515151, |
| "grad_norm": 0.0031667128205299377, |
| "learning_rate": 9.679993749048241e-05, |
| "loss": 0.0003, |
| "step": 1569 |
| }, |
| { |
| "epoch": 5.074747474747475, |
| "grad_norm": 0.00838505569845438, |
| "learning_rate": 9.679197923797385e-05, |
| "loss": 0.0007, |
| "step": 1570 |
| }, |
| { |
| "epoch": 5.077979797979798, |
| "grad_norm": 0.0062113236635923386, |
| "learning_rate": 9.678401143000469e-05, |
| "loss": 0.0004, |
| "step": 1571 |
| }, |
| { |
| "epoch": 5.081212121212121, |
| "grad_norm": 0.008421992883086205, |
| "learning_rate": 9.677603406820203e-05, |
| "loss": 0.0005, |
| "step": 1572 |
| }, |
| { |
| "epoch": 5.084444444444444, |
| "grad_norm": 0.007388005498796701, |
| "learning_rate": 9.676804715419494e-05, |
| "loss": 0.0006, |
| "step": 1573 |
| }, |
| { |
| "epoch": 5.087676767676768, |
| "grad_norm": 0.00611676461994648, |
| "learning_rate": 9.676005068961444e-05, |
| "loss": 0.0005, |
| "step": 1574 |
| }, |
| { |
| "epoch": 5.090909090909091, |
| "grad_norm": 0.005580159369856119, |
| "learning_rate": 9.675204467609351e-05, |
| "loss": 0.0003, |
| "step": 1575 |
| }, |
| { |
| "epoch": 5.090909090909091, |
| "eval_loss": 0.0011133438674733043, |
| "eval_runtime": 16.3751, |
| "eval_samples_per_second": 6.107, |
| "eval_steps_per_second": 1.527, |
| "step": 1575 |
| }, |
| { |
| "epoch": 5.094141414141414, |
| "grad_norm": 0.008742348290979862, |
| "learning_rate": 9.674402911526706e-05, |
| "loss": 0.0008, |
| "step": 1576 |
| }, |
| { |
| "epoch": 5.097373737373737, |
| "grad_norm": 0.006156270392239094, |
| "learning_rate": 9.673600400877197e-05, |
| "loss": 0.0004, |
| "step": 1577 |
| }, |
| { |
| "epoch": 5.100606060606061, |
| "grad_norm": 0.004946860484778881, |
| "learning_rate": 9.672796935824704e-05, |
| "loss": 0.0004, |
| "step": 1578 |
| }, |
| { |
| "epoch": 5.103838383838384, |
| "grad_norm": 0.004557866603136063, |
| "learning_rate": 9.671992516533306e-05, |
| "loss": 0.0004, |
| "step": 1579 |
| }, |
| { |
| "epoch": 5.107070707070707, |
| "grad_norm": 0.006562303751707077, |
| "learning_rate": 9.671187143167273e-05, |
| "loss": 0.0005, |
| "step": 1580 |
| }, |
| { |
| "epoch": 5.11030303030303, |
| "grad_norm": 0.0077783651649951935, |
| "learning_rate": 9.670380815891071e-05, |
| "loss": 0.0006, |
| "step": 1581 |
| }, |
| { |
| "epoch": 5.113535353535354, |
| "grad_norm": 0.00654107891023159, |
| "learning_rate": 9.669573534869363e-05, |
| "loss": 0.0005, |
| "step": 1582 |
| }, |
| { |
| "epoch": 5.116767676767677, |
| "grad_norm": 0.0059667970053851604, |
| "learning_rate": 9.668765300267006e-05, |
| "loss": 0.0005, |
| "step": 1583 |
| }, |
| { |
| "epoch": 5.12, |
| "grad_norm": 0.005632336251437664, |
| "learning_rate": 9.667956112249049e-05, |
| "loss": 0.0005, |
| "step": 1584 |
| }, |
| { |
| "epoch": 5.123232323232323, |
| "grad_norm": 0.00457612844184041, |
| "learning_rate": 9.667145970980735e-05, |
| "loss": 0.0005, |
| "step": 1585 |
| }, |
| { |
| "epoch": 5.126464646464647, |
| "grad_norm": 0.0033729569986462593, |
| "learning_rate": 9.66633487662751e-05, |
| "loss": 0.0003, |
| "step": 1586 |
| }, |
| { |
| "epoch": 5.12969696969697, |
| "grad_norm": 0.007405257783830166, |
| "learning_rate": 9.665522829355005e-05, |
| "loss": 0.0005, |
| "step": 1587 |
| }, |
| { |
| "epoch": 5.132929292929293, |
| "grad_norm": 0.007421514950692654, |
| "learning_rate": 9.664709829329051e-05, |
| "loss": 0.0005, |
| "step": 1588 |
| }, |
| { |
| "epoch": 5.136161616161616, |
| "grad_norm": 0.008489915169775486, |
| "learning_rate": 9.66389587671567e-05, |
| "loss": 0.0005, |
| "step": 1589 |
| }, |
| { |
| "epoch": 5.13939393939394, |
| "grad_norm": 0.006871935911476612, |
| "learning_rate": 9.663080971681081e-05, |
| "loss": 0.0005, |
| "step": 1590 |
| }, |
| { |
| "epoch": 5.142626262626263, |
| "grad_norm": 0.002959504025056958, |
| "learning_rate": 9.662265114391698e-05, |
| "loss": 0.0003, |
| "step": 1591 |
| }, |
| { |
| "epoch": 5.145858585858586, |
| "grad_norm": 0.008853099308907986, |
| "learning_rate": 9.661448305014129e-05, |
| "loss": 0.0006, |
| "step": 1592 |
| }, |
| { |
| "epoch": 5.149090909090909, |
| "grad_norm": 0.010997905395925045, |
| "learning_rate": 9.660630543715174e-05, |
| "loss": 0.0007, |
| "step": 1593 |
| }, |
| { |
| "epoch": 5.152323232323233, |
| "grad_norm": 0.011188059113919735, |
| "learning_rate": 9.659811830661831e-05, |
| "loss": 0.0007, |
| "step": 1594 |
| }, |
| { |
| "epoch": 5.155555555555556, |
| "grad_norm": 0.008755922317504883, |
| "learning_rate": 9.65899216602129e-05, |
| "loss": 0.0008, |
| "step": 1595 |
| }, |
| { |
| "epoch": 5.158787878787879, |
| "grad_norm": 0.004408243112266064, |
| "learning_rate": 9.658171549960935e-05, |
| "loss": 0.0005, |
| "step": 1596 |
| }, |
| { |
| "epoch": 5.162020202020202, |
| "grad_norm": 0.006124711595475674, |
| "learning_rate": 9.657349982648348e-05, |
| "loss": 0.0005, |
| "step": 1597 |
| }, |
| { |
| "epoch": 5.165252525252526, |
| "grad_norm": 0.00437605194747448, |
| "learning_rate": 9.656527464251298e-05, |
| "loss": 0.0004, |
| "step": 1598 |
| }, |
| { |
| "epoch": 5.168484848484849, |
| "grad_norm": 0.0057185799814760685, |
| "learning_rate": 9.655703994937756e-05, |
| "loss": 0.0005, |
| "step": 1599 |
| }, |
| { |
| "epoch": 5.171717171717171, |
| "grad_norm": 0.008006410673260689, |
| "learning_rate": 9.654879574875884e-05, |
| "loss": 0.0007, |
| "step": 1600 |
| }, |
| { |
| "epoch": 5.171717171717171, |
| "eval_loss": 0.0009937735740095377, |
| "eval_runtime": 16.3715, |
| "eval_samples_per_second": 6.108, |
| "eval_steps_per_second": 1.527, |
| "step": 1600 |
| }, |
| { |
| "epoch": 5.1749494949494945, |
| "grad_norm": 0.005054215434938669, |
| "learning_rate": 9.654054204234035e-05, |
| "loss": 0.0004, |
| "step": 1601 |
| }, |
| { |
| "epoch": 5.178181818181818, |
| "grad_norm": 0.005975916516035795, |
| "learning_rate": 9.653227883180765e-05, |
| "loss": 0.0007, |
| "step": 1602 |
| }, |
| { |
| "epoch": 5.181414141414141, |
| "grad_norm": 0.006343401037156582, |
| "learning_rate": 9.65240061188481e-05, |
| "loss": 0.0005, |
| "step": 1603 |
| }, |
| { |
| "epoch": 5.184646464646464, |
| "grad_norm": 0.008079957216978073, |
| "learning_rate": 9.651572390515118e-05, |
| "loss": 0.0005, |
| "step": 1604 |
| }, |
| { |
| "epoch": 5.1878787878787875, |
| "grad_norm": 0.005014673341065645, |
| "learning_rate": 9.650743219240813e-05, |
| "loss": 0.0004, |
| "step": 1605 |
| }, |
| { |
| "epoch": 5.191111111111111, |
| "grad_norm": 0.008054779842495918, |
| "learning_rate": 9.649913098231226e-05, |
| "loss": 0.0007, |
| "step": 1606 |
| }, |
| { |
| "epoch": 5.194343434343434, |
| "grad_norm": 0.006263330578804016, |
| "learning_rate": 9.649082027655876e-05, |
| "loss": 0.0005, |
| "step": 1607 |
| }, |
| { |
| "epoch": 5.197575757575757, |
| "grad_norm": 0.0038268077187240124, |
| "learning_rate": 9.648250007684476e-05, |
| "loss": 0.0004, |
| "step": 1608 |
| }, |
| { |
| "epoch": 5.2008080808080805, |
| "grad_norm": 0.010677930898964405, |
| "learning_rate": 9.647417038486935e-05, |
| "loss": 0.0006, |
| "step": 1609 |
| }, |
| { |
| "epoch": 5.204040404040404, |
| "grad_norm": 0.005933019332587719, |
| "learning_rate": 9.646583120233357e-05, |
| "loss": 0.0005, |
| "step": 1610 |
| }, |
| { |
| "epoch": 5.207272727272727, |
| "grad_norm": 0.013155707158148289, |
| "learning_rate": 9.645748253094034e-05, |
| "loss": 0.0005, |
| "step": 1611 |
| }, |
| { |
| "epoch": 5.21050505050505, |
| "grad_norm": 0.008887146599590778, |
| "learning_rate": 9.644912437239457e-05, |
| "loss": 0.0007, |
| "step": 1612 |
| }, |
| { |
| "epoch": 5.213737373737374, |
| "grad_norm": 0.003811226924881339, |
| "learning_rate": 9.644075672840312e-05, |
| "loss": 0.0003, |
| "step": 1613 |
| }, |
| { |
| "epoch": 5.216969696969697, |
| "grad_norm": 0.004529445432126522, |
| "learning_rate": 9.64323796006747e-05, |
| "loss": 0.0005, |
| "step": 1614 |
| }, |
| { |
| "epoch": 5.22020202020202, |
| "grad_norm": 0.008375285193324089, |
| "learning_rate": 9.642399299092006e-05, |
| "loss": 0.0006, |
| "step": 1615 |
| }, |
| { |
| "epoch": 5.223434343434343, |
| "grad_norm": 0.006074436008930206, |
| "learning_rate": 9.641559690085184e-05, |
| "loss": 0.0006, |
| "step": 1616 |
| }, |
| { |
| "epoch": 5.226666666666667, |
| "grad_norm": 0.011987969279289246, |
| "learning_rate": 9.640719133218461e-05, |
| "loss": 0.0005, |
| "step": 1617 |
| }, |
| { |
| "epoch": 5.22989898989899, |
| "grad_norm": 0.006978850811719894, |
| "learning_rate": 9.639877628663489e-05, |
| "loss": 0.0006, |
| "step": 1618 |
| }, |
| { |
| "epoch": 5.233131313131313, |
| "grad_norm": 0.008396327495574951, |
| "learning_rate": 9.639035176592111e-05, |
| "loss": 0.0007, |
| "step": 1619 |
| }, |
| { |
| "epoch": 5.236363636363636, |
| "grad_norm": 0.006504751276224852, |
| "learning_rate": 9.638191777176367e-05, |
| "loss": 0.0005, |
| "step": 1620 |
| }, |
| { |
| "epoch": 5.23959595959596, |
| "grad_norm": 0.005509985610842705, |
| "learning_rate": 9.637347430588489e-05, |
| "loss": 0.0003, |
| "step": 1621 |
| }, |
| { |
| "epoch": 5.242828282828283, |
| "grad_norm": 0.004340514540672302, |
| "learning_rate": 9.6365021370009e-05, |
| "loss": 0.0005, |
| "step": 1622 |
| }, |
| { |
| "epoch": 5.246060606060606, |
| "grad_norm": 0.00605999818071723, |
| "learning_rate": 9.635655896586221e-05, |
| "loss": 0.0004, |
| "step": 1623 |
| }, |
| { |
| "epoch": 5.249292929292929, |
| "grad_norm": 0.003610120853409171, |
| "learning_rate": 9.634808709517267e-05, |
| "loss": 0.0003, |
| "step": 1624 |
| }, |
| { |
| "epoch": 5.252525252525253, |
| "grad_norm": 0.00852506048977375, |
| "learning_rate": 9.633960575967036e-05, |
| "loss": 0.0005, |
| "step": 1625 |
| }, |
| { |
| "epoch": 5.252525252525253, |
| "eval_loss": 0.0011495515936985612, |
| "eval_runtime": 16.39, |
| "eval_samples_per_second": 6.101, |
| "eval_steps_per_second": 1.525, |
| "step": 1625 |
| }, |
| { |
| "epoch": 5.255757575757576, |
| "grad_norm": 0.0051088398322463036, |
| "learning_rate": 9.633111496108733e-05, |
| "loss": 0.0004, |
| "step": 1626 |
| }, |
| { |
| "epoch": 5.258989898989899, |
| "grad_norm": 0.005684142466634512, |
| "learning_rate": 9.632261470115746e-05, |
| "loss": 0.0006, |
| "step": 1627 |
| }, |
| { |
| "epoch": 5.262222222222222, |
| "grad_norm": 0.007544459775090218, |
| "learning_rate": 9.631410498161662e-05, |
| "loss": 0.0004, |
| "step": 1628 |
| }, |
| { |
| "epoch": 5.265454545454546, |
| "grad_norm": 0.006928099785000086, |
| "learning_rate": 9.630558580420258e-05, |
| "loss": 0.0005, |
| "step": 1629 |
| }, |
| { |
| "epoch": 5.268686868686869, |
| "grad_norm": 0.011976206675171852, |
| "learning_rate": 9.629705717065507e-05, |
| "loss": 0.0007, |
| "step": 1630 |
| }, |
| { |
| "epoch": 5.271919191919192, |
| "grad_norm": 0.007364520337432623, |
| "learning_rate": 9.628851908271572e-05, |
| "loss": 0.0006, |
| "step": 1631 |
| }, |
| { |
| "epoch": 5.275151515151515, |
| "grad_norm": 0.008781514130532742, |
| "learning_rate": 9.627997154212812e-05, |
| "loss": 0.0005, |
| "step": 1632 |
| }, |
| { |
| "epoch": 5.278383838383839, |
| "grad_norm": 0.007252220064401627, |
| "learning_rate": 9.627141455063777e-05, |
| "loss": 0.0005, |
| "step": 1633 |
| }, |
| { |
| "epoch": 5.281616161616162, |
| "grad_norm": 0.0061063519679009914, |
| "learning_rate": 9.626284810999209e-05, |
| "loss": 0.0004, |
| "step": 1634 |
| }, |
| { |
| "epoch": 5.284848484848485, |
| "grad_norm": 0.006363790947943926, |
| "learning_rate": 9.625427222194048e-05, |
| "loss": 0.0005, |
| "step": 1635 |
| }, |
| { |
| "epoch": 5.288080808080808, |
| "grad_norm": 0.0046599674969911575, |
| "learning_rate": 9.624568688823421e-05, |
| "loss": 0.0003, |
| "step": 1636 |
| }, |
| { |
| "epoch": 5.291313131313132, |
| "grad_norm": 0.007841712795197964, |
| "learning_rate": 9.62370921106265e-05, |
| "loss": 0.0006, |
| "step": 1637 |
| }, |
| { |
| "epoch": 5.294545454545455, |
| "grad_norm": 0.00554584851488471, |
| "learning_rate": 9.622848789087252e-05, |
| "loss": 0.0004, |
| "step": 1638 |
| }, |
| { |
| "epoch": 5.297777777777778, |
| "grad_norm": 0.012035543099045753, |
| "learning_rate": 9.621987423072933e-05, |
| "loss": 0.0008, |
| "step": 1639 |
| }, |
| { |
| "epoch": 5.301010101010101, |
| "grad_norm": 0.008659287355840206, |
| "learning_rate": 9.621125113195597e-05, |
| "loss": 0.0003, |
| "step": 1640 |
| }, |
| { |
| "epoch": 5.304242424242425, |
| "grad_norm": 0.006295360624790192, |
| "learning_rate": 9.620261859631336e-05, |
| "loss": 0.0006, |
| "step": 1641 |
| }, |
| { |
| "epoch": 5.307474747474748, |
| "grad_norm": 0.008987173438072205, |
| "learning_rate": 9.619397662556435e-05, |
| "loss": 0.0007, |
| "step": 1642 |
| }, |
| { |
| "epoch": 5.310707070707071, |
| "grad_norm": 0.009222692809998989, |
| "learning_rate": 9.618532522147374e-05, |
| "loss": 0.0007, |
| "step": 1643 |
| }, |
| { |
| "epoch": 5.313939393939394, |
| "grad_norm": 0.003340350231155753, |
| "learning_rate": 9.617666438580823e-05, |
| "loss": 0.0003, |
| "step": 1644 |
| }, |
| { |
| "epoch": 5.317171717171717, |
| "grad_norm": 0.006181197706609964, |
| "learning_rate": 9.61679941203365e-05, |
| "loss": 0.0007, |
| "step": 1645 |
| }, |
| { |
| "epoch": 5.32040404040404, |
| "grad_norm": 0.0098760686814785, |
| "learning_rate": 9.615931442682911e-05, |
| "loss": 0.0008, |
| "step": 1646 |
| }, |
| { |
| "epoch": 5.323636363636363, |
| "grad_norm": 0.007327872794121504, |
| "learning_rate": 9.615062530705851e-05, |
| "loss": 0.0005, |
| "step": 1647 |
| }, |
| { |
| "epoch": 5.3268686868686865, |
| "grad_norm": 0.011561273597180843, |
| "learning_rate": 9.614192676279917e-05, |
| "loss": 0.0009, |
| "step": 1648 |
| }, |
| { |
| "epoch": 5.33010101010101, |
| "grad_norm": 0.006350391544401646, |
| "learning_rate": 9.61332187958274e-05, |
| "loss": 0.0005, |
| "step": 1649 |
| }, |
| { |
| "epoch": 5.333333333333333, |
| "grad_norm": 0.007870431058108807, |
| "learning_rate": 9.612450140792148e-05, |
| "loss": 0.0005, |
| "step": 1650 |
| }, |
| { |
| "epoch": 5.333333333333333, |
| "eval_loss": 0.0010055277962237597, |
| "eval_runtime": 16.3732, |
| "eval_samples_per_second": 6.108, |
| "eval_steps_per_second": 1.527, |
| "step": 1650 |
| }, |
| { |
| "epoch": 5.336565656565656, |
| "grad_norm": 0.006069686263799667, |
| "learning_rate": 9.61157746008616e-05, |
| "loss": 0.0004, |
| "step": 1651 |
| }, |
| { |
| "epoch": 5.3397979797979795, |
| "grad_norm": 0.007049136329442263, |
| "learning_rate": 9.610703837642988e-05, |
| "loss": 0.0008, |
| "step": 1652 |
| }, |
| { |
| "epoch": 5.343030303030303, |
| "grad_norm": 0.00800009910017252, |
| "learning_rate": 9.609829273641034e-05, |
| "loss": 0.001, |
| "step": 1653 |
| }, |
| { |
| "epoch": 5.346262626262626, |
| "grad_norm": 0.008147126995027065, |
| "learning_rate": 9.608953768258894e-05, |
| "loss": 0.0007, |
| "step": 1654 |
| }, |
| { |
| "epoch": 5.349494949494949, |
| "grad_norm": 0.004491726867854595, |
| "learning_rate": 9.60807732167536e-05, |
| "loss": 0.0004, |
| "step": 1655 |
| }, |
| { |
| "epoch": 5.352727272727273, |
| "grad_norm": 0.0029130817856639624, |
| "learning_rate": 9.60719993406941e-05, |
| "loss": 0.0003, |
| "step": 1656 |
| }, |
| { |
| "epoch": 5.355959595959596, |
| "grad_norm": 0.009365669451653957, |
| "learning_rate": 9.606321605620215e-05, |
| "loss": 0.0008, |
| "step": 1657 |
| }, |
| { |
| "epoch": 5.359191919191919, |
| "grad_norm": 0.005305276717990637, |
| "learning_rate": 9.605442336507142e-05, |
| "loss": 0.0005, |
| "step": 1658 |
| }, |
| { |
| "epoch": 5.362424242424242, |
| "grad_norm": 0.006028775591403246, |
| "learning_rate": 9.604562126909748e-05, |
| "loss": 0.0004, |
| "step": 1659 |
| }, |
| { |
| "epoch": 5.365656565656566, |
| "grad_norm": 0.005635719280689955, |
| "learning_rate": 9.60368097700778e-05, |
| "loss": 0.0006, |
| "step": 1660 |
| }, |
| { |
| "epoch": 5.368888888888889, |
| "grad_norm": 0.005850364454090595, |
| "learning_rate": 9.60279888698118e-05, |
| "loss": 0.0004, |
| "step": 1661 |
| }, |
| { |
| "epoch": 5.372121212121212, |
| "grad_norm": 0.008651700802147388, |
| "learning_rate": 9.60191585701008e-05, |
| "loss": 0.0005, |
| "step": 1662 |
| }, |
| { |
| "epoch": 5.375353535353535, |
| "grad_norm": 0.007407501805573702, |
| "learning_rate": 9.601031887274808e-05, |
| "loss": 0.0005, |
| "step": 1663 |
| }, |
| { |
| "epoch": 5.378585858585859, |
| "grad_norm": 0.006025203503668308, |
| "learning_rate": 9.600146977955878e-05, |
| "loss": 0.0007, |
| "step": 1664 |
| }, |
| { |
| "epoch": 5.381818181818182, |
| "grad_norm": 0.008005495183169842, |
| "learning_rate": 9.599261129234e-05, |
| "loss": 0.0006, |
| "step": 1665 |
| }, |
| { |
| "epoch": 5.385050505050505, |
| "grad_norm": 0.005814394447952509, |
| "learning_rate": 9.598374341290073e-05, |
| "loss": 0.0006, |
| "step": 1666 |
| }, |
| { |
| "epoch": 5.388282828282828, |
| "grad_norm": 0.005213486962020397, |
| "learning_rate": 9.59748661430519e-05, |
| "loss": 0.0005, |
| "step": 1667 |
| }, |
| { |
| "epoch": 5.391515151515152, |
| "grad_norm": 0.008626583032310009, |
| "learning_rate": 9.596597948460634e-05, |
| "loss": 0.0007, |
| "step": 1668 |
| }, |
| { |
| "epoch": 5.394747474747475, |
| "grad_norm": 0.0064296601340174675, |
| "learning_rate": 9.595708343937885e-05, |
| "loss": 0.0007, |
| "step": 1669 |
| }, |
| { |
| "epoch": 5.397979797979798, |
| "grad_norm": 0.005717860069125891, |
| "learning_rate": 9.594817800918606e-05, |
| "loss": 0.0006, |
| "step": 1670 |
| }, |
| { |
| "epoch": 5.401212121212121, |
| "grad_norm": 0.006591845769435167, |
| "learning_rate": 9.593926319584658e-05, |
| "loss": 0.0006, |
| "step": 1671 |
| }, |
| { |
| "epoch": 5.404444444444445, |
| "grad_norm": 0.005862986668944359, |
| "learning_rate": 9.59303390011809e-05, |
| "loss": 0.0005, |
| "step": 1672 |
| }, |
| { |
| "epoch": 5.407676767676768, |
| "grad_norm": 0.004463293123990297, |
| "learning_rate": 9.592140542701147e-05, |
| "loss": 0.0003, |
| "step": 1673 |
| }, |
| { |
| "epoch": 5.410909090909091, |
| "grad_norm": 0.007692611776292324, |
| "learning_rate": 9.591246247516262e-05, |
| "loss": 0.0005, |
| "step": 1674 |
| }, |
| { |
| "epoch": 5.414141414141414, |
| "grad_norm": 0.006440794561058283, |
| "learning_rate": 9.590351014746059e-05, |
| "loss": 0.0004, |
| "step": 1675 |
| }, |
| { |
| "epoch": 5.414141414141414, |
| "eval_loss": 0.0010094997705891728, |
| "eval_runtime": 16.3714, |
| "eval_samples_per_second": 6.108, |
| "eval_steps_per_second": 1.527, |
| "step": 1675 |
| }, |
| { |
| "epoch": 5.417373737373738, |
| "grad_norm": 0.005647646728903055, |
| "learning_rate": 9.589454844573356e-05, |
| "loss": 0.0005, |
| "step": 1676 |
| }, |
| { |
| "epoch": 5.420606060606061, |
| "grad_norm": 0.004992474801838398, |
| "learning_rate": 9.588557737181161e-05, |
| "loss": 0.0003, |
| "step": 1677 |
| }, |
| { |
| "epoch": 5.423838383838384, |
| "grad_norm": 0.007492889184504747, |
| "learning_rate": 9.587659692752672e-05, |
| "loss": 0.0005, |
| "step": 1678 |
| }, |
| { |
| "epoch": 5.427070707070707, |
| "grad_norm": 0.006619749125093222, |
| "learning_rate": 9.586760711471284e-05, |
| "loss": 0.0005, |
| "step": 1679 |
| }, |
| { |
| "epoch": 5.430303030303031, |
| "grad_norm": 0.00738243293017149, |
| "learning_rate": 9.585860793520577e-05, |
| "loss": 0.0007, |
| "step": 1680 |
| }, |
| { |
| "epoch": 5.433535353535354, |
| "grad_norm": 0.010203221812844276, |
| "learning_rate": 9.584959939084323e-05, |
| "loss": 0.0007, |
| "step": 1681 |
| }, |
| { |
| "epoch": 5.436767676767677, |
| "grad_norm": 0.006132564041763544, |
| "learning_rate": 9.584058148346491e-05, |
| "loss": 0.0005, |
| "step": 1682 |
| }, |
| { |
| "epoch": 5.44, |
| "grad_norm": 0.006064686458557844, |
| "learning_rate": 9.583155421491232e-05, |
| "loss": 0.0005, |
| "step": 1683 |
| }, |
| { |
| "epoch": 5.443232323232324, |
| "grad_norm": 0.007469397038221359, |
| "learning_rate": 9.582251758702897e-05, |
| "loss": 0.0004, |
| "step": 1684 |
| }, |
| { |
| "epoch": 5.446464646464646, |
| "grad_norm": 0.007607720792293549, |
| "learning_rate": 9.581347160166023e-05, |
| "loss": 0.0005, |
| "step": 1685 |
| }, |
| { |
| "epoch": 5.449696969696969, |
| "grad_norm": 0.004066807217895985, |
| "learning_rate": 9.580441626065338e-05, |
| "loss": 0.0004, |
| "step": 1686 |
| }, |
| { |
| "epoch": 5.4529292929292925, |
| "grad_norm": 0.006586882285773754, |
| "learning_rate": 9.579535156585766e-05, |
| "loss": 0.0005, |
| "step": 1687 |
| }, |
| { |
| "epoch": 5.456161616161616, |
| "grad_norm": 0.008844728581607342, |
| "learning_rate": 9.578627751912414e-05, |
| "loss": 0.0004, |
| "step": 1688 |
| }, |
| { |
| "epoch": 5.459393939393939, |
| "grad_norm": 0.005195740610361099, |
| "learning_rate": 9.57771941223059e-05, |
| "loss": 0.0004, |
| "step": 1689 |
| }, |
| { |
| "epoch": 5.462626262626262, |
| "grad_norm": 0.004741475451737642, |
| "learning_rate": 9.576810137725782e-05, |
| "loss": 0.0005, |
| "step": 1690 |
| }, |
| { |
| "epoch": 5.4658585858585855, |
| "grad_norm": 0.007326944265514612, |
| "learning_rate": 9.575899928583678e-05, |
| "loss": 0.0004, |
| "step": 1691 |
| }, |
| { |
| "epoch": 5.469090909090909, |
| "grad_norm": 0.007390595972537994, |
| "learning_rate": 9.574988784990152e-05, |
| "loss": 0.0006, |
| "step": 1692 |
| }, |
| { |
| "epoch": 5.472323232323232, |
| "grad_norm": 0.004650065675377846, |
| "learning_rate": 9.574076707131269e-05, |
| "loss": 0.0005, |
| "step": 1693 |
| }, |
| { |
| "epoch": 5.475555555555555, |
| "grad_norm": 0.0051221102476119995, |
| "learning_rate": 9.573163695193287e-05, |
| "loss": 0.0004, |
| "step": 1694 |
| }, |
| { |
| "epoch": 5.4787878787878785, |
| "grad_norm": 0.007180133368819952, |
| "learning_rate": 9.572249749362652e-05, |
| "loss": 0.0006, |
| "step": 1695 |
| }, |
| { |
| "epoch": 5.482020202020202, |
| "grad_norm": 0.005039013456553221, |
| "learning_rate": 9.571334869826006e-05, |
| "loss": 0.0005, |
| "step": 1696 |
| }, |
| { |
| "epoch": 5.485252525252525, |
| "grad_norm": 0.0040956344455480576, |
| "learning_rate": 9.570419056770173e-05, |
| "loss": 0.0005, |
| "step": 1697 |
| }, |
| { |
| "epoch": 5.488484848484848, |
| "grad_norm": 0.006832249462604523, |
| "learning_rate": 9.569502310382176e-05, |
| "loss": 0.0004, |
| "step": 1698 |
| }, |
| { |
| "epoch": 5.4917171717171716, |
| "grad_norm": 0.005776044446974993, |
| "learning_rate": 9.568584630849224e-05, |
| "loss": 0.0006, |
| "step": 1699 |
| }, |
| { |
| "epoch": 5.494949494949495, |
| "grad_norm": 0.0057342336513102055, |
| "learning_rate": 9.567666018358718e-05, |
| "loss": 0.0005, |
| "step": 1700 |
| }, |
| { |
| "epoch": 5.494949494949495, |
| "eval_loss": 0.0010332745732739568, |
| "eval_runtime": 16.3211, |
| "eval_samples_per_second": 6.127, |
| "eval_steps_per_second": 1.532, |
| "step": 1700 |
| }, |
| { |
| "epoch": 5.498181818181818, |
| "grad_norm": 0.004787544719874859, |
| "learning_rate": 9.566746473098249e-05, |
| "loss": 0.0005, |
| "step": 1701 |
| }, |
| { |
| "epoch": 5.501414141414141, |
| "grad_norm": 0.005645860452204943, |
| "learning_rate": 9.565825995255599e-05, |
| "loss": 0.0005, |
| "step": 1702 |
| }, |
| { |
| "epoch": 5.504646464646465, |
| "grad_norm": 0.0058639501221477985, |
| "learning_rate": 9.56490458501874e-05, |
| "loss": 0.0005, |
| "step": 1703 |
| }, |
| { |
| "epoch": 5.507878787878788, |
| "grad_norm": 0.006462681572884321, |
| "learning_rate": 9.563982242575834e-05, |
| "loss": 0.0004, |
| "step": 1704 |
| }, |
| { |
| "epoch": 5.511111111111111, |
| "grad_norm": 0.005385414231568575, |
| "learning_rate": 9.563058968115235e-05, |
| "loss": 0.0004, |
| "step": 1705 |
| }, |
| { |
| "epoch": 5.514343434343434, |
| "grad_norm": 0.009829330258071423, |
| "learning_rate": 9.562134761825486e-05, |
| "loss": 0.0004, |
| "step": 1706 |
| }, |
| { |
| "epoch": 5.517575757575758, |
| "grad_norm": 0.004105670377612114, |
| "learning_rate": 9.56120962389532e-05, |
| "loss": 0.0003, |
| "step": 1707 |
| }, |
| { |
| "epoch": 5.520808080808081, |
| "grad_norm": 0.007424734067171812, |
| "learning_rate": 9.56028355451366e-05, |
| "loss": 0.0006, |
| "step": 1708 |
| }, |
| { |
| "epoch": 5.524040404040404, |
| "grad_norm": 0.00929220300167799, |
| "learning_rate": 9.559356553869623e-05, |
| "loss": 0.0006, |
| "step": 1709 |
| }, |
| { |
| "epoch": 5.527272727272727, |
| "grad_norm": 0.00830656848847866, |
| "learning_rate": 9.558428622152509e-05, |
| "loss": 0.0007, |
| "step": 1710 |
| }, |
| { |
| "epoch": 5.530505050505051, |
| "grad_norm": 0.007593775168061256, |
| "learning_rate": 9.557499759551816e-05, |
| "loss": 0.0007, |
| "step": 1711 |
| }, |
| { |
| "epoch": 5.533737373737374, |
| "grad_norm": 0.006107932887971401, |
| "learning_rate": 9.556569966257227e-05, |
| "loss": 0.0006, |
| "step": 1712 |
| }, |
| { |
| "epoch": 5.536969696969697, |
| "grad_norm": 0.006120285019278526, |
| "learning_rate": 9.555639242458617e-05, |
| "loss": 0.0007, |
| "step": 1713 |
| }, |
| { |
| "epoch": 5.54020202020202, |
| "grad_norm": 0.006700373720377684, |
| "learning_rate": 9.55470758834605e-05, |
| "loss": 0.0006, |
| "step": 1714 |
| }, |
| { |
| "epoch": 5.543434343434344, |
| "grad_norm": 0.008279255591332912, |
| "learning_rate": 9.553775004109781e-05, |
| "loss": 0.0007, |
| "step": 1715 |
| }, |
| { |
| "epoch": 5.546666666666667, |
| "grad_norm": 0.005448332987725735, |
| "learning_rate": 9.552841489940252e-05, |
| "loss": 0.0006, |
| "step": 1716 |
| }, |
| { |
| "epoch": 5.54989898989899, |
| "grad_norm": 0.007369070313870907, |
| "learning_rate": 9.5519070460281e-05, |
| "loss": 0.0006, |
| "step": 1717 |
| }, |
| { |
| "epoch": 5.553131313131313, |
| "grad_norm": 0.00597840640693903, |
| "learning_rate": 9.55097167256415e-05, |
| "loss": 0.0004, |
| "step": 1718 |
| }, |
| { |
| "epoch": 5.556363636363637, |
| "grad_norm": 0.003714159829542041, |
| "learning_rate": 9.550035369739416e-05, |
| "loss": 0.0004, |
| "step": 1719 |
| }, |
| { |
| "epoch": 5.55959595959596, |
| "grad_norm": 0.005237117875367403, |
| "learning_rate": 9.549098137745098e-05, |
| "loss": 0.0004, |
| "step": 1720 |
| }, |
| { |
| "epoch": 5.562828282828283, |
| "grad_norm": 0.0072378539480268955, |
| "learning_rate": 9.548159976772592e-05, |
| "loss": 0.0004, |
| "step": 1721 |
| }, |
| { |
| "epoch": 5.566060606060606, |
| "grad_norm": 0.004684271290898323, |
| "learning_rate": 9.547220887013482e-05, |
| "loss": 0.0005, |
| "step": 1722 |
| }, |
| { |
| "epoch": 5.56929292929293, |
| "grad_norm": 0.0064405822195112705, |
| "learning_rate": 9.546280868659539e-05, |
| "loss": 0.0004, |
| "step": 1723 |
| }, |
| { |
| "epoch": 5.572525252525253, |
| "grad_norm": 0.007867126725614071, |
| "learning_rate": 9.545339921902728e-05, |
| "loss": 0.0006, |
| "step": 1724 |
| }, |
| { |
| "epoch": 5.575757575757576, |
| "grad_norm": 0.006734086666256189, |
| "learning_rate": 9.544398046935199e-05, |
| "loss": 0.0004, |
| "step": 1725 |
| }, |
| { |
| "epoch": 5.575757575757576, |
| "eval_loss": 0.0012909317156299949, |
| "eval_runtime": 16.3569, |
| "eval_samples_per_second": 6.114, |
| "eval_steps_per_second": 1.528, |
| "step": 1725 |
| }, |
| { |
| "epoch": 5.575757575757576, |
| "step": 1725, |
| "total_flos": 3.477509107502285e+18, |
| "train_loss": 0.0029730239298457846, |
| "train_runtime": 26390.1229, |
| "train_samples_per_second": 9.379, |
| "train_steps_per_second": 0.293 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 7725, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 25, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "EarlyStoppingCallback": { |
| "args": { |
| "early_stopping_patience": 5, |
| "early_stopping_threshold": 0.0 |
| }, |
| "attributes": { |
| "early_stopping_patience_counter": 4 |
| } |
| }, |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.477509107502285e+18, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|