diff --git "a/checkpoint-600/trainer_state.json" "b/checkpoint-600/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-600/trainer_state.json" @@ -0,0 +1,4233 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6522625356706074, + "eval_steps": 500, + "global_step": 600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001087104226117679, + "grad_norm": 3.4784371852874756, + "learning_rate": 8.000000000000001e-07, + "loss": 4.4008, + "step": 1 + }, + { + "epoch": 0.002174208452235358, + "grad_norm": 3.746831178665161, + "learning_rate": 1.6000000000000001e-06, + "loss": 4.5225, + "step": 2 + }, + { + "epoch": 0.003261312678353037, + "grad_norm": 3.8642401695251465, + "learning_rate": 2.4000000000000003e-06, + "loss": 4.6885, + "step": 3 + }, + { + "epoch": 0.004348416904470716, + "grad_norm": 3.6917271614074707, + "learning_rate": 3.2000000000000003e-06, + "loss": 4.5109, + "step": 4 + }, + { + "epoch": 0.005435521130588396, + "grad_norm": 3.797416925430298, + "learning_rate": 4.000000000000001e-06, + "loss": 4.57, + "step": 5 + }, + { + "epoch": 0.006522625356706074, + "grad_norm": 4.3334856033325195, + "learning_rate": 4.800000000000001e-06, + "loss": 4.9443, + "step": 6 + }, + { + "epoch": 0.007609729582823754, + "grad_norm": 2.8812649250030518, + "learning_rate": 5.600000000000001e-06, + "loss": 3.9337, + "step": 7 + }, + { + "epoch": 0.008696833808941432, + "grad_norm": 3.2256581783294678, + "learning_rate": 6.4000000000000006e-06, + "loss": 4.221, + "step": 8 + }, + { + "epoch": 0.00978393803505911, + "grad_norm": 3.7202064990997314, + "learning_rate": 7.2000000000000005e-06, + "loss": 4.0528, + "step": 9 + }, + { + "epoch": 0.010871042261176791, + "grad_norm": 4.757291793823242, + "learning_rate": 8.000000000000001e-06, + "loss": 4.7898, + "step": 10 + }, + { + "epoch": 0.01195814648729447, + "grad_norm": 3.3146753311157227, + "learning_rate": 8.8e-06, + "loss": 4.1329, + "step": 11 + }, + { + "epoch": 0.013045250713412148, + "grad_norm": 3.2993814945220947, + "learning_rate": 9.600000000000001e-06, + "loss": 4.3881, + "step": 12 + }, + { + "epoch": 0.014132354939529827, + "grad_norm": 2.802339792251587, + "learning_rate": 1.04e-05, + "loss": 4.5441, + "step": 13 + }, + { + "epoch": 0.015219459165647507, + "grad_norm": 1.727351188659668, + "learning_rate": 1.1200000000000001e-05, + "loss": 3.7082, + "step": 14 + }, + { + "epoch": 0.016306563391765186, + "grad_norm": 1.9265928268432617, + "learning_rate": 1.2e-05, + "loss": 4.1921, + "step": 15 + }, + { + "epoch": 0.017393667617882864, + "grad_norm": 1.4355801343917847, + "learning_rate": 1.2800000000000001e-05, + "loss": 3.645, + "step": 16 + }, + { + "epoch": 0.018480771844000543, + "grad_norm": 1.3381849527359009, + "learning_rate": 1.3600000000000002e-05, + "loss": 3.6469, + "step": 17 + }, + { + "epoch": 0.01956787607011822, + "grad_norm": 1.0700595378875732, + "learning_rate": 1.4400000000000001e-05, + "loss": 3.2366, + "step": 18 + }, + { + "epoch": 0.0206549802962359, + "grad_norm": 1.5226552486419678, + "learning_rate": 1.5200000000000002e-05, + "loss": 3.9604, + "step": 19 + }, + { + "epoch": 0.021742084522353582, + "grad_norm": 0.8243942260742188, + "learning_rate": 1.6000000000000003e-05, + "loss": 3.1254, + "step": 20 + }, + { + "epoch": 0.02282918874847126, + "grad_norm": 0.9753342866897583, + "learning_rate": 1.6800000000000002e-05, + "loss": 3.2602, + "step": 21 + }, + { + "epoch": 0.02391629297458894, + "grad_norm": 0.9511851072311401, + "learning_rate": 1.76e-05, + "loss": 3.3339, + "step": 22 + }, + { + "epoch": 0.025003397200706618, + "grad_norm": 0.8845148682594299, + "learning_rate": 1.8400000000000003e-05, + "loss": 3.262, + "step": 23 + }, + { + "epoch": 0.026090501426824297, + "grad_norm": 1.0817408561706543, + "learning_rate": 1.9200000000000003e-05, + "loss": 3.5982, + "step": 24 + }, + { + "epoch": 0.027177605652941975, + "grad_norm": 0.8234590888023376, + "learning_rate": 2e-05, + "loss": 3.2171, + "step": 25 + }, + { + "epoch": 0.028264709879059654, + "grad_norm": 1.066890001296997, + "learning_rate": 2.08e-05, + "loss": 3.6648, + "step": 26 + }, + { + "epoch": 0.029351814105177332, + "grad_norm": 0.817592442035675, + "learning_rate": 2.1600000000000003e-05, + "loss": 3.5252, + "step": 27 + }, + { + "epoch": 0.030438918331295015, + "grad_norm": 0.7376441955566406, + "learning_rate": 2.2400000000000002e-05, + "loss": 3.3304, + "step": 28 + }, + { + "epoch": 0.03152602255741269, + "grad_norm": 0.6767750382423401, + "learning_rate": 2.32e-05, + "loss": 3.1561, + "step": 29 + }, + { + "epoch": 0.03261312678353037, + "grad_norm": 0.7858631014823914, + "learning_rate": 2.4e-05, + "loss": 3.1554, + "step": 30 + }, + { + "epoch": 0.03370023100964805, + "grad_norm": 0.6538259387016296, + "learning_rate": 2.4800000000000003e-05, + "loss": 3.0387, + "step": 31 + }, + { + "epoch": 0.03478733523576573, + "grad_norm": 0.36822834610939026, + "learning_rate": 2.5600000000000002e-05, + "loss": 2.8779, + "step": 32 + }, + { + "epoch": 0.03587443946188341, + "grad_norm": 0.5310895442962646, + "learning_rate": 2.6400000000000005e-05, + "loss": 3.2782, + "step": 33 + }, + { + "epoch": 0.036961543688001086, + "grad_norm": 0.43023616075515747, + "learning_rate": 2.7200000000000004e-05, + "loss": 2.8278, + "step": 34 + }, + { + "epoch": 0.038048647914118765, + "grad_norm": 0.6091771125793457, + "learning_rate": 2.8e-05, + "loss": 3.3641, + "step": 35 + }, + { + "epoch": 0.03913575214023644, + "grad_norm": 0.528182327747345, + "learning_rate": 2.8800000000000002e-05, + "loss": 3.1322, + "step": 36 + }, + { + "epoch": 0.04022285636635412, + "grad_norm": 0.3346516489982605, + "learning_rate": 2.96e-05, + "loss": 2.8377, + "step": 37 + }, + { + "epoch": 0.0413099605924718, + "grad_norm": 0.4978286325931549, + "learning_rate": 3.0400000000000004e-05, + "loss": 3.1188, + "step": 38 + }, + { + "epoch": 0.04239706481858948, + "grad_norm": 0.47854042053222656, + "learning_rate": 3.1200000000000006e-05, + "loss": 3.124, + "step": 39 + }, + { + "epoch": 0.043484169044707165, + "grad_norm": 0.35611164569854736, + "learning_rate": 3.2000000000000005e-05, + "loss": 3.1871, + "step": 40 + }, + { + "epoch": 0.04457127327082484, + "grad_norm": 0.3448915481567383, + "learning_rate": 3.28e-05, + "loss": 3.0723, + "step": 41 + }, + { + "epoch": 0.04565837749694252, + "grad_norm": 0.4089977741241455, + "learning_rate": 3.3600000000000004e-05, + "loss": 3.086, + "step": 42 + }, + { + "epoch": 0.0467454817230602, + "grad_norm": 0.4114953875541687, + "learning_rate": 3.44e-05, + "loss": 3.0346, + "step": 43 + }, + { + "epoch": 0.04783258594917788, + "grad_norm": 0.3668460249900818, + "learning_rate": 3.52e-05, + "loss": 3.0724, + "step": 44 + }, + { + "epoch": 0.04891969017529556, + "grad_norm": 0.34357163310050964, + "learning_rate": 3.6e-05, + "loss": 2.8962, + "step": 45 + }, + { + "epoch": 0.050006794401413236, + "grad_norm": 0.23081070184707642, + "learning_rate": 3.680000000000001e-05, + "loss": 2.8397, + "step": 46 + }, + { + "epoch": 0.051093898627530915, + "grad_norm": 0.3065420687198639, + "learning_rate": 3.76e-05, + "loss": 2.892, + "step": 47 + }, + { + "epoch": 0.05218100285364859, + "grad_norm": 0.36564749479293823, + "learning_rate": 3.8400000000000005e-05, + "loss": 2.8035, + "step": 48 + }, + { + "epoch": 0.05326810707976627, + "grad_norm": 0.40583041310310364, + "learning_rate": 3.9200000000000004e-05, + "loss": 2.954, + "step": 49 + }, + { + "epoch": 0.05435521130588395, + "grad_norm": 0.19353345036506653, + "learning_rate": 4e-05, + "loss": 2.765, + "step": 50 + }, + { + "epoch": 0.05544231553200163, + "grad_norm": 0.21138645708560944, + "learning_rate": 4.08e-05, + "loss": 2.8005, + "step": 51 + }, + { + "epoch": 0.05652941975811931, + "grad_norm": 0.27462801337242126, + "learning_rate": 4.16e-05, + "loss": 2.8831, + "step": 52 + }, + { + "epoch": 0.057616523984236986, + "grad_norm": 0.2978091239929199, + "learning_rate": 4.240000000000001e-05, + "loss": 2.7664, + "step": 53 + }, + { + "epoch": 0.058703628210354665, + "grad_norm": 0.19421067833900452, + "learning_rate": 4.3200000000000007e-05, + "loss": 2.6641, + "step": 54 + }, + { + "epoch": 0.059790732436472344, + "grad_norm": 0.3861449956893921, + "learning_rate": 4.4000000000000006e-05, + "loss": 2.7319, + "step": 55 + }, + { + "epoch": 0.06087783666259003, + "grad_norm": 0.2742801010608673, + "learning_rate": 4.4800000000000005e-05, + "loss": 2.8144, + "step": 56 + }, + { + "epoch": 0.06196494088870771, + "grad_norm": 0.27361395955085754, + "learning_rate": 4.56e-05, + "loss": 2.8754, + "step": 57 + }, + { + "epoch": 0.06305204511482539, + "grad_norm": 0.22017507255077362, + "learning_rate": 4.64e-05, + "loss": 2.8239, + "step": 58 + }, + { + "epoch": 0.06413914934094306, + "grad_norm": 0.3103742003440857, + "learning_rate": 4.72e-05, + "loss": 2.9427, + "step": 59 + }, + { + "epoch": 0.06522625356706074, + "grad_norm": 0.26305684447288513, + "learning_rate": 4.8e-05, + "loss": 3.2949, + "step": 60 + }, + { + "epoch": 0.06631335779317842, + "grad_norm": 0.21772415935993195, + "learning_rate": 4.88e-05, + "loss": 2.6121, + "step": 61 + }, + { + "epoch": 0.0674004620192961, + "grad_norm": 0.47127923369407654, + "learning_rate": 4.9600000000000006e-05, + "loss": 2.6692, + "step": 62 + }, + { + "epoch": 0.06848756624541377, + "grad_norm": 0.2787991464138031, + "learning_rate": 5.0400000000000005e-05, + "loss": 2.8747, + "step": 63 + }, + { + "epoch": 0.06957467047153146, + "grad_norm": 0.2478831708431244, + "learning_rate": 5.1200000000000004e-05, + "loss": 2.9119, + "step": 64 + }, + { + "epoch": 0.07066177469764914, + "grad_norm": 0.20420241355895996, + "learning_rate": 5.2000000000000004e-05, + "loss": 2.6599, + "step": 65 + }, + { + "epoch": 0.07174887892376682, + "grad_norm": 0.2753028869628906, + "learning_rate": 5.280000000000001e-05, + "loss": 2.8778, + "step": 66 + }, + { + "epoch": 0.0728359831498845, + "grad_norm": 0.2163364738225937, + "learning_rate": 5.360000000000001e-05, + "loss": 2.6554, + "step": 67 + }, + { + "epoch": 0.07392308737600217, + "grad_norm": 0.2634824812412262, + "learning_rate": 5.440000000000001e-05, + "loss": 2.8669, + "step": 68 + }, + { + "epoch": 0.07501019160211986, + "grad_norm": 0.23044374585151672, + "learning_rate": 5.52e-05, + "loss": 2.7099, + "step": 69 + }, + { + "epoch": 0.07609729582823753, + "grad_norm": 0.24113614857196808, + "learning_rate": 5.6e-05, + "loss": 2.8105, + "step": 70 + }, + { + "epoch": 0.07718440005435522, + "grad_norm": 0.2488006055355072, + "learning_rate": 5.6800000000000005e-05, + "loss": 2.7047, + "step": 71 + }, + { + "epoch": 0.07827150428047289, + "grad_norm": 0.23006896674633026, + "learning_rate": 5.7600000000000004e-05, + "loss": 2.5052, + "step": 72 + }, + { + "epoch": 0.07935860850659057, + "grad_norm": 0.2337830513715744, + "learning_rate": 5.84e-05, + "loss": 2.8153, + "step": 73 + }, + { + "epoch": 0.08044571273270824, + "grad_norm": 0.25199946761131287, + "learning_rate": 5.92e-05, + "loss": 2.9109, + "step": 74 + }, + { + "epoch": 0.08153281695882593, + "grad_norm": 0.3061909079551697, + "learning_rate": 6.000000000000001e-05, + "loss": 2.7768, + "step": 75 + }, + { + "epoch": 0.0826199211849436, + "grad_norm": 0.21918885409832, + "learning_rate": 6.080000000000001e-05, + "loss": 2.4612, + "step": 76 + }, + { + "epoch": 0.08370702541106129, + "grad_norm": 0.2534966468811035, + "learning_rate": 6.16e-05, + "loss": 2.5579, + "step": 77 + }, + { + "epoch": 0.08479412963717896, + "grad_norm": 0.3465976417064667, + "learning_rate": 6.240000000000001e-05, + "loss": 2.8096, + "step": 78 + }, + { + "epoch": 0.08588123386329664, + "grad_norm": 0.41591131687164307, + "learning_rate": 6.32e-05, + "loss": 2.9622, + "step": 79 + }, + { + "epoch": 0.08696833808941433, + "grad_norm": 0.46020373702049255, + "learning_rate": 6.400000000000001e-05, + "loss": 2.4985, + "step": 80 + }, + { + "epoch": 0.088055442315532, + "grad_norm": 0.3509288430213928, + "learning_rate": 6.48e-05, + "loss": 2.7941, + "step": 81 + }, + { + "epoch": 0.08914254654164969, + "grad_norm": 0.3739589750766754, + "learning_rate": 6.56e-05, + "loss": 2.5459, + "step": 82 + }, + { + "epoch": 0.09022965076776736, + "grad_norm": 0.34141457080841064, + "learning_rate": 6.64e-05, + "loss": 3.0349, + "step": 83 + }, + { + "epoch": 0.09131675499388504, + "grad_norm": 0.4145772159099579, + "learning_rate": 6.720000000000001e-05, + "loss": 2.7032, + "step": 84 + }, + { + "epoch": 0.09240385922000272, + "grad_norm": 0.26016175746917725, + "learning_rate": 6.8e-05, + "loss": 2.5074, + "step": 85 + }, + { + "epoch": 0.0934909634461204, + "grad_norm": 0.3912140727043152, + "learning_rate": 6.88e-05, + "loss": 2.7793, + "step": 86 + }, + { + "epoch": 0.09457806767223807, + "grad_norm": 0.33537185192108154, + "learning_rate": 6.960000000000001e-05, + "loss": 2.8683, + "step": 87 + }, + { + "epoch": 0.09566517189835576, + "grad_norm": 0.5199161171913147, + "learning_rate": 7.04e-05, + "loss": 2.4736, + "step": 88 + }, + { + "epoch": 0.09675227612447343, + "grad_norm": 0.40132808685302734, + "learning_rate": 7.120000000000001e-05, + "loss": 2.8865, + "step": 89 + }, + { + "epoch": 0.09783938035059112, + "grad_norm": 0.5440429449081421, + "learning_rate": 7.2e-05, + "loss": 2.6978, + "step": 90 + }, + { + "epoch": 0.09892648457670879, + "grad_norm": 0.4221319854259491, + "learning_rate": 7.280000000000001e-05, + "loss": 2.5253, + "step": 91 + }, + { + "epoch": 0.10001358880282647, + "grad_norm": 0.4072299301624298, + "learning_rate": 7.360000000000001e-05, + "loss": 2.4868, + "step": 92 + }, + { + "epoch": 0.10110069302894414, + "grad_norm": 0.3482096493244171, + "learning_rate": 7.44e-05, + "loss": 2.7824, + "step": 93 + }, + { + "epoch": 0.10218779725506183, + "grad_norm": 0.2974827289581299, + "learning_rate": 7.52e-05, + "loss": 2.6612, + "step": 94 + }, + { + "epoch": 0.1032749014811795, + "grad_norm": 0.33836594223976135, + "learning_rate": 7.6e-05, + "loss": 2.8567, + "step": 95 + }, + { + "epoch": 0.10436200570729719, + "grad_norm": 0.2650427520275116, + "learning_rate": 7.680000000000001e-05, + "loss": 2.5271, + "step": 96 + }, + { + "epoch": 0.10544910993341487, + "grad_norm": 0.3879508078098297, + "learning_rate": 7.76e-05, + "loss": 2.8121, + "step": 97 + }, + { + "epoch": 0.10653621415953254, + "grad_norm": 0.3120324909687042, + "learning_rate": 7.840000000000001e-05, + "loss": 2.8149, + "step": 98 + }, + { + "epoch": 0.10762331838565023, + "grad_norm": 0.2465900480747223, + "learning_rate": 7.92e-05, + "loss": 2.454, + "step": 99 + }, + { + "epoch": 0.1087104226117679, + "grad_norm": 0.5037747025489807, + "learning_rate": 8e-05, + "loss": 3.1017, + "step": 100 + }, + { + "epoch": 0.10979752683788559, + "grad_norm": 0.5572241544723511, + "learning_rate": 8e-05, + "loss": 2.6972, + "step": 101 + }, + { + "epoch": 0.11088463106400326, + "grad_norm": 0.36297839879989624, + "learning_rate": 8e-05, + "loss": 2.8079, + "step": 102 + }, + { + "epoch": 0.11197173529012094, + "grad_norm": 0.5766679048538208, + "learning_rate": 8e-05, + "loss": 2.7868, + "step": 103 + }, + { + "epoch": 0.11305883951623862, + "grad_norm": 0.2980547845363617, + "learning_rate": 8e-05, + "loss": 2.7063, + "step": 104 + }, + { + "epoch": 0.1141459437423563, + "grad_norm": 0.6680068373680115, + "learning_rate": 8e-05, + "loss": 2.7216, + "step": 105 + }, + { + "epoch": 0.11523304796847397, + "grad_norm": 0.4077845811843872, + "learning_rate": 8e-05, + "loss": 2.6532, + "step": 106 + }, + { + "epoch": 0.11632015219459166, + "grad_norm": 0.6247202157974243, + "learning_rate": 8e-05, + "loss": 3.0171, + "step": 107 + }, + { + "epoch": 0.11740725642070933, + "grad_norm": 0.45367133617401123, + "learning_rate": 8e-05, + "loss": 2.4446, + "step": 108 + }, + { + "epoch": 0.11849436064682702, + "grad_norm": 0.4844890236854553, + "learning_rate": 8e-05, + "loss": 2.9073, + "step": 109 + }, + { + "epoch": 0.11958146487294469, + "grad_norm": 0.6034656763076782, + "learning_rate": 8e-05, + "loss": 2.8585, + "step": 110 + }, + { + "epoch": 0.12066856909906237, + "grad_norm": 0.32010847330093384, + "learning_rate": 8e-05, + "loss": 2.7557, + "step": 111 + }, + { + "epoch": 0.12175567332518006, + "grad_norm": 0.7833703756332397, + "learning_rate": 8e-05, + "loss": 2.6376, + "step": 112 + }, + { + "epoch": 0.12284277755129773, + "grad_norm": 0.4453218877315521, + "learning_rate": 8e-05, + "loss": 2.7088, + "step": 113 + }, + { + "epoch": 0.12392988177741542, + "grad_norm": 0.5115008354187012, + "learning_rate": 8e-05, + "loss": 2.5499, + "step": 114 + }, + { + "epoch": 0.1250169860035331, + "grad_norm": 0.3353102505207062, + "learning_rate": 8e-05, + "loss": 2.6424, + "step": 115 + }, + { + "epoch": 0.12610409022965077, + "grad_norm": 0.3865357041358948, + "learning_rate": 8e-05, + "loss": 2.7787, + "step": 116 + }, + { + "epoch": 0.12719119445576846, + "grad_norm": 0.5271489024162292, + "learning_rate": 8e-05, + "loss": 2.6999, + "step": 117 + }, + { + "epoch": 0.12827829868188612, + "grad_norm": 0.3085884749889374, + "learning_rate": 8e-05, + "loss": 2.6128, + "step": 118 + }, + { + "epoch": 0.1293654029080038, + "grad_norm": 0.4037182033061981, + "learning_rate": 8e-05, + "loss": 2.686, + "step": 119 + }, + { + "epoch": 0.1304525071341215, + "grad_norm": 0.28804194927215576, + "learning_rate": 8e-05, + "loss": 2.6444, + "step": 120 + }, + { + "epoch": 0.13153961136023917, + "grad_norm": 0.554262638092041, + "learning_rate": 8e-05, + "loss": 2.7762, + "step": 121 + }, + { + "epoch": 0.13262671558635683, + "grad_norm": 0.27171409130096436, + "learning_rate": 8e-05, + "loss": 2.5823, + "step": 122 + }, + { + "epoch": 0.13371381981247452, + "grad_norm": 0.4178011417388916, + "learning_rate": 8e-05, + "loss": 2.6277, + "step": 123 + }, + { + "epoch": 0.1348009240385922, + "grad_norm": 0.300963819026947, + "learning_rate": 8e-05, + "loss": 2.9919, + "step": 124 + }, + { + "epoch": 0.1358880282647099, + "grad_norm": 0.4165850579738617, + "learning_rate": 8e-05, + "loss": 2.5905, + "step": 125 + }, + { + "epoch": 0.13697513249082754, + "grad_norm": 0.4002116024494171, + "learning_rate": 8e-05, + "loss": 2.8756, + "step": 126 + }, + { + "epoch": 0.13806223671694523, + "grad_norm": 0.36963963508605957, + "learning_rate": 8e-05, + "loss": 2.6691, + "step": 127 + }, + { + "epoch": 0.13914934094306292, + "grad_norm": 0.2826521396636963, + "learning_rate": 8e-05, + "loss": 2.47, + "step": 128 + }, + { + "epoch": 0.1402364451691806, + "grad_norm": 0.3212789297103882, + "learning_rate": 8e-05, + "loss": 2.6613, + "step": 129 + }, + { + "epoch": 0.1413235493952983, + "grad_norm": 0.25957366824150085, + "learning_rate": 8e-05, + "loss": 2.6636, + "step": 130 + }, + { + "epoch": 0.14241065362141594, + "grad_norm": 0.3244073987007141, + "learning_rate": 8e-05, + "loss": 2.6375, + "step": 131 + }, + { + "epoch": 0.14349775784753363, + "grad_norm": 0.2449469268321991, + "learning_rate": 8e-05, + "loss": 2.6309, + "step": 132 + }, + { + "epoch": 0.14458486207365132, + "grad_norm": 0.3098362982273102, + "learning_rate": 8e-05, + "loss": 2.5946, + "step": 133 + }, + { + "epoch": 0.145671966299769, + "grad_norm": 0.3512052595615387, + "learning_rate": 8e-05, + "loss": 2.5118, + "step": 134 + }, + { + "epoch": 0.14675907052588666, + "grad_norm": 0.4111577570438385, + "learning_rate": 8e-05, + "loss": 2.8764, + "step": 135 + }, + { + "epoch": 0.14784617475200434, + "grad_norm": 0.2309219241142273, + "learning_rate": 8e-05, + "loss": 2.6115, + "step": 136 + }, + { + "epoch": 0.14893327897812203, + "grad_norm": 0.2839241325855255, + "learning_rate": 8e-05, + "loss": 2.8208, + "step": 137 + }, + { + "epoch": 0.15002038320423972, + "grad_norm": 0.3878515660762787, + "learning_rate": 8e-05, + "loss": 2.8461, + "step": 138 + }, + { + "epoch": 0.15110748743035737, + "grad_norm": 0.3024687170982361, + "learning_rate": 8e-05, + "loss": 2.604, + "step": 139 + }, + { + "epoch": 0.15219459165647506, + "grad_norm": 0.22990410029888153, + "learning_rate": 8e-05, + "loss": 2.3805, + "step": 140 + }, + { + "epoch": 0.15328169588259274, + "grad_norm": 0.24732771515846252, + "learning_rate": 8e-05, + "loss": 2.5926, + "step": 141 + }, + { + "epoch": 0.15436880010871043, + "grad_norm": 0.26643362641334534, + "learning_rate": 8e-05, + "loss": 2.7555, + "step": 142 + }, + { + "epoch": 0.1554559043348281, + "grad_norm": 0.2520343065261841, + "learning_rate": 8e-05, + "loss": 2.6747, + "step": 143 + }, + { + "epoch": 0.15654300856094577, + "grad_norm": 0.24047300219535828, + "learning_rate": 8e-05, + "loss": 2.7427, + "step": 144 + }, + { + "epoch": 0.15763011278706346, + "grad_norm": 0.2692314088344574, + "learning_rate": 8e-05, + "loss": 2.7535, + "step": 145 + }, + { + "epoch": 0.15871721701318114, + "grad_norm": 0.27563148736953735, + "learning_rate": 8e-05, + "loss": 2.9195, + "step": 146 + }, + { + "epoch": 0.15980432123929883, + "grad_norm": 0.26135674118995667, + "learning_rate": 8e-05, + "loss": 2.9199, + "step": 147 + }, + { + "epoch": 0.1608914254654165, + "grad_norm": 0.28445643186569214, + "learning_rate": 8e-05, + "loss": 2.5978, + "step": 148 + }, + { + "epoch": 0.16197852969153417, + "grad_norm": 0.28675052523612976, + "learning_rate": 8e-05, + "loss": 2.7822, + "step": 149 + }, + { + "epoch": 0.16306563391765186, + "grad_norm": 0.2091285139322281, + "learning_rate": 8e-05, + "loss": 2.5101, + "step": 150 + }, + { + "epoch": 0.16415273814376954, + "grad_norm": 0.20553193986415863, + "learning_rate": 8e-05, + "loss": 2.4612, + "step": 151 + }, + { + "epoch": 0.1652398423698872, + "grad_norm": 0.21206006407737732, + "learning_rate": 8e-05, + "loss": 2.6467, + "step": 152 + }, + { + "epoch": 0.1663269465960049, + "grad_norm": 0.24279922246932983, + "learning_rate": 8e-05, + "loss": 2.696, + "step": 153 + }, + { + "epoch": 0.16741405082212257, + "grad_norm": 0.2367326319217682, + "learning_rate": 8e-05, + "loss": 2.6747, + "step": 154 + }, + { + "epoch": 0.16850115504824026, + "grad_norm": 0.2839709222316742, + "learning_rate": 8e-05, + "loss": 2.8172, + "step": 155 + }, + { + "epoch": 0.16958825927435792, + "grad_norm": 0.3134251534938812, + "learning_rate": 8e-05, + "loss": 2.5493, + "step": 156 + }, + { + "epoch": 0.1706753635004756, + "grad_norm": 0.2717304825782776, + "learning_rate": 8e-05, + "loss": 2.6943, + "step": 157 + }, + { + "epoch": 0.1717624677265933, + "grad_norm": 0.26062291860580444, + "learning_rate": 8e-05, + "loss": 2.5104, + "step": 158 + }, + { + "epoch": 0.17284957195271097, + "grad_norm": 0.3013114333152771, + "learning_rate": 8e-05, + "loss": 2.7947, + "step": 159 + }, + { + "epoch": 0.17393667617882866, + "grad_norm": 0.20062896609306335, + "learning_rate": 8e-05, + "loss": 2.4444, + "step": 160 + }, + { + "epoch": 0.17502378040494632, + "grad_norm": 0.4020882844924927, + "learning_rate": 8e-05, + "loss": 2.8952, + "step": 161 + }, + { + "epoch": 0.176110884631064, + "grad_norm": 0.25666365027427673, + "learning_rate": 8e-05, + "loss": 2.7683, + "step": 162 + }, + { + "epoch": 0.1771979888571817, + "grad_norm": 0.3367927372455597, + "learning_rate": 8e-05, + "loss": 2.5053, + "step": 163 + }, + { + "epoch": 0.17828509308329937, + "grad_norm": 0.2697398066520691, + "learning_rate": 8e-05, + "loss": 2.737, + "step": 164 + }, + { + "epoch": 0.17937219730941703, + "grad_norm": 0.5030338168144226, + "learning_rate": 8e-05, + "loss": 2.9525, + "step": 165 + }, + { + "epoch": 0.18045930153553472, + "grad_norm": 0.306744247674942, + "learning_rate": 8e-05, + "loss": 2.8699, + "step": 166 + }, + { + "epoch": 0.1815464057616524, + "grad_norm": 0.4985387623310089, + "learning_rate": 8e-05, + "loss": 2.5112, + "step": 167 + }, + { + "epoch": 0.1826335099877701, + "grad_norm": 0.4023090898990631, + "learning_rate": 8e-05, + "loss": 2.648, + "step": 168 + }, + { + "epoch": 0.18372061421388775, + "grad_norm": 0.625040590763092, + "learning_rate": 8e-05, + "loss": 2.8806, + "step": 169 + }, + { + "epoch": 0.18480771844000543, + "grad_norm": 0.3565062880516052, + "learning_rate": 8e-05, + "loss": 2.6986, + "step": 170 + }, + { + "epoch": 0.18589482266612312, + "grad_norm": 0.4763094186782837, + "learning_rate": 8e-05, + "loss": 2.8519, + "step": 171 + }, + { + "epoch": 0.1869819268922408, + "grad_norm": 0.3974241316318512, + "learning_rate": 8e-05, + "loss": 2.6697, + "step": 172 + }, + { + "epoch": 0.18806903111835846, + "grad_norm": 0.3001902103424072, + "learning_rate": 8e-05, + "loss": 2.801, + "step": 173 + }, + { + "epoch": 0.18915613534447615, + "grad_norm": 0.38552460074424744, + "learning_rate": 8e-05, + "loss": 2.6368, + "step": 174 + }, + { + "epoch": 0.19024323957059383, + "grad_norm": 0.2738027572631836, + "learning_rate": 8e-05, + "loss": 2.78, + "step": 175 + }, + { + "epoch": 0.19133034379671152, + "grad_norm": 0.384124755859375, + "learning_rate": 8e-05, + "loss": 2.4878, + "step": 176 + }, + { + "epoch": 0.1924174480228292, + "grad_norm": 0.24756291508674622, + "learning_rate": 8e-05, + "loss": 2.4496, + "step": 177 + }, + { + "epoch": 0.19350455224894686, + "grad_norm": 0.4402608573436737, + "learning_rate": 8e-05, + "loss": 2.8877, + "step": 178 + }, + { + "epoch": 0.19459165647506454, + "grad_norm": 0.27092137932777405, + "learning_rate": 8e-05, + "loss": 2.7196, + "step": 179 + }, + { + "epoch": 0.19567876070118223, + "grad_norm": 0.34909459948539734, + "learning_rate": 8e-05, + "loss": 2.5983, + "step": 180 + }, + { + "epoch": 0.19676586492729992, + "grad_norm": 0.2909966707229614, + "learning_rate": 8e-05, + "loss": 2.4896, + "step": 181 + }, + { + "epoch": 0.19785296915341757, + "grad_norm": 0.5153746604919434, + "learning_rate": 8e-05, + "loss": 3.0324, + "step": 182 + }, + { + "epoch": 0.19894007337953526, + "grad_norm": 0.401748389005661, + "learning_rate": 8e-05, + "loss": 2.8828, + "step": 183 + }, + { + "epoch": 0.20002717760565294, + "grad_norm": 0.5100975632667542, + "learning_rate": 8e-05, + "loss": 2.5779, + "step": 184 + }, + { + "epoch": 0.20111428183177063, + "grad_norm": 0.47615495324134827, + "learning_rate": 8e-05, + "loss": 2.7809, + "step": 185 + }, + { + "epoch": 0.2022013860578883, + "grad_norm": 0.3028770685195923, + "learning_rate": 8e-05, + "loss": 2.6011, + "step": 186 + }, + { + "epoch": 0.20328849028400597, + "grad_norm": 0.4174680709838867, + "learning_rate": 8e-05, + "loss": 2.5798, + "step": 187 + }, + { + "epoch": 0.20437559451012366, + "grad_norm": 0.42194369435310364, + "learning_rate": 8e-05, + "loss": 2.7546, + "step": 188 + }, + { + "epoch": 0.20546269873624134, + "grad_norm": 0.48476532101631165, + "learning_rate": 8e-05, + "loss": 2.5283, + "step": 189 + }, + { + "epoch": 0.206549802962359, + "grad_norm": 0.4926374852657318, + "learning_rate": 8e-05, + "loss": 2.6779, + "step": 190 + }, + { + "epoch": 0.2076369071884767, + "grad_norm": 0.4315795302391052, + "learning_rate": 8e-05, + "loss": 2.462, + "step": 191 + }, + { + "epoch": 0.20872401141459437, + "grad_norm": 0.44672197103500366, + "learning_rate": 8e-05, + "loss": 2.587, + "step": 192 + }, + { + "epoch": 0.20981111564071206, + "grad_norm": 0.23925480246543884, + "learning_rate": 8e-05, + "loss": 2.5481, + "step": 193 + }, + { + "epoch": 0.21089821986682974, + "grad_norm": 0.62049400806427, + "learning_rate": 8e-05, + "loss": 3.0742, + "step": 194 + }, + { + "epoch": 0.2119853240929474, + "grad_norm": 0.28580546379089355, + "learning_rate": 8e-05, + "loss": 2.803, + "step": 195 + }, + { + "epoch": 0.2130724283190651, + "grad_norm": 0.466965913772583, + "learning_rate": 8e-05, + "loss": 2.594, + "step": 196 + }, + { + "epoch": 0.21415953254518277, + "grad_norm": 0.29863160848617554, + "learning_rate": 8e-05, + "loss": 2.6083, + "step": 197 + }, + { + "epoch": 0.21524663677130046, + "grad_norm": 0.24443824589252472, + "learning_rate": 8e-05, + "loss": 2.6672, + "step": 198 + }, + { + "epoch": 0.21633374099741812, + "grad_norm": 0.3745376467704773, + "learning_rate": 8e-05, + "loss": 2.6715, + "step": 199 + }, + { + "epoch": 0.2174208452235358, + "grad_norm": 0.2916417419910431, + "learning_rate": 8e-05, + "loss": 2.6092, + "step": 200 + }, + { + "epoch": 0.2185079494496535, + "grad_norm": 0.4010668694972992, + "learning_rate": 8e-05, + "loss": 2.7797, + "step": 201 + }, + { + "epoch": 0.21959505367577117, + "grad_norm": 0.2601282000541687, + "learning_rate": 8e-05, + "loss": 2.6054, + "step": 202 + }, + { + "epoch": 0.22068215790188883, + "grad_norm": 0.22905616462230682, + "learning_rate": 8e-05, + "loss": 2.5397, + "step": 203 + }, + { + "epoch": 0.22176926212800652, + "grad_norm": 0.2685694694519043, + "learning_rate": 8e-05, + "loss": 2.546, + "step": 204 + }, + { + "epoch": 0.2228563663541242, + "grad_norm": 0.19726788997650146, + "learning_rate": 8e-05, + "loss": 2.3117, + "step": 205 + }, + { + "epoch": 0.2239434705802419, + "grad_norm": 0.20461906492710114, + "learning_rate": 8e-05, + "loss": 2.5892, + "step": 206 + }, + { + "epoch": 0.22503057480635957, + "grad_norm": 0.30402520298957825, + "learning_rate": 8e-05, + "loss": 2.8895, + "step": 207 + }, + { + "epoch": 0.22611767903247723, + "grad_norm": 0.29612261056900024, + "learning_rate": 8e-05, + "loss": 2.5287, + "step": 208 + }, + { + "epoch": 0.22720478325859492, + "grad_norm": 0.2652565836906433, + "learning_rate": 8e-05, + "loss": 2.6632, + "step": 209 + }, + { + "epoch": 0.2282918874847126, + "grad_norm": 0.20809625089168549, + "learning_rate": 8e-05, + "loss": 2.3787, + "step": 210 + }, + { + "epoch": 0.2293789917108303, + "grad_norm": 0.2513427734375, + "learning_rate": 8e-05, + "loss": 2.8202, + "step": 211 + }, + { + "epoch": 0.23046609593694795, + "grad_norm": 0.2041294425725937, + "learning_rate": 8e-05, + "loss": 2.474, + "step": 212 + }, + { + "epoch": 0.23155320016306563, + "grad_norm": 0.2388080656528473, + "learning_rate": 8e-05, + "loss": 2.4549, + "step": 213 + }, + { + "epoch": 0.23264030438918332, + "grad_norm": 0.18037046492099762, + "learning_rate": 8e-05, + "loss": 2.3815, + "step": 214 + }, + { + "epoch": 0.233727408615301, + "grad_norm": 0.401646226644516, + "learning_rate": 8e-05, + "loss": 2.714, + "step": 215 + }, + { + "epoch": 0.23481451284141866, + "grad_norm": 0.2445582002401352, + "learning_rate": 8e-05, + "loss": 2.9044, + "step": 216 + }, + { + "epoch": 0.23590161706753635, + "grad_norm": 0.43115776777267456, + "learning_rate": 8e-05, + "loss": 2.837, + "step": 217 + }, + { + "epoch": 0.23698872129365403, + "grad_norm": 0.21371930837631226, + "learning_rate": 8e-05, + "loss": 2.7065, + "step": 218 + }, + { + "epoch": 0.23807582551977172, + "grad_norm": 0.25618669390678406, + "learning_rate": 8e-05, + "loss": 2.616, + "step": 219 + }, + { + "epoch": 0.23916292974588937, + "grad_norm": 0.24008898437023163, + "learning_rate": 8e-05, + "loss": 2.5871, + "step": 220 + }, + { + "epoch": 0.24025003397200706, + "grad_norm": 0.2695084512233734, + "learning_rate": 8e-05, + "loss": 2.7921, + "step": 221 + }, + { + "epoch": 0.24133713819812475, + "grad_norm": 0.2882753312587738, + "learning_rate": 8e-05, + "loss": 2.8283, + "step": 222 + }, + { + "epoch": 0.24242424242424243, + "grad_norm": 0.2380722165107727, + "learning_rate": 8e-05, + "loss": 2.4755, + "step": 223 + }, + { + "epoch": 0.24351134665036012, + "grad_norm": 0.2560077905654907, + "learning_rate": 8e-05, + "loss": 2.7807, + "step": 224 + }, + { + "epoch": 0.24459845087647777, + "grad_norm": 0.23890511691570282, + "learning_rate": 8e-05, + "loss": 2.617, + "step": 225 + }, + { + "epoch": 0.24568555510259546, + "grad_norm": 0.18405930697917938, + "learning_rate": 8e-05, + "loss": 2.5359, + "step": 226 + }, + { + "epoch": 0.24677265932871315, + "grad_norm": 0.2747512757778168, + "learning_rate": 8e-05, + "loss": 2.6553, + "step": 227 + }, + { + "epoch": 0.24785976355483083, + "grad_norm": 0.27616727352142334, + "learning_rate": 8e-05, + "loss": 2.8316, + "step": 228 + }, + { + "epoch": 0.2489468677809485, + "grad_norm": 0.20825694501399994, + "learning_rate": 8e-05, + "loss": 2.7335, + "step": 229 + }, + { + "epoch": 0.2500339720070662, + "grad_norm": 0.27620628476142883, + "learning_rate": 8e-05, + "loss": 2.4846, + "step": 230 + }, + { + "epoch": 0.25112107623318386, + "grad_norm": 0.2194932997226715, + "learning_rate": 8e-05, + "loss": 2.481, + "step": 231 + }, + { + "epoch": 0.25220818045930155, + "grad_norm": 0.2152658998966217, + "learning_rate": 8e-05, + "loss": 2.6557, + "step": 232 + }, + { + "epoch": 0.25329528468541923, + "grad_norm": 0.2613218128681183, + "learning_rate": 8e-05, + "loss": 2.7292, + "step": 233 + }, + { + "epoch": 0.2543823889115369, + "grad_norm": 0.24911417067050934, + "learning_rate": 8e-05, + "loss": 2.5561, + "step": 234 + }, + { + "epoch": 0.25546949313765455, + "grad_norm": 0.21398596465587616, + "learning_rate": 8e-05, + "loss": 2.5883, + "step": 235 + }, + { + "epoch": 0.25655659736377223, + "grad_norm": 0.29950228333473206, + "learning_rate": 8e-05, + "loss": 2.9426, + "step": 236 + }, + { + "epoch": 0.2576437015898899, + "grad_norm": 0.2769708037376404, + "learning_rate": 8e-05, + "loss": 2.6879, + "step": 237 + }, + { + "epoch": 0.2587308058160076, + "grad_norm": 0.21842673420906067, + "learning_rate": 8e-05, + "loss": 2.4821, + "step": 238 + }, + { + "epoch": 0.2598179100421253, + "grad_norm": 0.3005446791648865, + "learning_rate": 8e-05, + "loss": 2.7082, + "step": 239 + }, + { + "epoch": 0.260905014268243, + "grad_norm": 0.22701415419578552, + "learning_rate": 8e-05, + "loss": 2.7276, + "step": 240 + }, + { + "epoch": 0.26199211849436066, + "grad_norm": 0.23170171678066254, + "learning_rate": 8e-05, + "loss": 2.5763, + "step": 241 + }, + { + "epoch": 0.26307922272047835, + "grad_norm": 0.20267575979232788, + "learning_rate": 8e-05, + "loss": 2.5886, + "step": 242 + }, + { + "epoch": 0.26416632694659603, + "grad_norm": 0.326568067073822, + "learning_rate": 8e-05, + "loss": 2.6622, + "step": 243 + }, + { + "epoch": 0.26525343117271366, + "grad_norm": 0.27677053213119507, + "learning_rate": 8e-05, + "loss": 2.8204, + "step": 244 + }, + { + "epoch": 0.26634053539883135, + "grad_norm": 0.314101904630661, + "learning_rate": 8e-05, + "loss": 2.7894, + "step": 245 + }, + { + "epoch": 0.26742763962494903, + "grad_norm": 0.27384045720100403, + "learning_rate": 8e-05, + "loss": 2.7276, + "step": 246 + }, + { + "epoch": 0.2685147438510667, + "grad_norm": 0.19937290251255035, + "learning_rate": 8e-05, + "loss": 2.4933, + "step": 247 + }, + { + "epoch": 0.2696018480771844, + "grad_norm": 0.3280627727508545, + "learning_rate": 8e-05, + "loss": 2.9393, + "step": 248 + }, + { + "epoch": 0.2706889523033021, + "grad_norm": 0.2605300545692444, + "learning_rate": 8e-05, + "loss": 2.5535, + "step": 249 + }, + { + "epoch": 0.2717760565294198, + "grad_norm": 0.29271361231803894, + "learning_rate": 8e-05, + "loss": 2.7, + "step": 250 + }, + { + "epoch": 0.27286316075553746, + "grad_norm": 0.25908029079437256, + "learning_rate": 8e-05, + "loss": 2.6147, + "step": 251 + }, + { + "epoch": 0.2739502649816551, + "grad_norm": 0.32984668016433716, + "learning_rate": 8e-05, + "loss": 2.4952, + "step": 252 + }, + { + "epoch": 0.2750373692077728, + "grad_norm": 0.1817110776901245, + "learning_rate": 8e-05, + "loss": 2.3811, + "step": 253 + }, + { + "epoch": 0.27612447343389046, + "grad_norm": 0.38330700993537903, + "learning_rate": 8e-05, + "loss": 2.5844, + "step": 254 + }, + { + "epoch": 0.27721157766000815, + "grad_norm": 0.25955748558044434, + "learning_rate": 8e-05, + "loss": 2.7745, + "step": 255 + }, + { + "epoch": 0.27829868188612583, + "grad_norm": 0.2686518132686615, + "learning_rate": 8e-05, + "loss": 2.405, + "step": 256 + }, + { + "epoch": 0.2793857861122435, + "grad_norm": 0.32059964537620544, + "learning_rate": 8e-05, + "loss": 2.7394, + "step": 257 + }, + { + "epoch": 0.2804728903383612, + "grad_norm": 0.2641451358795166, + "learning_rate": 8e-05, + "loss": 2.8543, + "step": 258 + }, + { + "epoch": 0.2815599945644789, + "grad_norm": 0.2771840989589691, + "learning_rate": 8e-05, + "loss": 2.6053, + "step": 259 + }, + { + "epoch": 0.2826470987905966, + "grad_norm": 0.29152432084083557, + "learning_rate": 8e-05, + "loss": 2.7386, + "step": 260 + }, + { + "epoch": 0.2837342030167142, + "grad_norm": 0.2997722327709198, + "learning_rate": 8e-05, + "loss": 2.3482, + "step": 261 + }, + { + "epoch": 0.2848213072428319, + "grad_norm": 0.4489678144454956, + "learning_rate": 8e-05, + "loss": 2.7246, + "step": 262 + }, + { + "epoch": 0.2859084114689496, + "grad_norm": 0.6481120586395264, + "learning_rate": 8e-05, + "loss": 2.9414, + "step": 263 + }, + { + "epoch": 0.28699551569506726, + "grad_norm": 0.265381395816803, + "learning_rate": 8e-05, + "loss": 2.7031, + "step": 264 + }, + { + "epoch": 0.28808261992118495, + "grad_norm": 0.38403844833374023, + "learning_rate": 8e-05, + "loss": 2.4666, + "step": 265 + }, + { + "epoch": 0.28916972414730263, + "grad_norm": 0.27127566933631897, + "learning_rate": 8e-05, + "loss": 2.6384, + "step": 266 + }, + { + "epoch": 0.2902568283734203, + "grad_norm": 0.4415520429611206, + "learning_rate": 8e-05, + "loss": 2.4854, + "step": 267 + }, + { + "epoch": 0.291343932599538, + "grad_norm": 0.47857826948165894, + "learning_rate": 8e-05, + "loss": 2.7786, + "step": 268 + }, + { + "epoch": 0.29243103682565563, + "grad_norm": 0.44557562470436096, + "learning_rate": 8e-05, + "loss": 2.6502, + "step": 269 + }, + { + "epoch": 0.2935181410517733, + "grad_norm": 0.4563475251197815, + "learning_rate": 8e-05, + "loss": 2.6464, + "step": 270 + }, + { + "epoch": 0.294605245277891, + "grad_norm": 0.2607819437980652, + "learning_rate": 8e-05, + "loss": 2.6242, + "step": 271 + }, + { + "epoch": 0.2956923495040087, + "grad_norm": 0.686856746673584, + "learning_rate": 8e-05, + "loss": 3.1042, + "step": 272 + }, + { + "epoch": 0.2967794537301264, + "grad_norm": 0.23426967859268188, + "learning_rate": 8e-05, + "loss": 2.712, + "step": 273 + }, + { + "epoch": 0.29786655795624406, + "grad_norm": 0.7017249464988708, + "learning_rate": 8e-05, + "loss": 2.796, + "step": 274 + }, + { + "epoch": 0.29895366218236175, + "grad_norm": 0.22539305686950684, + "learning_rate": 8e-05, + "loss": 2.4547, + "step": 275 + }, + { + "epoch": 0.30004076640847943, + "grad_norm": 0.5437155961990356, + "learning_rate": 8e-05, + "loss": 2.5863, + "step": 276 + }, + { + "epoch": 0.3011278706345971, + "grad_norm": 0.32931384444236755, + "learning_rate": 8e-05, + "loss": 2.4025, + "step": 277 + }, + { + "epoch": 0.30221497486071475, + "grad_norm": 0.2990173101425171, + "learning_rate": 8e-05, + "loss": 2.7275, + "step": 278 + }, + { + "epoch": 0.30330207908683243, + "grad_norm": 0.47869154810905457, + "learning_rate": 8e-05, + "loss": 2.4735, + "step": 279 + }, + { + "epoch": 0.3043891833129501, + "grad_norm": 0.260394424200058, + "learning_rate": 8e-05, + "loss": 2.7238, + "step": 280 + }, + { + "epoch": 0.3054762875390678, + "grad_norm": 0.3463429808616638, + "learning_rate": 8e-05, + "loss": 2.9073, + "step": 281 + }, + { + "epoch": 0.3065633917651855, + "grad_norm": 0.24965578317642212, + "learning_rate": 8e-05, + "loss": 2.686, + "step": 282 + }, + { + "epoch": 0.3076504959913032, + "grad_norm": 0.24897950887680054, + "learning_rate": 8e-05, + "loss": 2.794, + "step": 283 + }, + { + "epoch": 0.30873760021742086, + "grad_norm": 0.1850753277540207, + "learning_rate": 8e-05, + "loss": 2.5439, + "step": 284 + }, + { + "epoch": 0.30982470444353855, + "grad_norm": 0.33542948961257935, + "learning_rate": 8e-05, + "loss": 2.8439, + "step": 285 + }, + { + "epoch": 0.3109118086696562, + "grad_norm": 0.2958002984523773, + "learning_rate": 8e-05, + "loss": 2.6063, + "step": 286 + }, + { + "epoch": 0.31199891289577386, + "grad_norm": 0.2417321503162384, + "learning_rate": 8e-05, + "loss": 2.7863, + "step": 287 + }, + { + "epoch": 0.31308601712189155, + "grad_norm": 0.22681960463523865, + "learning_rate": 8e-05, + "loss": 2.6345, + "step": 288 + }, + { + "epoch": 0.31417312134800923, + "grad_norm": 0.3645409047603607, + "learning_rate": 8e-05, + "loss": 2.6412, + "step": 289 + }, + { + "epoch": 0.3152602255741269, + "grad_norm": 0.24913059175014496, + "learning_rate": 8e-05, + "loss": 2.6702, + "step": 290 + }, + { + "epoch": 0.3163473298002446, + "grad_norm": 0.44635793566703796, + "learning_rate": 8e-05, + "loss": 2.7226, + "step": 291 + }, + { + "epoch": 0.3174344340263623, + "grad_norm": 0.1831802874803543, + "learning_rate": 8e-05, + "loss": 2.4525, + "step": 292 + }, + { + "epoch": 0.31852153825248, + "grad_norm": 0.38782718777656555, + "learning_rate": 8e-05, + "loss": 2.7055, + "step": 293 + }, + { + "epoch": 0.31960864247859766, + "grad_norm": 0.18159623444080353, + "learning_rate": 8e-05, + "loss": 2.5244, + "step": 294 + }, + { + "epoch": 0.3206957467047153, + "grad_norm": 0.21298684179782867, + "learning_rate": 8e-05, + "loss": 2.6829, + "step": 295 + }, + { + "epoch": 0.321782850930833, + "grad_norm": 0.2741154432296753, + "learning_rate": 8e-05, + "loss": 2.5856, + "step": 296 + }, + { + "epoch": 0.32286995515695066, + "grad_norm": 0.2257017344236374, + "learning_rate": 8e-05, + "loss": 2.6391, + "step": 297 + }, + { + "epoch": 0.32395705938306835, + "grad_norm": 0.2167813777923584, + "learning_rate": 8e-05, + "loss": 2.7613, + "step": 298 + }, + { + "epoch": 0.32504416360918603, + "grad_norm": 0.2137296348810196, + "learning_rate": 8e-05, + "loss": 2.4463, + "step": 299 + }, + { + "epoch": 0.3261312678353037, + "grad_norm": 0.28773000836372375, + "learning_rate": 8e-05, + "loss": 2.5254, + "step": 300 + }, + { + "epoch": 0.3272183720614214, + "grad_norm": 0.20781901478767395, + "learning_rate": 8e-05, + "loss": 2.6658, + "step": 301 + }, + { + "epoch": 0.3283054762875391, + "grad_norm": 0.2771778404712677, + "learning_rate": 8e-05, + "loss": 2.9115, + "step": 302 + }, + { + "epoch": 0.3293925805136568, + "grad_norm": 0.24414929747581482, + "learning_rate": 8e-05, + "loss": 2.7075, + "step": 303 + }, + { + "epoch": 0.3304796847397744, + "grad_norm": 0.1979932337999344, + "learning_rate": 8e-05, + "loss": 2.6466, + "step": 304 + }, + { + "epoch": 0.3315667889658921, + "grad_norm": 0.19733980298042297, + "learning_rate": 8e-05, + "loss": 2.5722, + "step": 305 + }, + { + "epoch": 0.3326538931920098, + "grad_norm": 0.24796366691589355, + "learning_rate": 8e-05, + "loss": 2.6416, + "step": 306 + }, + { + "epoch": 0.33374099741812746, + "grad_norm": 0.3053446412086487, + "learning_rate": 8e-05, + "loss": 2.7917, + "step": 307 + }, + { + "epoch": 0.33482810164424515, + "grad_norm": 0.2702461779117584, + "learning_rate": 8e-05, + "loss": 2.5917, + "step": 308 + }, + { + "epoch": 0.33591520587036283, + "grad_norm": 0.3687792420387268, + "learning_rate": 8e-05, + "loss": 2.8157, + "step": 309 + }, + { + "epoch": 0.3370023100964805, + "grad_norm": 0.2970832288265228, + "learning_rate": 8e-05, + "loss": 2.627, + "step": 310 + }, + { + "epoch": 0.3380894143225982, + "grad_norm": 0.27384626865386963, + "learning_rate": 8e-05, + "loss": 2.65, + "step": 311 + }, + { + "epoch": 0.33917651854871583, + "grad_norm": 0.18980079889297485, + "learning_rate": 8e-05, + "loss": 2.5708, + "step": 312 + }, + { + "epoch": 0.3402636227748335, + "grad_norm": 0.2395751178264618, + "learning_rate": 8e-05, + "loss": 2.6439, + "step": 313 + }, + { + "epoch": 0.3413507270009512, + "grad_norm": 0.19065041840076447, + "learning_rate": 8e-05, + "loss": 2.4159, + "step": 314 + }, + { + "epoch": 0.3424378312270689, + "grad_norm": 0.22037646174430847, + "learning_rate": 8e-05, + "loss": 2.5869, + "step": 315 + }, + { + "epoch": 0.3435249354531866, + "grad_norm": 0.20623396337032318, + "learning_rate": 8e-05, + "loss": 2.3962, + "step": 316 + }, + { + "epoch": 0.34461203967930426, + "grad_norm": 0.2784217894077301, + "learning_rate": 8e-05, + "loss": 2.5728, + "step": 317 + }, + { + "epoch": 0.34569914390542195, + "grad_norm": 0.1825850009918213, + "learning_rate": 8e-05, + "loss": 2.5317, + "step": 318 + }, + { + "epoch": 0.34678624813153963, + "grad_norm": 0.18753507733345032, + "learning_rate": 8e-05, + "loss": 2.59, + "step": 319 + }, + { + "epoch": 0.3478733523576573, + "grad_norm": 0.2197190821170807, + "learning_rate": 8e-05, + "loss": 2.7076, + "step": 320 + }, + { + "epoch": 0.34896045658377495, + "grad_norm": 0.19864331185817719, + "learning_rate": 8e-05, + "loss": 2.4527, + "step": 321 + }, + { + "epoch": 0.35004756080989263, + "grad_norm": 0.2344828099012375, + "learning_rate": 8e-05, + "loss": 2.8001, + "step": 322 + }, + { + "epoch": 0.3511346650360103, + "grad_norm": 0.18985828757286072, + "learning_rate": 8e-05, + "loss": 2.4641, + "step": 323 + }, + { + "epoch": 0.352221769262128, + "grad_norm": 0.23207929730415344, + "learning_rate": 8e-05, + "loss": 2.5095, + "step": 324 + }, + { + "epoch": 0.3533088734882457, + "grad_norm": 0.24603039026260376, + "learning_rate": 8e-05, + "loss": 2.3933, + "step": 325 + }, + { + "epoch": 0.3543959777143634, + "grad_norm": 0.23867091536521912, + "learning_rate": 8e-05, + "loss": 2.5696, + "step": 326 + }, + { + "epoch": 0.35548308194048106, + "grad_norm": 0.321333110332489, + "learning_rate": 8e-05, + "loss": 2.5598, + "step": 327 + }, + { + "epoch": 0.35657018616659875, + "grad_norm": 0.22439239919185638, + "learning_rate": 8e-05, + "loss": 2.8095, + "step": 328 + }, + { + "epoch": 0.3576572903927164, + "grad_norm": 0.2510218620300293, + "learning_rate": 8e-05, + "loss": 2.5496, + "step": 329 + }, + { + "epoch": 0.35874439461883406, + "grad_norm": 0.18319284915924072, + "learning_rate": 8e-05, + "loss": 2.4899, + "step": 330 + }, + { + "epoch": 0.35983149884495175, + "grad_norm": 0.2805016040802002, + "learning_rate": 8e-05, + "loss": 2.8361, + "step": 331 + }, + { + "epoch": 0.36091860307106943, + "grad_norm": 0.19867131114006042, + "learning_rate": 8e-05, + "loss": 2.5067, + "step": 332 + }, + { + "epoch": 0.3620057072971871, + "grad_norm": 0.15917038917541504, + "learning_rate": 8e-05, + "loss": 2.4492, + "step": 333 + }, + { + "epoch": 0.3630928115233048, + "grad_norm": 0.28437408804893494, + "learning_rate": 8e-05, + "loss": 2.5722, + "step": 334 + }, + { + "epoch": 0.3641799157494225, + "grad_norm": 0.2445293366909027, + "learning_rate": 8e-05, + "loss": 2.742, + "step": 335 + }, + { + "epoch": 0.3652670199755402, + "grad_norm": 0.2666874825954437, + "learning_rate": 8e-05, + "loss": 2.7537, + "step": 336 + }, + { + "epoch": 0.36635412420165786, + "grad_norm": 0.1791556477546692, + "learning_rate": 8e-05, + "loss": 2.59, + "step": 337 + }, + { + "epoch": 0.3674412284277755, + "grad_norm": 0.20954462885856628, + "learning_rate": 8e-05, + "loss": 2.4946, + "step": 338 + }, + { + "epoch": 0.3685283326538932, + "grad_norm": 0.2492891103029251, + "learning_rate": 8e-05, + "loss": 2.6309, + "step": 339 + }, + { + "epoch": 0.36961543688001086, + "grad_norm": 0.2827008366584778, + "learning_rate": 8e-05, + "loss": 2.9174, + "step": 340 + }, + { + "epoch": 0.37070254110612855, + "grad_norm": 0.200570747256279, + "learning_rate": 8e-05, + "loss": 2.6176, + "step": 341 + }, + { + "epoch": 0.37178964533224623, + "grad_norm": 0.2485036849975586, + "learning_rate": 8e-05, + "loss": 2.5081, + "step": 342 + }, + { + "epoch": 0.3728767495583639, + "grad_norm": 0.1741902381181717, + "learning_rate": 8e-05, + "loss": 2.421, + "step": 343 + }, + { + "epoch": 0.3739638537844816, + "grad_norm": 0.20756232738494873, + "learning_rate": 8e-05, + "loss": 2.4375, + "step": 344 + }, + { + "epoch": 0.3750509580105993, + "grad_norm": 0.22192849218845367, + "learning_rate": 8e-05, + "loss": 2.4881, + "step": 345 + }, + { + "epoch": 0.3761380622367169, + "grad_norm": 0.2701265215873718, + "learning_rate": 8e-05, + "loss": 2.8489, + "step": 346 + }, + { + "epoch": 0.3772251664628346, + "grad_norm": 0.183476060628891, + "learning_rate": 8e-05, + "loss": 2.5363, + "step": 347 + }, + { + "epoch": 0.3783122706889523, + "grad_norm": 0.26317840814590454, + "learning_rate": 8e-05, + "loss": 2.8179, + "step": 348 + }, + { + "epoch": 0.37939937491507, + "grad_norm": 0.2133825272321701, + "learning_rate": 8e-05, + "loss": 2.5242, + "step": 349 + }, + { + "epoch": 0.38048647914118766, + "grad_norm": 0.2778710722923279, + "learning_rate": 8e-05, + "loss": 2.5892, + "step": 350 + }, + { + "epoch": 0.38157358336730535, + "grad_norm": 0.22975800931453705, + "learning_rate": 8e-05, + "loss": 2.6756, + "step": 351 + }, + { + "epoch": 0.38266068759342303, + "grad_norm": 0.2850879430770874, + "learning_rate": 8e-05, + "loss": 2.6953, + "step": 352 + }, + { + "epoch": 0.3837477918195407, + "grad_norm": 0.18681801855564117, + "learning_rate": 8e-05, + "loss": 2.7098, + "step": 353 + }, + { + "epoch": 0.3848348960456584, + "grad_norm": 0.174485981464386, + "learning_rate": 8e-05, + "loss": 2.3186, + "step": 354 + }, + { + "epoch": 0.38592200027177603, + "grad_norm": 0.22386078536510468, + "learning_rate": 8e-05, + "loss": 2.6432, + "step": 355 + }, + { + "epoch": 0.3870091044978937, + "grad_norm": 0.17607641220092773, + "learning_rate": 8e-05, + "loss": 2.5093, + "step": 356 + }, + { + "epoch": 0.3880962087240114, + "grad_norm": 0.31591373682022095, + "learning_rate": 8e-05, + "loss": 2.6728, + "step": 357 + }, + { + "epoch": 0.3891833129501291, + "grad_norm": 0.19923368096351624, + "learning_rate": 8e-05, + "loss": 2.6799, + "step": 358 + }, + { + "epoch": 0.3902704171762468, + "grad_norm": 0.18405026197433472, + "learning_rate": 8e-05, + "loss": 2.5314, + "step": 359 + }, + { + "epoch": 0.39135752140236446, + "grad_norm": 0.2218071073293686, + "learning_rate": 8e-05, + "loss": 2.4172, + "step": 360 + }, + { + "epoch": 0.39244462562848215, + "grad_norm": 0.25192755460739136, + "learning_rate": 8e-05, + "loss": 2.5965, + "step": 361 + }, + { + "epoch": 0.39353172985459983, + "grad_norm": 0.21659573912620544, + "learning_rate": 8e-05, + "loss": 2.602, + "step": 362 + }, + { + "epoch": 0.39461883408071746, + "grad_norm": 0.25699228048324585, + "learning_rate": 8e-05, + "loss": 2.4906, + "step": 363 + }, + { + "epoch": 0.39570593830683515, + "grad_norm": 0.22029569745063782, + "learning_rate": 8e-05, + "loss": 2.7201, + "step": 364 + }, + { + "epoch": 0.39679304253295283, + "grad_norm": 0.22668136656284332, + "learning_rate": 8e-05, + "loss": 2.7366, + "step": 365 + }, + { + "epoch": 0.3978801467590705, + "grad_norm": 0.25682350993156433, + "learning_rate": 8e-05, + "loss": 2.8469, + "step": 366 + }, + { + "epoch": 0.3989672509851882, + "grad_norm": 0.20977407693862915, + "learning_rate": 8e-05, + "loss": 2.5293, + "step": 367 + }, + { + "epoch": 0.4000543552113059, + "grad_norm": 0.18233545124530792, + "learning_rate": 8e-05, + "loss": 2.6155, + "step": 368 + }, + { + "epoch": 0.4011414594374236, + "grad_norm": 0.22229976952075958, + "learning_rate": 8e-05, + "loss": 2.6526, + "step": 369 + }, + { + "epoch": 0.40222856366354126, + "grad_norm": 0.2936011850833893, + "learning_rate": 8e-05, + "loss": 2.6662, + "step": 370 + }, + { + "epoch": 0.40331566788965895, + "grad_norm": 0.27802130579948425, + "learning_rate": 8e-05, + "loss": 2.6508, + "step": 371 + }, + { + "epoch": 0.4044027721157766, + "grad_norm": 0.1855441778898239, + "learning_rate": 8e-05, + "loss": 2.718, + "step": 372 + }, + { + "epoch": 0.40548987634189426, + "grad_norm": 0.2801089882850647, + "learning_rate": 8e-05, + "loss": 2.9642, + "step": 373 + }, + { + "epoch": 0.40657698056801195, + "grad_norm": 0.19018062949180603, + "learning_rate": 8e-05, + "loss": 2.6812, + "step": 374 + }, + { + "epoch": 0.40766408479412963, + "grad_norm": 0.22897948324680328, + "learning_rate": 8e-05, + "loss": 2.4886, + "step": 375 + }, + { + "epoch": 0.4087511890202473, + "grad_norm": 0.23201127350330353, + "learning_rate": 8e-05, + "loss": 2.455, + "step": 376 + }, + { + "epoch": 0.409838293246365, + "grad_norm": 0.24789221584796906, + "learning_rate": 8e-05, + "loss": 2.7142, + "step": 377 + }, + { + "epoch": 0.4109253974724827, + "grad_norm": 0.194185271859169, + "learning_rate": 8e-05, + "loss": 2.4141, + "step": 378 + }, + { + "epoch": 0.4120125016986004, + "grad_norm": 0.2882381081581116, + "learning_rate": 8e-05, + "loss": 2.7273, + "step": 379 + }, + { + "epoch": 0.413099605924718, + "grad_norm": 0.2271953821182251, + "learning_rate": 8e-05, + "loss": 2.7347, + "step": 380 + }, + { + "epoch": 0.4141867101508357, + "grad_norm": 0.30258652567863464, + "learning_rate": 8e-05, + "loss": 2.6196, + "step": 381 + }, + { + "epoch": 0.4152738143769534, + "grad_norm": 0.21576198935508728, + "learning_rate": 8e-05, + "loss": 2.4849, + "step": 382 + }, + { + "epoch": 0.41636091860307106, + "grad_norm": 0.23280194401741028, + "learning_rate": 8e-05, + "loss": 2.485, + "step": 383 + }, + { + "epoch": 0.41744802282918875, + "grad_norm": 0.17125369608402252, + "learning_rate": 8e-05, + "loss": 2.3511, + "step": 384 + }, + { + "epoch": 0.41853512705530643, + "grad_norm": 0.21508488059043884, + "learning_rate": 8e-05, + "loss": 2.4354, + "step": 385 + }, + { + "epoch": 0.4196222312814241, + "grad_norm": 0.21491070091724396, + "learning_rate": 8e-05, + "loss": 2.5373, + "step": 386 + }, + { + "epoch": 0.4207093355075418, + "grad_norm": 0.36531469225883484, + "learning_rate": 8e-05, + "loss": 2.91, + "step": 387 + }, + { + "epoch": 0.4217964397336595, + "grad_norm": 0.2808130979537964, + "learning_rate": 8e-05, + "loss": 2.9195, + "step": 388 + }, + { + "epoch": 0.4228835439597771, + "grad_norm": 0.34745627641677856, + "learning_rate": 8e-05, + "loss": 2.5437, + "step": 389 + }, + { + "epoch": 0.4239706481858948, + "grad_norm": 0.28565704822540283, + "learning_rate": 8e-05, + "loss": 2.7628, + "step": 390 + }, + { + "epoch": 0.4250577524120125, + "grad_norm": 0.24669288098812103, + "learning_rate": 8e-05, + "loss": 2.7123, + "step": 391 + }, + { + "epoch": 0.4261448566381302, + "grad_norm": 0.20600421726703644, + "learning_rate": 8e-05, + "loss": 2.5081, + "step": 392 + }, + { + "epoch": 0.42723196086424786, + "grad_norm": 0.24662664532661438, + "learning_rate": 8e-05, + "loss": 2.5985, + "step": 393 + }, + { + "epoch": 0.42831906509036555, + "grad_norm": 0.20637661218643188, + "learning_rate": 8e-05, + "loss": 2.6118, + "step": 394 + }, + { + "epoch": 0.42940616931648323, + "grad_norm": 0.2504507303237915, + "learning_rate": 8e-05, + "loss": 2.7384, + "step": 395 + }, + { + "epoch": 0.4304932735426009, + "grad_norm": 0.21672147512435913, + "learning_rate": 8e-05, + "loss": 2.6591, + "step": 396 + }, + { + "epoch": 0.43158037776871855, + "grad_norm": 0.18554607033729553, + "learning_rate": 8e-05, + "loss": 2.4652, + "step": 397 + }, + { + "epoch": 0.43266748199483623, + "grad_norm": 0.19278907775878906, + "learning_rate": 8e-05, + "loss": 2.4733, + "step": 398 + }, + { + "epoch": 0.4337545862209539, + "grad_norm": 0.28226155042648315, + "learning_rate": 8e-05, + "loss": 2.6512, + "step": 399 + }, + { + "epoch": 0.4348416904470716, + "grad_norm": 0.27327367663383484, + "learning_rate": 8e-05, + "loss": 2.7556, + "step": 400 + }, + { + "epoch": 0.4359287946731893, + "grad_norm": 0.21287570893764496, + "learning_rate": 8e-05, + "loss": 2.6486, + "step": 401 + }, + { + "epoch": 0.437015898899307, + "grad_norm": 0.17834673821926117, + "learning_rate": 8e-05, + "loss": 2.4794, + "step": 402 + }, + { + "epoch": 0.43810300312542466, + "grad_norm": 0.226215198636055, + "learning_rate": 8e-05, + "loss": 2.6491, + "step": 403 + }, + { + "epoch": 0.43919010735154235, + "grad_norm": 0.21208593249320984, + "learning_rate": 8e-05, + "loss": 2.6761, + "step": 404 + }, + { + "epoch": 0.44027721157766003, + "grad_norm": 0.2032519280910492, + "learning_rate": 8e-05, + "loss": 2.5648, + "step": 405 + }, + { + "epoch": 0.44136431580377766, + "grad_norm": 0.2813989222049713, + "learning_rate": 8e-05, + "loss": 2.578, + "step": 406 + }, + { + "epoch": 0.44245142002989535, + "grad_norm": 0.2081678807735443, + "learning_rate": 8e-05, + "loss": 2.6766, + "step": 407 + }, + { + "epoch": 0.44353852425601303, + "grad_norm": 0.16450271010398865, + "learning_rate": 8e-05, + "loss": 2.4384, + "step": 408 + }, + { + "epoch": 0.4446256284821307, + "grad_norm": 0.26878130435943604, + "learning_rate": 8e-05, + "loss": 2.7346, + "step": 409 + }, + { + "epoch": 0.4457127327082484, + "grad_norm": 0.2213621884584427, + "learning_rate": 8e-05, + "loss": 2.5341, + "step": 410 + }, + { + "epoch": 0.4467998369343661, + "grad_norm": 0.19212883710861206, + "learning_rate": 8e-05, + "loss": 2.7588, + "step": 411 + }, + { + "epoch": 0.4478869411604838, + "grad_norm": 0.206886425614357, + "learning_rate": 8e-05, + "loss": 2.6373, + "step": 412 + }, + { + "epoch": 0.44897404538660146, + "grad_norm": 0.2944885790348053, + "learning_rate": 8e-05, + "loss": 2.7282, + "step": 413 + }, + { + "epoch": 0.45006114961271915, + "grad_norm": 0.17455589771270752, + "learning_rate": 8e-05, + "loss": 2.6046, + "step": 414 + }, + { + "epoch": 0.4511482538388368, + "grad_norm": 0.20827996730804443, + "learning_rate": 8e-05, + "loss": 2.6364, + "step": 415 + }, + { + "epoch": 0.45223535806495446, + "grad_norm": 0.2872694134712219, + "learning_rate": 8e-05, + "loss": 2.5584, + "step": 416 + }, + { + "epoch": 0.45332246229107215, + "grad_norm": 0.2136147916316986, + "learning_rate": 8e-05, + "loss": 2.619, + "step": 417 + }, + { + "epoch": 0.45440956651718983, + "grad_norm": 0.19407716393470764, + "learning_rate": 8e-05, + "loss": 2.516, + "step": 418 + }, + { + "epoch": 0.4554966707433075, + "grad_norm": 0.2835613489151001, + "learning_rate": 8e-05, + "loss": 2.563, + "step": 419 + }, + { + "epoch": 0.4565837749694252, + "grad_norm": 0.21422918140888214, + "learning_rate": 8e-05, + "loss": 2.5913, + "step": 420 + }, + { + "epoch": 0.4576708791955429, + "grad_norm": 0.2230289876461029, + "learning_rate": 8e-05, + "loss": 2.4396, + "step": 421 + }, + { + "epoch": 0.4587579834216606, + "grad_norm": 0.18986904621124268, + "learning_rate": 8e-05, + "loss": 2.647, + "step": 422 + }, + { + "epoch": 0.4598450876477782, + "grad_norm": 0.21688848733901978, + "learning_rate": 8e-05, + "loss": 2.5867, + "step": 423 + }, + { + "epoch": 0.4609321918738959, + "grad_norm": 0.250792533159256, + "learning_rate": 8e-05, + "loss": 2.8648, + "step": 424 + }, + { + "epoch": 0.4620192961000136, + "grad_norm": 0.19201241433620453, + "learning_rate": 8e-05, + "loss": 2.602, + "step": 425 + }, + { + "epoch": 0.46310640032613126, + "grad_norm": 0.20771661400794983, + "learning_rate": 8e-05, + "loss": 2.6862, + "step": 426 + }, + { + "epoch": 0.46419350455224895, + "grad_norm": 0.27982455492019653, + "learning_rate": 8e-05, + "loss": 2.6253, + "step": 427 + }, + { + "epoch": 0.46528060877836663, + "grad_norm": 0.22421002388000488, + "learning_rate": 8e-05, + "loss": 2.7057, + "step": 428 + }, + { + "epoch": 0.4663677130044843, + "grad_norm": 0.26257553696632385, + "learning_rate": 8e-05, + "loss": 2.7638, + "step": 429 + }, + { + "epoch": 0.467454817230602, + "grad_norm": 0.2060101181268692, + "learning_rate": 8e-05, + "loss": 2.3578, + "step": 430 + }, + { + "epoch": 0.4685419214567197, + "grad_norm": 0.21063673496246338, + "learning_rate": 8e-05, + "loss": 2.4644, + "step": 431 + }, + { + "epoch": 0.4696290256828373, + "grad_norm": 0.24110709130764008, + "learning_rate": 8e-05, + "loss": 2.4528, + "step": 432 + }, + { + "epoch": 0.470716129908955, + "grad_norm": 0.220398411154747, + "learning_rate": 8e-05, + "loss": 2.6468, + "step": 433 + }, + { + "epoch": 0.4718032341350727, + "grad_norm": 0.22534501552581787, + "learning_rate": 8e-05, + "loss": 2.6238, + "step": 434 + }, + { + "epoch": 0.4728903383611904, + "grad_norm": 0.3070676922798157, + "learning_rate": 8e-05, + "loss": 2.8271, + "step": 435 + }, + { + "epoch": 0.47397744258730806, + "grad_norm": 0.28768453001976013, + "learning_rate": 8e-05, + "loss": 2.4974, + "step": 436 + }, + { + "epoch": 0.47506454681342575, + "grad_norm": 0.19919167459011078, + "learning_rate": 8e-05, + "loss": 2.4308, + "step": 437 + }, + { + "epoch": 0.47615165103954343, + "grad_norm": 0.22018972039222717, + "learning_rate": 8e-05, + "loss": 2.7667, + "step": 438 + }, + { + "epoch": 0.4772387552656611, + "grad_norm": 0.3046324551105499, + "learning_rate": 8e-05, + "loss": 2.4361, + "step": 439 + }, + { + "epoch": 0.47832585949177875, + "grad_norm": 0.176994189620018, + "learning_rate": 8e-05, + "loss": 2.4605, + "step": 440 + }, + { + "epoch": 0.47941296371789643, + "grad_norm": 0.26426082849502563, + "learning_rate": 8e-05, + "loss": 2.6778, + "step": 441 + }, + { + "epoch": 0.4805000679440141, + "grad_norm": 0.28179702162742615, + "learning_rate": 8e-05, + "loss": 2.5435, + "step": 442 + }, + { + "epoch": 0.4815871721701318, + "grad_norm": 0.3115067780017853, + "learning_rate": 8e-05, + "loss": 2.6775, + "step": 443 + }, + { + "epoch": 0.4826742763962495, + "grad_norm": 0.29136329889297485, + "learning_rate": 8e-05, + "loss": 2.6037, + "step": 444 + }, + { + "epoch": 0.4837613806223672, + "grad_norm": 0.3193109631538391, + "learning_rate": 8e-05, + "loss": 2.7079, + "step": 445 + }, + { + "epoch": 0.48484848484848486, + "grad_norm": 0.21345189213752747, + "learning_rate": 8e-05, + "loss": 2.6314, + "step": 446 + }, + { + "epoch": 0.48593558907460255, + "grad_norm": 0.18337033689022064, + "learning_rate": 8e-05, + "loss": 2.768, + "step": 447 + }, + { + "epoch": 0.48702269330072023, + "grad_norm": 0.29228952527046204, + "learning_rate": 8e-05, + "loss": 2.3995, + "step": 448 + }, + { + "epoch": 0.48810979752683786, + "grad_norm": 0.21599094569683075, + "learning_rate": 8e-05, + "loss": 2.4368, + "step": 449 + }, + { + "epoch": 0.48919690175295555, + "grad_norm": 0.23692837357521057, + "learning_rate": 8e-05, + "loss": 2.5977, + "step": 450 + }, + { + "epoch": 0.49028400597907323, + "grad_norm": 0.2822672426700592, + "learning_rate": 8e-05, + "loss": 2.5887, + "step": 451 + }, + { + "epoch": 0.4913711102051909, + "grad_norm": 0.1884002983570099, + "learning_rate": 8e-05, + "loss": 2.418, + "step": 452 + }, + { + "epoch": 0.4924582144313086, + "grad_norm": 0.23114743828773499, + "learning_rate": 8e-05, + "loss": 2.8977, + "step": 453 + }, + { + "epoch": 0.4935453186574263, + "grad_norm": 0.36670178174972534, + "learning_rate": 8e-05, + "loss": 2.681, + "step": 454 + }, + { + "epoch": 0.494632422883544, + "grad_norm": 0.2136538028717041, + "learning_rate": 8e-05, + "loss": 2.6545, + "step": 455 + }, + { + "epoch": 0.49571952710966166, + "grad_norm": 0.2428305298089981, + "learning_rate": 8e-05, + "loss": 2.6605, + "step": 456 + }, + { + "epoch": 0.4968066313357793, + "grad_norm": 0.2097429484128952, + "learning_rate": 8e-05, + "loss": 2.3287, + "step": 457 + }, + { + "epoch": 0.497893735561897, + "grad_norm": 0.30625641345977783, + "learning_rate": 8e-05, + "loss": 2.6579, + "step": 458 + }, + { + "epoch": 0.49898083978801466, + "grad_norm": 0.21543031930923462, + "learning_rate": 8e-05, + "loss": 2.4359, + "step": 459 + }, + { + "epoch": 0.5000679440141323, + "grad_norm": 0.1620841771364212, + "learning_rate": 8e-05, + "loss": 2.4045, + "step": 460 + }, + { + "epoch": 0.50115504824025, + "grad_norm": 0.46116378903388977, + "learning_rate": 8e-05, + "loss": 2.6635, + "step": 461 + }, + { + "epoch": 0.5022421524663677, + "grad_norm": 0.20259852707386017, + "learning_rate": 8e-05, + "loss": 2.6525, + "step": 462 + }, + { + "epoch": 0.5033292566924854, + "grad_norm": 0.39548853039741516, + "learning_rate": 8e-05, + "loss": 2.6511, + "step": 463 + }, + { + "epoch": 0.5044163609186031, + "grad_norm": 0.2123095542192459, + "learning_rate": 8e-05, + "loss": 2.7076, + "step": 464 + }, + { + "epoch": 0.5055034651447208, + "grad_norm": 0.4113270044326782, + "learning_rate": 8e-05, + "loss": 2.6109, + "step": 465 + }, + { + "epoch": 0.5065905693708385, + "grad_norm": 0.2013510912656784, + "learning_rate": 8e-05, + "loss": 2.5089, + "step": 466 + }, + { + "epoch": 0.5076776735969561, + "grad_norm": 0.2560182511806488, + "learning_rate": 8e-05, + "loss": 2.7458, + "step": 467 + }, + { + "epoch": 0.5087647778230738, + "grad_norm": 0.30504193902015686, + "learning_rate": 8e-05, + "loss": 2.6229, + "step": 468 + }, + { + "epoch": 0.5098518820491915, + "grad_norm": 0.2004050314426422, + "learning_rate": 8e-05, + "loss": 2.6744, + "step": 469 + }, + { + "epoch": 0.5109389862753091, + "grad_norm": 0.2589462399482727, + "learning_rate": 8e-05, + "loss": 2.5851, + "step": 470 + }, + { + "epoch": 0.5120260905014268, + "grad_norm": 0.23366373777389526, + "learning_rate": 8e-05, + "loss": 2.694, + "step": 471 + }, + { + "epoch": 0.5131131947275445, + "grad_norm": 0.24854260683059692, + "learning_rate": 8e-05, + "loss": 2.5622, + "step": 472 + }, + { + "epoch": 0.5142002989536621, + "grad_norm": 0.26010555028915405, + "learning_rate": 8e-05, + "loss": 2.4512, + "step": 473 + }, + { + "epoch": 0.5152874031797798, + "grad_norm": 0.22194120287895203, + "learning_rate": 8e-05, + "loss": 2.7111, + "step": 474 + }, + { + "epoch": 0.5163745074058975, + "grad_norm": 0.24803297221660614, + "learning_rate": 8e-05, + "loss": 2.4904, + "step": 475 + }, + { + "epoch": 0.5174616116320152, + "grad_norm": 0.23401300609111786, + "learning_rate": 8e-05, + "loss": 2.5487, + "step": 476 + }, + { + "epoch": 0.5185487158581329, + "grad_norm": 0.2260722517967224, + "learning_rate": 8e-05, + "loss": 2.7317, + "step": 477 + }, + { + "epoch": 0.5196358200842506, + "grad_norm": 0.20105749368667603, + "learning_rate": 8e-05, + "loss": 2.4866, + "step": 478 + }, + { + "epoch": 0.5207229243103683, + "grad_norm": 0.1568683683872223, + "learning_rate": 8e-05, + "loss": 2.4315, + "step": 479 + }, + { + "epoch": 0.521810028536486, + "grad_norm": 0.22254113852977753, + "learning_rate": 8e-05, + "loss": 2.5453, + "step": 480 + }, + { + "epoch": 0.5228971327626036, + "grad_norm": 0.2921615540981293, + "learning_rate": 8e-05, + "loss": 2.8772, + "step": 481 + }, + { + "epoch": 0.5239842369887213, + "grad_norm": 0.19162219762802124, + "learning_rate": 8e-05, + "loss": 2.5175, + "step": 482 + }, + { + "epoch": 0.525071341214839, + "grad_norm": 0.22468820214271545, + "learning_rate": 8e-05, + "loss": 2.6928, + "step": 483 + }, + { + "epoch": 0.5261584454409567, + "grad_norm": 0.19029191136360168, + "learning_rate": 8e-05, + "loss": 2.2626, + "step": 484 + }, + { + "epoch": 0.5272455496670744, + "grad_norm": 0.1988745927810669, + "learning_rate": 8e-05, + "loss": 2.5301, + "step": 485 + }, + { + "epoch": 0.5283326538931921, + "grad_norm": 0.17997898161411285, + "learning_rate": 8e-05, + "loss": 2.3693, + "step": 486 + }, + { + "epoch": 0.5294197581193096, + "grad_norm": 0.23908165097236633, + "learning_rate": 8e-05, + "loss": 2.6242, + "step": 487 + }, + { + "epoch": 0.5305068623454273, + "grad_norm": 0.39755916595458984, + "learning_rate": 8e-05, + "loss": 2.7795, + "step": 488 + }, + { + "epoch": 0.531593966571545, + "grad_norm": 0.17780715227127075, + "learning_rate": 8e-05, + "loss": 2.6092, + "step": 489 + }, + { + "epoch": 0.5326810707976627, + "grad_norm": 0.25898826122283936, + "learning_rate": 8e-05, + "loss": 2.4112, + "step": 490 + }, + { + "epoch": 0.5337681750237804, + "grad_norm": 0.4918609857559204, + "learning_rate": 8e-05, + "loss": 2.7958, + "step": 491 + }, + { + "epoch": 0.5348552792498981, + "grad_norm": 0.34552201628685, + "learning_rate": 8e-05, + "loss": 2.7417, + "step": 492 + }, + { + "epoch": 0.5359423834760157, + "grad_norm": 0.4940396845340729, + "learning_rate": 8e-05, + "loss": 2.6919, + "step": 493 + }, + { + "epoch": 0.5370294877021334, + "grad_norm": 0.23338870704174042, + "learning_rate": 8e-05, + "loss": 2.399, + "step": 494 + }, + { + "epoch": 0.5381165919282511, + "grad_norm": 0.4842647314071655, + "learning_rate": 8e-05, + "loss": 2.7612, + "step": 495 + }, + { + "epoch": 0.5392036961543688, + "grad_norm": 0.23610053956508636, + "learning_rate": 8e-05, + "loss": 2.7365, + "step": 496 + }, + { + "epoch": 0.5402908003804865, + "grad_norm": 0.4675828516483307, + "learning_rate": 8e-05, + "loss": 2.6929, + "step": 497 + }, + { + "epoch": 0.5413779046066042, + "grad_norm": 0.2838115692138672, + "learning_rate": 8e-05, + "loss": 2.4773, + "step": 498 + }, + { + "epoch": 0.5424650088327219, + "grad_norm": 0.20726549625396729, + "learning_rate": 8e-05, + "loss": 2.3243, + "step": 499 + }, + { + "epoch": 0.5435521130588395, + "grad_norm": 0.3642233908176422, + "learning_rate": 8e-05, + "loss": 2.6699, + "step": 500 + }, + { + "epoch": 0.5446392172849572, + "grad_norm": 0.28025588393211365, + "learning_rate": 8e-05, + "loss": 2.6198, + "step": 501 + }, + { + "epoch": 0.5457263215110749, + "grad_norm": 0.2388845682144165, + "learning_rate": 8e-05, + "loss": 2.4221, + "step": 502 + }, + { + "epoch": 0.5468134257371926, + "grad_norm": 0.25004518032073975, + "learning_rate": 8e-05, + "loss": 2.4969, + "step": 503 + }, + { + "epoch": 0.5479005299633102, + "grad_norm": 0.24011000990867615, + "learning_rate": 8e-05, + "loss": 2.6083, + "step": 504 + }, + { + "epoch": 0.5489876341894279, + "grad_norm": 0.23871102929115295, + "learning_rate": 8e-05, + "loss": 2.4869, + "step": 505 + }, + { + "epoch": 0.5500747384155455, + "grad_norm": 0.34483233094215393, + "learning_rate": 8e-05, + "loss": 2.6895, + "step": 506 + }, + { + "epoch": 0.5511618426416632, + "grad_norm": 0.22084181010723114, + "learning_rate": 8e-05, + "loss": 2.8537, + "step": 507 + }, + { + "epoch": 0.5522489468677809, + "grad_norm": 0.24399428069591522, + "learning_rate": 8e-05, + "loss": 2.6289, + "step": 508 + }, + { + "epoch": 0.5533360510938986, + "grad_norm": 0.2644304037094116, + "learning_rate": 8e-05, + "loss": 2.6045, + "step": 509 + }, + { + "epoch": 0.5544231553200163, + "grad_norm": 0.19887222349643707, + "learning_rate": 8e-05, + "loss": 2.5859, + "step": 510 + }, + { + "epoch": 0.555510259546134, + "grad_norm": 0.2816789150238037, + "learning_rate": 8e-05, + "loss": 2.6778, + "step": 511 + }, + { + "epoch": 0.5565973637722517, + "grad_norm": 0.15038593113422394, + "learning_rate": 8e-05, + "loss": 2.3545, + "step": 512 + }, + { + "epoch": 0.5576844679983693, + "grad_norm": 0.17188872396945953, + "learning_rate": 8e-05, + "loss": 2.553, + "step": 513 + }, + { + "epoch": 0.558771572224487, + "grad_norm": 0.17198430001735687, + "learning_rate": 8e-05, + "loss": 2.4905, + "step": 514 + }, + { + "epoch": 0.5598586764506047, + "grad_norm": 0.1813175082206726, + "learning_rate": 8e-05, + "loss": 2.4595, + "step": 515 + }, + { + "epoch": 0.5609457806767224, + "grad_norm": 0.19259634613990784, + "learning_rate": 8e-05, + "loss": 2.5214, + "step": 516 + }, + { + "epoch": 0.5620328849028401, + "grad_norm": 0.24767690896987915, + "learning_rate": 8e-05, + "loss": 2.5865, + "step": 517 + }, + { + "epoch": 0.5631199891289578, + "grad_norm": 0.27908816933631897, + "learning_rate": 8e-05, + "loss": 2.3923, + "step": 518 + }, + { + "epoch": 0.5642070933550755, + "grad_norm": 0.1764037162065506, + "learning_rate": 8e-05, + "loss": 2.6532, + "step": 519 + }, + { + "epoch": 0.5652941975811931, + "grad_norm": 0.2394370436668396, + "learning_rate": 8e-05, + "loss": 2.8574, + "step": 520 + }, + { + "epoch": 0.5663813018073107, + "grad_norm": 0.204293891787529, + "learning_rate": 8e-05, + "loss": 2.8942, + "step": 521 + }, + { + "epoch": 0.5674684060334284, + "grad_norm": 0.2370186299085617, + "learning_rate": 8e-05, + "loss": 2.6307, + "step": 522 + }, + { + "epoch": 0.5685555102595461, + "grad_norm": 0.16939158737659454, + "learning_rate": 8e-05, + "loss": 2.6127, + "step": 523 + }, + { + "epoch": 0.5696426144856638, + "grad_norm": 0.1784953773021698, + "learning_rate": 8e-05, + "loss": 2.686, + "step": 524 + }, + { + "epoch": 0.5707297187117815, + "grad_norm": 0.15965938568115234, + "learning_rate": 8e-05, + "loss": 2.3886, + "step": 525 + }, + { + "epoch": 0.5718168229378991, + "grad_norm": 0.24406729638576508, + "learning_rate": 8e-05, + "loss": 2.6877, + "step": 526 + }, + { + "epoch": 0.5729039271640168, + "grad_norm": 0.2516114115715027, + "learning_rate": 8e-05, + "loss": 2.7208, + "step": 527 + }, + { + "epoch": 0.5739910313901345, + "grad_norm": 0.20796844363212585, + "learning_rate": 8e-05, + "loss": 2.6516, + "step": 528 + }, + { + "epoch": 0.5750781356162522, + "grad_norm": 0.1795925796031952, + "learning_rate": 8e-05, + "loss": 2.3656, + "step": 529 + }, + { + "epoch": 0.5761652398423699, + "grad_norm": 0.25777432322502136, + "learning_rate": 8e-05, + "loss": 2.5944, + "step": 530 + }, + { + "epoch": 0.5772523440684876, + "grad_norm": 0.31613433361053467, + "learning_rate": 8e-05, + "loss": 2.8876, + "step": 531 + }, + { + "epoch": 0.5783394482946053, + "grad_norm": 0.31211256980895996, + "learning_rate": 8e-05, + "loss": 2.74, + "step": 532 + }, + { + "epoch": 0.579426552520723, + "grad_norm": 0.2686806917190552, + "learning_rate": 8e-05, + "loss": 2.411, + "step": 533 + }, + { + "epoch": 0.5805136567468406, + "grad_norm": 0.18795549869537354, + "learning_rate": 8e-05, + "loss": 2.6824, + "step": 534 + }, + { + "epoch": 0.5816007609729583, + "grad_norm": 0.2706974744796753, + "learning_rate": 8e-05, + "loss": 2.4265, + "step": 535 + }, + { + "epoch": 0.582687865199076, + "grad_norm": 0.3143719434738159, + "learning_rate": 8e-05, + "loss": 2.7433, + "step": 536 + }, + { + "epoch": 0.5837749694251937, + "grad_norm": 0.3055052161216736, + "learning_rate": 8e-05, + "loss": 2.5379, + "step": 537 + }, + { + "epoch": 0.5848620736513113, + "grad_norm": 0.3382609188556671, + "learning_rate": 8e-05, + "loss": 3.0399, + "step": 538 + }, + { + "epoch": 0.585949177877429, + "grad_norm": 0.17904764413833618, + "learning_rate": 8e-05, + "loss": 2.5064, + "step": 539 + }, + { + "epoch": 0.5870362821035466, + "grad_norm": 0.3742446303367615, + "learning_rate": 8e-05, + "loss": 2.5411, + "step": 540 + }, + { + "epoch": 0.5881233863296643, + "grad_norm": 0.20331904292106628, + "learning_rate": 8e-05, + "loss": 2.5591, + "step": 541 + }, + { + "epoch": 0.589210490555782, + "grad_norm": 0.2534007728099823, + "learning_rate": 8e-05, + "loss": 2.4984, + "step": 542 + }, + { + "epoch": 0.5902975947818997, + "grad_norm": 0.3521711230278015, + "learning_rate": 8e-05, + "loss": 2.6077, + "step": 543 + }, + { + "epoch": 0.5913846990080174, + "grad_norm": 0.18652194738388062, + "learning_rate": 8e-05, + "loss": 2.7411, + "step": 544 + }, + { + "epoch": 0.5924718032341351, + "grad_norm": 0.2977702021598816, + "learning_rate": 8e-05, + "loss": 2.6113, + "step": 545 + }, + { + "epoch": 0.5935589074602527, + "grad_norm": 0.2756431996822357, + "learning_rate": 8e-05, + "loss": 2.5185, + "step": 546 + }, + { + "epoch": 0.5946460116863704, + "grad_norm": 0.22625668346881866, + "learning_rate": 8e-05, + "loss": 2.7018, + "step": 547 + }, + { + "epoch": 0.5957331159124881, + "grad_norm": 0.2432011365890503, + "learning_rate": 8e-05, + "loss": 2.6205, + "step": 548 + }, + { + "epoch": 0.5968202201386058, + "grad_norm": 0.2109351009130478, + "learning_rate": 8e-05, + "loss": 2.7193, + "step": 549 + }, + { + "epoch": 0.5979073243647235, + "grad_norm": 0.19625385105609894, + "learning_rate": 8e-05, + "loss": 2.5419, + "step": 550 + }, + { + "epoch": 0.5989944285908412, + "grad_norm": 0.24168696999549866, + "learning_rate": 8e-05, + "loss": 2.3459, + "step": 551 + }, + { + "epoch": 0.6000815328169589, + "grad_norm": 0.19131049513816833, + "learning_rate": 8e-05, + "loss": 2.5184, + "step": 552 + }, + { + "epoch": 0.6011686370430765, + "grad_norm": 0.24158018827438354, + "learning_rate": 8e-05, + "loss": 2.9306, + "step": 553 + }, + { + "epoch": 0.6022557412691942, + "grad_norm": 0.26671212911605835, + "learning_rate": 8e-05, + "loss": 2.6283, + "step": 554 + }, + { + "epoch": 0.6033428454953118, + "grad_norm": 0.21698710322380066, + "learning_rate": 8e-05, + "loss": 2.7353, + "step": 555 + }, + { + "epoch": 0.6044299497214295, + "grad_norm": 0.2515629827976227, + "learning_rate": 8e-05, + "loss": 2.6427, + "step": 556 + }, + { + "epoch": 0.6055170539475472, + "grad_norm": 0.1712554544210434, + "learning_rate": 8e-05, + "loss": 2.557, + "step": 557 + }, + { + "epoch": 0.6066041581736649, + "grad_norm": 0.2252780944108963, + "learning_rate": 8e-05, + "loss": 2.5625, + "step": 558 + }, + { + "epoch": 0.6076912623997826, + "grad_norm": 0.18625782430171967, + "learning_rate": 8e-05, + "loss": 2.7296, + "step": 559 + }, + { + "epoch": 0.6087783666259002, + "grad_norm": 0.24307093024253845, + "learning_rate": 8e-05, + "loss": 2.5914, + "step": 560 + }, + { + "epoch": 0.6098654708520179, + "grad_norm": 0.19246543943881989, + "learning_rate": 8e-05, + "loss": 2.5483, + "step": 561 + }, + { + "epoch": 0.6109525750781356, + "grad_norm": 0.2455776482820511, + "learning_rate": 8e-05, + "loss": 2.6067, + "step": 562 + }, + { + "epoch": 0.6120396793042533, + "grad_norm": 0.16997501254081726, + "learning_rate": 8e-05, + "loss": 2.3667, + "step": 563 + }, + { + "epoch": 0.613126783530371, + "grad_norm": 0.2945926785469055, + "learning_rate": 8e-05, + "loss": 2.8638, + "step": 564 + }, + { + "epoch": 0.6142138877564887, + "grad_norm": 0.21388252079486847, + "learning_rate": 8e-05, + "loss": 2.5535, + "step": 565 + }, + { + "epoch": 0.6153009919826063, + "grad_norm": 0.27458804845809937, + "learning_rate": 8e-05, + "loss": 2.6685, + "step": 566 + }, + { + "epoch": 0.616388096208724, + "grad_norm": 0.2044813334941864, + "learning_rate": 8e-05, + "loss": 2.517, + "step": 567 + }, + { + "epoch": 0.6174752004348417, + "grad_norm": 0.18248596787452698, + "learning_rate": 8e-05, + "loss": 2.5029, + "step": 568 + }, + { + "epoch": 0.6185623046609594, + "grad_norm": 0.2239370048046112, + "learning_rate": 8e-05, + "loss": 2.6914, + "step": 569 + }, + { + "epoch": 0.6196494088870771, + "grad_norm": 0.22053289413452148, + "learning_rate": 8e-05, + "loss": 2.5147, + "step": 570 + }, + { + "epoch": 0.6207365131131948, + "grad_norm": 0.15921327471733093, + "learning_rate": 8e-05, + "loss": 2.4666, + "step": 571 + }, + { + "epoch": 0.6218236173393124, + "grad_norm": 0.3132762014865875, + "learning_rate": 8e-05, + "loss": 2.7194, + "step": 572 + }, + { + "epoch": 0.62291072156543, + "grad_norm": 0.18798871338367462, + "learning_rate": 8e-05, + "loss": 2.7374, + "step": 573 + }, + { + "epoch": 0.6239978257915477, + "grad_norm": 0.19612906873226166, + "learning_rate": 8e-05, + "loss": 2.5695, + "step": 574 + }, + { + "epoch": 0.6250849300176654, + "grad_norm": 0.19844460487365723, + "learning_rate": 8e-05, + "loss": 2.4379, + "step": 575 + }, + { + "epoch": 0.6261720342437831, + "grad_norm": 0.21438564360141754, + "learning_rate": 8e-05, + "loss": 2.5401, + "step": 576 + }, + { + "epoch": 0.6272591384699008, + "grad_norm": 0.22666309773921967, + "learning_rate": 8e-05, + "loss": 2.5291, + "step": 577 + }, + { + "epoch": 0.6283462426960185, + "grad_norm": 0.17162497341632843, + "learning_rate": 8e-05, + "loss": 2.474, + "step": 578 + }, + { + "epoch": 0.6294333469221361, + "grad_norm": 0.2103007584810257, + "learning_rate": 8e-05, + "loss": 2.6709, + "step": 579 + }, + { + "epoch": 0.6305204511482538, + "grad_norm": 0.17329515516757965, + "learning_rate": 8e-05, + "loss": 2.5378, + "step": 580 + }, + { + "epoch": 0.6316075553743715, + "grad_norm": 0.18160247802734375, + "learning_rate": 8e-05, + "loss": 2.4837, + "step": 581 + }, + { + "epoch": 0.6326946596004892, + "grad_norm": 0.20657005906105042, + "learning_rate": 8e-05, + "loss": 2.4943, + "step": 582 + }, + { + "epoch": 0.6337817638266069, + "grad_norm": 0.2123149186372757, + "learning_rate": 8e-05, + "loss": 2.5828, + "step": 583 + }, + { + "epoch": 0.6348688680527246, + "grad_norm": 0.34785452485084534, + "learning_rate": 8e-05, + "loss": 2.849, + "step": 584 + }, + { + "epoch": 0.6359559722788423, + "grad_norm": 0.3708589971065521, + "learning_rate": 8e-05, + "loss": 2.6221, + "step": 585 + }, + { + "epoch": 0.63704307650496, + "grad_norm": 0.28052204847335815, + "learning_rate": 8e-05, + "loss": 2.8145, + "step": 586 + }, + { + "epoch": 0.6381301807310776, + "grad_norm": 0.2516515851020813, + "learning_rate": 8e-05, + "loss": 2.3935, + "step": 587 + }, + { + "epoch": 0.6392172849571953, + "grad_norm": 0.3348628580570221, + "learning_rate": 8e-05, + "loss": 2.5096, + "step": 588 + }, + { + "epoch": 0.640304389183313, + "grad_norm": 0.1764392852783203, + "learning_rate": 8e-05, + "loss": 2.389, + "step": 589 + }, + { + "epoch": 0.6413914934094306, + "grad_norm": 0.34492596983909607, + "learning_rate": 8e-05, + "loss": 2.8998, + "step": 590 + }, + { + "epoch": 0.6424785976355483, + "grad_norm": 0.16219203174114227, + "learning_rate": 8e-05, + "loss": 2.4584, + "step": 591 + }, + { + "epoch": 0.643565701861666, + "grad_norm": 0.1792818307876587, + "learning_rate": 8e-05, + "loss": 2.4709, + "step": 592 + }, + { + "epoch": 0.6446528060877836, + "grad_norm": 0.24766263365745544, + "learning_rate": 8e-05, + "loss": 2.6406, + "step": 593 + }, + { + "epoch": 0.6457399103139013, + "grad_norm": 0.19446979463100433, + "learning_rate": 8e-05, + "loss": 2.6262, + "step": 594 + }, + { + "epoch": 0.646827014540019, + "grad_norm": 0.22750389575958252, + "learning_rate": 8e-05, + "loss": 2.4815, + "step": 595 + }, + { + "epoch": 0.6479141187661367, + "grad_norm": 0.24159416556358337, + "learning_rate": 8e-05, + "loss": 2.4862, + "step": 596 + }, + { + "epoch": 0.6490012229922544, + "grad_norm": 0.1971299797296524, + "learning_rate": 8e-05, + "loss": 2.8525, + "step": 597 + }, + { + "epoch": 0.6500883272183721, + "grad_norm": 0.24144640564918518, + "learning_rate": 8e-05, + "loss": 2.4892, + "step": 598 + }, + { + "epoch": 0.6511754314444897, + "grad_norm": 0.23594394326210022, + "learning_rate": 8e-05, + "loss": 2.7762, + "step": 599 + }, + { + "epoch": 0.6522625356706074, + "grad_norm": 0.24387364089488983, + "learning_rate": 8e-05, + "loss": 2.86, + "step": 600 + } + ], + "logging_steps": 1, + "max_steps": 919, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.6600150075899904e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}