| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 500, | |
| "global_step": 460, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.004347826086956522, | |
| "grad_norm": 209.3180389404297, | |
| "learning_rate": 0.0, | |
| "loss": 5.8188, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.008695652173913044, | |
| "grad_norm": 215.69874572753906, | |
| "learning_rate": 4.347826086956522e-06, | |
| "loss": 5.9259, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.013043478260869565, | |
| "grad_norm": 62.712825775146484, | |
| "learning_rate": 8.695652173913044e-06, | |
| "loss": 5.4202, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.017391304347826087, | |
| "grad_norm": 85.59194946289062, | |
| "learning_rate": 1.3043478260869566e-05, | |
| "loss": 5.3079, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.021739130434782608, | |
| "grad_norm": 22.901897430419922, | |
| "learning_rate": 1.739130434782609e-05, | |
| "loss": 5.0196, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.02608695652173913, | |
| "grad_norm": 22.081829071044922, | |
| "learning_rate": 2.173913043478261e-05, | |
| "loss": 4.8222, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.030434782608695653, | |
| "grad_norm": 11.022245407104492, | |
| "learning_rate": 2.608695652173913e-05, | |
| "loss": 4.4617, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.034782608695652174, | |
| "grad_norm": 7.274469375610352, | |
| "learning_rate": 3.0434782608695656e-05, | |
| "loss": 4.335, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.0391304347826087, | |
| "grad_norm": 3.8645834922790527, | |
| "learning_rate": 3.478260869565218e-05, | |
| "loss": 4.0476, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.043478260869565216, | |
| "grad_norm": 2.6724016666412354, | |
| "learning_rate": 3.91304347826087e-05, | |
| "loss": 3.8387, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.04782608695652174, | |
| "grad_norm": 2.258195161819458, | |
| "learning_rate": 4.347826086956522e-05, | |
| "loss": 3.8144, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.05217391304347826, | |
| "grad_norm": 1.8822625875473022, | |
| "learning_rate": 4.782608695652174e-05, | |
| "loss": 3.4008, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.05652173913043478, | |
| "grad_norm": 2.047840118408203, | |
| "learning_rate": 5.217391304347826e-05, | |
| "loss": 3.2554, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.06086956521739131, | |
| "grad_norm": 1.8671568632125854, | |
| "learning_rate": 5.652173913043478e-05, | |
| "loss": 3.2461, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.06521739130434782, | |
| "grad_norm": 1.6069483757019043, | |
| "learning_rate": 6.086956521739131e-05, | |
| "loss": 2.9738, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.06956521739130435, | |
| "grad_norm": 1.3096915483474731, | |
| "learning_rate": 6.521739130434783e-05, | |
| "loss": 2.7823, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.07391304347826087, | |
| "grad_norm": 1.3594956398010254, | |
| "learning_rate": 6.956521739130436e-05, | |
| "loss": 2.6255, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.0782608695652174, | |
| "grad_norm": 1.0210895538330078, | |
| "learning_rate": 7.391304347826086e-05, | |
| "loss": 2.4501, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.08260869565217391, | |
| "grad_norm": 0.8942164182662964, | |
| "learning_rate": 7.82608695652174e-05, | |
| "loss": 2.2934, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.08695652173913043, | |
| "grad_norm": 0.8361735343933105, | |
| "learning_rate": 8.260869565217392e-05, | |
| "loss": 2.2029, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.09130434782608696, | |
| "grad_norm": 0.794482409954071, | |
| "learning_rate": 8.695652173913044e-05, | |
| "loss": 2.0223, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.09565217391304348, | |
| "grad_norm": 0.7513137459754944, | |
| "learning_rate": 9.130434782608696e-05, | |
| "loss": 1.8504, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.76312655210495, | |
| "learning_rate": 9.565217391304348e-05, | |
| "loss": 1.6577, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.10434782608695652, | |
| "grad_norm": 0.8560758829116821, | |
| "learning_rate": 0.0001, | |
| "loss": 1.5565, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.10869565217391304, | |
| "grad_norm": 0.7479954957962036, | |
| "learning_rate": 0.00010434782608695653, | |
| "loss": 1.4364, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.11304347826086956, | |
| "grad_norm": 0.5951140522956848, | |
| "learning_rate": 0.00010869565217391305, | |
| "loss": 1.2957, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.11739130434782609, | |
| "grad_norm": 0.503224790096283, | |
| "learning_rate": 0.00011304347826086956, | |
| "loss": 1.1799, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.12173913043478261, | |
| "grad_norm": 0.47480374574661255, | |
| "learning_rate": 0.0001173913043478261, | |
| "loss": 1.1277, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.12608695652173912, | |
| "grad_norm": 0.38552260398864746, | |
| "learning_rate": 0.00012173913043478263, | |
| "loss": 1.0744, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.13043478260869565, | |
| "grad_norm": 0.35596558451652527, | |
| "learning_rate": 0.00012608695652173915, | |
| "loss": 1.0023, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.13478260869565217, | |
| "grad_norm": 0.32971665263175964, | |
| "learning_rate": 0.00013043478260869567, | |
| "loss": 0.9691, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.1391304347826087, | |
| "grad_norm": 0.37770169973373413, | |
| "learning_rate": 0.0001347826086956522, | |
| "loss": 0.9116, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.14347826086956522, | |
| "grad_norm": 0.22640736401081085, | |
| "learning_rate": 0.0001391304347826087, | |
| "loss": 0.8613, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.14782608695652175, | |
| "grad_norm": 0.20925410091876984, | |
| "learning_rate": 0.0001434782608695652, | |
| "loss": 0.8836, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.15217391304347827, | |
| "grad_norm": 0.20542123913764954, | |
| "learning_rate": 0.00014782608695652173, | |
| "loss": 0.8502, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.1565217391304348, | |
| "grad_norm": 0.16715222597122192, | |
| "learning_rate": 0.00015217391304347827, | |
| "loss": 0.8292, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.1608695652173913, | |
| "grad_norm": 0.1648133248090744, | |
| "learning_rate": 0.0001565217391304348, | |
| "loss": 0.8189, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.16521739130434782, | |
| "grad_norm": 0.13562779128551483, | |
| "learning_rate": 0.00016086956521739132, | |
| "loss": 0.8078, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.16956521739130434, | |
| "grad_norm": 0.1290610432624817, | |
| "learning_rate": 0.00016521739130434784, | |
| "loss": 0.7712, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.17391304347826086, | |
| "grad_norm": 0.11024343967437744, | |
| "learning_rate": 0.00016956521739130436, | |
| "loss": 0.7448, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.1782608695652174, | |
| "grad_norm": 0.12418993562459946, | |
| "learning_rate": 0.00017391304347826088, | |
| "loss": 0.7633, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.1826086956521739, | |
| "grad_norm": 0.10319849103689194, | |
| "learning_rate": 0.0001782608695652174, | |
| "loss": 0.7463, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.18695652173913044, | |
| "grad_norm": 0.10371455550193787, | |
| "learning_rate": 0.00018260869565217392, | |
| "loss": 0.7516, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.19130434782608696, | |
| "grad_norm": 0.09219090640544891, | |
| "learning_rate": 0.00018695652173913045, | |
| "loss": 0.7265, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.1956521739130435, | |
| "grad_norm": 0.09577666968107224, | |
| "learning_rate": 0.00019130434782608697, | |
| "loss": 0.7382, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.08755916357040405, | |
| "learning_rate": 0.0001956521739130435, | |
| "loss": 0.7392, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.20434782608695654, | |
| "grad_norm": 0.08335893601179123, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7182, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.20869565217391303, | |
| "grad_norm": 0.08622466027736664, | |
| "learning_rate": 0.00019999712083215463, | |
| "loss": 0.7196, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.21304347826086956, | |
| "grad_norm": 0.07222707569599152, | |
| "learning_rate": 0.00019998848349441062, | |
| "loss": 0.7014, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.21739130434782608, | |
| "grad_norm": 0.07286012172698975, | |
| "learning_rate": 0.00019997408848413493, | |
| "loss": 0.6986, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.2217391304347826, | |
| "grad_norm": 0.07811558246612549, | |
| "learning_rate": 0.00019995393663024054, | |
| "loss": 0.6922, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.22608695652173913, | |
| "grad_norm": 0.07095416635274887, | |
| "learning_rate": 0.0001999280290931388, | |
| "loss": 0.7188, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.23043478260869565, | |
| "grad_norm": 0.0705651044845581, | |
| "learning_rate": 0.00019989636736467278, | |
| "loss": 0.7135, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.23478260869565218, | |
| "grad_norm": 0.0649741142988205, | |
| "learning_rate": 0.00019985895326803097, | |
| "loss": 0.6833, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.2391304347826087, | |
| "grad_norm": 0.07023416459560394, | |
| "learning_rate": 0.00019981578895764273, | |
| "loss": 0.6902, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.24347826086956523, | |
| "grad_norm": 0.065043605864048, | |
| "learning_rate": 0.00019976687691905393, | |
| "loss": 0.6933, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.24782608695652175, | |
| "grad_norm": 0.0647321566939354, | |
| "learning_rate": 0.00019971221996878394, | |
| "loss": 0.6946, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.25217391304347825, | |
| "grad_norm": 0.08214448392391205, | |
| "learning_rate": 0.0001996518212541634, | |
| "loss": 0.6789, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.2565217391304348, | |
| "grad_norm": 0.06106014922261238, | |
| "learning_rate": 0.00019958568425315314, | |
| "loss": 0.6826, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.2608695652173913, | |
| "grad_norm": 0.06052952632308006, | |
| "learning_rate": 0.0001995138127741436, | |
| "loss": 0.6706, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.26521739130434785, | |
| "grad_norm": 0.06265316903591156, | |
| "learning_rate": 0.00019943621095573586, | |
| "loss": 0.6809, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.26956521739130435, | |
| "grad_norm": 0.0603368878364563, | |
| "learning_rate": 0.00019935288326650312, | |
| "loss": 0.6728, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.27391304347826084, | |
| "grad_norm": 0.06611189991235733, | |
| "learning_rate": 0.00019926383450473344, | |
| "loss": 0.6499, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.2782608695652174, | |
| "grad_norm": 0.06278355419635773, | |
| "learning_rate": 0.00019916906979815347, | |
| "loss": 0.6561, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.2826086956521739, | |
| "grad_norm": 0.07379094511270523, | |
| "learning_rate": 0.00019906859460363307, | |
| "loss": 0.6786, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.28695652173913044, | |
| "grad_norm": 0.09574166685342789, | |
| "learning_rate": 0.0001989624147068713, | |
| "loss": 0.6625, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.29130434782608694, | |
| "grad_norm": 0.08743462711572647, | |
| "learning_rate": 0.00019885053622206304, | |
| "loss": 0.648, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.2956521739130435, | |
| "grad_norm": 0.08914034813642502, | |
| "learning_rate": 0.00019873296559154698, | |
| "loss": 0.6561, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.06804706901311874, | |
| "learning_rate": 0.0001986097095854347, | |
| "loss": 0.658, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.30434782608695654, | |
| "grad_norm": 0.09893489629030228, | |
| "learning_rate": 0.00019848077530122083, | |
| "loss": 0.6708, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.30869565217391304, | |
| "grad_norm": 0.07928409427404404, | |
| "learning_rate": 0.0001983461701633742, | |
| "loss": 0.6407, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.3130434782608696, | |
| "grad_norm": 0.07455449551343918, | |
| "learning_rate": 0.0001982059019229106, | |
| "loss": 0.676, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.3173913043478261, | |
| "grad_norm": 0.0770968496799469, | |
| "learning_rate": 0.00019805997865694614, | |
| "loss": 0.6639, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.3217391304347826, | |
| "grad_norm": 0.06771919876337051, | |
| "learning_rate": 0.00019790840876823232, | |
| "loss": 0.6486, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.32608695652173914, | |
| "grad_norm": 0.07457810640335083, | |
| "learning_rate": 0.0001977512009846721, | |
| "loss": 0.6681, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.33043478260869563, | |
| "grad_norm": 0.0826922208070755, | |
| "learning_rate": 0.00019758836435881746, | |
| "loss": 0.6356, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.3347826086956522, | |
| "grad_norm": 0.07923886179924011, | |
| "learning_rate": 0.00019741990826734794, | |
| "loss": 0.6682, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.3391304347826087, | |
| "grad_norm": 0.11045071482658386, | |
| "learning_rate": 0.0001972458424105307, | |
| "loss": 0.6203, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.34347826086956523, | |
| "grad_norm": 0.11731227487325668, | |
| "learning_rate": 0.00019706617681166218, | |
| "loss": 0.66, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.34782608695652173, | |
| "grad_norm": 0.12649305164813995, | |
| "learning_rate": 0.00019688092181649065, | |
| "loss": 0.6613, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.3521739130434783, | |
| "grad_norm": 0.1144268661737442, | |
| "learning_rate": 0.00019669008809262062, | |
| "loss": 0.6606, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.3565217391304348, | |
| "grad_norm": 0.11361440271139145, | |
| "learning_rate": 0.00019649368662889855, | |
| "loss": 0.629, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.36086956521739133, | |
| "grad_norm": 0.12539249658584595, | |
| "learning_rate": 0.00019629172873477995, | |
| "loss": 0.6676, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.3652173913043478, | |
| "grad_norm": 0.11141279339790344, | |
| "learning_rate": 0.00019608422603967836, | |
| "loss": 0.6376, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.3695652173913043, | |
| "grad_norm": 0.09837634861469269, | |
| "learning_rate": 0.00019587119049229557, | |
| "loss": 0.6503, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.3739130434782609, | |
| "grad_norm": 0.15677575767040253, | |
| "learning_rate": 0.0001956526343599335, | |
| "loss": 0.6638, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.3782608695652174, | |
| "grad_norm": 0.252825528383255, | |
| "learning_rate": 0.0001954285702277879, | |
| "loss": 0.6713, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.3826086956521739, | |
| "grad_norm": 0.3602813482284546, | |
| "learning_rate": 0.00019519901099822372, | |
| "loss": 0.6596, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.3869565217391304, | |
| "grad_norm": 0.3970949053764343, | |
| "learning_rate": 0.00019496396989003193, | |
| "loss": 0.6617, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.391304347826087, | |
| "grad_norm": 0.284343421459198, | |
| "learning_rate": 0.00019472346043766865, | |
| "loss": 0.6229, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.39565217391304347, | |
| "grad_norm": 0.19832171499729156, | |
| "learning_rate": 0.00019447749649047542, | |
| "loss": 0.6665, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.24541743099689484, | |
| "learning_rate": 0.00019422609221188207, | |
| "loss": 0.6585, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.4043478260869565, | |
| "grad_norm": 0.1915537267923355, | |
| "learning_rate": 0.00019396926207859084, | |
| "loss": 0.6343, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.40869565217391307, | |
| "grad_norm": 0.20492875576019287, | |
| "learning_rate": 0.00019370702087974302, | |
| "loss": 0.6438, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.41304347826086957, | |
| "grad_norm": 0.25835996866226196, | |
| "learning_rate": 0.00019343938371606712, | |
| "loss": 0.6502, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.41739130434782606, | |
| "grad_norm": 0.2585464417934418, | |
| "learning_rate": 0.00019316636599900946, | |
| "loss": 0.6393, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.4217391304347826, | |
| "grad_norm": 0.2317182868719101, | |
| "learning_rate": 0.00019288798344984672, | |
| "loss": 0.6275, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.4260869565217391, | |
| "grad_norm": 0.23632416129112244, | |
| "learning_rate": 0.00019260425209878052, | |
| "loss": 0.6414, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.43043478260869567, | |
| "grad_norm": 0.1801244169473648, | |
| "learning_rate": 0.00019231518828401458, | |
| "loss": 0.6491, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.43478260869565216, | |
| "grad_norm": 0.24871514737606049, | |
| "learning_rate": 0.00019202080865081368, | |
| "loss": 0.6581, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.4391304347826087, | |
| "grad_norm": 0.26276353001594543, | |
| "learning_rate": 0.00019172113015054532, | |
| "loss": 0.644, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.4434782608695652, | |
| "grad_norm": 0.19743724167346954, | |
| "learning_rate": 0.0001914161700397035, | |
| "loss": 0.6519, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.44782608695652176, | |
| "grad_norm": 0.31385916471481323, | |
| "learning_rate": 0.00019110594587891519, | |
| "loss": 0.6462, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.45217391304347826, | |
| "grad_norm": 0.2689647674560547, | |
| "learning_rate": 0.0001907904755319289, | |
| "loss": 0.6517, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.45652173913043476, | |
| "grad_norm": 0.17245543003082275, | |
| "learning_rate": 0.00019046977716458626, | |
| "loss": 0.6245, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.4608695652173913, | |
| "grad_norm": 0.4380849003791809, | |
| "learning_rate": 0.00019014386924377582, | |
| "loss": 0.6519, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.4652173913043478, | |
| "grad_norm": 0.305043488740921, | |
| "learning_rate": 0.0001898127705363696, | |
| "loss": 0.6606, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.46956521739130436, | |
| "grad_norm": 0.20340269804000854, | |
| "learning_rate": 0.0001894765001081428, | |
| "loss": 0.6359, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.47391304347826085, | |
| "grad_norm": 0.15703125298023224, | |
| "learning_rate": 0.0001891350773226754, | |
| "loss": 0.6461, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.4782608695652174, | |
| "grad_norm": 0.16932646930217743, | |
| "learning_rate": 0.0001887885218402375, | |
| "loss": 0.6413, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.4826086956521739, | |
| "grad_norm": 0.1790553480386734, | |
| "learning_rate": 0.00018843685361665723, | |
| "loss": 0.6378, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.48695652173913045, | |
| "grad_norm": 0.24903282523155212, | |
| "learning_rate": 0.00018808009290217136, | |
| "loss": 0.6308, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.49130434782608695, | |
| "grad_norm": 0.20529182255268097, | |
| "learning_rate": 0.00018771826024025946, | |
| "loss": 0.6315, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.4956521739130435, | |
| "grad_norm": 0.18206629157066345, | |
| "learning_rate": 0.00018735137646646078, | |
| "loss": 0.6409, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.22906547784805298, | |
| "learning_rate": 0.00018697946270717467, | |
| "loss": 0.6522, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.5043478260869565, | |
| "grad_norm": 0.23560722172260284, | |
| "learning_rate": 0.00018660254037844388, | |
| "loss": 0.6424, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.508695652173913, | |
| "grad_norm": 0.3479248881340027, | |
| "learning_rate": 0.00018622063118472134, | |
| "loss": 0.6591, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.5130434782608696, | |
| "grad_norm": 0.48405924439430237, | |
| "learning_rate": 0.00018583375711762052, | |
| "loss": 0.6312, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.5173913043478261, | |
| "grad_norm": 0.6660999655723572, | |
| "learning_rate": 0.00018544194045464886, | |
| "loss": 0.6492, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.5217391304347826, | |
| "grad_norm": 0.6070662140846252, | |
| "learning_rate": 0.0001850452037579251, | |
| "loss": 0.631, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.5260869565217391, | |
| "grad_norm": 0.2432556301355362, | |
| "learning_rate": 0.00018464356987288013, | |
| "loss": 0.6192, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.5304347826086957, | |
| "grad_norm": 0.4718700647354126, | |
| "learning_rate": 0.00018423706192694116, | |
| "loss": 0.6385, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.5347826086956522, | |
| "grad_norm": 0.41220200061798096, | |
| "learning_rate": 0.00018382570332820043, | |
| "loss": 0.6362, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.5391304347826087, | |
| "grad_norm": 0.24313992261886597, | |
| "learning_rate": 0.00018340951776406694, | |
| "loss": 0.659, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.5434782608695652, | |
| "grad_norm": 0.42307668924331665, | |
| "learning_rate": 0.00018298852919990252, | |
| "loss": 0.6484, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.5478260869565217, | |
| "grad_norm": 0.2858572006225586, | |
| "learning_rate": 0.00018256276187764197, | |
| "loss": 0.6437, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.5521739130434783, | |
| "grad_norm": 0.2318851351737976, | |
| "learning_rate": 0.0001821322403143969, | |
| "loss": 0.6191, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.5565217391304348, | |
| "grad_norm": 0.3861188292503357, | |
| "learning_rate": 0.0001816969893010442, | |
| "loss": 0.639, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.5608695652173913, | |
| "grad_norm": 0.2969801127910614, | |
| "learning_rate": 0.0001812570339007983, | |
| "loss": 0.6624, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.5652173913043478, | |
| "grad_norm": 0.29341548681259155, | |
| "learning_rate": 0.00018081239944776805, | |
| "loss": 0.639, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.5695652173913044, | |
| "grad_norm": 0.43678849935531616, | |
| "learning_rate": 0.00018036311154549784, | |
| "loss": 0.6384, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.5739130434782609, | |
| "grad_norm": 0.5248069167137146, | |
| "learning_rate": 0.00017990919606549328, | |
| "loss": 0.6451, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.5782608695652174, | |
| "grad_norm": 0.5387030243873596, | |
| "learning_rate": 0.00017945067914573146, | |
| "loss": 0.6198, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.5826086956521739, | |
| "grad_norm": 0.55666184425354, | |
| "learning_rate": 0.00017898758718915586, | |
| "loss": 0.6391, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.5869565217391305, | |
| "grad_norm": 0.4839560389518738, | |
| "learning_rate": 0.0001785199468621559, | |
| "loss": 0.6411, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.591304347826087, | |
| "grad_norm": 0.5173195004463196, | |
| "learning_rate": 0.00017804778509303138, | |
| "loss": 0.6318, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.5956521739130435, | |
| "grad_norm": 0.341448038816452, | |
| "learning_rate": 0.000177571129070442, | |
| "loss": 0.6427, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.2654604911804199, | |
| "learning_rate": 0.00017709000624184162, | |
| "loss": 0.616, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.6043478260869565, | |
| "grad_norm": 0.4000408351421356, | |
| "learning_rate": 0.0001766044443118978, | |
| "loss": 0.611, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.6086956521739131, | |
| "grad_norm": 0.2812383770942688, | |
| "learning_rate": 0.00017611447124089649, | |
| "loss": 0.6508, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.6130434782608696, | |
| "grad_norm": 0.30483949184417725, | |
| "learning_rate": 0.00017562011524313185, | |
| "loss": 0.6628, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.6173913043478261, | |
| "grad_norm": 0.4457907974720001, | |
| "learning_rate": 0.0001751214047852818, | |
| "loss": 0.6274, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.6217391304347826, | |
| "grad_norm": 0.38395488262176514, | |
| "learning_rate": 0.00017461836858476856, | |
| "loss": 0.6528, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.6260869565217392, | |
| "grad_norm": 0.573344886302948, | |
| "learning_rate": 0.00017411103560810526, | |
| "loss": 0.6504, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.6304347826086957, | |
| "grad_norm": 0.5133661031723022, | |
| "learning_rate": 0.00017359943506922774, | |
| "loss": 0.6334, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.6347826086956522, | |
| "grad_norm": 0.2995568513870239, | |
| "learning_rate": 0.00017308359642781242, | |
| "loss": 0.6328, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.6391304347826087, | |
| "grad_norm": 0.5677820444107056, | |
| "learning_rate": 0.0001725635493875799, | |
| "loss": 0.639, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.6434782608695652, | |
| "grad_norm": 0.4751092791557312, | |
| "learning_rate": 0.00017203932389458454, | |
| "loss": 0.6229, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.6478260869565218, | |
| "grad_norm": 0.4374710023403168, | |
| "learning_rate": 0.00017151095013548994, | |
| "loss": 0.6377, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.6521739130434783, | |
| "grad_norm": 0.4172927439212799, | |
| "learning_rate": 0.0001709784585358309, | |
| "loss": 0.6277, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.6565217391304348, | |
| "grad_norm": 0.3994798958301544, | |
| "learning_rate": 0.00017044187975826124, | |
| "loss": 0.637, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.6608695652173913, | |
| "grad_norm": 0.34366917610168457, | |
| "learning_rate": 0.00016990124470078822, | |
| "loss": 0.6556, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.6652173913043479, | |
| "grad_norm": 0.533347487449646, | |
| "learning_rate": 0.0001693565844949933, | |
| "loss": 0.6073, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.6695652173913044, | |
| "grad_norm": 0.4292946457862854, | |
| "learning_rate": 0.0001688079305042395, | |
| "loss": 0.6548, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.6739130434782609, | |
| "grad_norm": 0.2770076394081116, | |
| "learning_rate": 0.00016825531432186543, | |
| "loss": 0.6014, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.6782608695652174, | |
| "grad_norm": 0.377838134765625, | |
| "learning_rate": 0.0001676987677693659, | |
| "loss": 0.6406, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.6826086956521739, | |
| "grad_norm": 0.421268492937088, | |
| "learning_rate": 0.0001671383228945597, | |
| "loss": 0.6288, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.6869565217391305, | |
| "grad_norm": 0.4219221770763397, | |
| "learning_rate": 0.00016657401196974405, | |
| "loss": 0.647, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.691304347826087, | |
| "grad_norm": 0.3563760221004486, | |
| "learning_rate": 0.00016600586748983641, | |
| "loss": 0.6307, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.6956521739130435, | |
| "grad_norm": 0.39387866854667664, | |
| "learning_rate": 0.00016543392217050314, | |
| "loss": 0.631, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.36268243193626404, | |
| "learning_rate": 0.0001648582089462756, | |
| "loss": 0.6429, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.7043478260869566, | |
| "grad_norm": 0.3702019155025482, | |
| "learning_rate": 0.00016427876096865394, | |
| "loss": 0.6338, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.7086956521739131, | |
| "grad_norm": 0.44408297538757324, | |
| "learning_rate": 0.00016369561160419784, | |
| "loss": 0.6416, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.7130434782608696, | |
| "grad_norm": 0.5986080765724182, | |
| "learning_rate": 0.00016310879443260528, | |
| "loss": 0.6187, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.717391304347826, | |
| "grad_norm": 0.7963016629219055, | |
| "learning_rate": 0.0001625183432447789, | |
| "loss": 0.6365, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.7217391304347827, | |
| "grad_norm": 1.2156025171279907, | |
| "learning_rate": 0.0001619242920408802, | |
| "loss": 0.6625, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.7260869565217392, | |
| "grad_norm": 0.7924716472625732, | |
| "learning_rate": 0.00016132667502837165, | |
| "loss": 0.6276, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.7304347826086957, | |
| "grad_norm": 0.29551273584365845, | |
| "learning_rate": 0.00016072552662004696, | |
| "loss": 0.6159, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.7347826086956522, | |
| "grad_norm": 0.7566269040107727, | |
| "learning_rate": 0.00016012088143204953, | |
| "loss": 0.6485, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.7391304347826086, | |
| "grad_norm": 1.001354455947876, | |
| "learning_rate": 0.00015951277428187898, | |
| "loss": 0.6323, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.7434782608695653, | |
| "grad_norm": 0.9103027582168579, | |
| "learning_rate": 0.00015890124018638638, | |
| "loss": 0.6255, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.7478260869565218, | |
| "grad_norm": 0.3885137736797333, | |
| "learning_rate": 0.00015828631435975784, | |
| "loss": 0.6323, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.7521739130434782, | |
| "grad_norm": 0.6141281723976135, | |
| "learning_rate": 0.00015766803221148673, | |
| "loss": 0.6504, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.7565217391304347, | |
| "grad_norm": 0.8024821281433105, | |
| "learning_rate": 0.0001570464293443346, | |
| "loss": 0.641, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.7608695652173914, | |
| "grad_norm": 0.43333736062049866, | |
| "learning_rate": 0.00015642154155228122, | |
| "loss": 0.627, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.7652173913043478, | |
| "grad_norm": 0.649389922618866, | |
| "learning_rate": 0.00015579340481846336, | |
| "loss": 0.6483, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.7695652173913043, | |
| "grad_norm": 1.0359424352645874, | |
| "learning_rate": 0.00015516205531310273, | |
| "loss": 0.6332, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.7739130434782608, | |
| "grad_norm": 0.7209396362304688, | |
| "learning_rate": 0.00015452752939142328, | |
| "loss": 0.6524, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.7782608695652173, | |
| "grad_norm": 0.6178513169288635, | |
| "learning_rate": 0.00015388986359155758, | |
| "loss": 0.645, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.782608695652174, | |
| "grad_norm": 0.9886595606803894, | |
| "learning_rate": 0.00015324909463244296, | |
| "loss": 0.6642, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.7869565217391304, | |
| "grad_norm": 0.7466373443603516, | |
| "learning_rate": 0.00015260525941170712, | |
| "loss": 0.6315, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.7913043478260869, | |
| "grad_norm": 0.5552679896354675, | |
| "learning_rate": 0.00015195839500354335, | |
| "loss": 0.6207, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.7956521739130434, | |
| "grad_norm": 0.5576688647270203, | |
| "learning_rate": 0.0001513085386565758, | |
| "loss": 0.6421, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.4000707268714905, | |
| "learning_rate": 0.00015065572779171432, | |
| "loss": 0.6398, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.8043478260869565, | |
| "grad_norm": 0.4978863298892975, | |
| "learning_rate": 0.00015000000000000001, | |
| "loss": 0.6456, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.808695652173913, | |
| "grad_norm": 0.4530424177646637, | |
| "learning_rate": 0.00014934139304044033, | |
| "loss": 0.6453, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.8130434782608695, | |
| "grad_norm": 0.29163071513175964, | |
| "learning_rate": 0.00014867994483783485, | |
| "loss": 0.6558, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.8173913043478261, | |
| "grad_norm": 0.33445900678634644, | |
| "learning_rate": 0.00014801569348059157, | |
| "loss": 0.6291, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.8217391304347826, | |
| "grad_norm": 0.3891032934188843, | |
| "learning_rate": 0.0001473486772185334, | |
| "loss": 0.6458, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.8260869565217391, | |
| "grad_norm": 0.4320944845676422, | |
| "learning_rate": 0.00014667893446069588, | |
| "loss": 0.6275, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.8304347826086956, | |
| "grad_norm": 0.3652418553829193, | |
| "learning_rate": 0.00014600650377311522, | |
| "loss": 0.6434, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.8347826086956521, | |
| "grad_norm": 0.2939096689224243, | |
| "learning_rate": 0.00014533142387660773, | |
| "loss": 0.6462, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.8391304347826087, | |
| "grad_norm": 0.36094796657562256, | |
| "learning_rate": 0.00014465373364454001, | |
| "loss": 0.6259, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.8434782608695652, | |
| "grad_norm": 0.503746747970581, | |
| "learning_rate": 0.00014397347210059057, | |
| "loss": 0.6565, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.8478260869565217, | |
| "grad_norm": 0.501377522945404, | |
| "learning_rate": 0.00014329067841650274, | |
| "loss": 0.6358, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.8521739130434782, | |
| "grad_norm": 0.40720251202583313, | |
| "learning_rate": 0.00014260539190982886, | |
| "loss": 0.636, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.8565217391304348, | |
| "grad_norm": 0.3170947730541229, | |
| "learning_rate": 0.00014191765204166643, | |
| "loss": 0.6343, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.8608695652173913, | |
| "grad_norm": 0.43554455041885376, | |
| "learning_rate": 0.00014122749841438575, | |
| "loss": 0.6319, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.8652173913043478, | |
| "grad_norm": 0.5128415822982788, | |
| "learning_rate": 0.00014053497076934948, | |
| "loss": 0.6326, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.8695652173913043, | |
| "grad_norm": 0.44992515444755554, | |
| "learning_rate": 0.00013984010898462416, | |
| "loss": 0.6343, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.8739130434782608, | |
| "grad_norm": 0.506968080997467, | |
| "learning_rate": 0.00013914295307268396, | |
| "loss": 0.6472, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.8782608695652174, | |
| "grad_norm": 0.6257392764091492, | |
| "learning_rate": 0.0001384435431781065, | |
| "loss": 0.6535, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.8826086956521739, | |
| "grad_norm": 0.9480230808258057, | |
| "learning_rate": 0.00013774191957526143, | |
| "loss": 0.6628, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.8869565217391304, | |
| "grad_norm": 1.2171893119812012, | |
| "learning_rate": 0.00013703812266599113, | |
| "loss": 0.6585, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.8913043478260869, | |
| "grad_norm": 0.3134421110153198, | |
| "learning_rate": 0.00013633219297728416, | |
| "loss": 0.6629, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.8956521739130435, | |
| "grad_norm": 1.003349781036377, | |
| "learning_rate": 0.00013562417115894172, | |
| "loss": 0.6516, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 1.246419906616211, | |
| "learning_rate": 0.00013491409798123687, | |
| "loss": 0.6418, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.9043478260869565, | |
| "grad_norm": 0.46948862075805664, | |
| "learning_rate": 0.00013420201433256689, | |
| "loss": 0.6441, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.908695652173913, | |
| "grad_norm": 1.628340244293213, | |
| "learning_rate": 0.00013348796121709862, | |
| "loss": 0.6661, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.9130434782608695, | |
| "grad_norm": 0.4027623236179352, | |
| "learning_rate": 0.0001327719797524075, | |
| "loss": 0.6342, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.9173913043478261, | |
| "grad_norm": 1.3196384906768799, | |
| "learning_rate": 0.00013205411116710972, | |
| "loss": 0.6724, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.9217391304347826, | |
| "grad_norm": 0.561631977558136, | |
| "learning_rate": 0.00013133439679848823, | |
| "loss": 0.6541, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.9260869565217391, | |
| "grad_norm": 0.7715569734573364, | |
| "learning_rate": 0.00013061287809011242, | |
| "loss": 0.6419, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.9304347826086956, | |
| "grad_norm": 0.8591257333755493, | |
| "learning_rate": 0.0001298895965894516, | |
| "loss": 0.6197, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.9347826086956522, | |
| "grad_norm": 0.4229847192764282, | |
| "learning_rate": 0.0001291645939454825, | |
| "loss": 0.6472, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.9391304347826087, | |
| "grad_norm": 0.7943733930587769, | |
| "learning_rate": 0.0001284379119062912, | |
| "loss": 0.6576, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.9434782608695652, | |
| "grad_norm": 0.7454273104667664, | |
| "learning_rate": 0.0001277095923166689, | |
| "loss": 0.6245, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.9478260869565217, | |
| "grad_norm": 0.4976602792739868, | |
| "learning_rate": 0.00012697967711570242, | |
| "loss": 0.644, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.9521739130434783, | |
| "grad_norm": 0.6845293641090393, | |
| "learning_rate": 0.00012624820833435937, | |
| "loss": 0.6412, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.9565217391304348, | |
| "grad_norm": 0.7265484929084778, | |
| "learning_rate": 0.0001255152280930676, | |
| "loss": 0.6438, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.9608695652173913, | |
| "grad_norm": 0.4346272647380829, | |
| "learning_rate": 0.00012478077859929, | |
| "loss": 0.6116, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.9652173913043478, | |
| "grad_norm": 0.5768253803253174, | |
| "learning_rate": 0.00012404490214509386, | |
| "loss": 0.6242, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.9695652173913043, | |
| "grad_norm": 0.688556969165802, | |
| "learning_rate": 0.00012330764110471566, | |
| "loss": 0.6546, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.9739130434782609, | |
| "grad_norm": 0.6147114634513855, | |
| "learning_rate": 0.00012256903793212107, | |
| "loss": 0.6286, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.9782608695652174, | |
| "grad_norm": 0.6598117351531982, | |
| "learning_rate": 0.00012182913515856015, | |
| "loss": 0.65, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.9826086956521739, | |
| "grad_norm": 0.6232290863990784, | |
| "learning_rate": 0.00012108797539011847, | |
| "loss": 0.6465, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.9869565217391304, | |
| "grad_norm": 0.3764599561691284, | |
| "learning_rate": 0.0001203456013052634, | |
| "loss": 0.6397, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.991304347826087, | |
| "grad_norm": 0.4177006781101227, | |
| "learning_rate": 0.00011960205565238684, | |
| "loss": 0.6324, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.9956521739130435, | |
| "grad_norm": 0.6632861495018005, | |
| "learning_rate": 0.00011885738124734358, | |
| "loss": 0.6394, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.8406037092208862, | |
| "learning_rate": 0.00011811162097098558, | |
| "loss": 0.6563, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.0043478260869565, | |
| "grad_norm": 0.5756528973579407, | |
| "learning_rate": 0.00011736481776669306, | |
| "loss": 0.6087, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 1.008695652173913, | |
| "grad_norm": 0.4710526466369629, | |
| "learning_rate": 0.00011661701463790142, | |
| "loss": 0.632, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 1.0130434782608695, | |
| "grad_norm": 0.6560250520706177, | |
| "learning_rate": 0.00011586825464562514, | |
| "loss": 0.6305, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 1.017391304347826, | |
| "grad_norm": 0.7808629870414734, | |
| "learning_rate": 0.0001151185809059781, | |
| "loss": 0.6374, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 1.0217391304347827, | |
| "grad_norm": 0.7576888799667358, | |
| "learning_rate": 0.00011436803658769082, | |
| "loss": 0.6436, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.0260869565217392, | |
| "grad_norm": 0.5998823046684265, | |
| "learning_rate": 0.00011361666490962468, | |
| "loss": 0.6245, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 1.0304347826086957, | |
| "grad_norm": 0.4165184795856476, | |
| "learning_rate": 0.00011286450913828312, | |
| "loss": 0.6307, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 1.0347826086956522, | |
| "grad_norm": 0.7513805031776428, | |
| "learning_rate": 0.00011211161258532041, | |
| "loss": 0.6327, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 1.0391304347826087, | |
| "grad_norm": 0.6765947341918945, | |
| "learning_rate": 0.00011135801860504749, | |
| "loss": 0.6412, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 1.0434782608695652, | |
| "grad_norm": 0.6011433005332947, | |
| "learning_rate": 0.00011060377059193547, | |
| "loss": 0.6521, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.0478260869565217, | |
| "grad_norm": 0.4051482677459717, | |
| "learning_rate": 0.00010984891197811687, | |
| "loss": 0.6419, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 1.0521739130434782, | |
| "grad_norm": 0.7668951153755188, | |
| "learning_rate": 0.0001090934862308847, | |
| "loss": 0.6264, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 1.0565217391304347, | |
| "grad_norm": 0.6456301212310791, | |
| "learning_rate": 0.00010833753685018935, | |
| "loss": 0.6509, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 1.0608695652173914, | |
| "grad_norm": 0.3792930841445923, | |
| "learning_rate": 0.00010758110736613385, | |
| "loss": 0.6132, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 1.065217391304348, | |
| "grad_norm": 0.47149112820625305, | |
| "learning_rate": 0.0001068242413364671, | |
| "loss": 0.6346, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.0695652173913044, | |
| "grad_norm": 0.5918865203857422, | |
| "learning_rate": 0.00010606698234407586, | |
| "loss": 0.6365, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 1.0739130434782609, | |
| "grad_norm": 0.4131185710430145, | |
| "learning_rate": 0.00010530937399447496, | |
| "loss": 0.6262, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 1.0782608695652174, | |
| "grad_norm": 0.6192177534103394, | |
| "learning_rate": 0.00010455145991329638, | |
| "loss": 0.6267, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 1.0826086956521739, | |
| "grad_norm": 0.7231637835502625, | |
| "learning_rate": 0.00010379328374377715, | |
| "loss": 0.6524, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 1.0869565217391304, | |
| "grad_norm": 0.5949220061302185, | |
| "learning_rate": 0.00010303488914424624, | |
| "loss": 0.6465, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.0913043478260869, | |
| "grad_norm": 0.4544646441936493, | |
| "learning_rate": 0.00010227631978561056, | |
| "loss": 0.6383, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 1.0956521739130434, | |
| "grad_norm": 0.3706333041191101, | |
| "learning_rate": 0.00010151761934884028, | |
| "loss": 0.6277, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 0.4875901937484741, | |
| "learning_rate": 0.00010075883152245334, | |
| "loss": 0.6119, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 1.1043478260869566, | |
| "grad_norm": 0.3684738576412201, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6264, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 1.108695652173913, | |
| "grad_norm": 0.42785608768463135, | |
| "learning_rate": 9.92411684775467e-05, | |
| "loss": 0.6272, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.1130434782608696, | |
| "grad_norm": 0.3924098014831543, | |
| "learning_rate": 9.848238065115975e-05, | |
| "loss": 0.65, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 1.117391304347826, | |
| "grad_norm": 0.3814132809638977, | |
| "learning_rate": 9.772368021438943e-05, | |
| "loss": 0.6103, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 1.1217391304347826, | |
| "grad_norm": 0.3904854655265808, | |
| "learning_rate": 9.696511085575377e-05, | |
| "loss": 0.6422, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 1.126086956521739, | |
| "grad_norm": 0.27673909068107605, | |
| "learning_rate": 9.620671625622288e-05, | |
| "loss": 0.637, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 1.1304347826086956, | |
| "grad_norm": 0.3034502863883972, | |
| "learning_rate": 9.544854008670367e-05, | |
| "loss": 0.6286, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.134782608695652, | |
| "grad_norm": 0.3358616828918457, | |
| "learning_rate": 9.469062600552509e-05, | |
| "loss": 0.6427, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 1.1391304347826088, | |
| "grad_norm": 0.245226189494133, | |
| "learning_rate": 9.393301765592415e-05, | |
| "loss": 0.6269, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 1.1434782608695653, | |
| "grad_norm": 0.3370216488838196, | |
| "learning_rate": 9.317575866353292e-05, | |
| "loss": 0.6093, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 1.1478260869565218, | |
| "grad_norm": 0.27349230647087097, | |
| "learning_rate": 9.241889263386618e-05, | |
| "loss": 0.6495, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 1.1521739130434783, | |
| "grad_norm": 0.3180980086326599, | |
| "learning_rate": 9.166246314981066e-05, | |
| "loss": 0.6336, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.1565217391304348, | |
| "grad_norm": 0.32506948709487915, | |
| "learning_rate": 9.09065137691153e-05, | |
| "loss": 0.6264, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 1.1608695652173913, | |
| "grad_norm": 0.30634960532188416, | |
| "learning_rate": 9.015108802188313e-05, | |
| "loss": 0.6269, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 1.1652173913043478, | |
| "grad_norm": 0.36996349692344666, | |
| "learning_rate": 8.939622940806455e-05, | |
| "loss": 0.6454, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 1.1695652173913043, | |
| "grad_norm": 0.2220568060874939, | |
| "learning_rate": 8.86419813949525e-05, | |
| "loss": 0.6106, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 1.1739130434782608, | |
| "grad_norm": 0.26350730657577515, | |
| "learning_rate": 8.788838741467962e-05, | |
| "loss": 0.5946, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.1782608695652175, | |
| "grad_norm": 0.22457756102085114, | |
| "learning_rate": 8.713549086171691e-05, | |
| "loss": 0.617, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 1.182608695652174, | |
| "grad_norm": 0.28281551599502563, | |
| "learning_rate": 8.638333509037536e-05, | |
| "loss": 0.618, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 1.1869565217391305, | |
| "grad_norm": 0.3269217908382416, | |
| "learning_rate": 8.563196341230919e-05, | |
| "loss": 0.6261, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 1.191304347826087, | |
| "grad_norm": 0.2878088355064392, | |
| "learning_rate": 8.488141909402191e-05, | |
| "loss": 0.6104, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 1.1956521739130435, | |
| "grad_norm": 0.23815755546092987, | |
| "learning_rate": 8.413174535437487e-05, | |
| "loss": 0.6325, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 0.2873881459236145, | |
| "learning_rate": 8.33829853620986e-05, | |
| "loss": 0.6353, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 1.2043478260869565, | |
| "grad_norm": 0.24071787297725677, | |
| "learning_rate": 8.263518223330697e-05, | |
| "loss": 0.621, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 1.208695652173913, | |
| "grad_norm": 0.2793138921260834, | |
| "learning_rate": 8.188837902901442e-05, | |
| "loss": 0.6279, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 1.2130434782608694, | |
| "grad_norm": 0.4142613112926483, | |
| "learning_rate": 8.114261875265643e-05, | |
| "loss": 0.6164, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 1.2173913043478262, | |
| "grad_norm": 0.3821130692958832, | |
| "learning_rate": 8.039794434761318e-05, | |
| "loss": 0.6191, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.2217391304347827, | |
| "grad_norm": 0.24204838275909424, | |
| "learning_rate": 7.965439869473664e-05, | |
| "loss": 0.6149, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 1.2260869565217392, | |
| "grad_norm": 0.2838297188282013, | |
| "learning_rate": 7.891202460988158e-05, | |
| "loss": 0.6478, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 1.2304347826086957, | |
| "grad_norm": 0.35510286688804626, | |
| "learning_rate": 7.817086484143986e-05, | |
| "loss": 0.6418, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 1.2347826086956522, | |
| "grad_norm": 0.37343931198120117, | |
| "learning_rate": 7.743096206787894e-05, | |
| "loss": 0.6126, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 1.2391304347826086, | |
| "grad_norm": 0.264636754989624, | |
| "learning_rate": 7.669235889528436e-05, | |
| "loss": 0.6231, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 1.2434782608695651, | |
| "grad_norm": 0.2996392548084259, | |
| "learning_rate": 7.595509785490617e-05, | |
| "loss": 0.6343, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 1.2478260869565219, | |
| "grad_norm": 0.38490474224090576, | |
| "learning_rate": 7.521922140071002e-05, | |
| "loss": 0.64, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 1.2521739130434781, | |
| "grad_norm": 0.297821044921875, | |
| "learning_rate": 7.448477190693238e-05, | |
| "loss": 0.6197, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 1.2565217391304349, | |
| "grad_norm": 0.2390124499797821, | |
| "learning_rate": 7.375179166564063e-05, | |
| "loss": 0.6283, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 1.2608695652173914, | |
| "grad_norm": 0.27224546670913696, | |
| "learning_rate": 7.302032288429756e-05, | |
| "loss": 0.6197, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.2652173913043478, | |
| "grad_norm": 0.3059544563293457, | |
| "learning_rate": 7.229040768333115e-05, | |
| "loss": 0.6281, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 1.2695652173913043, | |
| "grad_norm": 0.23116622865200043, | |
| "learning_rate": 7.156208809370883e-05, | |
| "loss": 0.6235, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 1.2739130434782608, | |
| "grad_norm": 0.28964969515800476, | |
| "learning_rate": 7.08354060545175e-05, | |
| "loss": 0.6032, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 1.2782608695652173, | |
| "grad_norm": 0.23806549608707428, | |
| "learning_rate": 7.011040341054845e-05, | |
| "loss": 0.6113, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 1.2826086956521738, | |
| "grad_norm": 0.23752138018608093, | |
| "learning_rate": 6.93871219098876e-05, | |
| "loss": 0.6359, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 1.2869565217391306, | |
| "grad_norm": 0.23858202993869781, | |
| "learning_rate": 6.866560320151179e-05, | |
| "loss": 0.6207, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 1.2913043478260868, | |
| "grad_norm": 0.28167223930358887, | |
| "learning_rate": 6.79458888328903e-05, | |
| "loss": 0.6055, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 1.2956521739130435, | |
| "grad_norm": 0.24841302633285522, | |
| "learning_rate": 6.722802024759252e-05, | |
| "loss": 0.62, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 0.2754172682762146, | |
| "learning_rate": 6.651203878290139e-05, | |
| "loss": 0.6177, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 1.3043478260869565, | |
| "grad_norm": 0.3091265857219696, | |
| "learning_rate": 6.579798566743314e-05, | |
| "loss": 0.6295, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.308695652173913, | |
| "grad_norm": 0.2213958352804184, | |
| "learning_rate": 6.508590201876317e-05, | |
| "loss": 0.6017, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 1.3130434782608695, | |
| "grad_norm": 0.30063438415527344, | |
| "learning_rate": 6.437582884105835e-05, | |
| "loss": 0.6455, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 1.317391304347826, | |
| "grad_norm": 0.3458832800388336, | |
| "learning_rate": 6.366780702271589e-05, | |
| "loss": 0.6326, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 1.3217391304347825, | |
| "grad_norm": 0.24162618815898895, | |
| "learning_rate": 6.29618773340089e-05, | |
| "loss": 0.6155, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 1.3260869565217392, | |
| "grad_norm": 0.32919561862945557, | |
| "learning_rate": 6.225808042473858e-05, | |
| "loss": 0.6354, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 1.3304347826086955, | |
| "grad_norm": 0.3182857036590576, | |
| "learning_rate": 6.155645682189351e-05, | |
| "loss": 0.6007, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 1.3347826086956522, | |
| "grad_norm": 0.2895689308643341, | |
| "learning_rate": 6.085704692731609e-05, | |
| "loss": 0.6382, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 1.3391304347826087, | |
| "grad_norm": 0.2976822555065155, | |
| "learning_rate": 6.015989101537586e-05, | |
| "loss": 0.5889, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 1.3434782608695652, | |
| "grad_norm": 0.2826453745365143, | |
| "learning_rate": 5.9465029230650534e-05, | |
| "loss": 0.6329, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 1.3478260869565217, | |
| "grad_norm": 0.3266195058822632, | |
| "learning_rate": 5.877250158561425e-05, | |
| "loss": 0.6359, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.3521739130434782, | |
| "grad_norm": 0.2917690873146057, | |
| "learning_rate": 5.8082347958333625e-05, | |
| "loss": 0.6301, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 1.3565217391304347, | |
| "grad_norm": 0.25518476963043213, | |
| "learning_rate": 5.73946080901712e-05, | |
| "loss": 0.5977, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 1.3608695652173912, | |
| "grad_norm": 0.2891990542411804, | |
| "learning_rate": 5.670932158349731e-05, | |
| "loss": 0.6394, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 1.365217391304348, | |
| "grad_norm": 0.24494178593158722, | |
| "learning_rate": 5.602652789940941e-05, | |
| "loss": 0.6104, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 1.3695652173913042, | |
| "grad_norm": 0.2637173533439636, | |
| "learning_rate": 5.5346266355459995e-05, | |
| "loss": 0.6252, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 1.373913043478261, | |
| "grad_norm": 0.30065053701400757, | |
| "learning_rate": 5.466857612339229e-05, | |
| "loss": 0.638, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 1.3782608695652174, | |
| "grad_norm": 0.2335294783115387, | |
| "learning_rate": 5.399349622688479e-05, | |
| "loss": 0.6433, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 1.382608695652174, | |
| "grad_norm": 0.28032875061035156, | |
| "learning_rate": 5.332106553930414e-05, | |
| "loss": 0.6301, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 1.3869565217391304, | |
| "grad_norm": 0.3166956603527069, | |
| "learning_rate": 5.26513227814666e-05, | |
| "loss": 0.63, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 1.391304347826087, | |
| "grad_norm": 0.24381506443023682, | |
| "learning_rate": 5.1984306519408456e-05, | |
| "loss": 0.5954, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.3956521739130434, | |
| "grad_norm": 0.38320308923721313, | |
| "learning_rate": 5.1320055162165115e-05, | |
| "loss": 0.6464, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 0.30799657106399536, | |
| "learning_rate": 5.065860695955971e-05, | |
| "loss": 0.6356, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 1.4043478260869566, | |
| "grad_norm": 0.5910504460334778, | |
| "learning_rate": 5.000000000000002e-05, | |
| "loss": 0.6058, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 1.4086956521739131, | |
| "grad_norm": 0.35114794969558716, | |
| "learning_rate": 4.934427220828571e-05, | |
| "loss": 0.6172, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 1.4130434782608696, | |
| "grad_norm": 0.27586087584495544, | |
| "learning_rate": 4.869146134342426e-05, | |
| "loss": 0.6249, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 1.4173913043478261, | |
| "grad_norm": 0.2799661457538605, | |
| "learning_rate": 4.804160499645667e-05, | |
| "loss": 0.6144, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 1.4217391304347826, | |
| "grad_norm": 0.29662200808525085, | |
| "learning_rate": 4.739474058829289e-05, | |
| "loss": 0.6042, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 1.4260869565217391, | |
| "grad_norm": 0.26697418093681335, | |
| "learning_rate": 4.675090536755705e-05, | |
| "loss": 0.6164, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 1.4304347826086956, | |
| "grad_norm": 0.2900712490081787, | |
| "learning_rate": 4.611013640844245e-05, | |
| "loss": 0.6298, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 1.434782608695652, | |
| "grad_norm": 0.26294711232185364, | |
| "learning_rate": 4.547247060857675e-05, | |
| "loss": 0.6363, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.4391304347826086, | |
| "grad_norm": 0.22578993439674377, | |
| "learning_rate": 4.483794468689728e-05, | |
| "loss": 0.615, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 1.4434782608695653, | |
| "grad_norm": 0.2953335642814636, | |
| "learning_rate": 4.420659518153667e-05, | |
| "loss": 0.6278, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 1.4478260869565218, | |
| "grad_norm": 0.27388015389442444, | |
| "learning_rate": 4.357845844771881e-05, | |
| "loss": 0.6222, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 1.4521739130434783, | |
| "grad_norm": 0.23460474610328674, | |
| "learning_rate": 4.295357065566543e-05, | |
| "loss": 0.6313, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 1.4565217391304348, | |
| "grad_norm": 0.3273029923439026, | |
| "learning_rate": 4.2331967788513295e-05, | |
| "loss": 0.6038, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 1.4608695652173913, | |
| "grad_norm": 0.27219584584236145, | |
| "learning_rate": 4.1713685640242165e-05, | |
| "loss": 0.6285, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 1.4652173913043478, | |
| "grad_norm": 0.3417372703552246, | |
| "learning_rate": 4.109875981361363e-05, | |
| "loss": 0.6361, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 1.4695652173913043, | |
| "grad_norm": 0.26904726028442383, | |
| "learning_rate": 4.048722571812105e-05, | |
| "loss": 0.6143, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 1.4739130434782608, | |
| "grad_norm": 0.3139411211013794, | |
| "learning_rate": 3.987911856795047e-05, | |
| "loss": 0.6209, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 1.4782608695652173, | |
| "grad_norm": 0.2561061978340149, | |
| "learning_rate": 3.927447337995304e-05, | |
| "loss": 0.614, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.482608695652174, | |
| "grad_norm": 0.2412693351507187, | |
| "learning_rate": 3.8673324971628357e-05, | |
| "loss": 0.6154, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 1.4869565217391305, | |
| "grad_norm": 0.20282143354415894, | |
| "learning_rate": 3.8075707959119846e-05, | |
| "loss": 0.6052, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 1.491304347826087, | |
| "grad_norm": 0.25691401958465576, | |
| "learning_rate": 3.7481656755221125e-05, | |
| "loss": 0.6081, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 1.4956521739130435, | |
| "grad_norm": 0.26478344202041626, | |
| "learning_rate": 3.689120556739475e-05, | |
| "loss": 0.6191, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 0.21863462030887604, | |
| "learning_rate": 3.630438839580217e-05, | |
| "loss": 0.6319, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 1.5043478260869565, | |
| "grad_norm": 0.22426074743270874, | |
| "learning_rate": 3.5721239031346066e-05, | |
| "loss": 0.6178, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 1.508695652173913, | |
| "grad_norm": 0.21831241250038147, | |
| "learning_rate": 3.5141791053724405e-05, | |
| "loss": 0.6303, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 1.5130434782608697, | |
| "grad_norm": 0.21894365549087524, | |
| "learning_rate": 3.456607782949689e-05, | |
| "loss": 0.6044, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 1.517391304347826, | |
| "grad_norm": 0.25870388746261597, | |
| "learning_rate": 3.399413251016359e-05, | |
| "loss": 0.6181, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 1.5217391304347827, | |
| "grad_norm": 0.2224799394607544, | |
| "learning_rate": 3.342598803025595e-05, | |
| "loss": 0.6028, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.526086956521739, | |
| "grad_norm": 0.25613147020339966, | |
| "learning_rate": 3.2861677105440336e-05, | |
| "loss": 0.5982, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 1.5304347826086957, | |
| "grad_norm": 0.24120664596557617, | |
| "learning_rate": 3.2301232230634104e-05, | |
| "loss": 0.6159, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 1.5347826086956522, | |
| "grad_norm": 0.22223018109798431, | |
| "learning_rate": 3.174468567813461e-05, | |
| "loss": 0.6113, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 1.5391304347826087, | |
| "grad_norm": 0.25855186581611633, | |
| "learning_rate": 3.119206949576052e-05, | |
| "loss": 0.6315, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 1.5434782608695652, | |
| "grad_norm": 0.21754255890846252, | |
| "learning_rate": 3.0643415505006735e-05, | |
| "loss": 0.6202, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 1.5478260869565217, | |
| "grad_norm": 0.2552974224090576, | |
| "learning_rate": 3.009875529921181e-05, | |
| "loss": 0.6195, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 1.5521739130434784, | |
| "grad_norm": 0.19017580151557922, | |
| "learning_rate": 2.9558120241738784e-05, | |
| "loss": 0.5913, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 1.5565217391304347, | |
| "grad_norm": 0.2394818663597107, | |
| "learning_rate": 2.90215414641691e-05, | |
| "loss": 0.6091, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 1.5608695652173914, | |
| "grad_norm": 0.29089704155921936, | |
| "learning_rate": 2.8489049864510054e-05, | |
| "loss": 0.639, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 1.5652173913043477, | |
| "grad_norm": 0.21274344623088837, | |
| "learning_rate": 2.7960676105415472e-05, | |
| "loss": 0.6182, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.5695652173913044, | |
| "grad_norm": 0.2907910645008087, | |
| "learning_rate": 2.7436450612420095e-05, | |
| "loss": 0.6111, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 1.5739130434782609, | |
| "grad_norm": 0.19825702905654907, | |
| "learning_rate": 2.691640357218759e-05, | |
| "loss": 0.6175, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 1.5782608695652174, | |
| "grad_norm": 0.2254411280155182, | |
| "learning_rate": 2.640056493077231e-05, | |
| "loss": 0.5938, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 1.5826086956521739, | |
| "grad_norm": 0.2379319965839386, | |
| "learning_rate": 2.5888964391894766e-05, | |
| "loss": 0.6105, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 1.5869565217391304, | |
| "grad_norm": 0.1910097748041153, | |
| "learning_rate": 2.5381631415231454e-05, | |
| "loss": 0.6088, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 1.591304347826087, | |
| "grad_norm": 0.2428610920906067, | |
| "learning_rate": 2.4878595214718236e-05, | |
| "loss": 0.5999, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 1.5956521739130434, | |
| "grad_norm": 0.21029497683048248, | |
| "learning_rate": 2.4379884756868167e-05, | |
| "loss": 0.617, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 0.22931547462940216, | |
| "learning_rate": 2.3885528759103538e-05, | |
| "loss": 0.5912, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 1.6043478260869564, | |
| "grad_norm": 0.19771426916122437, | |
| "learning_rate": 2.339555568810221e-05, | |
| "loss": 0.58, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 1.608695652173913, | |
| "grad_norm": 0.25638020038604736, | |
| "learning_rate": 2.2909993758158412e-05, | |
| "loss": 0.6217, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.6130434782608696, | |
| "grad_norm": 0.2538512945175171, | |
| "learning_rate": 2.242887092955801e-05, | |
| "loss": 0.6372, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 1.617391304347826, | |
| "grad_norm": 0.19667693972587585, | |
| "learning_rate": 2.1952214906968627e-05, | |
| "loss": 0.5964, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 1.6217391304347826, | |
| "grad_norm": 0.2506635785102844, | |
| "learning_rate": 2.1480053137844115e-05, | |
| "loss": 0.6196, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 1.626086956521739, | |
| "grad_norm": 0.19938236474990845, | |
| "learning_rate": 2.101241281084416e-05, | |
| "loss": 0.6214, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 1.6304347826086958, | |
| "grad_norm": 0.20109523832798004, | |
| "learning_rate": 2.054932085426856e-05, | |
| "loss": 0.6072, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 1.634782608695652, | |
| "grad_norm": 0.21539539098739624, | |
| "learning_rate": 2.0090803934506764e-05, | |
| "loss": 0.6062, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 1.6391304347826088, | |
| "grad_norm": 0.21811442077159882, | |
| "learning_rate": 1.9636888454502178e-05, | |
| "loss": 0.6093, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 1.643478260869565, | |
| "grad_norm": 0.1716393679380417, | |
| "learning_rate": 1.9187600552231955e-05, | |
| "loss": 0.5951, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 1.6478260869565218, | |
| "grad_norm": 0.22683413326740265, | |
| "learning_rate": 1.8742966099201697e-05, | |
| "loss": 0.6119, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 1.6521739130434783, | |
| "grad_norm": 0.213649183511734, | |
| "learning_rate": 1.8303010698955804e-05, | |
| "loss": 0.6013, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.6565217391304348, | |
| "grad_norm": 0.1564229130744934, | |
| "learning_rate": 1.7867759685603114e-05, | |
| "loss": 0.6083, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 1.6608695652173913, | |
| "grad_norm": 0.2164810448884964, | |
| "learning_rate": 1.7437238122358057e-05, | |
| "loss": 0.6306, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 1.6652173913043478, | |
| "grad_norm": 0.18622331321239471, | |
| "learning_rate": 1.7011470800097496e-05, | |
| "loss": 0.5776, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 1.6695652173913045, | |
| "grad_norm": 0.1888633519411087, | |
| "learning_rate": 1.659048223593308e-05, | |
| "loss": 0.6242, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 1.6739130434782608, | |
| "grad_norm": 0.16813671588897705, | |
| "learning_rate": 1.6174296671799572e-05, | |
| "loss": 0.5759, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 1.6782608695652175, | |
| "grad_norm": 0.20108157396316528, | |
| "learning_rate": 1.5762938073058853e-05, | |
| "loss": 0.6151, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 1.6826086956521737, | |
| "grad_norm": 0.17089848220348358, | |
| "learning_rate": 1.5356430127119913e-05, | |
| "loss": 0.5993, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 1.6869565217391305, | |
| "grad_norm": 0.15591366589069366, | |
| "learning_rate": 1.4954796242074898e-05, | |
| "loss": 0.614, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 1.691304347826087, | |
| "grad_norm": 0.16411159932613373, | |
| "learning_rate": 1.4558059545351143e-05, | |
| "loss": 0.6023, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 1.6956521739130435, | |
| "grad_norm": 0.17830075323581696, | |
| "learning_rate": 1.4166242882379476e-05, | |
| "loss": 0.6031, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 0.15914571285247803, | |
| "learning_rate": 1.3779368815278647e-05, | |
| "loss": 0.6173, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 1.7043478260869565, | |
| "grad_norm": 0.16238906979560852, | |
| "learning_rate": 1.339745962155613e-05, | |
| "loss": 0.6085, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 1.7086956521739132, | |
| "grad_norm": 0.1836780607700348, | |
| "learning_rate": 1.302053729282533e-05, | |
| "loss": 0.6147, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 1.7130434782608694, | |
| "grad_norm": 0.17346879839897156, | |
| "learning_rate": 1.2648623533539261e-05, | |
| "loss": 0.5877, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 1.7173913043478262, | |
| "grad_norm": 0.17038564383983612, | |
| "learning_rate": 1.2281739759740574e-05, | |
| "loss": 0.5981, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 1.7217391304347827, | |
| "grad_norm": 0.23689156770706177, | |
| "learning_rate": 1.1919907097828653e-05, | |
| "loss": 0.6318, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 1.7260869565217392, | |
| "grad_norm": 0.1710849106311798, | |
| "learning_rate": 1.1563146383342772e-05, | |
| "loss": 0.6007, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 1.7304347826086957, | |
| "grad_norm": 0.150990828871727, | |
| "learning_rate": 1.1211478159762478e-05, | |
| "loss": 0.5942, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 1.7347826086956522, | |
| "grad_norm": 0.17545339465141296, | |
| "learning_rate": 1.0864922677324618e-05, | |
| "loss": 0.6205, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 1.7391304347826086, | |
| "grad_norm": 0.15553072094917297, | |
| "learning_rate": 1.0523499891857225e-05, | |
| "loss": 0.5996, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.7434782608695651, | |
| "grad_norm": 0.16285602748394012, | |
| "learning_rate": 1.01872294636304e-05, | |
| "loss": 0.5937, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 1.7478260869565219, | |
| "grad_norm": 0.16755890846252441, | |
| "learning_rate": 9.856130756224213e-06, | |
| "loss": 0.6023, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 1.7521739130434781, | |
| "grad_norm": 0.141220822930336, | |
| "learning_rate": 9.530222835413738e-06, | |
| "loss": 0.6181, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 1.7565217391304349, | |
| "grad_norm": 0.1646806001663208, | |
| "learning_rate": 9.209524468071096e-06, | |
| "loss": 0.615, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 1.7608695652173914, | |
| "grad_norm": 0.1562896966934204, | |
| "learning_rate": 8.894054121084838e-06, | |
| "loss": 0.6002, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 1.7652173913043478, | |
| "grad_norm": 0.15589256584644318, | |
| "learning_rate": 8.58382996029652e-06, | |
| "loss": 0.6098, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 1.7695652173913043, | |
| "grad_norm": 0.14259637892246246, | |
| "learning_rate": 8.278869849454718e-06, | |
| "loss": 0.6032, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 1.7739130434782608, | |
| "grad_norm": 0.1473761945962906, | |
| "learning_rate": 7.97919134918632e-06, | |
| "loss": 0.6265, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 1.7782608695652173, | |
| "grad_norm": 0.1384039968252182, | |
| "learning_rate": 7.684811715985429e-06, | |
| "loss": 0.617, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 1.7826086956521738, | |
| "grad_norm": 0.16400642693042755, | |
| "learning_rate": 7.395747901219474e-06, | |
| "loss": 0.632, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.7869565217391306, | |
| "grad_norm": 0.154324010014534, | |
| "learning_rate": 7.1120165501533e-06, | |
| "loss": 0.6039, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 1.7913043478260868, | |
| "grad_norm": 0.15824785828590393, | |
| "learning_rate": 6.833634000990541e-06, | |
| "loss": 0.5953, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 1.7956521739130435, | |
| "grad_norm": 0.14589077234268188, | |
| "learning_rate": 6.560616283932897e-06, | |
| "loss": 0.6049, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 0.1429314911365509, | |
| "learning_rate": 6.292979120256992e-06, | |
| "loss": 0.6068, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 1.8043478260869565, | |
| "grad_norm": 0.1345595121383667, | |
| "learning_rate": 6.030737921409169e-06, | |
| "loss": 0.6233, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 1.808695652173913, | |
| "grad_norm": 0.14544036984443665, | |
| "learning_rate": 5.77390778811796e-06, | |
| "loss": 0.6229, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 1.8130434782608695, | |
| "grad_norm": 0.14767540991306305, | |
| "learning_rate": 5.52250350952459e-06, | |
| "loss": 0.6289, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 1.8173913043478263, | |
| "grad_norm": 0.14071600139141083, | |
| "learning_rate": 5.276539562331384e-06, | |
| "loss": 0.6006, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 1.8217391304347825, | |
| "grad_norm": 0.15143164992332458, | |
| "learning_rate": 5.036030109968082e-06, | |
| "loss": 0.6162, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 1.8260869565217392, | |
| "grad_norm": 0.1341582089662552, | |
| "learning_rate": 4.800989001776324e-06, | |
| "loss": 0.6015, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.8304347826086955, | |
| "grad_norm": 0.13631945848464966, | |
| "learning_rate": 4.5714297722121106e-06, | |
| "loss": 0.6197, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 1.8347826086956522, | |
| "grad_norm": 0.15245981514453888, | |
| "learning_rate": 4.347365640066525e-06, | |
| "loss": 0.6225, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 1.8391304347826087, | |
| "grad_norm": 0.14014090597629547, | |
| "learning_rate": 4.128809507704445e-06, | |
| "loss": 0.5993, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 1.8434782608695652, | |
| "grad_norm": 0.13918210566043854, | |
| "learning_rate": 3.915773960321634e-06, | |
| "loss": 0.6243, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 1.8478260869565217, | |
| "grad_norm": 0.13100025057792664, | |
| "learning_rate": 3.7082712652200867e-06, | |
| "loss": 0.6074, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 1.8521739130434782, | |
| "grad_norm": 0.15842288732528687, | |
| "learning_rate": 3.5063133711014882e-06, | |
| "loss": 0.6097, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 1.856521739130435, | |
| "grad_norm": 0.1331150382757187, | |
| "learning_rate": 3.3099119073793928e-06, | |
| "loss": 0.6021, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 1.8608695652173912, | |
| "grad_norm": 0.13728439807891846, | |
| "learning_rate": 3.119078183509372e-06, | |
| "loss": 0.6015, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 1.865217391304348, | |
| "grad_norm": 0.1390867531299591, | |
| "learning_rate": 2.9338231883378366e-06, | |
| "loss": 0.603, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 1.8695652173913042, | |
| "grad_norm": 0.15344412624835968, | |
| "learning_rate": 2.7541575894693194e-06, | |
| "loss": 0.6045, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.873913043478261, | |
| "grad_norm": 0.14015014469623566, | |
| "learning_rate": 2.580091732652101e-06, | |
| "loss": 0.6161, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 1.8782608695652174, | |
| "grad_norm": 0.15435759723186493, | |
| "learning_rate": 2.4116356411825525e-06, | |
| "loss": 0.6281, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 1.882608695652174, | |
| "grad_norm": 0.14000248908996582, | |
| "learning_rate": 2.248799015327907e-06, | |
| "loss": 0.6358, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 1.8869565217391304, | |
| "grad_norm": 0.14310961961746216, | |
| "learning_rate": 2.091591231767709e-06, | |
| "loss": 0.63, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 1.891304347826087, | |
| "grad_norm": 0.14115604758262634, | |
| "learning_rate": 1.9400213430538773e-06, | |
| "loss": 0.637, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 1.8956521739130436, | |
| "grad_norm": 0.1358548104763031, | |
| "learning_rate": 1.7940980770894122e-06, | |
| "loss": 0.6241, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 0.13970565795898438, | |
| "learning_rate": 1.6538298366257976e-06, | |
| "loss": 0.6118, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 1.9043478260869566, | |
| "grad_norm": 0.1373710334300995, | |
| "learning_rate": 1.5192246987791981e-06, | |
| "loss": 0.6121, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 1.908695652173913, | |
| "grad_norm": 0.13853000104427338, | |
| "learning_rate": 1.3902904145653096e-06, | |
| "loss": 0.6433, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 1.9130434782608696, | |
| "grad_norm": 0.16299930214881897, | |
| "learning_rate": 1.2670344084530383e-06, | |
| "loss": 0.6114, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.9173913043478261, | |
| "grad_norm": 0.15240226686000824, | |
| "learning_rate": 1.1494637779369766e-06, | |
| "loss": 0.6392, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 1.9217391304347826, | |
| "grad_norm": 0.12335983663797379, | |
| "learning_rate": 1.0375852931286956e-06, | |
| "loss": 0.6224, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 1.9260869565217391, | |
| "grad_norm": 0.1319676637649536, | |
| "learning_rate": 9.314053963669245e-07, | |
| "loss": 0.618, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 1.9304347826086956, | |
| "grad_norm": 0.11884966492652893, | |
| "learning_rate": 8.309302018465581e-07, | |
| "loss": 0.5911, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 1.9347826086956523, | |
| "grad_norm": 0.1367439180612564, | |
| "learning_rate": 7.361654952665609e-07, | |
| "loss": 0.6186, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 1.9391304347826086, | |
| "grad_norm": 0.12649576365947723, | |
| "learning_rate": 6.471167334968886e-07, | |
| "loss": 0.6249, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 1.9434782608695653, | |
| "grad_norm": 0.1303686797618866, | |
| "learning_rate": 5.637890442641402e-07, | |
| "loss": 0.5979, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 1.9478260869565216, | |
| "grad_norm": 0.1282416582107544, | |
| "learning_rate": 4.861872258564049e-07, | |
| "loss": 0.6195, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 1.9521739130434783, | |
| "grad_norm": 0.12191151827573776, | |
| "learning_rate": 4.143157468468717e-07, | |
| "loss": 0.6097, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 1.9565217391304348, | |
| "grad_norm": 0.11873479187488556, | |
| "learning_rate": 3.481787458365915e-07, | |
| "loss": 0.6102, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.9608695652173913, | |
| "grad_norm": 0.1280432939529419, | |
| "learning_rate": 2.877800312160783e-07, | |
| "loss": 0.5857, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 1.9652173913043478, | |
| "grad_norm": 0.12122710049152374, | |
| "learning_rate": 2.3312308094607382e-07, | |
| "loss": 0.5989, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 1.9695652173913043, | |
| "grad_norm": 0.12293203175067902, | |
| "learning_rate": 1.8421104235727405e-07, | |
| "loss": 0.6275, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 1.973913043478261, | |
| "grad_norm": 0.1292450726032257, | |
| "learning_rate": 1.4104673196903005e-07, | |
| "loss": 0.6009, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 1.9782608695652173, | |
| "grad_norm": 0.1343158781528473, | |
| "learning_rate": 1.0363263532724432e-07, | |
| "loss": 0.6207, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 1.982608695652174, | |
| "grad_norm": 0.12888604402542114, | |
| "learning_rate": 7.197090686119623e-08, | |
| "loss": 0.6183, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 1.9869565217391303, | |
| "grad_norm": 0.12688706815242767, | |
| "learning_rate": 4.606336975948589e-08, | |
| "loss": 0.6183, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 1.991304347826087, | |
| "grad_norm": 0.1274312287569046, | |
| "learning_rate": 2.5911515865084667e-08, | |
| "loss": 0.6094, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 1.9956521739130435, | |
| "grad_norm": 0.13021548092365265, | |
| "learning_rate": 1.1516505589381776e-08, | |
| "loss": 0.6078, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.12913620471954346, | |
| "learning_rate": 2.8791678453821135e-09, | |
| "loss": 0.6241, | |
| "step": 460 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 460, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.7941728521827123e+19, | |
| "train_batch_size": 24, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |