{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.99675535366645, "eval_steps": 500, "global_step": 2310, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012978585334198572, "grad_norm": 51.281074849235615, "learning_rate": 0.0, "loss": 11.2228, "step": 1 }, { "epoch": 0.0025957170668397143, "grad_norm": 52.288477767034706, "learning_rate": 2.1645021645021646e-07, "loss": 11.2142, "step": 2 }, { "epoch": 0.003893575600259572, "grad_norm": 51.690439395204805, "learning_rate": 4.329004329004329e-07, "loss": 11.2982, "step": 3 }, { "epoch": 0.005191434133679429, "grad_norm": 52.70272970065567, "learning_rate": 6.493506493506494e-07, "loss": 11.2202, "step": 4 }, { "epoch": 0.006489292667099286, "grad_norm": 52.19249689221791, "learning_rate": 8.658008658008658e-07, "loss": 11.223, "step": 5 }, { "epoch": 0.007787151200519144, "grad_norm": 52.51499888342824, "learning_rate": 1.0822510822510822e-06, "loss": 11.1764, "step": 6 }, { "epoch": 0.009085009733939001, "grad_norm": 54.65313049626493, "learning_rate": 1.2987012987012988e-06, "loss": 11.1836, "step": 7 }, { "epoch": 0.010382868267358857, "grad_norm": 56.26332260448297, "learning_rate": 1.5151515151515152e-06, "loss": 10.9434, "step": 8 }, { "epoch": 0.011680726800778715, "grad_norm": 56.526613959934075, "learning_rate": 1.7316017316017317e-06, "loss": 10.8116, "step": 9 }, { "epoch": 0.012978585334198572, "grad_norm": 73.22487510820876, "learning_rate": 1.948051948051948e-06, "loss": 10.0099, "step": 10 }, { "epoch": 0.01427644386761843, "grad_norm": 78.76237638740871, "learning_rate": 2.1645021645021643e-06, "loss": 9.7022, "step": 11 }, { "epoch": 0.015574302401038288, "grad_norm": 84.53260376952899, "learning_rate": 2.3809523809523808e-06, "loss": 9.2879, "step": 12 }, { "epoch": 0.016872160934458143, "grad_norm": 93.89336284136559, "learning_rate": 2.5974025974025976e-06, "loss": 9.1068, "step": 13 }, { "epoch": 0.018170019467878003, "grad_norm": 67.50935080000205, "learning_rate": 2.813852813852814e-06, "loss": 4.1366, "step": 14 }, { "epoch": 0.01946787800129786, "grad_norm": 60.451555462271216, "learning_rate": 3.0303030303030305e-06, "loss": 3.7321, "step": 15 }, { "epoch": 0.020765736534717714, "grad_norm": 48.941978109448335, "learning_rate": 3.2467532467532465e-06, "loss": 3.3276, "step": 16 }, { "epoch": 0.022063595068137574, "grad_norm": 41.193189217962875, "learning_rate": 3.4632034632034634e-06, "loss": 3.0135, "step": 17 }, { "epoch": 0.02336145360155743, "grad_norm": 19.806882723512313, "learning_rate": 3.67965367965368e-06, "loss": 2.1732, "step": 18 }, { "epoch": 0.02465931213497729, "grad_norm": 6.601644234547365, "learning_rate": 3.896103896103896e-06, "loss": 1.5589, "step": 19 }, { "epoch": 0.025957170668397145, "grad_norm": 5.420878318524882, "learning_rate": 4.112554112554113e-06, "loss": 1.4694, "step": 20 }, { "epoch": 0.027255029201817, "grad_norm": 4.513535899746422, "learning_rate": 4.329004329004329e-06, "loss": 1.4127, "step": 21 }, { "epoch": 0.02855288773523686, "grad_norm": 3.4399120070700926, "learning_rate": 4.5454545454545455e-06, "loss": 1.308, "step": 22 }, { "epoch": 0.029850746268656716, "grad_norm": 2.737903832715275, "learning_rate": 4.7619047619047615e-06, "loss": 1.2391, "step": 23 }, { "epoch": 0.031148604802076575, "grad_norm": 2.229792362203531, "learning_rate": 4.978354978354978e-06, "loss": 1.207, "step": 24 }, { "epoch": 0.03244646333549643, "grad_norm": 1.67506952925642, "learning_rate": 5.194805194805195e-06, "loss": 1.164, "step": 25 }, { "epoch": 0.03374432186891629, "grad_norm": 7.810491533013112, "learning_rate": 5.411255411255411e-06, "loss": 1.0519, "step": 26 }, { "epoch": 0.03504218040233614, "grad_norm": 1.8418087052814074, "learning_rate": 5.627705627705628e-06, "loss": 1.0332, "step": 27 }, { "epoch": 0.036340038935756006, "grad_norm": 1.6946651958442733, "learning_rate": 5.844155844155844e-06, "loss": 1.0257, "step": 28 }, { "epoch": 0.03763789746917586, "grad_norm": 1.0837092117736122, "learning_rate": 6.060606060606061e-06, "loss": 0.9995, "step": 29 }, { "epoch": 0.03893575600259572, "grad_norm": 1.022247931246769, "learning_rate": 6.277056277056277e-06, "loss": 0.9658, "step": 30 }, { "epoch": 0.04023361453601557, "grad_norm": 1.0439465825515253, "learning_rate": 6.493506493506493e-06, "loss": 0.9258, "step": 31 }, { "epoch": 0.04153147306943543, "grad_norm": 0.8476038489892367, "learning_rate": 6.709956709956711e-06, "loss": 0.9183, "step": 32 }, { "epoch": 0.04282933160285529, "grad_norm": 0.7260505313959857, "learning_rate": 6.926406926406927e-06, "loss": 0.8859, "step": 33 }, { "epoch": 0.04412719013627515, "grad_norm": 0.9334322529996619, "learning_rate": 7.142857142857143e-06, "loss": 0.8775, "step": 34 }, { "epoch": 0.045425048669695, "grad_norm": 0.7507371805560344, "learning_rate": 7.35930735930736e-06, "loss": 0.8506, "step": 35 }, { "epoch": 0.04672290720311486, "grad_norm": 0.7037392218293158, "learning_rate": 7.5757575757575764e-06, "loss": 0.8373, "step": 36 }, { "epoch": 0.048020765736534715, "grad_norm": 0.7026115757535957, "learning_rate": 7.792207792207792e-06, "loss": 0.8148, "step": 37 }, { "epoch": 0.04931862426995458, "grad_norm": 0.6329073089403997, "learning_rate": 8.008658008658008e-06, "loss": 0.7989, "step": 38 }, { "epoch": 0.050616482803374434, "grad_norm": 0.576557174722755, "learning_rate": 8.225108225108225e-06, "loss": 0.801, "step": 39 }, { "epoch": 0.05191434133679429, "grad_norm": 0.633584321792007, "learning_rate": 8.441558441558442e-06, "loss": 0.8154, "step": 40 }, { "epoch": 0.053212199870214145, "grad_norm": 0.6357768126157509, "learning_rate": 8.658008658008657e-06, "loss": 0.7985, "step": 41 }, { "epoch": 0.054510058403634, "grad_norm": 0.4606140950872704, "learning_rate": 8.874458874458876e-06, "loss": 0.7875, "step": 42 }, { "epoch": 0.055807916937053864, "grad_norm": 0.42579840291728105, "learning_rate": 9.090909090909091e-06, "loss": 0.7882, "step": 43 }, { "epoch": 0.05710577547047372, "grad_norm": 0.5127047756782175, "learning_rate": 9.307359307359308e-06, "loss": 0.7668, "step": 44 }, { "epoch": 0.058403634003893576, "grad_norm": 0.5275747680292829, "learning_rate": 9.523809523809523e-06, "loss": 0.7556, "step": 45 }, { "epoch": 0.05970149253731343, "grad_norm": 0.4422307893352111, "learning_rate": 9.740259740259742e-06, "loss": 0.7469, "step": 46 }, { "epoch": 0.06099935107073329, "grad_norm": 0.3950972183567316, "learning_rate": 9.956709956709957e-06, "loss": 0.7257, "step": 47 }, { "epoch": 0.06229720960415315, "grad_norm": 0.4294144227066294, "learning_rate": 1.0173160173160174e-05, "loss": 0.7082, "step": 48 }, { "epoch": 0.063595068137573, "grad_norm": 0.4261355016492852, "learning_rate": 1.038961038961039e-05, "loss": 0.7202, "step": 49 }, { "epoch": 0.06489292667099286, "grad_norm": 0.40006881327817506, "learning_rate": 1.0606060606060607e-05, "loss": 0.7006, "step": 50 }, { "epoch": 0.06619078520441272, "grad_norm": 0.3484479390008924, "learning_rate": 1.0822510822510823e-05, "loss": 0.7141, "step": 51 }, { "epoch": 0.06748864373783257, "grad_norm": 0.3508582969164509, "learning_rate": 1.103896103896104e-05, "loss": 0.7203, "step": 52 }, { "epoch": 0.06878650227125244, "grad_norm": 0.3754749738655716, "learning_rate": 1.1255411255411256e-05, "loss": 0.7465, "step": 53 }, { "epoch": 0.07008436080467229, "grad_norm": 0.33275270814242425, "learning_rate": 1.1471861471861473e-05, "loss": 0.6844, "step": 54 }, { "epoch": 0.07138221933809215, "grad_norm": 0.29711493887953333, "learning_rate": 1.1688311688311688e-05, "loss": 0.668, "step": 55 }, { "epoch": 0.07268007787151201, "grad_norm": 0.3254569707924083, "learning_rate": 1.1904761904761905e-05, "loss": 0.7234, "step": 56 }, { "epoch": 0.07397793640493186, "grad_norm": 0.2925311216603525, "learning_rate": 1.2121212121212122e-05, "loss": 0.7004, "step": 57 }, { "epoch": 0.07527579493835172, "grad_norm": 0.2781203466736231, "learning_rate": 1.2337662337662339e-05, "loss": 0.6845, "step": 58 }, { "epoch": 0.07657365347177157, "grad_norm": 0.27946888261667213, "learning_rate": 1.2554112554112554e-05, "loss": 0.6999, "step": 59 }, { "epoch": 0.07787151200519143, "grad_norm": 0.2728571313678063, "learning_rate": 1.2770562770562773e-05, "loss": 0.6639, "step": 60 }, { "epoch": 0.0791693705386113, "grad_norm": 0.3093993935391829, "learning_rate": 1.2987012987012986e-05, "loss": 0.701, "step": 61 }, { "epoch": 0.08046722907203115, "grad_norm": 0.2852724472177098, "learning_rate": 1.3203463203463205e-05, "loss": 0.681, "step": 62 }, { "epoch": 0.08176508760545101, "grad_norm": 0.2693071822601781, "learning_rate": 1.3419913419913421e-05, "loss": 0.6679, "step": 63 }, { "epoch": 0.08306294613887086, "grad_norm": 0.2883803733655785, "learning_rate": 1.3636363636363637e-05, "loss": 0.6871, "step": 64 }, { "epoch": 0.08436080467229072, "grad_norm": 0.27168971444927753, "learning_rate": 1.3852813852813853e-05, "loss": 0.6478, "step": 65 }, { "epoch": 0.08565866320571058, "grad_norm": 0.2780741659791045, "learning_rate": 1.406926406926407e-05, "loss": 0.6654, "step": 66 }, { "epoch": 0.08695652173913043, "grad_norm": 0.2669958151004055, "learning_rate": 1.4285714285714285e-05, "loss": 0.6494, "step": 67 }, { "epoch": 0.0882543802725503, "grad_norm": 0.2645854701003351, "learning_rate": 1.4502164502164502e-05, "loss": 0.6343, "step": 68 }, { "epoch": 0.08955223880597014, "grad_norm": 0.27977755521966374, "learning_rate": 1.471861471861472e-05, "loss": 0.6703, "step": 69 }, { "epoch": 0.09085009733939, "grad_norm": 0.2701714280796314, "learning_rate": 1.4935064935064936e-05, "loss": 0.6657, "step": 70 }, { "epoch": 0.09214795587280987, "grad_norm": 0.3340236352400633, "learning_rate": 1.5151515151515153e-05, "loss": 0.6654, "step": 71 }, { "epoch": 0.09344581440622972, "grad_norm": 0.25125625871192836, "learning_rate": 1.5367965367965366e-05, "loss": 0.6829, "step": 72 }, { "epoch": 0.09474367293964958, "grad_norm": 0.27623404854696354, "learning_rate": 1.5584415584415583e-05, "loss": 0.6864, "step": 73 }, { "epoch": 0.09604153147306943, "grad_norm": 0.2855287411905892, "learning_rate": 1.5800865800865803e-05, "loss": 0.6669, "step": 74 }, { "epoch": 0.0973393900064893, "grad_norm": 0.2544109696892319, "learning_rate": 1.6017316017316017e-05, "loss": 0.6441, "step": 75 }, { "epoch": 0.09863724853990916, "grad_norm": 0.29021289781813303, "learning_rate": 1.6233766233766234e-05, "loss": 0.6664, "step": 76 }, { "epoch": 0.099935107073329, "grad_norm": 0.26812240880351107, "learning_rate": 1.645021645021645e-05, "loss": 0.6391, "step": 77 }, { "epoch": 0.10123296560674887, "grad_norm": 0.27576904300300786, "learning_rate": 1.6666666666666667e-05, "loss": 0.6355, "step": 78 }, { "epoch": 0.10253082414016872, "grad_norm": 0.2814054287273717, "learning_rate": 1.6883116883116884e-05, "loss": 0.6265, "step": 79 }, { "epoch": 0.10382868267358858, "grad_norm": 0.3103049215962741, "learning_rate": 1.70995670995671e-05, "loss": 0.6497, "step": 80 }, { "epoch": 0.10512654120700844, "grad_norm": 0.2728333867240695, "learning_rate": 1.7316017316017315e-05, "loss": 0.628, "step": 81 }, { "epoch": 0.10642439974042829, "grad_norm": 0.29691347602771223, "learning_rate": 1.7532467532467535e-05, "loss": 0.6481, "step": 82 }, { "epoch": 0.10772225827384815, "grad_norm": 0.29273954514595735, "learning_rate": 1.7748917748917752e-05, "loss": 0.6261, "step": 83 }, { "epoch": 0.109020116807268, "grad_norm": 0.3074962736781368, "learning_rate": 1.7965367965367965e-05, "loss": 0.6299, "step": 84 }, { "epoch": 0.11031797534068787, "grad_norm": 0.29602233662175786, "learning_rate": 1.8181818181818182e-05, "loss": 0.6335, "step": 85 }, { "epoch": 0.11161583387410773, "grad_norm": 0.2830666870801868, "learning_rate": 1.83982683982684e-05, "loss": 0.641, "step": 86 }, { "epoch": 0.11291369240752758, "grad_norm": 0.3125259689124729, "learning_rate": 1.8614718614718616e-05, "loss": 0.6388, "step": 87 }, { "epoch": 0.11421155094094744, "grad_norm": 0.26645549323423784, "learning_rate": 1.8831168831168833e-05, "loss": 0.6208, "step": 88 }, { "epoch": 0.11550940947436729, "grad_norm": 0.28954783071217016, "learning_rate": 1.9047619047619046e-05, "loss": 0.6273, "step": 89 }, { "epoch": 0.11680726800778715, "grad_norm": 0.28149679569001645, "learning_rate": 1.9264069264069266e-05, "loss": 0.6028, "step": 90 }, { "epoch": 0.11810512654120701, "grad_norm": 0.2906262772721881, "learning_rate": 1.9480519480519483e-05, "loss": 0.6245, "step": 91 }, { "epoch": 0.11940298507462686, "grad_norm": 0.2711185379042277, "learning_rate": 1.9696969696969697e-05, "loss": 0.6267, "step": 92 }, { "epoch": 0.12070084360804673, "grad_norm": 0.3052664513793837, "learning_rate": 1.9913419913419914e-05, "loss": 0.6346, "step": 93 }, { "epoch": 0.12199870214146658, "grad_norm": 0.29479074178005676, "learning_rate": 2.012987012987013e-05, "loss": 0.6255, "step": 94 }, { "epoch": 0.12329656067488644, "grad_norm": 0.3687002197662538, "learning_rate": 2.0346320346320347e-05, "loss": 0.6269, "step": 95 }, { "epoch": 0.1245944192083063, "grad_norm": 0.26974731920341294, "learning_rate": 2.0562770562770564e-05, "loss": 0.6355, "step": 96 }, { "epoch": 0.12589227774172615, "grad_norm": 0.35521751114512884, "learning_rate": 2.077922077922078e-05, "loss": 0.6293, "step": 97 }, { "epoch": 0.127190136275146, "grad_norm": 0.31122119266101045, "learning_rate": 2.0995670995670998e-05, "loss": 0.6548, "step": 98 }, { "epoch": 0.12848799480856588, "grad_norm": 0.32784103974924345, "learning_rate": 2.1212121212121215e-05, "loss": 0.6409, "step": 99 }, { "epoch": 0.12978585334198572, "grad_norm": 0.2862191321006967, "learning_rate": 2.1428571428571428e-05, "loss": 0.6287, "step": 100 }, { "epoch": 0.13108371187540557, "grad_norm": 0.2888970770108121, "learning_rate": 2.1645021645021645e-05, "loss": 0.5825, "step": 101 }, { "epoch": 0.13238157040882545, "grad_norm": 0.27541204550524634, "learning_rate": 2.1861471861471862e-05, "loss": 0.6056, "step": 102 }, { "epoch": 0.1336794289422453, "grad_norm": 0.2829745345550545, "learning_rate": 2.207792207792208e-05, "loss": 0.6388, "step": 103 }, { "epoch": 0.13497728747566515, "grad_norm": 0.31335331278223877, "learning_rate": 2.2294372294372296e-05, "loss": 0.6149, "step": 104 }, { "epoch": 0.136275146009085, "grad_norm": 0.26183513844983125, "learning_rate": 2.2510822510822512e-05, "loss": 0.598, "step": 105 }, { "epoch": 0.13757300454250487, "grad_norm": 0.3166303353223508, "learning_rate": 2.272727272727273e-05, "loss": 0.6153, "step": 106 }, { "epoch": 0.13887086307592472, "grad_norm": 0.2827827597423759, "learning_rate": 2.2943722943722946e-05, "loss": 0.5878, "step": 107 }, { "epoch": 0.14016872160934457, "grad_norm": 0.27950978868403287, "learning_rate": 2.3160173160173163e-05, "loss": 0.6022, "step": 108 }, { "epoch": 0.14146658014276445, "grad_norm": 0.31785506543954495, "learning_rate": 2.3376623376623376e-05, "loss": 0.6419, "step": 109 }, { "epoch": 0.1427644386761843, "grad_norm": 0.2760724448320942, "learning_rate": 2.3593073593073593e-05, "loss": 0.5892, "step": 110 }, { "epoch": 0.14406229720960415, "grad_norm": 0.31705667668464776, "learning_rate": 2.380952380952381e-05, "loss": 0.5828, "step": 111 }, { "epoch": 0.14536015574302402, "grad_norm": 0.2786427147511611, "learning_rate": 2.4025974025974027e-05, "loss": 0.6189, "step": 112 }, { "epoch": 0.14665801427644387, "grad_norm": 0.33800188191224245, "learning_rate": 2.4242424242424244e-05, "loss": 0.5867, "step": 113 }, { "epoch": 0.14795587280986372, "grad_norm": 0.3183986863565769, "learning_rate": 2.4458874458874457e-05, "loss": 0.6244, "step": 114 }, { "epoch": 0.14925373134328357, "grad_norm": 0.346611504802979, "learning_rate": 2.4675324675324678e-05, "loss": 0.6114, "step": 115 }, { "epoch": 0.15055158987670345, "grad_norm": 0.3193746967683076, "learning_rate": 2.4891774891774894e-05, "loss": 0.5847, "step": 116 }, { "epoch": 0.1518494484101233, "grad_norm": 0.329720331399979, "learning_rate": 2.5108225108225108e-05, "loss": 0.6104, "step": 117 }, { "epoch": 0.15314730694354314, "grad_norm": 0.30497761214035857, "learning_rate": 2.5324675324675325e-05, "loss": 0.6147, "step": 118 }, { "epoch": 0.15444516547696302, "grad_norm": 0.3065657873353463, "learning_rate": 2.5541125541125545e-05, "loss": 0.5891, "step": 119 }, { "epoch": 0.15574302401038287, "grad_norm": 0.3040591111660935, "learning_rate": 2.575757575757576e-05, "loss": 0.5874, "step": 120 }, { "epoch": 0.15704088254380272, "grad_norm": 0.3176140258251669, "learning_rate": 2.5974025974025972e-05, "loss": 0.5891, "step": 121 }, { "epoch": 0.1583387410772226, "grad_norm": 0.33129130491628744, "learning_rate": 2.6190476190476192e-05, "loss": 0.5754, "step": 122 }, { "epoch": 0.15963659961064244, "grad_norm": 0.3400250207622185, "learning_rate": 2.640692640692641e-05, "loss": 0.5927, "step": 123 }, { "epoch": 0.1609344581440623, "grad_norm": 0.3294442929975534, "learning_rate": 2.6623376623376623e-05, "loss": 0.6016, "step": 124 }, { "epoch": 0.16223231667748214, "grad_norm": 0.27952039743370355, "learning_rate": 2.6839826839826843e-05, "loss": 0.5674, "step": 125 }, { "epoch": 0.16353017521090202, "grad_norm": 0.3263152361115472, "learning_rate": 2.7056277056277056e-05, "loss": 0.6185, "step": 126 }, { "epoch": 0.16482803374432187, "grad_norm": 0.34561117525982527, "learning_rate": 2.7272727272727273e-05, "loss": 0.6003, "step": 127 }, { "epoch": 0.16612589227774172, "grad_norm": 0.36330136868220264, "learning_rate": 2.7489177489177493e-05, "loss": 0.5796, "step": 128 }, { "epoch": 0.1674237508111616, "grad_norm": 0.3448144052747857, "learning_rate": 2.7705627705627707e-05, "loss": 0.5858, "step": 129 }, { "epoch": 0.16872160934458144, "grad_norm": 0.30841385505522906, "learning_rate": 2.792207792207792e-05, "loss": 0.5913, "step": 130 }, { "epoch": 0.1700194678780013, "grad_norm": 0.3823986000835476, "learning_rate": 2.813852813852814e-05, "loss": 0.6089, "step": 131 }, { "epoch": 0.17131732641142117, "grad_norm": 0.3183137204294537, "learning_rate": 2.8354978354978357e-05, "loss": 0.5974, "step": 132 }, { "epoch": 0.17261518494484102, "grad_norm": 0.3375228791953999, "learning_rate": 2.857142857142857e-05, "loss": 0.5919, "step": 133 }, { "epoch": 0.17391304347826086, "grad_norm": 0.34769553896113353, "learning_rate": 2.878787878787879e-05, "loss": 0.587, "step": 134 }, { "epoch": 0.1752109020116807, "grad_norm": 0.34053830322587214, "learning_rate": 2.9004329004329005e-05, "loss": 0.5846, "step": 135 }, { "epoch": 0.1765087605451006, "grad_norm": 0.3327693629121813, "learning_rate": 2.922077922077922e-05, "loss": 0.5929, "step": 136 }, { "epoch": 0.17780661907852044, "grad_norm": 0.37595317145253215, "learning_rate": 2.943722943722944e-05, "loss": 0.5836, "step": 137 }, { "epoch": 0.1791044776119403, "grad_norm": 0.31124901930305726, "learning_rate": 2.9653679653679655e-05, "loss": 0.5946, "step": 138 }, { "epoch": 0.18040233614536016, "grad_norm": 0.41500685318923003, "learning_rate": 2.9870129870129872e-05, "loss": 0.599, "step": 139 }, { "epoch": 0.18170019467878, "grad_norm": 0.4422225800744917, "learning_rate": 3.0086580086580092e-05, "loss": 0.6079, "step": 140 }, { "epoch": 0.18299805321219986, "grad_norm": 0.3911349391427895, "learning_rate": 3.0303030303030306e-05, "loss": 0.5911, "step": 141 }, { "epoch": 0.18429591174561974, "grad_norm": 0.3565760473012978, "learning_rate": 3.051948051948052e-05, "loss": 0.5874, "step": 142 }, { "epoch": 0.1855937702790396, "grad_norm": 0.3316833578762426, "learning_rate": 3.073593073593073e-05, "loss": 0.5987, "step": 143 }, { "epoch": 0.18689162881245944, "grad_norm": 0.4255792906025628, "learning_rate": 3.095238095238095e-05, "loss": 0.5674, "step": 144 }, { "epoch": 0.18818948734587929, "grad_norm": 0.3111389344438918, "learning_rate": 3.1168831168831166e-05, "loss": 0.5916, "step": 145 }, { "epoch": 0.18948734587929916, "grad_norm": 0.40391893328316164, "learning_rate": 3.1385281385281387e-05, "loss": 0.5862, "step": 146 }, { "epoch": 0.190785204412719, "grad_norm": 0.3571856870514297, "learning_rate": 3.160173160173161e-05, "loss": 0.5783, "step": 147 }, { "epoch": 0.19208306294613886, "grad_norm": 0.34724535128608686, "learning_rate": 3.181818181818182e-05, "loss": 0.593, "step": 148 }, { "epoch": 0.19338092147955874, "grad_norm": 0.36623311715616075, "learning_rate": 3.2034632034632034e-05, "loss": 0.5791, "step": 149 }, { "epoch": 0.1946787800129786, "grad_norm": 0.35421377131407383, "learning_rate": 3.2251082251082254e-05, "loss": 0.5869, "step": 150 }, { "epoch": 0.19597663854639844, "grad_norm": 0.3580175565804796, "learning_rate": 3.246753246753247e-05, "loss": 0.5731, "step": 151 }, { "epoch": 0.1972744970798183, "grad_norm": 0.3779107544260428, "learning_rate": 3.268398268398268e-05, "loss": 0.5888, "step": 152 }, { "epoch": 0.19857235561323816, "grad_norm": 0.381401724832965, "learning_rate": 3.29004329004329e-05, "loss": 0.5754, "step": 153 }, { "epoch": 0.199870214146658, "grad_norm": 0.3996699371198549, "learning_rate": 3.311688311688312e-05, "loss": 0.5878, "step": 154 }, { "epoch": 0.20116807268007786, "grad_norm": 0.3498521285804811, "learning_rate": 3.3333333333333335e-05, "loss": 0.5624, "step": 155 }, { "epoch": 0.20246593121349774, "grad_norm": 0.4329402753533996, "learning_rate": 3.3549783549783555e-05, "loss": 0.5823, "step": 156 }, { "epoch": 0.20376378974691758, "grad_norm": 0.4715275117713498, "learning_rate": 3.376623376623377e-05, "loss": 0.5679, "step": 157 }, { "epoch": 0.20506164828033743, "grad_norm": 0.4087297995001702, "learning_rate": 3.398268398268398e-05, "loss": 0.5492, "step": 158 }, { "epoch": 0.2063595068137573, "grad_norm": 0.3963104181486302, "learning_rate": 3.41991341991342e-05, "loss": 0.5779, "step": 159 }, { "epoch": 0.20765736534717716, "grad_norm": 0.4626784659383467, "learning_rate": 3.4415584415584416e-05, "loss": 0.5791, "step": 160 }, { "epoch": 0.208955223880597, "grad_norm": 0.4629189431646934, "learning_rate": 3.463203463203463e-05, "loss": 0.5709, "step": 161 }, { "epoch": 0.21025308241401688, "grad_norm": 0.4327284524192223, "learning_rate": 3.484848484848485e-05, "loss": 0.5821, "step": 162 }, { "epoch": 0.21155094094743673, "grad_norm": 0.42226923421652224, "learning_rate": 3.506493506493507e-05, "loss": 0.579, "step": 163 }, { "epoch": 0.21284879948085658, "grad_norm": 0.37986989822155737, "learning_rate": 3.528138528138528e-05, "loss": 0.5656, "step": 164 }, { "epoch": 0.21414665801427643, "grad_norm": 0.4629547655665463, "learning_rate": 3.5497835497835503e-05, "loss": 0.5703, "step": 165 }, { "epoch": 0.2154445165476963, "grad_norm": 0.41674661311211725, "learning_rate": 3.571428571428572e-05, "loss": 0.5775, "step": 166 }, { "epoch": 0.21674237508111616, "grad_norm": 0.37812170353301494, "learning_rate": 3.593073593073593e-05, "loss": 0.5647, "step": 167 }, { "epoch": 0.218040233614536, "grad_norm": 0.3533683709945352, "learning_rate": 3.6147186147186144e-05, "loss": 0.5742, "step": 168 }, { "epoch": 0.21933809214795588, "grad_norm": 0.3327311378407231, "learning_rate": 3.6363636363636364e-05, "loss": 0.5616, "step": 169 }, { "epoch": 0.22063595068137573, "grad_norm": 0.31532044110811386, "learning_rate": 3.6580086580086584e-05, "loss": 0.5434, "step": 170 }, { "epoch": 0.22193380921479558, "grad_norm": 0.3191051407668251, "learning_rate": 3.67965367965368e-05, "loss": 0.5719, "step": 171 }, { "epoch": 0.22323166774821546, "grad_norm": 0.36508138364995835, "learning_rate": 3.701298701298702e-05, "loss": 0.5496, "step": 172 }, { "epoch": 0.2245295262816353, "grad_norm": 0.35917301960844417, "learning_rate": 3.722943722943723e-05, "loss": 0.5599, "step": 173 }, { "epoch": 0.22582738481505515, "grad_norm": 0.332146935347223, "learning_rate": 3.7445887445887445e-05, "loss": 0.5729, "step": 174 }, { "epoch": 0.227125243348475, "grad_norm": 0.3761709742507644, "learning_rate": 3.7662337662337665e-05, "loss": 0.5395, "step": 175 }, { "epoch": 0.22842310188189488, "grad_norm": 0.357015737631827, "learning_rate": 3.787878787878788e-05, "loss": 0.5661, "step": 176 }, { "epoch": 0.22972096041531473, "grad_norm": 0.34903223053324706, "learning_rate": 3.809523809523809e-05, "loss": 0.5633, "step": 177 }, { "epoch": 0.23101881894873458, "grad_norm": 0.3592680565530814, "learning_rate": 3.831168831168831e-05, "loss": 0.5504, "step": 178 }, { "epoch": 0.23231667748215445, "grad_norm": 0.31763219861115954, "learning_rate": 3.852813852813853e-05, "loss": 0.5693, "step": 179 }, { "epoch": 0.2336145360155743, "grad_norm": 0.37831351950166914, "learning_rate": 3.8744588744588746e-05, "loss": 0.5545, "step": 180 }, { "epoch": 0.23491239454899415, "grad_norm": 0.3032029950520603, "learning_rate": 3.8961038961038966e-05, "loss": 0.5483, "step": 181 }, { "epoch": 0.23621025308241403, "grad_norm": 0.45423832636818023, "learning_rate": 3.917748917748918e-05, "loss": 0.5646, "step": 182 }, { "epoch": 0.23750811161583388, "grad_norm": 0.3885683241280637, "learning_rate": 3.939393939393939e-05, "loss": 0.5782, "step": 183 }, { "epoch": 0.23880597014925373, "grad_norm": 0.4113001708391826, "learning_rate": 3.9610389610389614e-05, "loss": 0.5709, "step": 184 }, { "epoch": 0.24010382868267358, "grad_norm": 0.43060691545097807, "learning_rate": 3.982683982683983e-05, "loss": 0.5357, "step": 185 }, { "epoch": 0.24140168721609345, "grad_norm": 0.48621329563873417, "learning_rate": 4.004329004329004e-05, "loss": 0.5438, "step": 186 }, { "epoch": 0.2426995457495133, "grad_norm": 0.346819520203559, "learning_rate": 4.025974025974026e-05, "loss": 0.5448, "step": 187 }, { "epoch": 0.24399740428293315, "grad_norm": 0.5771040244138606, "learning_rate": 4.047619047619048e-05, "loss": 0.5609, "step": 188 }, { "epoch": 0.24529526281635303, "grad_norm": 0.5691856093486398, "learning_rate": 4.0692640692640695e-05, "loss": 0.5509, "step": 189 }, { "epoch": 0.24659312134977288, "grad_norm": 0.5658078548327832, "learning_rate": 4.0909090909090915e-05, "loss": 0.5457, "step": 190 }, { "epoch": 0.24789097988319272, "grad_norm": 0.32565141992028185, "learning_rate": 4.112554112554113e-05, "loss": 0.5576, "step": 191 }, { "epoch": 0.2491888384166126, "grad_norm": 0.693035116005457, "learning_rate": 4.134199134199134e-05, "loss": 0.5818, "step": 192 }, { "epoch": 0.25048669695003245, "grad_norm": 0.5767521454545272, "learning_rate": 4.155844155844156e-05, "loss": 0.5651, "step": 193 }, { "epoch": 0.2517845554834523, "grad_norm": 0.5780821207088752, "learning_rate": 4.1774891774891775e-05, "loss": 0.569, "step": 194 }, { "epoch": 0.25308241401687215, "grad_norm": 0.37604239901597153, "learning_rate": 4.1991341991341996e-05, "loss": 0.543, "step": 195 }, { "epoch": 0.254380272550292, "grad_norm": 0.5156588377708116, "learning_rate": 4.220779220779221e-05, "loss": 0.5655, "step": 196 }, { "epoch": 0.2556781310837119, "grad_norm": 0.547020541236707, "learning_rate": 4.242424242424243e-05, "loss": 0.5823, "step": 197 }, { "epoch": 0.25697598961713175, "grad_norm": 0.4902045464021542, "learning_rate": 4.264069264069264e-05, "loss": 0.5819, "step": 198 }, { "epoch": 0.2582738481505516, "grad_norm": 0.43892225186858413, "learning_rate": 4.2857142857142856e-05, "loss": 0.5407, "step": 199 }, { "epoch": 0.25957170668397145, "grad_norm": 0.35311657422045256, "learning_rate": 4.3073593073593077e-05, "loss": 0.5333, "step": 200 }, { "epoch": 0.2608695652173913, "grad_norm": 0.550191337628468, "learning_rate": 4.329004329004329e-05, "loss": 0.5691, "step": 201 }, { "epoch": 0.26216742375081115, "grad_norm": 0.3252801360738619, "learning_rate": 4.3506493506493503e-05, "loss": 0.5533, "step": 202 }, { "epoch": 0.263465282284231, "grad_norm": 0.4553304513015423, "learning_rate": 4.3722943722943724e-05, "loss": 0.5565, "step": 203 }, { "epoch": 0.2647631408176509, "grad_norm": 0.3263307722581273, "learning_rate": 4.3939393939393944e-05, "loss": 0.5671, "step": 204 }, { "epoch": 0.26606099935107075, "grad_norm": 0.4000844274004943, "learning_rate": 4.415584415584416e-05, "loss": 0.5399, "step": 205 }, { "epoch": 0.2673588578844906, "grad_norm": 0.38431545582799964, "learning_rate": 4.437229437229438e-05, "loss": 0.5417, "step": 206 }, { "epoch": 0.26865671641791045, "grad_norm": 0.40816346897613404, "learning_rate": 4.458874458874459e-05, "loss": 0.5592, "step": 207 }, { "epoch": 0.2699545749513303, "grad_norm": 0.34584489381728045, "learning_rate": 4.4805194805194805e-05, "loss": 0.5438, "step": 208 }, { "epoch": 0.27125243348475014, "grad_norm": 0.4537307823944973, "learning_rate": 4.5021645021645025e-05, "loss": 0.5399, "step": 209 }, { "epoch": 0.27255029201817, "grad_norm": 0.5752635325535591, "learning_rate": 4.523809523809524e-05, "loss": 0.5672, "step": 210 }, { "epoch": 0.2738481505515899, "grad_norm": 0.4083818210095887, "learning_rate": 4.545454545454546e-05, "loss": 0.5617, "step": 211 }, { "epoch": 0.27514600908500975, "grad_norm": 0.3170399657064755, "learning_rate": 4.567099567099568e-05, "loss": 0.5352, "step": 212 }, { "epoch": 0.2764438676184296, "grad_norm": 0.31917717130826856, "learning_rate": 4.588744588744589e-05, "loss": 0.5617, "step": 213 }, { "epoch": 0.27774172615184944, "grad_norm": 0.3772448329589651, "learning_rate": 4.6103896103896106e-05, "loss": 0.5662, "step": 214 }, { "epoch": 0.2790395846852693, "grad_norm": 0.3799776585928483, "learning_rate": 4.6320346320346326e-05, "loss": 0.5814, "step": 215 }, { "epoch": 0.28033744321868914, "grad_norm": 0.376252486652199, "learning_rate": 4.653679653679654e-05, "loss": 0.5475, "step": 216 }, { "epoch": 0.28163530175210905, "grad_norm": 0.3703755301826731, "learning_rate": 4.675324675324675e-05, "loss": 0.5488, "step": 217 }, { "epoch": 0.2829331602855289, "grad_norm": 0.32861168704985866, "learning_rate": 4.696969696969697e-05, "loss": 0.554, "step": 218 }, { "epoch": 0.28423101881894874, "grad_norm": 0.3475845025879547, "learning_rate": 4.718614718614719e-05, "loss": 0.5407, "step": 219 }, { "epoch": 0.2855288773523686, "grad_norm": 0.3648655973805309, "learning_rate": 4.740259740259741e-05, "loss": 0.5466, "step": 220 }, { "epoch": 0.28682673588578844, "grad_norm": 0.3350866523035428, "learning_rate": 4.761904761904762e-05, "loss": 0.5548, "step": 221 }, { "epoch": 0.2881245944192083, "grad_norm": 0.43767143054287594, "learning_rate": 4.783549783549784e-05, "loss": 0.5684, "step": 222 }, { "epoch": 0.28942245295262814, "grad_norm": 0.421178133286777, "learning_rate": 4.8051948051948054e-05, "loss": 0.5639, "step": 223 }, { "epoch": 0.29072031148604804, "grad_norm": 0.37835877083504477, "learning_rate": 4.826839826839827e-05, "loss": 0.5566, "step": 224 }, { "epoch": 0.2920181700194679, "grad_norm": 0.3417724143733512, "learning_rate": 4.848484848484849e-05, "loss": 0.5552, "step": 225 }, { "epoch": 0.29331602855288774, "grad_norm": 0.3870541340632366, "learning_rate": 4.87012987012987e-05, "loss": 0.549, "step": 226 }, { "epoch": 0.2946138870863076, "grad_norm": 0.4889598386044001, "learning_rate": 4.8917748917748915e-05, "loss": 0.5538, "step": 227 }, { "epoch": 0.29591174561972744, "grad_norm": 0.4543222558469965, "learning_rate": 4.9134199134199135e-05, "loss": 0.5651, "step": 228 }, { "epoch": 0.2972096041531473, "grad_norm": 0.38147571297168936, "learning_rate": 4.9350649350649355e-05, "loss": 0.5456, "step": 229 }, { "epoch": 0.29850746268656714, "grad_norm": 0.48062886052178266, "learning_rate": 4.956709956709957e-05, "loss": 0.5519, "step": 230 }, { "epoch": 0.29980532121998704, "grad_norm": 0.3436776708584428, "learning_rate": 4.978354978354979e-05, "loss": 0.5572, "step": 231 }, { "epoch": 0.3011031797534069, "grad_norm": 0.48075516118965306, "learning_rate": 5e-05, "loss": 0.5566, "step": 232 }, { "epoch": 0.30240103828682674, "grad_norm": 0.5764784128417795, "learning_rate": 4.997594997594998e-05, "loss": 0.5819, "step": 233 }, { "epoch": 0.3036988968202466, "grad_norm": 0.396476527818061, "learning_rate": 4.995189995189995e-05, "loss": 0.5411, "step": 234 }, { "epoch": 0.30499675535366644, "grad_norm": 0.46291378630567925, "learning_rate": 4.992784992784993e-05, "loss": 0.5552, "step": 235 }, { "epoch": 0.3062946138870863, "grad_norm": 0.44861478710130637, "learning_rate": 4.990379990379991e-05, "loss": 0.544, "step": 236 }, { "epoch": 0.3075924724205062, "grad_norm": 0.3873611746053732, "learning_rate": 4.987974987974988e-05, "loss": 0.5556, "step": 237 }, { "epoch": 0.30889033095392604, "grad_norm": 0.41664468323948045, "learning_rate": 4.985569985569986e-05, "loss": 0.5771, "step": 238 }, { "epoch": 0.3101881894873459, "grad_norm": 0.3859756658142421, "learning_rate": 4.983164983164983e-05, "loss": 0.532, "step": 239 }, { "epoch": 0.31148604802076574, "grad_norm": 0.44937677319362224, "learning_rate": 4.980759980759981e-05, "loss": 0.5426, "step": 240 }, { "epoch": 0.3127839065541856, "grad_norm": 0.4349437401518082, "learning_rate": 4.978354978354979e-05, "loss": 0.5259, "step": 241 }, { "epoch": 0.31408176508760544, "grad_norm": 0.400324790012705, "learning_rate": 4.9759499759499764e-05, "loss": 0.5537, "step": 242 }, { "epoch": 0.3153796236210253, "grad_norm": 0.43872297509664254, "learning_rate": 4.973544973544973e-05, "loss": 0.549, "step": 243 }, { "epoch": 0.3166774821544452, "grad_norm": 0.4653708053643151, "learning_rate": 4.971139971139971e-05, "loss": 0.5254, "step": 244 }, { "epoch": 0.31797534068786504, "grad_norm": 0.40941811760654495, "learning_rate": 4.968734968734969e-05, "loss": 0.568, "step": 245 }, { "epoch": 0.3192731992212849, "grad_norm": 0.5368348479077355, "learning_rate": 4.966329966329967e-05, "loss": 0.5476, "step": 246 }, { "epoch": 0.32057105775470474, "grad_norm": 0.4544642184839637, "learning_rate": 4.963924963924964e-05, "loss": 0.5566, "step": 247 }, { "epoch": 0.3218689162881246, "grad_norm": 0.42978031279738266, "learning_rate": 4.961519961519962e-05, "loss": 0.548, "step": 248 }, { "epoch": 0.32316677482154443, "grad_norm": 0.41191622365654873, "learning_rate": 4.9591149591149594e-05, "loss": 0.5458, "step": 249 }, { "epoch": 0.3244646333549643, "grad_norm": 0.6074054124348204, "learning_rate": 4.956709956709957e-05, "loss": 0.5519, "step": 250 }, { "epoch": 0.3257624918883842, "grad_norm": 0.4651053481351256, "learning_rate": 4.9543049543049543e-05, "loss": 0.5811, "step": 251 }, { "epoch": 0.32706035042180404, "grad_norm": 0.4240240962916135, "learning_rate": 4.951899951899952e-05, "loss": 0.5523, "step": 252 }, { "epoch": 0.3283582089552239, "grad_norm": 0.5066208761057746, "learning_rate": 4.94949494949495e-05, "loss": 0.544, "step": 253 }, { "epoch": 0.32965606748864373, "grad_norm": 0.38109072762259544, "learning_rate": 4.9470899470899475e-05, "loss": 0.5538, "step": 254 }, { "epoch": 0.3309539260220636, "grad_norm": 0.5117807003713138, "learning_rate": 4.944684944684945e-05, "loss": 0.5577, "step": 255 }, { "epoch": 0.33225178455548343, "grad_norm": 0.44912086500472626, "learning_rate": 4.9422799422799424e-05, "loss": 0.5495, "step": 256 }, { "epoch": 0.33354964308890334, "grad_norm": 0.3651331486905666, "learning_rate": 4.93987493987494e-05, "loss": 0.5631, "step": 257 }, { "epoch": 0.3348475016223232, "grad_norm": 0.5611125950484844, "learning_rate": 4.937469937469938e-05, "loss": 0.5465, "step": 258 }, { "epoch": 0.33614536015574303, "grad_norm": 0.5300284860526002, "learning_rate": 4.9350649350649355e-05, "loss": 0.5425, "step": 259 }, { "epoch": 0.3374432186891629, "grad_norm": 0.42241934122178765, "learning_rate": 4.932659932659932e-05, "loss": 0.5613, "step": 260 }, { "epoch": 0.33874107722258273, "grad_norm": 0.6480707951702842, "learning_rate": 4.9302549302549305e-05, "loss": 0.5443, "step": 261 }, { "epoch": 0.3400389357560026, "grad_norm": 0.5458559898285835, "learning_rate": 4.927849927849928e-05, "loss": 0.5362, "step": 262 }, { "epoch": 0.34133679428942243, "grad_norm": 0.4307852761395753, "learning_rate": 4.925444925444926e-05, "loss": 0.5405, "step": 263 }, { "epoch": 0.34263465282284233, "grad_norm": 0.5693990395449862, "learning_rate": 4.923039923039923e-05, "loss": 0.5455, "step": 264 }, { "epoch": 0.3439325113562622, "grad_norm": 0.4427765568418805, "learning_rate": 4.9206349206349204e-05, "loss": 0.5475, "step": 265 }, { "epoch": 0.34523036988968203, "grad_norm": 0.4724926699957873, "learning_rate": 4.9182299182299185e-05, "loss": 0.5502, "step": 266 }, { "epoch": 0.3465282284231019, "grad_norm": 0.6296467164625645, "learning_rate": 4.915824915824916e-05, "loss": 0.555, "step": 267 }, { "epoch": 0.34782608695652173, "grad_norm": 0.521325771991002, "learning_rate": 4.9134199134199135e-05, "loss": 0.5521, "step": 268 }, { "epoch": 0.3491239454899416, "grad_norm": 0.4920183923356473, "learning_rate": 4.911014911014911e-05, "loss": 0.5589, "step": 269 }, { "epoch": 0.3504218040233614, "grad_norm": 0.6860051439883974, "learning_rate": 4.908609908609909e-05, "loss": 0.5396, "step": 270 }, { "epoch": 0.35171966255678133, "grad_norm": 0.38098025544839875, "learning_rate": 4.9062049062049066e-05, "loss": 0.5439, "step": 271 }, { "epoch": 0.3530175210902012, "grad_norm": 0.545523500518169, "learning_rate": 4.903799903799904e-05, "loss": 0.5285, "step": 272 }, { "epoch": 0.35431537962362103, "grad_norm": 0.4773245042110645, "learning_rate": 4.9013949013949016e-05, "loss": 0.5506, "step": 273 }, { "epoch": 0.3556132381570409, "grad_norm": 0.41823644467627, "learning_rate": 4.898989898989899e-05, "loss": 0.5382, "step": 274 }, { "epoch": 0.3569110966904607, "grad_norm": 0.43108861210799143, "learning_rate": 4.896584896584897e-05, "loss": 0.5312, "step": 275 }, { "epoch": 0.3582089552238806, "grad_norm": 0.35256122918946825, "learning_rate": 4.894179894179895e-05, "loss": 0.5507, "step": 276 }, { "epoch": 0.3595068137573005, "grad_norm": 0.641603115121163, "learning_rate": 4.8917748917748915e-05, "loss": 0.5494, "step": 277 }, { "epoch": 0.36080467229072033, "grad_norm": 0.42144449610145046, "learning_rate": 4.8893698893698896e-05, "loss": 0.5537, "step": 278 }, { "epoch": 0.3621025308241402, "grad_norm": 0.4421221398296794, "learning_rate": 4.886964886964887e-05, "loss": 0.5305, "step": 279 }, { "epoch": 0.36340038935756, "grad_norm": 0.34904354726043524, "learning_rate": 4.884559884559885e-05, "loss": 0.5151, "step": 280 }, { "epoch": 0.3646982478909799, "grad_norm": 0.567323138161088, "learning_rate": 4.882154882154882e-05, "loss": 0.5394, "step": 281 }, { "epoch": 0.3659961064243997, "grad_norm": 0.4275900383373202, "learning_rate": 4.8797498797498795e-05, "loss": 0.5793, "step": 282 }, { "epoch": 0.3672939649578196, "grad_norm": 0.43590374764579676, "learning_rate": 4.877344877344878e-05, "loss": 0.5238, "step": 283 }, { "epoch": 0.3685918234912395, "grad_norm": 0.3806185860855704, "learning_rate": 4.874939874939875e-05, "loss": 0.5311, "step": 284 }, { "epoch": 0.3698896820246593, "grad_norm": 0.36597622680635733, "learning_rate": 4.8725348725348726e-05, "loss": 0.5483, "step": 285 }, { "epoch": 0.3711875405580792, "grad_norm": 0.39934249219009466, "learning_rate": 4.87012987012987e-05, "loss": 0.5323, "step": 286 }, { "epoch": 0.372485399091499, "grad_norm": 0.35489673738601485, "learning_rate": 4.8677248677248676e-05, "loss": 0.5197, "step": 287 }, { "epoch": 0.3737832576249189, "grad_norm": 0.35597996373456253, "learning_rate": 4.865319865319866e-05, "loss": 0.541, "step": 288 }, { "epoch": 0.3750811161583387, "grad_norm": 0.30995272924377104, "learning_rate": 4.862914862914863e-05, "loss": 0.531, "step": 289 }, { "epoch": 0.37637897469175857, "grad_norm": 0.3041222657562842, "learning_rate": 4.860509860509861e-05, "loss": 0.5263, "step": 290 }, { "epoch": 0.3776768332251785, "grad_norm": 0.27479710316885086, "learning_rate": 4.858104858104858e-05, "loss": 0.5179, "step": 291 }, { "epoch": 0.3789746917585983, "grad_norm": 0.4108809131825242, "learning_rate": 4.8556998556998563e-05, "loss": 0.5285, "step": 292 }, { "epoch": 0.3802725502920182, "grad_norm": 0.3283706178094482, "learning_rate": 4.853294853294854e-05, "loss": 0.5485, "step": 293 }, { "epoch": 0.381570408825438, "grad_norm": 0.3628325275789365, "learning_rate": 4.8508898508898506e-05, "loss": 0.5342, "step": 294 }, { "epoch": 0.38286826735885787, "grad_norm": 0.3545709020214379, "learning_rate": 4.848484848484849e-05, "loss": 0.5441, "step": 295 }, { "epoch": 0.3841661258922777, "grad_norm": 0.27536849505708144, "learning_rate": 4.846079846079846e-05, "loss": 0.5189, "step": 296 }, { "epoch": 0.3854639844256976, "grad_norm": 0.31314568760395595, "learning_rate": 4.8436748436748444e-05, "loss": 0.5165, "step": 297 }, { "epoch": 0.3867618429591175, "grad_norm": 0.31727676668467136, "learning_rate": 4.841269841269841e-05, "loss": 0.5185, "step": 298 }, { "epoch": 0.3880597014925373, "grad_norm": 0.35285183197833564, "learning_rate": 4.838864838864839e-05, "loss": 0.5328, "step": 299 }, { "epoch": 0.3893575600259572, "grad_norm": 0.2990420731073892, "learning_rate": 4.836459836459837e-05, "loss": 0.532, "step": 300 }, { "epoch": 0.390655418559377, "grad_norm": 0.38606448483559813, "learning_rate": 4.834054834054834e-05, "loss": 0.5461, "step": 301 }, { "epoch": 0.39195327709279687, "grad_norm": 0.37652402001442803, "learning_rate": 4.831649831649832e-05, "loss": 0.5379, "step": 302 }, { "epoch": 0.3932511356262167, "grad_norm": 0.34953978468725405, "learning_rate": 4.829244829244829e-05, "loss": 0.5359, "step": 303 }, { "epoch": 0.3945489941596366, "grad_norm": 0.3382778166946982, "learning_rate": 4.826839826839827e-05, "loss": 0.5342, "step": 304 }, { "epoch": 0.3958468526930565, "grad_norm": 0.34560665492104875, "learning_rate": 4.824434824434825e-05, "loss": 0.5324, "step": 305 }, { "epoch": 0.3971447112264763, "grad_norm": 0.34496470111641636, "learning_rate": 4.8220298220298224e-05, "loss": 0.5339, "step": 306 }, { "epoch": 0.39844256975989617, "grad_norm": 0.40001685434062584, "learning_rate": 4.81962481962482e-05, "loss": 0.5272, "step": 307 }, { "epoch": 0.399740428293316, "grad_norm": 0.366032696592655, "learning_rate": 4.8172198172198173e-05, "loss": 0.5306, "step": 308 }, { "epoch": 0.40103828682673587, "grad_norm": 0.37927598899770393, "learning_rate": 4.814814814814815e-05, "loss": 0.5591, "step": 309 }, { "epoch": 0.4023361453601557, "grad_norm": 0.32812121834756386, "learning_rate": 4.812409812409813e-05, "loss": 0.5422, "step": 310 }, { "epoch": 0.4036340038935756, "grad_norm": 0.35171717899329513, "learning_rate": 4.81000481000481e-05, "loss": 0.532, "step": 311 }, { "epoch": 0.40493186242699547, "grad_norm": 0.3756784968486016, "learning_rate": 4.807599807599808e-05, "loss": 0.5392, "step": 312 }, { "epoch": 0.4062297209604153, "grad_norm": 0.3426264703785813, "learning_rate": 4.8051948051948054e-05, "loss": 0.5457, "step": 313 }, { "epoch": 0.40752757949383517, "grad_norm": 0.39836935230937326, "learning_rate": 4.8027898027898036e-05, "loss": 0.5268, "step": 314 }, { "epoch": 0.408825438027255, "grad_norm": 0.33486717640072616, "learning_rate": 4.8003848003848004e-05, "loss": 0.5298, "step": 315 }, { "epoch": 0.41012329656067487, "grad_norm": 0.3463640087410465, "learning_rate": 4.797979797979798e-05, "loss": 0.5372, "step": 316 }, { "epoch": 0.41142115509409477, "grad_norm": 0.2981951724559669, "learning_rate": 4.795574795574796e-05, "loss": 0.5193, "step": 317 }, { "epoch": 0.4127190136275146, "grad_norm": 0.37701472504733063, "learning_rate": 4.7931697931697935e-05, "loss": 0.5262, "step": 318 }, { "epoch": 0.41401687216093447, "grad_norm": 0.2958251594721693, "learning_rate": 4.790764790764791e-05, "loss": 0.5262, "step": 319 }, { "epoch": 0.4153147306943543, "grad_norm": 0.36512530778352836, "learning_rate": 4.7883597883597884e-05, "loss": 0.5451, "step": 320 }, { "epoch": 0.41661258922777417, "grad_norm": 0.32275488837011096, "learning_rate": 4.785954785954786e-05, "loss": 0.5378, "step": 321 }, { "epoch": 0.417910447761194, "grad_norm": 0.29968884353456, "learning_rate": 4.783549783549784e-05, "loss": 0.5308, "step": 322 }, { "epoch": 0.41920830629461386, "grad_norm": 0.3472967912976659, "learning_rate": 4.7811447811447815e-05, "loss": 0.5262, "step": 323 }, { "epoch": 0.42050616482803377, "grad_norm": 0.3256673670375662, "learning_rate": 4.778739778739779e-05, "loss": 0.5349, "step": 324 }, { "epoch": 0.4218040233614536, "grad_norm": 0.3830575202323324, "learning_rate": 4.7763347763347765e-05, "loss": 0.5343, "step": 325 }, { "epoch": 0.42310188189487347, "grad_norm": 0.34375094386741617, "learning_rate": 4.773929773929774e-05, "loss": 0.5295, "step": 326 }, { "epoch": 0.4243997404282933, "grad_norm": 0.32100699117380493, "learning_rate": 4.771524771524772e-05, "loss": 0.5241, "step": 327 }, { "epoch": 0.42569759896171316, "grad_norm": 0.3546414912790039, "learning_rate": 4.769119769119769e-05, "loss": 0.5292, "step": 328 }, { "epoch": 0.426995457495133, "grad_norm": 0.367282717001635, "learning_rate": 4.766714766714767e-05, "loss": 0.525, "step": 329 }, { "epoch": 0.42829331602855286, "grad_norm": 0.36332040957365974, "learning_rate": 4.7643097643097646e-05, "loss": 0.5395, "step": 330 }, { "epoch": 0.42959117456197277, "grad_norm": 0.36242424332632034, "learning_rate": 4.761904761904762e-05, "loss": 0.5533, "step": 331 }, { "epoch": 0.4308890330953926, "grad_norm": 0.36697609874383924, "learning_rate": 4.7594997594997595e-05, "loss": 0.5359, "step": 332 }, { "epoch": 0.43218689162881246, "grad_norm": 0.33118162802731466, "learning_rate": 4.757094757094757e-05, "loss": 0.5317, "step": 333 }, { "epoch": 0.4334847501622323, "grad_norm": 0.30441401984534905, "learning_rate": 4.754689754689755e-05, "loss": 0.5117, "step": 334 }, { "epoch": 0.43478260869565216, "grad_norm": 0.38992931838953715, "learning_rate": 4.7522847522847526e-05, "loss": 0.5461, "step": 335 }, { "epoch": 0.436080467229072, "grad_norm": 0.3335314021890073, "learning_rate": 4.74987974987975e-05, "loss": 0.521, "step": 336 }, { "epoch": 0.4373783257624919, "grad_norm": 0.44568000339670255, "learning_rate": 4.7474747474747476e-05, "loss": 0.5351, "step": 337 }, { "epoch": 0.43867618429591176, "grad_norm": 0.30521804239806394, "learning_rate": 4.745069745069745e-05, "loss": 0.5354, "step": 338 }, { "epoch": 0.4399740428293316, "grad_norm": 0.40068879251857975, "learning_rate": 4.742664742664743e-05, "loss": 0.519, "step": 339 }, { "epoch": 0.44127190136275146, "grad_norm": 0.2988879048992771, "learning_rate": 4.740259740259741e-05, "loss": 0.5219, "step": 340 }, { "epoch": 0.4425697598961713, "grad_norm": 0.3568374963385924, "learning_rate": 4.737854737854738e-05, "loss": 0.5407, "step": 341 }, { "epoch": 0.44386761842959116, "grad_norm": 0.3314619863009568, "learning_rate": 4.7354497354497356e-05, "loss": 0.5293, "step": 342 }, { "epoch": 0.445165476963011, "grad_norm": 0.38859118585963526, "learning_rate": 4.733044733044733e-05, "loss": 0.5313, "step": 343 }, { "epoch": 0.4464633354964309, "grad_norm": 0.29870660291558937, "learning_rate": 4.730639730639731e-05, "loss": 0.5235, "step": 344 }, { "epoch": 0.44776119402985076, "grad_norm": 0.33559539703922564, "learning_rate": 4.728234728234728e-05, "loss": 0.5165, "step": 345 }, { "epoch": 0.4490590525632706, "grad_norm": 0.29408841700456767, "learning_rate": 4.725829725829726e-05, "loss": 0.5385, "step": 346 }, { "epoch": 0.45035691109669046, "grad_norm": 0.37054598216697, "learning_rate": 4.723424723424724e-05, "loss": 0.5107, "step": 347 }, { "epoch": 0.4516547696301103, "grad_norm": 0.3250925011044857, "learning_rate": 4.721019721019721e-05, "loss": 0.5233, "step": 348 }, { "epoch": 0.45295262816353016, "grad_norm": 0.35856402885603006, "learning_rate": 4.718614718614719e-05, "loss": 0.5055, "step": 349 }, { "epoch": 0.45425048669695, "grad_norm": 0.34612856309129164, "learning_rate": 4.716209716209716e-05, "loss": 0.5404, "step": 350 }, { "epoch": 0.4555483452303699, "grad_norm": 0.33303853027334285, "learning_rate": 4.713804713804714e-05, "loss": 0.5178, "step": 351 }, { "epoch": 0.45684620376378976, "grad_norm": 0.32091795432054987, "learning_rate": 4.711399711399712e-05, "loss": 0.5222, "step": 352 }, { "epoch": 0.4581440622972096, "grad_norm": 0.34934754787554123, "learning_rate": 4.708994708994709e-05, "loss": 0.5405, "step": 353 }, { "epoch": 0.45944192083062946, "grad_norm": 0.2937202903692653, "learning_rate": 4.706589706589707e-05, "loss": 0.5283, "step": 354 }, { "epoch": 0.4607397793640493, "grad_norm": 0.3464664667340698, "learning_rate": 4.704184704184704e-05, "loss": 0.5257, "step": 355 }, { "epoch": 0.46203763789746916, "grad_norm": 0.3021689351056674, "learning_rate": 4.7017797017797024e-05, "loss": 0.5256, "step": 356 }, { "epoch": 0.46333549643088906, "grad_norm": 0.3373492256075124, "learning_rate": 4.6993746993747e-05, "loss": 0.5262, "step": 357 }, { "epoch": 0.4646333549643089, "grad_norm": 0.3279466476251607, "learning_rate": 4.696969696969697e-05, "loss": 0.5252, "step": 358 }, { "epoch": 0.46593121349772876, "grad_norm": 0.3151794533478745, "learning_rate": 4.694564694564695e-05, "loss": 0.5355, "step": 359 }, { "epoch": 0.4672290720311486, "grad_norm": 0.3676469350203011, "learning_rate": 4.692159692159692e-05, "loss": 0.5536, "step": 360 }, { "epoch": 0.46852693056456846, "grad_norm": 0.2638129347242171, "learning_rate": 4.6897546897546904e-05, "loss": 0.5167, "step": 361 }, { "epoch": 0.4698247890979883, "grad_norm": 0.32036105761534295, "learning_rate": 4.687349687349687e-05, "loss": 0.5105, "step": 362 }, { "epoch": 0.47112264763140815, "grad_norm": 0.3312350329187521, "learning_rate": 4.6849446849446854e-05, "loss": 0.5403, "step": 363 }, { "epoch": 0.47242050616482806, "grad_norm": 0.2691270807481844, "learning_rate": 4.682539682539683e-05, "loss": 0.5129, "step": 364 }, { "epoch": 0.4737183646982479, "grad_norm": 0.31512841418214993, "learning_rate": 4.68013468013468e-05, "loss": 0.5188, "step": 365 }, { "epoch": 0.47501622323166776, "grad_norm": 0.32669598998030175, "learning_rate": 4.677729677729678e-05, "loss": 0.5384, "step": 366 }, { "epoch": 0.4763140817650876, "grad_norm": 0.3235775681673074, "learning_rate": 4.675324675324675e-05, "loss": 0.5194, "step": 367 }, { "epoch": 0.47761194029850745, "grad_norm": 0.3095310587265592, "learning_rate": 4.6729196729196734e-05, "loss": 0.5323, "step": 368 }, { "epoch": 0.4789097988319273, "grad_norm": 0.32341229991033627, "learning_rate": 4.670514670514671e-05, "loss": 0.5192, "step": 369 }, { "epoch": 0.48020765736534715, "grad_norm": 0.3175500385279334, "learning_rate": 4.6681096681096684e-05, "loss": 0.53, "step": 370 }, { "epoch": 0.48150551589876706, "grad_norm": 0.34206247662543693, "learning_rate": 4.665704665704666e-05, "loss": 0.5314, "step": 371 }, { "epoch": 0.4828033744321869, "grad_norm": 0.3185235414836324, "learning_rate": 4.6632996632996634e-05, "loss": 0.5367, "step": 372 }, { "epoch": 0.48410123296560675, "grad_norm": 0.28453878466972515, "learning_rate": 4.6608946608946615e-05, "loss": 0.5223, "step": 373 }, { "epoch": 0.4853990914990266, "grad_norm": 0.2957039810409513, "learning_rate": 4.658489658489659e-05, "loss": 0.5005, "step": 374 }, { "epoch": 0.48669695003244645, "grad_norm": 0.2940626178906619, "learning_rate": 4.656084656084656e-05, "loss": 0.5098, "step": 375 }, { "epoch": 0.4879948085658663, "grad_norm": 0.33976965417394467, "learning_rate": 4.653679653679654e-05, "loss": 0.5155, "step": 376 }, { "epoch": 0.4892926670992862, "grad_norm": 0.2993214412594064, "learning_rate": 4.6512746512746514e-05, "loss": 0.5183, "step": 377 }, { "epoch": 0.49059052563270605, "grad_norm": 0.3550998192684892, "learning_rate": 4.6488696488696496e-05, "loss": 0.5404, "step": 378 }, { "epoch": 0.4918883841661259, "grad_norm": 0.3961098073492471, "learning_rate": 4.6464646464646464e-05, "loss": 0.5411, "step": 379 }, { "epoch": 0.49318624269954575, "grad_norm": 0.34269318810223304, "learning_rate": 4.6440596440596445e-05, "loss": 0.5206, "step": 380 }, { "epoch": 0.4944841012329656, "grad_norm": 0.29509416892424534, "learning_rate": 4.641654641654642e-05, "loss": 0.5189, "step": 381 }, { "epoch": 0.49578195976638545, "grad_norm": 0.32772442148274133, "learning_rate": 4.6392496392496395e-05, "loss": 0.5368, "step": 382 }, { "epoch": 0.4970798182998053, "grad_norm": 0.2719733229054414, "learning_rate": 4.636844636844637e-05, "loss": 0.5027, "step": 383 }, { "epoch": 0.4983776768332252, "grad_norm": 0.32827976870034653, "learning_rate": 4.6344396344396344e-05, "loss": 0.5225, "step": 384 }, { "epoch": 0.49967553536664505, "grad_norm": 0.36529779846696075, "learning_rate": 4.6320346320346326e-05, "loss": 0.4961, "step": 385 }, { "epoch": 0.5009733939000649, "grad_norm": 0.34737533192311987, "learning_rate": 4.62962962962963e-05, "loss": 0.5056, "step": 386 }, { "epoch": 0.5022712524334848, "grad_norm": 0.32570267249669654, "learning_rate": 4.6272246272246276e-05, "loss": 0.5114, "step": 387 }, { "epoch": 0.5035691109669046, "grad_norm": 0.3419484703073112, "learning_rate": 4.624819624819625e-05, "loss": 0.5141, "step": 388 }, { "epoch": 0.5048669695003245, "grad_norm": 0.34141193067026915, "learning_rate": 4.6224146224146225e-05, "loss": 0.5296, "step": 389 }, { "epoch": 0.5061648280337443, "grad_norm": 0.314416021500765, "learning_rate": 4.620009620009621e-05, "loss": 0.5202, "step": 390 }, { "epoch": 0.5074626865671642, "grad_norm": 0.2608120507481533, "learning_rate": 4.617604617604618e-05, "loss": 0.507, "step": 391 }, { "epoch": 0.508760545100584, "grad_norm": 0.3415961124944403, "learning_rate": 4.615199615199615e-05, "loss": 0.5513, "step": 392 }, { "epoch": 0.5100584036340039, "grad_norm": 0.3545772824977214, "learning_rate": 4.612794612794613e-05, "loss": 0.5195, "step": 393 }, { "epoch": 0.5113562621674238, "grad_norm": 0.2759048918555112, "learning_rate": 4.6103896103896106e-05, "loss": 0.5145, "step": 394 }, { "epoch": 0.5126541207008436, "grad_norm": 0.29424902451922874, "learning_rate": 4.607984607984609e-05, "loss": 0.4965, "step": 395 }, { "epoch": 0.5139519792342635, "grad_norm": 0.2865983981532377, "learning_rate": 4.6055796055796055e-05, "loss": 0.51, "step": 396 }, { "epoch": 0.5152498377676833, "grad_norm": 0.2826395400080094, "learning_rate": 4.603174603174603e-05, "loss": 0.5177, "step": 397 }, { "epoch": 0.5165476963011032, "grad_norm": 0.3154832346968727, "learning_rate": 4.600769600769601e-05, "loss": 0.5167, "step": 398 }, { "epoch": 0.517845554834523, "grad_norm": 0.28657500627349963, "learning_rate": 4.5983645983645986e-05, "loss": 0.4915, "step": 399 }, { "epoch": 0.5191434133679429, "grad_norm": 0.3108878669769025, "learning_rate": 4.595959595959596e-05, "loss": 0.5094, "step": 400 }, { "epoch": 0.5204412719013628, "grad_norm": 0.3105653890499028, "learning_rate": 4.5935545935545936e-05, "loss": 0.5395, "step": 401 }, { "epoch": 0.5217391304347826, "grad_norm": 0.3041614944176647, "learning_rate": 4.591149591149592e-05, "loss": 0.5015, "step": 402 }, { "epoch": 0.5230369889682025, "grad_norm": 0.2774234169618742, "learning_rate": 4.588744588744589e-05, "loss": 0.4953, "step": 403 }, { "epoch": 0.5243348475016223, "grad_norm": 0.3292325244305617, "learning_rate": 4.586339586339587e-05, "loss": 0.5326, "step": 404 }, { "epoch": 0.5256327060350422, "grad_norm": 0.280230976708198, "learning_rate": 4.583934583934584e-05, "loss": 0.5186, "step": 405 }, { "epoch": 0.526930564568462, "grad_norm": 0.29220060360384115, "learning_rate": 4.5815295815295817e-05, "loss": 0.5224, "step": 406 }, { "epoch": 0.5282284231018819, "grad_norm": 0.2973987484773138, "learning_rate": 4.57912457912458e-05, "loss": 0.5198, "step": 407 }, { "epoch": 0.5295262816353018, "grad_norm": 0.31013343434720114, "learning_rate": 4.576719576719577e-05, "loss": 0.5303, "step": 408 }, { "epoch": 0.5308241401687216, "grad_norm": 0.3091402470001352, "learning_rate": 4.574314574314574e-05, "loss": 0.5176, "step": 409 }, { "epoch": 0.5321219987021415, "grad_norm": 0.2974831281530903, "learning_rate": 4.571909571909572e-05, "loss": 0.5153, "step": 410 }, { "epoch": 0.5334198572355613, "grad_norm": 0.30558238497100093, "learning_rate": 4.56950456950457e-05, "loss": 0.5171, "step": 411 }, { "epoch": 0.5347177157689812, "grad_norm": 0.3050417584271822, "learning_rate": 4.567099567099568e-05, "loss": 0.51, "step": 412 }, { "epoch": 0.536015574302401, "grad_norm": 0.3062163771011129, "learning_rate": 4.564694564694565e-05, "loss": 0.5182, "step": 413 }, { "epoch": 0.5373134328358209, "grad_norm": 0.2918708356467548, "learning_rate": 4.562289562289562e-05, "loss": 0.5372, "step": 414 }, { "epoch": 0.5386112913692408, "grad_norm": 0.3100802477130393, "learning_rate": 4.55988455988456e-05, "loss": 0.504, "step": 415 }, { "epoch": 0.5399091499026606, "grad_norm": 0.28080959444703624, "learning_rate": 4.557479557479558e-05, "loss": 0.5133, "step": 416 }, { "epoch": 0.5412070084360805, "grad_norm": 0.31510663571529784, "learning_rate": 4.555074555074555e-05, "loss": 0.5083, "step": 417 }, { "epoch": 0.5425048669695003, "grad_norm": 0.2987074534694197, "learning_rate": 4.552669552669553e-05, "loss": 0.5233, "step": 418 }, { "epoch": 0.5438027255029202, "grad_norm": 0.3018100646658762, "learning_rate": 4.55026455026455e-05, "loss": 0.5125, "step": 419 }, { "epoch": 0.54510058403634, "grad_norm": 0.3068942632441547, "learning_rate": 4.5478595478595484e-05, "loss": 0.5141, "step": 420 }, { "epoch": 0.5463984425697599, "grad_norm": 0.30401355001057445, "learning_rate": 4.545454545454546e-05, "loss": 0.5275, "step": 421 }, { "epoch": 0.5476963011031798, "grad_norm": 0.3203825407640607, "learning_rate": 4.543049543049543e-05, "loss": 0.5374, "step": 422 }, { "epoch": 0.5489941596365996, "grad_norm": 0.2890332355349151, "learning_rate": 4.540644540644541e-05, "loss": 0.4928, "step": 423 }, { "epoch": 0.5502920181700195, "grad_norm": 0.3167375640999411, "learning_rate": 4.538239538239538e-05, "loss": 0.5238, "step": 424 }, { "epoch": 0.5515898767034393, "grad_norm": 0.2838739525584143, "learning_rate": 4.535834535834536e-05, "loss": 0.5193, "step": 425 }, { "epoch": 0.5528877352368592, "grad_norm": 0.31600565211015347, "learning_rate": 4.533429533429533e-05, "loss": 0.5408, "step": 426 }, { "epoch": 0.5541855937702791, "grad_norm": 0.2936881048070723, "learning_rate": 4.5310245310245314e-05, "loss": 0.517, "step": 427 }, { "epoch": 0.5554834523036989, "grad_norm": 0.2869325695453175, "learning_rate": 4.528619528619529e-05, "loss": 0.5272, "step": 428 }, { "epoch": 0.5567813108371188, "grad_norm": 0.26396682696755225, "learning_rate": 4.5262145262145264e-05, "loss": 0.5092, "step": 429 }, { "epoch": 0.5580791693705386, "grad_norm": 0.2778797627866081, "learning_rate": 4.523809523809524e-05, "loss": 0.5107, "step": 430 }, { "epoch": 0.5593770279039585, "grad_norm": 0.29638166733777366, "learning_rate": 4.521404521404521e-05, "loss": 0.5207, "step": 431 }, { "epoch": 0.5606748864373783, "grad_norm": 0.31086056140406293, "learning_rate": 4.5189995189995195e-05, "loss": 0.5213, "step": 432 }, { "epoch": 0.5619727449707982, "grad_norm": 0.2777342628277087, "learning_rate": 4.516594516594517e-05, "loss": 0.5225, "step": 433 }, { "epoch": 0.5632706035042181, "grad_norm": 0.31153041204212967, "learning_rate": 4.5141895141895144e-05, "loss": 0.5029, "step": 434 }, { "epoch": 0.5645684620376379, "grad_norm": 0.2833387330391144, "learning_rate": 4.511784511784512e-05, "loss": 0.5269, "step": 435 }, { "epoch": 0.5658663205710578, "grad_norm": 0.3208653719812624, "learning_rate": 4.5093795093795094e-05, "loss": 0.5027, "step": 436 }, { "epoch": 0.5671641791044776, "grad_norm": 0.3024059727560176, "learning_rate": 4.5069745069745075e-05, "loss": 0.5219, "step": 437 }, { "epoch": 0.5684620376378975, "grad_norm": 0.2822549725146957, "learning_rate": 4.504569504569504e-05, "loss": 0.5095, "step": 438 }, { "epoch": 0.5697598961713173, "grad_norm": 0.3016137705597104, "learning_rate": 4.5021645021645025e-05, "loss": 0.5104, "step": 439 }, { "epoch": 0.5710577547047372, "grad_norm": 0.3627945641240943, "learning_rate": 4.4997594997595e-05, "loss": 0.5264, "step": 440 }, { "epoch": 0.5723556132381571, "grad_norm": 0.27507558856590775, "learning_rate": 4.4973544973544974e-05, "loss": 0.5139, "step": 441 }, { "epoch": 0.5736534717715769, "grad_norm": 0.3593784232319199, "learning_rate": 4.494949494949495e-05, "loss": 0.5212, "step": 442 }, { "epoch": 0.5749513303049968, "grad_norm": 0.29557452932516204, "learning_rate": 4.4925444925444924e-05, "loss": 0.502, "step": 443 }, { "epoch": 0.5762491888384166, "grad_norm": 0.3005471775259294, "learning_rate": 4.4901394901394906e-05, "loss": 0.4912, "step": 444 }, { "epoch": 0.5775470473718365, "grad_norm": 0.26919062978615377, "learning_rate": 4.487734487734488e-05, "loss": 0.5313, "step": 445 }, { "epoch": 0.5788449059052563, "grad_norm": 0.2556241030912058, "learning_rate": 4.4853294853294855e-05, "loss": 0.5085, "step": 446 }, { "epoch": 0.5801427644386762, "grad_norm": 0.2733433348389189, "learning_rate": 4.482924482924483e-05, "loss": 0.5232, "step": 447 }, { "epoch": 0.5814406229720961, "grad_norm": 0.2699232360629045, "learning_rate": 4.4805194805194805e-05, "loss": 0.5049, "step": 448 }, { "epoch": 0.5827384815055159, "grad_norm": 0.28747431650418886, "learning_rate": 4.4781144781144786e-05, "loss": 0.5112, "step": 449 }, { "epoch": 0.5840363400389358, "grad_norm": 0.28804716410878617, "learning_rate": 4.475709475709476e-05, "loss": 0.4905, "step": 450 }, { "epoch": 0.5853341985723556, "grad_norm": 0.2919584848714507, "learning_rate": 4.4733044733044736e-05, "loss": 0.5272, "step": 451 }, { "epoch": 0.5866320571057755, "grad_norm": 0.27840735576549713, "learning_rate": 4.470899470899471e-05, "loss": 0.4885, "step": 452 }, { "epoch": 0.5879299156391953, "grad_norm": 0.29490953800516345, "learning_rate": 4.4684944684944685e-05, "loss": 0.5164, "step": 453 }, { "epoch": 0.5892277741726152, "grad_norm": 0.3240170627979527, "learning_rate": 4.466089466089467e-05, "loss": 0.5173, "step": 454 }, { "epoch": 0.5905256327060351, "grad_norm": 0.2665880580848304, "learning_rate": 4.4636844636844635e-05, "loss": 0.5045, "step": 455 }, { "epoch": 0.5918234912394549, "grad_norm": 0.36453608305554464, "learning_rate": 4.4612794612794616e-05, "loss": 0.5176, "step": 456 }, { "epoch": 0.5931213497728748, "grad_norm": 0.2971475504780928, "learning_rate": 4.458874458874459e-05, "loss": 0.5047, "step": 457 }, { "epoch": 0.5944192083062946, "grad_norm": 0.34100322893736984, "learning_rate": 4.4564694564694566e-05, "loss": 0.5111, "step": 458 }, { "epoch": 0.5957170668397145, "grad_norm": 0.27240121549823265, "learning_rate": 4.454064454064454e-05, "loss": 0.4885, "step": 459 }, { "epoch": 0.5970149253731343, "grad_norm": 0.31589767714076145, "learning_rate": 4.4516594516594515e-05, "loss": 0.4994, "step": 460 }, { "epoch": 0.5983127839065542, "grad_norm": 0.2801464661937106, "learning_rate": 4.44925444925445e-05, "loss": 0.497, "step": 461 }, { "epoch": 0.5996106424399741, "grad_norm": 0.33064691940201346, "learning_rate": 4.446849446849447e-05, "loss": 0.5336, "step": 462 }, { "epoch": 0.6009085009733939, "grad_norm": 0.2866479133025442, "learning_rate": 4.4444444444444447e-05, "loss": 0.5346, "step": 463 }, { "epoch": 0.6022063595068138, "grad_norm": 0.33165659226246563, "learning_rate": 4.442039442039442e-05, "loss": 0.5145, "step": 464 }, { "epoch": 0.6035042180402336, "grad_norm": 0.4080281603274731, "learning_rate": 4.4396344396344396e-05, "loss": 0.5083, "step": 465 }, { "epoch": 0.6048020765736535, "grad_norm": 0.32997017809734314, "learning_rate": 4.437229437229438e-05, "loss": 0.5108, "step": 466 }, { "epoch": 0.6060999351070734, "grad_norm": 0.4193305220535223, "learning_rate": 4.434824434824435e-05, "loss": 0.5333, "step": 467 }, { "epoch": 0.6073977936404932, "grad_norm": 0.33536790460482346, "learning_rate": 4.432419432419432e-05, "loss": 0.5315, "step": 468 }, { "epoch": 0.6086956521739131, "grad_norm": 0.30331774787903265, "learning_rate": 4.43001443001443e-05, "loss": 0.5044, "step": 469 }, { "epoch": 0.6099935107073329, "grad_norm": 0.3332494229182582, "learning_rate": 4.427609427609428e-05, "loss": 0.5271, "step": 470 }, { "epoch": 0.6112913692407528, "grad_norm": 0.27631817841293893, "learning_rate": 4.425204425204426e-05, "loss": 0.4869, "step": 471 }, { "epoch": 0.6125892277741726, "grad_norm": 0.30864904652906816, "learning_rate": 4.4227994227994226e-05, "loss": 0.5189, "step": 472 }, { "epoch": 0.6138870863075925, "grad_norm": 0.25286124571299967, "learning_rate": 4.420394420394421e-05, "loss": 0.5243, "step": 473 }, { "epoch": 0.6151849448410124, "grad_norm": 0.32194768257527906, "learning_rate": 4.417989417989418e-05, "loss": 0.517, "step": 474 }, { "epoch": 0.6164828033744322, "grad_norm": 0.25295912491765415, "learning_rate": 4.415584415584416e-05, "loss": 0.5119, "step": 475 }, { "epoch": 0.6177806619078521, "grad_norm": 0.30031239529360704, "learning_rate": 4.413179413179413e-05, "loss": 0.5167, "step": 476 }, { "epoch": 0.6190785204412719, "grad_norm": 0.3457458465491688, "learning_rate": 4.410774410774411e-05, "loss": 0.5122, "step": 477 }, { "epoch": 0.6203763789746918, "grad_norm": 0.3091494265523315, "learning_rate": 4.408369408369409e-05, "loss": 0.4996, "step": 478 }, { "epoch": 0.6216742375081116, "grad_norm": 0.34841455852307157, "learning_rate": 4.405964405964406e-05, "loss": 0.5187, "step": 479 }, { "epoch": 0.6229720960415315, "grad_norm": 0.28466491288804874, "learning_rate": 4.403559403559404e-05, "loss": 0.5081, "step": 480 }, { "epoch": 0.6242699545749514, "grad_norm": 0.31239695738713696, "learning_rate": 4.401154401154401e-05, "loss": 0.5188, "step": 481 }, { "epoch": 0.6255678131083712, "grad_norm": 0.2906342979686575, "learning_rate": 4.398749398749399e-05, "loss": 0.4979, "step": 482 }, { "epoch": 0.6268656716417911, "grad_norm": 0.32742410431546853, "learning_rate": 4.396344396344397e-05, "loss": 0.5096, "step": 483 }, { "epoch": 0.6281635301752109, "grad_norm": 0.28587236180759673, "learning_rate": 4.3939393939393944e-05, "loss": 0.5282, "step": 484 }, { "epoch": 0.6294613887086308, "grad_norm": 0.2939115603096443, "learning_rate": 4.391534391534391e-05, "loss": 0.5172, "step": 485 }, { "epoch": 0.6307592472420506, "grad_norm": 0.2873210431560928, "learning_rate": 4.3891293891293894e-05, "loss": 0.5229, "step": 486 }, { "epoch": 0.6320571057754705, "grad_norm": 0.342818284796367, "learning_rate": 4.386724386724387e-05, "loss": 0.5272, "step": 487 }, { "epoch": 0.6333549643088904, "grad_norm": 0.2663532951872564, "learning_rate": 4.384319384319385e-05, "loss": 0.5087, "step": 488 }, { "epoch": 0.6346528228423102, "grad_norm": 0.25849575306366, "learning_rate": 4.381914381914382e-05, "loss": 0.503, "step": 489 }, { "epoch": 0.6359506813757301, "grad_norm": 0.2682428237326465, "learning_rate": 4.379509379509379e-05, "loss": 0.5178, "step": 490 }, { "epoch": 0.6372485399091499, "grad_norm": 0.2899634415594277, "learning_rate": 4.3771043771043774e-05, "loss": 0.4964, "step": 491 }, { "epoch": 0.6385463984425698, "grad_norm": 0.3453086828842896, "learning_rate": 4.374699374699375e-05, "loss": 0.53, "step": 492 }, { "epoch": 0.6398442569759896, "grad_norm": 0.34399408107909996, "learning_rate": 4.3722943722943724e-05, "loss": 0.5193, "step": 493 }, { "epoch": 0.6411421155094095, "grad_norm": 0.3610030879163129, "learning_rate": 4.36988936988937e-05, "loss": 0.5144, "step": 494 }, { "epoch": 0.6424399740428294, "grad_norm": 0.23843044383570788, "learning_rate": 4.367484367484368e-05, "loss": 0.4999, "step": 495 }, { "epoch": 0.6437378325762492, "grad_norm": 0.3654439591676623, "learning_rate": 4.3650793650793655e-05, "loss": 0.4888, "step": 496 }, { "epoch": 0.6450356911096691, "grad_norm": 0.28776010247656836, "learning_rate": 4.362674362674363e-05, "loss": 0.5022, "step": 497 }, { "epoch": 0.6463335496430889, "grad_norm": 0.3647131705869751, "learning_rate": 4.3602693602693604e-05, "loss": 0.5092, "step": 498 }, { "epoch": 0.6476314081765088, "grad_norm": 0.30736812776643446, "learning_rate": 4.357864357864358e-05, "loss": 0.5171, "step": 499 }, { "epoch": 0.6489292667099286, "grad_norm": 0.36290147629484104, "learning_rate": 4.355459355459356e-05, "loss": 0.5223, "step": 500 }, { "epoch": 0.6502271252433485, "grad_norm": 0.32228223382695725, "learning_rate": 4.3530543530543535e-05, "loss": 0.5099, "step": 501 }, { "epoch": 0.6515249837767684, "grad_norm": 0.31393689007483594, "learning_rate": 4.3506493506493503e-05, "loss": 0.496, "step": 502 }, { "epoch": 0.6528228423101882, "grad_norm": 0.2966759326603879, "learning_rate": 4.3482443482443485e-05, "loss": 0.5173, "step": 503 }, { "epoch": 0.6541207008436081, "grad_norm": 0.2864744517114858, "learning_rate": 4.345839345839346e-05, "loss": 0.503, "step": 504 }, { "epoch": 0.6554185593770279, "grad_norm": 0.28016826596559247, "learning_rate": 4.343434343434344e-05, "loss": 0.5095, "step": 505 }, { "epoch": 0.6567164179104478, "grad_norm": 0.3045274480234983, "learning_rate": 4.341029341029341e-05, "loss": 0.5234, "step": 506 }, { "epoch": 0.6580142764438677, "grad_norm": 0.2865539821179636, "learning_rate": 4.3386243386243384e-05, "loss": 0.5057, "step": 507 }, { "epoch": 0.6593121349772875, "grad_norm": 0.28016725527352626, "learning_rate": 4.3362193362193366e-05, "loss": 0.5054, "step": 508 }, { "epoch": 0.6606099935107074, "grad_norm": 0.2779087438851858, "learning_rate": 4.333814333814334e-05, "loss": 0.4844, "step": 509 }, { "epoch": 0.6619078520441272, "grad_norm": 0.29308593442315034, "learning_rate": 4.3314093314093315e-05, "loss": 0.5026, "step": 510 }, { "epoch": 0.6632057105775471, "grad_norm": 0.24617150101353785, "learning_rate": 4.329004329004329e-05, "loss": 0.505, "step": 511 }, { "epoch": 0.6645035691109669, "grad_norm": 0.2801536462432465, "learning_rate": 4.3265993265993265e-05, "loss": 0.4957, "step": 512 }, { "epoch": 0.6658014276443868, "grad_norm": 0.2590262249669081, "learning_rate": 4.3241943241943246e-05, "loss": 0.4976, "step": 513 }, { "epoch": 0.6670992861778067, "grad_norm": 0.27675213215164485, "learning_rate": 4.321789321789322e-05, "loss": 0.5016, "step": 514 }, { "epoch": 0.6683971447112265, "grad_norm": 0.3211262394859621, "learning_rate": 4.3193843193843196e-05, "loss": 0.5285, "step": 515 }, { "epoch": 0.6696950032446464, "grad_norm": 0.2847594895492132, "learning_rate": 4.316979316979317e-05, "loss": 0.5174, "step": 516 }, { "epoch": 0.6709928617780662, "grad_norm": 0.31731208678548406, "learning_rate": 4.314574314574315e-05, "loss": 0.5287, "step": 517 }, { "epoch": 0.6722907203114861, "grad_norm": 0.26600293134050695, "learning_rate": 4.312169312169313e-05, "loss": 0.5105, "step": 518 }, { "epoch": 0.6735885788449059, "grad_norm": 0.29880462234281113, "learning_rate": 4.3097643097643095e-05, "loss": 0.5375, "step": 519 }, { "epoch": 0.6748864373783258, "grad_norm": 0.2652094878668775, "learning_rate": 4.3073593073593077e-05, "loss": 0.5033, "step": 520 }, { "epoch": 0.6761842959117457, "grad_norm": 0.315140738606816, "learning_rate": 4.304954304954305e-05, "loss": 0.5238, "step": 521 }, { "epoch": 0.6774821544451655, "grad_norm": 0.2852888179467452, "learning_rate": 4.302549302549303e-05, "loss": 0.5125, "step": 522 }, { "epoch": 0.6787800129785854, "grad_norm": 0.3217782609108167, "learning_rate": 4.3001443001443e-05, "loss": 0.5084, "step": 523 }, { "epoch": 0.6800778715120052, "grad_norm": 0.3067930968649758, "learning_rate": 4.2977392977392976e-05, "loss": 0.4999, "step": 524 }, { "epoch": 0.6813757300454251, "grad_norm": 0.2937819263154037, "learning_rate": 4.295334295334296e-05, "loss": 0.5256, "step": 525 }, { "epoch": 0.6826735885788449, "grad_norm": 0.32438054578281567, "learning_rate": 4.292929292929293e-05, "loss": 0.4907, "step": 526 }, { "epoch": 0.6839714471122648, "grad_norm": 0.2742147889295781, "learning_rate": 4.290524290524291e-05, "loss": 0.5068, "step": 527 }, { "epoch": 0.6852693056456847, "grad_norm": 0.35488588986537717, "learning_rate": 4.288119288119288e-05, "loss": 0.5248, "step": 528 }, { "epoch": 0.6865671641791045, "grad_norm": 0.26229530604678386, "learning_rate": 4.2857142857142856e-05, "loss": 0.5037, "step": 529 }, { "epoch": 0.6878650227125244, "grad_norm": 0.3461696681941986, "learning_rate": 4.283309283309284e-05, "loss": 0.5025, "step": 530 }, { "epoch": 0.6891628812459442, "grad_norm": 0.266178675206237, "learning_rate": 4.280904280904281e-05, "loss": 0.4896, "step": 531 }, { "epoch": 0.6904607397793641, "grad_norm": 0.34686998824653287, "learning_rate": 4.278499278499279e-05, "loss": 0.5034, "step": 532 }, { "epoch": 0.6917585983127839, "grad_norm": 0.3320503579783302, "learning_rate": 4.276094276094276e-05, "loss": 0.5178, "step": 533 }, { "epoch": 0.6930564568462038, "grad_norm": 0.3083799644603529, "learning_rate": 4.273689273689274e-05, "loss": 0.526, "step": 534 }, { "epoch": 0.6943543153796237, "grad_norm": 0.36890093348582187, "learning_rate": 4.271284271284272e-05, "loss": 0.5311, "step": 535 }, { "epoch": 0.6956521739130435, "grad_norm": 0.30993196929952377, "learning_rate": 4.2688792688792686e-05, "loss": 0.5246, "step": 536 }, { "epoch": 0.6969500324464634, "grad_norm": 0.3135809737260554, "learning_rate": 4.266474266474267e-05, "loss": 0.5042, "step": 537 }, { "epoch": 0.6982478909798832, "grad_norm": 0.32235641085299527, "learning_rate": 4.264069264069264e-05, "loss": 0.5226, "step": 538 }, { "epoch": 0.6995457495133031, "grad_norm": 0.29955767973991637, "learning_rate": 4.2616642616642624e-05, "loss": 0.5003, "step": 539 }, { "epoch": 0.7008436080467229, "grad_norm": 0.2728001277044658, "learning_rate": 4.259259259259259e-05, "loss": 0.5035, "step": 540 }, { "epoch": 0.7021414665801428, "grad_norm": 0.3125500639615984, "learning_rate": 4.256854256854257e-05, "loss": 0.4982, "step": 541 }, { "epoch": 0.7034393251135627, "grad_norm": 0.3214238247187388, "learning_rate": 4.254449254449255e-05, "loss": 0.5015, "step": 542 }, { "epoch": 0.7047371836469825, "grad_norm": 0.34677033141949526, "learning_rate": 4.2520442520442523e-05, "loss": 0.5071, "step": 543 }, { "epoch": 0.7060350421804024, "grad_norm": 0.320737794563556, "learning_rate": 4.24963924963925e-05, "loss": 0.5036, "step": 544 }, { "epoch": 0.7073329007138222, "grad_norm": 0.28140631509820974, "learning_rate": 4.247234247234247e-05, "loss": 0.472, "step": 545 }, { "epoch": 0.7086307592472421, "grad_norm": 0.2876262450309547, "learning_rate": 4.244829244829245e-05, "loss": 0.5033, "step": 546 }, { "epoch": 0.7099286177806619, "grad_norm": 0.28203604302358143, "learning_rate": 4.242424242424243e-05, "loss": 0.4933, "step": 547 }, { "epoch": 0.7112264763140818, "grad_norm": 0.3106404772330975, "learning_rate": 4.2400192400192404e-05, "loss": 0.4972, "step": 548 }, { "epoch": 0.7125243348475017, "grad_norm": 0.25531650904916336, "learning_rate": 4.237614237614238e-05, "loss": 0.524, "step": 549 }, { "epoch": 0.7138221933809215, "grad_norm": 0.3694832680055122, "learning_rate": 4.2352092352092354e-05, "loss": 0.5215, "step": 550 }, { "epoch": 0.7151200519143414, "grad_norm": 0.29317455967258776, "learning_rate": 4.232804232804233e-05, "loss": 0.4978, "step": 551 }, { "epoch": 0.7164179104477612, "grad_norm": 0.36952833950680053, "learning_rate": 4.230399230399231e-05, "loss": 0.5091, "step": 552 }, { "epoch": 0.7177157689811811, "grad_norm": 0.3458300719165068, "learning_rate": 4.227994227994228e-05, "loss": 0.5179, "step": 553 }, { "epoch": 0.719013627514601, "grad_norm": 0.35910338257547214, "learning_rate": 4.225589225589226e-05, "loss": 0.4967, "step": 554 }, { "epoch": 0.7203114860480208, "grad_norm": 0.3832017565467235, "learning_rate": 4.2231842231842234e-05, "loss": 0.506, "step": 555 }, { "epoch": 0.7216093445814407, "grad_norm": 0.3270524496685099, "learning_rate": 4.220779220779221e-05, "loss": 0.4956, "step": 556 }, { "epoch": 0.7229072031148605, "grad_norm": 0.31306380662178745, "learning_rate": 4.2183742183742184e-05, "loss": 0.516, "step": 557 }, { "epoch": 0.7242050616482804, "grad_norm": 0.294604631221543, "learning_rate": 4.215969215969216e-05, "loss": 0.5055, "step": 558 }, { "epoch": 0.7255029201817002, "grad_norm": 0.3534780044338388, "learning_rate": 4.213564213564214e-05, "loss": 0.4975, "step": 559 }, { "epoch": 0.72680077871512, "grad_norm": 0.33032987931239965, "learning_rate": 4.2111592111592115e-05, "loss": 0.5118, "step": 560 }, { "epoch": 0.72809863724854, "grad_norm": 0.3196832192635056, "learning_rate": 4.208754208754209e-05, "loss": 0.5093, "step": 561 }, { "epoch": 0.7293964957819598, "grad_norm": 0.36785704594666774, "learning_rate": 4.2063492063492065e-05, "loss": 0.5069, "step": 562 }, { "epoch": 0.7306943543153797, "grad_norm": 0.3702503469527744, "learning_rate": 4.203944203944204e-05, "loss": 0.513, "step": 563 }, { "epoch": 0.7319922128487995, "grad_norm": 0.3070674221331164, "learning_rate": 4.201539201539202e-05, "loss": 0.497, "step": 564 }, { "epoch": 0.7332900713822194, "grad_norm": 0.35750959007798994, "learning_rate": 4.1991341991341996e-05, "loss": 0.5085, "step": 565 }, { "epoch": 0.7345879299156391, "grad_norm": 0.2835364219292076, "learning_rate": 4.196729196729197e-05, "loss": 0.5091, "step": 566 }, { "epoch": 0.735885788449059, "grad_norm": 0.2884098082465498, "learning_rate": 4.1943241943241945e-05, "loss": 0.4884, "step": 567 }, { "epoch": 0.737183646982479, "grad_norm": 0.3203510406175552, "learning_rate": 4.191919191919192e-05, "loss": 0.4971, "step": 568 }, { "epoch": 0.7384815055158988, "grad_norm": 0.27371373687668255, "learning_rate": 4.18951418951419e-05, "loss": 0.5095, "step": 569 }, { "epoch": 0.7397793640493187, "grad_norm": 0.34717402203397457, "learning_rate": 4.187109187109187e-05, "loss": 0.5014, "step": 570 }, { "epoch": 0.7410772225827384, "grad_norm": 0.30582393639621713, "learning_rate": 4.184704184704185e-05, "loss": 0.5181, "step": 571 }, { "epoch": 0.7423750811161584, "grad_norm": 0.32112393843480735, "learning_rate": 4.1822991822991826e-05, "loss": 0.5006, "step": 572 }, { "epoch": 0.7436729396495781, "grad_norm": 0.33979137877685406, "learning_rate": 4.17989417989418e-05, "loss": 0.5248, "step": 573 }, { "epoch": 0.744970798182998, "grad_norm": 0.3209001348833202, "learning_rate": 4.1774891774891775e-05, "loss": 0.5013, "step": 574 }, { "epoch": 0.746268656716418, "grad_norm": 0.3266878409508907, "learning_rate": 4.175084175084175e-05, "loss": 0.4915, "step": 575 }, { "epoch": 0.7475665152498377, "grad_norm": 0.30503612561210064, "learning_rate": 4.172679172679173e-05, "loss": 0.5046, "step": 576 }, { "epoch": 0.7488643737832577, "grad_norm": 0.28971405798539174, "learning_rate": 4.1702741702741707e-05, "loss": 0.494, "step": 577 }, { "epoch": 0.7501622323166774, "grad_norm": 0.29780493895562776, "learning_rate": 4.167869167869168e-05, "loss": 0.492, "step": 578 }, { "epoch": 0.7514600908500974, "grad_norm": 0.32685361493691256, "learning_rate": 4.1654641654641656e-05, "loss": 0.5101, "step": 579 }, { "epoch": 0.7527579493835171, "grad_norm": 0.34614664346128227, "learning_rate": 4.163059163059163e-05, "loss": 0.4953, "step": 580 }, { "epoch": 0.754055807916937, "grad_norm": 0.24964382812915295, "learning_rate": 4.160654160654161e-05, "loss": 0.4974, "step": 581 }, { "epoch": 0.755353666450357, "grad_norm": 0.3266391465805975, "learning_rate": 4.158249158249159e-05, "loss": 0.5039, "step": 582 }, { "epoch": 0.7566515249837767, "grad_norm": 0.34857873431761155, "learning_rate": 4.155844155844156e-05, "loss": 0.5056, "step": 583 }, { "epoch": 0.7579493835171967, "grad_norm": 0.2921239047290261, "learning_rate": 4.153439153439154e-05, "loss": 0.4967, "step": 584 }, { "epoch": 0.7592472420506164, "grad_norm": 0.33268372260105683, "learning_rate": 4.151034151034151e-05, "loss": 0.5066, "step": 585 }, { "epoch": 0.7605451005840363, "grad_norm": 0.318064660501317, "learning_rate": 4.148629148629149e-05, "loss": 0.5014, "step": 586 }, { "epoch": 0.7618429591174561, "grad_norm": 0.40789097570888044, "learning_rate": 4.146224146224146e-05, "loss": 0.5236, "step": 587 }, { "epoch": 0.763140817650876, "grad_norm": 0.36460994866717067, "learning_rate": 4.143819143819144e-05, "loss": 0.5028, "step": 588 }, { "epoch": 0.764438676184296, "grad_norm": 0.32621095461004207, "learning_rate": 4.141414141414142e-05, "loss": 0.5, "step": 589 }, { "epoch": 0.7657365347177157, "grad_norm": 0.35949920016397585, "learning_rate": 4.139009139009139e-05, "loss": 0.5028, "step": 590 }, { "epoch": 0.7670343932511356, "grad_norm": 0.26064162839844207, "learning_rate": 4.136604136604137e-05, "loss": 0.4926, "step": 591 }, { "epoch": 0.7683322517845554, "grad_norm": 0.33955569046497985, "learning_rate": 4.134199134199134e-05, "loss": 0.491, "step": 592 }, { "epoch": 0.7696301103179753, "grad_norm": 0.302141896432662, "learning_rate": 4.131794131794132e-05, "loss": 0.4884, "step": 593 }, { "epoch": 0.7709279688513953, "grad_norm": 0.30640851435457384, "learning_rate": 4.12938912938913e-05, "loss": 0.4993, "step": 594 }, { "epoch": 0.772225827384815, "grad_norm": 0.3040997259974209, "learning_rate": 4.126984126984127e-05, "loss": 0.5207, "step": 595 }, { "epoch": 0.773523685918235, "grad_norm": 0.3146837697580934, "learning_rate": 4.124579124579125e-05, "loss": 0.5041, "step": 596 }, { "epoch": 0.7748215444516547, "grad_norm": 0.2837662015282775, "learning_rate": 4.122174122174122e-05, "loss": 0.4958, "step": 597 }, { "epoch": 0.7761194029850746, "grad_norm": 0.3005344336762094, "learning_rate": 4.1197691197691204e-05, "loss": 0.4995, "step": 598 }, { "epoch": 0.7774172615184944, "grad_norm": 0.2593994091267606, "learning_rate": 4.117364117364118e-05, "loss": 0.4949, "step": 599 }, { "epoch": 0.7787151200519143, "grad_norm": 0.31127336265884026, "learning_rate": 4.114959114959115e-05, "loss": 0.5026, "step": 600 }, { "epoch": 0.7800129785853342, "grad_norm": 0.24332809263192706, "learning_rate": 4.112554112554113e-05, "loss": 0.506, "step": 601 }, { "epoch": 0.781310837118754, "grad_norm": 0.268091615023721, "learning_rate": 4.11014911014911e-05, "loss": 0.5148, "step": 602 }, { "epoch": 0.782608695652174, "grad_norm": 0.27606994914293354, "learning_rate": 4.1077441077441085e-05, "loss": 0.5058, "step": 603 }, { "epoch": 0.7839065541855937, "grad_norm": 0.25091695629276883, "learning_rate": 4.105339105339105e-05, "loss": 0.4939, "step": 604 }, { "epoch": 0.7852044127190136, "grad_norm": 0.25969299604058604, "learning_rate": 4.1029341029341034e-05, "loss": 0.5045, "step": 605 }, { "epoch": 0.7865022712524334, "grad_norm": 0.28645937532292653, "learning_rate": 4.100529100529101e-05, "loss": 0.51, "step": 606 }, { "epoch": 0.7878001297858533, "grad_norm": 0.2867588479387004, "learning_rate": 4.0981240981240984e-05, "loss": 0.4849, "step": 607 }, { "epoch": 0.7890979883192732, "grad_norm": 0.2646936431763157, "learning_rate": 4.095719095719096e-05, "loss": 0.5139, "step": 608 }, { "epoch": 0.790395846852693, "grad_norm": 0.3083485975619241, "learning_rate": 4.093314093314093e-05, "loss": 0.52, "step": 609 }, { "epoch": 0.791693705386113, "grad_norm": 0.29615253606758346, "learning_rate": 4.0909090909090915e-05, "loss": 0.491, "step": 610 }, { "epoch": 0.7929915639195327, "grad_norm": 0.29803496441037525, "learning_rate": 4.088504088504089e-05, "loss": 0.5054, "step": 611 }, { "epoch": 0.7942894224529526, "grad_norm": 0.2854912264040868, "learning_rate": 4.0860990860990864e-05, "loss": 0.4838, "step": 612 }, { "epoch": 0.7955872809863724, "grad_norm": 0.2861142997625756, "learning_rate": 4.083694083694084e-05, "loss": 0.4777, "step": 613 }, { "epoch": 0.7968851395197923, "grad_norm": 0.28878974958497106, "learning_rate": 4.0812890812890814e-05, "loss": 0.5043, "step": 614 }, { "epoch": 0.7981829980532122, "grad_norm": 0.28908126130671624, "learning_rate": 4.0788840788840795e-05, "loss": 0.5077, "step": 615 }, { "epoch": 0.799480856586632, "grad_norm": 0.31059225731020423, "learning_rate": 4.0764790764790763e-05, "loss": 0.4846, "step": 616 }, { "epoch": 0.8007787151200519, "grad_norm": 0.2990367658353648, "learning_rate": 4.074074074074074e-05, "loss": 0.5075, "step": 617 }, { "epoch": 0.8020765736534717, "grad_norm": 0.29529728033655306, "learning_rate": 4.071669071669072e-05, "loss": 0.4876, "step": 618 }, { "epoch": 0.8033744321868916, "grad_norm": 0.299331177429087, "learning_rate": 4.0692640692640695e-05, "loss": 0.5032, "step": 619 }, { "epoch": 0.8046722907203114, "grad_norm": 0.2842657684401892, "learning_rate": 4.066859066859067e-05, "loss": 0.4889, "step": 620 }, { "epoch": 0.8059701492537313, "grad_norm": 0.2692733717815561, "learning_rate": 4.0644540644540644e-05, "loss": 0.5009, "step": 621 }, { "epoch": 0.8072680077871512, "grad_norm": 0.3436883319030681, "learning_rate": 4.062049062049062e-05, "loss": 0.5185, "step": 622 }, { "epoch": 0.808565866320571, "grad_norm": 0.2653175278056993, "learning_rate": 4.05964405964406e-05, "loss": 0.4739, "step": 623 }, { "epoch": 0.8098637248539909, "grad_norm": 0.31209657077735303, "learning_rate": 4.0572390572390575e-05, "loss": 0.4944, "step": 624 }, { "epoch": 0.8111615833874107, "grad_norm": 0.3396825057908641, "learning_rate": 4.054834054834055e-05, "loss": 0.5203, "step": 625 }, { "epoch": 0.8124594419208306, "grad_norm": 0.31652376647001546, "learning_rate": 4.0524290524290525e-05, "loss": 0.4973, "step": 626 }, { "epoch": 0.8137573004542504, "grad_norm": 0.41987696956302806, "learning_rate": 4.05002405002405e-05, "loss": 0.5008, "step": 627 }, { "epoch": 0.8150551589876703, "grad_norm": 0.32125784926625567, "learning_rate": 4.047619047619048e-05, "loss": 0.5107, "step": 628 }, { "epoch": 0.8163530175210902, "grad_norm": 0.43302794720660975, "learning_rate": 4.045214045214045e-05, "loss": 0.5174, "step": 629 }, { "epoch": 0.81765087605451, "grad_norm": 0.29529734876987174, "learning_rate": 4.042809042809043e-05, "loss": 0.4881, "step": 630 }, { "epoch": 0.8189487345879299, "grad_norm": 0.4076264173563411, "learning_rate": 4.0404040404040405e-05, "loss": 0.5034, "step": 631 }, { "epoch": 0.8202465931213497, "grad_norm": 0.30337707563686833, "learning_rate": 4.037999037999039e-05, "loss": 0.5119, "step": 632 }, { "epoch": 0.8215444516547696, "grad_norm": 0.39849923453663594, "learning_rate": 4.0355940355940355e-05, "loss": 0.5216, "step": 633 }, { "epoch": 0.8228423101881895, "grad_norm": 0.26447226558452136, "learning_rate": 4.033189033189033e-05, "loss": 0.5122, "step": 634 }, { "epoch": 0.8241401687216093, "grad_norm": 0.36530243282807756, "learning_rate": 4.030784030784031e-05, "loss": 0.4918, "step": 635 }, { "epoch": 0.8254380272550292, "grad_norm": 0.3160155549438362, "learning_rate": 4.0283790283790286e-05, "loss": 0.5024, "step": 636 }, { "epoch": 0.826735885788449, "grad_norm": 0.33636766065888035, "learning_rate": 4.025974025974026e-05, "loss": 0.5075, "step": 637 }, { "epoch": 0.8280337443218689, "grad_norm": 0.29456591212102723, "learning_rate": 4.0235690235690236e-05, "loss": 0.5241, "step": 638 }, { "epoch": 0.8293316028552887, "grad_norm": 0.3220137418817016, "learning_rate": 4.021164021164021e-05, "loss": 0.5028, "step": 639 }, { "epoch": 0.8306294613887086, "grad_norm": 0.279849046005973, "learning_rate": 4.018759018759019e-05, "loss": 0.4937, "step": 640 }, { "epoch": 0.8319273199221285, "grad_norm": 0.34243863539028374, "learning_rate": 4.016354016354017e-05, "loss": 0.4992, "step": 641 }, { "epoch": 0.8332251784555483, "grad_norm": 0.3077281111260478, "learning_rate": 4.013949013949014e-05, "loss": 0.5013, "step": 642 }, { "epoch": 0.8345230369889682, "grad_norm": 0.2917135387110751, "learning_rate": 4.0115440115440116e-05, "loss": 0.5084, "step": 643 }, { "epoch": 0.835820895522388, "grad_norm": 0.4035461806364624, "learning_rate": 4.009139009139009e-05, "loss": 0.5075, "step": 644 }, { "epoch": 0.8371187540558079, "grad_norm": 0.28209498726622767, "learning_rate": 4.006734006734007e-05, "loss": 0.5018, "step": 645 }, { "epoch": 0.8384166125892277, "grad_norm": 0.32365984928312647, "learning_rate": 4.004329004329004e-05, "loss": 0.5059, "step": 646 }, { "epoch": 0.8397144711226476, "grad_norm": 0.27577530319773297, "learning_rate": 4.001924001924002e-05, "loss": 0.4833, "step": 647 }, { "epoch": 0.8410123296560675, "grad_norm": 0.28322226416250573, "learning_rate": 3.999518999519e-05, "loss": 0.4918, "step": 648 }, { "epoch": 0.8423101881894873, "grad_norm": 0.2855631382527533, "learning_rate": 3.997113997113997e-05, "loss": 0.4865, "step": 649 }, { "epoch": 0.8436080467229072, "grad_norm": 0.32968743054146016, "learning_rate": 3.9947089947089946e-05, "loss": 0.4995, "step": 650 }, { "epoch": 0.844905905256327, "grad_norm": 0.2546632505302031, "learning_rate": 3.992303992303992e-05, "loss": 0.5007, "step": 651 }, { "epoch": 0.8462037637897469, "grad_norm": 0.3379202280608477, "learning_rate": 3.98989898989899e-05, "loss": 0.5191, "step": 652 }, { "epoch": 0.8475016223231667, "grad_norm": 0.2828398301217406, "learning_rate": 3.987493987493988e-05, "loss": 0.511, "step": 653 }, { "epoch": 0.8487994808565866, "grad_norm": 0.2759621170821239, "learning_rate": 3.985088985088985e-05, "loss": 0.4833, "step": 654 }, { "epoch": 0.8500973393900065, "grad_norm": 0.30549235671106895, "learning_rate": 3.982683982683983e-05, "loss": 0.5041, "step": 655 }, { "epoch": 0.8513951979234263, "grad_norm": 0.27394626074529427, "learning_rate": 3.98027898027898e-05, "loss": 0.4765, "step": 656 }, { "epoch": 0.8526930564568462, "grad_norm": 0.28778374402990897, "learning_rate": 3.9778739778739783e-05, "loss": 0.5204, "step": 657 }, { "epoch": 0.853990914990266, "grad_norm": 0.34038234379097493, "learning_rate": 3.975468975468976e-05, "loss": 0.4974, "step": 658 }, { "epoch": 0.8552887735236859, "grad_norm": 0.2866059277868264, "learning_rate": 3.973063973063973e-05, "loss": 0.5054, "step": 659 }, { "epoch": 0.8565866320571057, "grad_norm": 0.2927739560082091, "learning_rate": 3.970658970658971e-05, "loss": 0.4974, "step": 660 }, { "epoch": 0.8578844905905256, "grad_norm": 0.3262341662863849, "learning_rate": 3.968253968253968e-05, "loss": 0.4989, "step": 661 }, { "epoch": 0.8591823491239455, "grad_norm": 0.2845679103896212, "learning_rate": 3.9658489658489664e-05, "loss": 0.4874, "step": 662 }, { "epoch": 0.8604802076573653, "grad_norm": 0.2655596774616536, "learning_rate": 3.963443963443963e-05, "loss": 0.4843, "step": 663 }, { "epoch": 0.8617780661907852, "grad_norm": 0.2820305028454277, "learning_rate": 3.9610389610389614e-05, "loss": 0.5164, "step": 664 }, { "epoch": 0.863075924724205, "grad_norm": 0.2940016391705861, "learning_rate": 3.958633958633959e-05, "loss": 0.4886, "step": 665 }, { "epoch": 0.8643737832576249, "grad_norm": 0.2547644694514986, "learning_rate": 3.956228956228956e-05, "loss": 0.5051, "step": 666 }, { "epoch": 0.8656716417910447, "grad_norm": 0.26023827863988136, "learning_rate": 3.953823953823954e-05, "loss": 0.5217, "step": 667 }, { "epoch": 0.8669695003244646, "grad_norm": 0.27927530276749113, "learning_rate": 3.951418951418951e-05, "loss": 0.5092, "step": 668 }, { "epoch": 0.8682673588578845, "grad_norm": 0.26218361367284654, "learning_rate": 3.9490139490139494e-05, "loss": 0.4889, "step": 669 }, { "epoch": 0.8695652173913043, "grad_norm": 0.2574083587232944, "learning_rate": 3.946608946608947e-05, "loss": 0.498, "step": 670 }, { "epoch": 0.8708630759247242, "grad_norm": 0.28970396117550484, "learning_rate": 3.9442039442039444e-05, "loss": 0.5005, "step": 671 }, { "epoch": 0.872160934458144, "grad_norm": 0.253672603517478, "learning_rate": 3.941798941798942e-05, "loss": 0.4802, "step": 672 }, { "epoch": 0.8734587929915639, "grad_norm": 0.28176527595198714, "learning_rate": 3.939393939393939e-05, "loss": 0.5009, "step": 673 }, { "epoch": 0.8747566515249838, "grad_norm": 0.29970103183262975, "learning_rate": 3.9369889369889375e-05, "loss": 0.494, "step": 674 }, { "epoch": 0.8760545100584036, "grad_norm": 0.28208623298136165, "learning_rate": 3.934583934583935e-05, "loss": 0.4784, "step": 675 }, { "epoch": 0.8773523685918235, "grad_norm": 0.31291964058774646, "learning_rate": 3.9321789321789324e-05, "loss": 0.4869, "step": 676 }, { "epoch": 0.8786502271252433, "grad_norm": 0.2979349183006068, "learning_rate": 3.92977392977393e-05, "loss": 0.5046, "step": 677 }, { "epoch": 0.8799480856586632, "grad_norm": 0.2908014815865184, "learning_rate": 3.9273689273689274e-05, "loss": 0.5265, "step": 678 }, { "epoch": 0.881245944192083, "grad_norm": 0.26416143914819024, "learning_rate": 3.9249639249639256e-05, "loss": 0.4818, "step": 679 }, { "epoch": 0.8825438027255029, "grad_norm": 0.2933751247833131, "learning_rate": 3.9225589225589224e-05, "loss": 0.5012, "step": 680 }, { "epoch": 0.8838416612589228, "grad_norm": 0.3126465117694497, "learning_rate": 3.9201539201539205e-05, "loss": 0.5146, "step": 681 }, { "epoch": 0.8851395197923426, "grad_norm": 0.2524418661547154, "learning_rate": 3.917748917748918e-05, "loss": 0.5016, "step": 682 }, { "epoch": 0.8864373783257625, "grad_norm": 0.25838371662824994, "learning_rate": 3.9153439153439155e-05, "loss": 0.4788, "step": 683 }, { "epoch": 0.8877352368591823, "grad_norm": 0.26415640627712256, "learning_rate": 3.912938912938913e-05, "loss": 0.5026, "step": 684 }, { "epoch": 0.8890330953926022, "grad_norm": 0.24247698201137077, "learning_rate": 3.9105339105339104e-05, "loss": 0.4785, "step": 685 }, { "epoch": 0.890330953926022, "grad_norm": 0.2592072694481307, "learning_rate": 3.9081289081289086e-05, "loss": 0.4977, "step": 686 }, { "epoch": 0.8916288124594419, "grad_norm": 0.2542832037438216, "learning_rate": 3.905723905723906e-05, "loss": 0.4965, "step": 687 }, { "epoch": 0.8929266709928618, "grad_norm": 0.262249098843731, "learning_rate": 3.9033189033189035e-05, "loss": 0.4973, "step": 688 }, { "epoch": 0.8942245295262816, "grad_norm": 0.31610300163969496, "learning_rate": 3.900913900913901e-05, "loss": 0.4825, "step": 689 }, { "epoch": 0.8955223880597015, "grad_norm": 0.2740619461513036, "learning_rate": 3.8985088985088985e-05, "loss": 0.4975, "step": 690 }, { "epoch": 0.8968202465931213, "grad_norm": 0.2741811791339238, "learning_rate": 3.8961038961038966e-05, "loss": 0.4959, "step": 691 }, { "epoch": 0.8981181051265412, "grad_norm": 0.271574424484243, "learning_rate": 3.893698893698894e-05, "loss": 0.4871, "step": 692 }, { "epoch": 0.899415963659961, "grad_norm": 0.23758978275848563, "learning_rate": 3.891293891293891e-05, "loss": 0.4862, "step": 693 }, { "epoch": 0.9007138221933809, "grad_norm": 0.27263432443106744, "learning_rate": 3.888888888888889e-05, "loss": 0.5041, "step": 694 }, { "epoch": 0.9020116807268008, "grad_norm": 0.2707424886921677, "learning_rate": 3.8864838864838866e-05, "loss": 0.4669, "step": 695 }, { "epoch": 0.9033095392602206, "grad_norm": 0.25600324849109557, "learning_rate": 3.884078884078885e-05, "loss": 0.4915, "step": 696 }, { "epoch": 0.9046073977936405, "grad_norm": 0.28010923150865535, "learning_rate": 3.8816738816738815e-05, "loss": 0.5038, "step": 697 }, { "epoch": 0.9059052563270603, "grad_norm": 0.28506679888273495, "learning_rate": 3.87926887926888e-05, "loss": 0.4965, "step": 698 }, { "epoch": 0.9072031148604802, "grad_norm": 0.26956128070889057, "learning_rate": 3.876863876863877e-05, "loss": 0.5141, "step": 699 }, { "epoch": 0.9085009733939, "grad_norm": 0.25947152657252537, "learning_rate": 3.8744588744588746e-05, "loss": 0.477, "step": 700 }, { "epoch": 0.9097988319273199, "grad_norm": 0.29443853949197607, "learning_rate": 3.872053872053872e-05, "loss": 0.5059, "step": 701 }, { "epoch": 0.9110966904607398, "grad_norm": 0.23371391184316104, "learning_rate": 3.8696488696488696e-05, "loss": 0.4783, "step": 702 }, { "epoch": 0.9123945489941596, "grad_norm": 0.2960003000238748, "learning_rate": 3.867243867243868e-05, "loss": 0.4975, "step": 703 }, { "epoch": 0.9136924075275795, "grad_norm": 0.2447202791191107, "learning_rate": 3.864838864838865e-05, "loss": 0.4922, "step": 704 }, { "epoch": 0.9149902660609993, "grad_norm": 0.26642870018861, "learning_rate": 3.862433862433863e-05, "loss": 0.5252, "step": 705 }, { "epoch": 0.9162881245944192, "grad_norm": 0.2560976171455726, "learning_rate": 3.86002886002886e-05, "loss": 0.5012, "step": 706 }, { "epoch": 0.917585983127839, "grad_norm": 0.28883925682227224, "learning_rate": 3.8576238576238576e-05, "loss": 0.509, "step": 707 }, { "epoch": 0.9188838416612589, "grad_norm": 0.2513723680500846, "learning_rate": 3.855218855218856e-05, "loss": 0.5064, "step": 708 }, { "epoch": 0.9201817001946788, "grad_norm": 0.26385183299541554, "learning_rate": 3.852813852813853e-05, "loss": 0.4844, "step": 709 }, { "epoch": 0.9214795587280986, "grad_norm": 0.2739471730680778, "learning_rate": 3.85040885040885e-05, "loss": 0.5062, "step": 710 }, { "epoch": 0.9227774172615185, "grad_norm": 0.29349547383156327, "learning_rate": 3.848003848003848e-05, "loss": 0.4663, "step": 711 }, { "epoch": 0.9240752757949383, "grad_norm": 0.24335538681766872, "learning_rate": 3.845598845598846e-05, "loss": 0.4802, "step": 712 }, { "epoch": 0.9253731343283582, "grad_norm": 0.26992374528269225, "learning_rate": 3.843193843193844e-05, "loss": 0.4737, "step": 713 }, { "epoch": 0.9266709928617781, "grad_norm": 0.2601068353604271, "learning_rate": 3.8407888407888407e-05, "loss": 0.4871, "step": 714 }, { "epoch": 0.9279688513951979, "grad_norm": 0.2758032793334676, "learning_rate": 3.838383838383838e-05, "loss": 0.497, "step": 715 }, { "epoch": 0.9292667099286178, "grad_norm": 0.28995105804236226, "learning_rate": 3.835978835978836e-05, "loss": 0.4954, "step": 716 }, { "epoch": 0.9305645684620376, "grad_norm": 0.2711126929498793, "learning_rate": 3.833573833573834e-05, "loss": 0.4899, "step": 717 }, { "epoch": 0.9318624269954575, "grad_norm": 0.27659146601067053, "learning_rate": 3.831168831168831e-05, "loss": 0.4724, "step": 718 }, { "epoch": 0.9331602855288773, "grad_norm": 0.27572149711082233, "learning_rate": 3.828763828763829e-05, "loss": 0.4771, "step": 719 }, { "epoch": 0.9344581440622972, "grad_norm": 0.26747547869691946, "learning_rate": 3.826358826358827e-05, "loss": 0.508, "step": 720 }, { "epoch": 0.9357560025957171, "grad_norm": 0.25233671196063495, "learning_rate": 3.8239538239538244e-05, "loss": 0.5112, "step": 721 }, { "epoch": 0.9370538611291369, "grad_norm": 0.25520294945614, "learning_rate": 3.821548821548822e-05, "loss": 0.5015, "step": 722 }, { "epoch": 0.9383517196625568, "grad_norm": 0.2638285175394357, "learning_rate": 3.819143819143819e-05, "loss": 0.5159, "step": 723 }, { "epoch": 0.9396495781959766, "grad_norm": 0.2424275513941752, "learning_rate": 3.816738816738817e-05, "loss": 0.4934, "step": 724 }, { "epoch": 0.9409474367293965, "grad_norm": 0.25978321355889966, "learning_rate": 3.814333814333815e-05, "loss": 0.493, "step": 725 }, { "epoch": 0.9422452952628163, "grad_norm": 0.2717237370055125, "learning_rate": 3.8119288119288124e-05, "loss": 0.4848, "step": 726 }, { "epoch": 0.9435431537962362, "grad_norm": 0.2649262510228781, "learning_rate": 3.809523809523809e-05, "loss": 0.5106, "step": 727 }, { "epoch": 0.9448410123296561, "grad_norm": 0.2630223525297261, "learning_rate": 3.8071188071188074e-05, "loss": 0.4819, "step": 728 }, { "epoch": 0.9461388708630759, "grad_norm": 0.2552060740231092, "learning_rate": 3.804713804713805e-05, "loss": 0.4846, "step": 729 }, { "epoch": 0.9474367293964958, "grad_norm": 0.27269809006023926, "learning_rate": 3.802308802308803e-05, "loss": 0.4646, "step": 730 }, { "epoch": 0.9487345879299156, "grad_norm": 0.26086007457601984, "learning_rate": 3.7999037999038e-05, "loss": 0.4881, "step": 731 }, { "epoch": 0.9500324464633355, "grad_norm": 0.261121106269547, "learning_rate": 3.797498797498797e-05, "loss": 0.4676, "step": 732 }, { "epoch": 0.9513303049967553, "grad_norm": 0.24120860308268785, "learning_rate": 3.7950937950937954e-05, "loss": 0.475, "step": 733 }, { "epoch": 0.9526281635301752, "grad_norm": 0.23338097940225563, "learning_rate": 3.792688792688793e-05, "loss": 0.4913, "step": 734 }, { "epoch": 0.9539260220635951, "grad_norm": 0.27856238721903515, "learning_rate": 3.7902837902837904e-05, "loss": 0.4817, "step": 735 }, { "epoch": 0.9552238805970149, "grad_norm": 0.23489180306981414, "learning_rate": 3.787878787878788e-05, "loss": 0.4876, "step": 736 }, { "epoch": 0.9565217391304348, "grad_norm": 0.24747777349253483, "learning_rate": 3.7854737854737854e-05, "loss": 0.4926, "step": 737 }, { "epoch": 0.9578195976638546, "grad_norm": 0.2464671646367973, "learning_rate": 3.7830687830687835e-05, "loss": 0.4988, "step": 738 }, { "epoch": 0.9591174561972745, "grad_norm": 0.2444290486715431, "learning_rate": 3.780663780663781e-05, "loss": 0.5102, "step": 739 }, { "epoch": 0.9604153147306943, "grad_norm": 0.23262037897628499, "learning_rate": 3.7782587782587785e-05, "loss": 0.4928, "step": 740 }, { "epoch": 0.9617131732641142, "grad_norm": 0.2320844900099328, "learning_rate": 3.775853775853776e-05, "loss": 0.4858, "step": 741 }, { "epoch": 0.9630110317975341, "grad_norm": 0.25468927720147666, "learning_rate": 3.773448773448774e-05, "loss": 0.4978, "step": 742 }, { "epoch": 0.9643088903309539, "grad_norm": 0.2686348081196404, "learning_rate": 3.7710437710437716e-05, "loss": 0.4774, "step": 743 }, { "epoch": 0.9656067488643738, "grad_norm": 0.25462323553264127, "learning_rate": 3.7686387686387684e-05, "loss": 0.4967, "step": 744 }, { "epoch": 0.9669046073977936, "grad_norm": 0.23826397601414423, "learning_rate": 3.7662337662337665e-05, "loss": 0.4803, "step": 745 }, { "epoch": 0.9682024659312135, "grad_norm": 0.28828391446426194, "learning_rate": 3.763828763828764e-05, "loss": 0.4853, "step": 746 }, { "epoch": 0.9695003244646333, "grad_norm": 0.28433469305996517, "learning_rate": 3.761423761423762e-05, "loss": 0.4952, "step": 747 }, { "epoch": 0.9707981829980532, "grad_norm": 0.23492438563324666, "learning_rate": 3.759018759018759e-05, "loss": 0.4707, "step": 748 }, { "epoch": 0.9720960415314731, "grad_norm": 0.24099143399264922, "learning_rate": 3.7566137566137564e-05, "loss": 0.5025, "step": 749 }, { "epoch": 0.9733939000648929, "grad_norm": 0.26521862280904784, "learning_rate": 3.7542087542087546e-05, "loss": 0.4801, "step": 750 }, { "epoch": 0.9746917585983128, "grad_norm": 0.25301181696938563, "learning_rate": 3.751803751803752e-05, "loss": 0.4812, "step": 751 }, { "epoch": 0.9759896171317326, "grad_norm": 0.27340749938479236, "learning_rate": 3.7493987493987495e-05, "loss": 0.5218, "step": 752 }, { "epoch": 0.9772874756651525, "grad_norm": 0.2758375543775843, "learning_rate": 3.746993746993747e-05, "loss": 0.5003, "step": 753 }, { "epoch": 0.9785853341985724, "grad_norm": 0.2894871095962033, "learning_rate": 3.7445887445887445e-05, "loss": 0.4898, "step": 754 }, { "epoch": 0.9798831927319922, "grad_norm": 0.24603375955455123, "learning_rate": 3.7421837421837427e-05, "loss": 0.4975, "step": 755 }, { "epoch": 0.9811810512654121, "grad_norm": 0.3430084532330243, "learning_rate": 3.73977873977874e-05, "loss": 0.4986, "step": 756 }, { "epoch": 0.9824789097988319, "grad_norm": 0.26260792839174424, "learning_rate": 3.7373737373737376e-05, "loss": 0.4948, "step": 757 }, { "epoch": 0.9837767683322518, "grad_norm": 0.3417705691604155, "learning_rate": 3.734968734968735e-05, "loss": 0.4892, "step": 758 }, { "epoch": 0.9850746268656716, "grad_norm": 0.2367132738219532, "learning_rate": 3.7325637325637326e-05, "loss": 0.4837, "step": 759 }, { "epoch": 0.9863724853990915, "grad_norm": 0.35539307871553294, "learning_rate": 3.730158730158731e-05, "loss": 0.5081, "step": 760 }, { "epoch": 0.9876703439325114, "grad_norm": 0.22735379818962181, "learning_rate": 3.7277537277537275e-05, "loss": 0.4791, "step": 761 }, { "epoch": 0.9889682024659312, "grad_norm": 0.30959128828024085, "learning_rate": 3.725348725348726e-05, "loss": 0.4977, "step": 762 }, { "epoch": 0.9902660609993511, "grad_norm": 0.24407830637335193, "learning_rate": 3.722943722943723e-05, "loss": 0.4889, "step": 763 }, { "epoch": 0.9915639195327709, "grad_norm": 0.37047588851002045, "learning_rate": 3.720538720538721e-05, "loss": 0.4887, "step": 764 }, { "epoch": 0.9928617780661908, "grad_norm": 0.3132029888056198, "learning_rate": 3.718133718133718e-05, "loss": 0.4808, "step": 765 }, { "epoch": 0.9941596365996106, "grad_norm": 0.336725063578875, "learning_rate": 3.7157287157287156e-05, "loss": 0.4944, "step": 766 }, { "epoch": 0.9954574951330305, "grad_norm": 0.29559283526551605, "learning_rate": 3.713323713323714e-05, "loss": 0.485, "step": 767 }, { "epoch": 0.9967553536664504, "grad_norm": 0.3181505432986373, "learning_rate": 3.710918710918711e-05, "loss": 0.5109, "step": 768 }, { "epoch": 0.9980532121998702, "grad_norm": 0.25792486030536, "learning_rate": 3.708513708513709e-05, "loss": 0.4837, "step": 769 }, { "epoch": 0.9993510707332901, "grad_norm": 0.2662559802060241, "learning_rate": 3.706108706108706e-05, "loss": 0.4768, "step": 770 }, { "epoch": 1.0, "grad_norm": 0.2662559802060241, "learning_rate": 3.7037037037037037e-05, "loss": 0.4749, "step": 771 }, { "epoch": 1.0012978585334198, "grad_norm": 0.45710604360462226, "learning_rate": 3.701298701298702e-05, "loss": 0.4295, "step": 772 }, { "epoch": 1.0025957170668398, "grad_norm": 0.3984763478659025, "learning_rate": 3.698893698893699e-05, "loss": 0.4211, "step": 773 }, { "epoch": 1.0038935756002596, "grad_norm": 0.33604362226653633, "learning_rate": 3.696488696488697e-05, "loss": 0.4472, "step": 774 }, { "epoch": 1.0051914341336794, "grad_norm": 0.35492193650921217, "learning_rate": 3.694083694083694e-05, "loss": 0.4144, "step": 775 }, { "epoch": 1.0064892926670992, "grad_norm": 0.34395756575343545, "learning_rate": 3.691678691678692e-05, "loss": 0.4371, "step": 776 }, { "epoch": 1.0077871512005192, "grad_norm": 0.3027073670465442, "learning_rate": 3.68927368927369e-05, "loss": 0.4418, "step": 777 }, { "epoch": 1.009085009733939, "grad_norm": 0.33164997403681556, "learning_rate": 3.686868686868687e-05, "loss": 0.4236, "step": 778 }, { "epoch": 1.0103828682673588, "grad_norm": 0.29836741552546153, "learning_rate": 3.684463684463685e-05, "loss": 0.4291, "step": 779 }, { "epoch": 1.0116807268007788, "grad_norm": 0.31908724983675907, "learning_rate": 3.682058682058682e-05, "loss": 0.4444, "step": 780 }, { "epoch": 1.0129785853341986, "grad_norm": 0.3063522863393575, "learning_rate": 3.67965367965368e-05, "loss": 0.4235, "step": 781 }, { "epoch": 1.0142764438676184, "grad_norm": 0.286969078111983, "learning_rate": 3.677248677248677e-05, "loss": 0.4273, "step": 782 }, { "epoch": 1.0155743024010382, "grad_norm": 0.30969412667904017, "learning_rate": 3.674843674843675e-05, "loss": 0.4179, "step": 783 }, { "epoch": 1.0168721609344582, "grad_norm": 0.28846243749065087, "learning_rate": 3.672438672438673e-05, "loss": 0.445, "step": 784 }, { "epoch": 1.018170019467878, "grad_norm": 0.3052289303187025, "learning_rate": 3.6700336700336704e-05, "loss": 0.4282, "step": 785 }, { "epoch": 1.0194678780012978, "grad_norm": 0.28776824445687055, "learning_rate": 3.667628667628668e-05, "loss": 0.4361, "step": 786 }, { "epoch": 1.0207657365347178, "grad_norm": 0.25471106959244577, "learning_rate": 3.665223665223665e-05, "loss": 0.4177, "step": 787 }, { "epoch": 1.0220635950681376, "grad_norm": 0.2740291049864792, "learning_rate": 3.662818662818663e-05, "loss": 0.4394, "step": 788 }, { "epoch": 1.0233614536015574, "grad_norm": 0.3017972640732574, "learning_rate": 3.660413660413661e-05, "loss": 0.4386, "step": 789 }, { "epoch": 1.0246593121349772, "grad_norm": 0.23597430678446688, "learning_rate": 3.6580086580086584e-05, "loss": 0.4499, "step": 790 }, { "epoch": 1.0259571706683972, "grad_norm": 0.2879673662024183, "learning_rate": 3.655603655603656e-05, "loss": 0.4293, "step": 791 }, { "epoch": 1.027255029201817, "grad_norm": 0.2700731107191033, "learning_rate": 3.6531986531986534e-05, "loss": 0.4258, "step": 792 }, { "epoch": 1.0285528877352368, "grad_norm": 0.25488494703667464, "learning_rate": 3.650793650793651e-05, "loss": 0.4074, "step": 793 }, { "epoch": 1.0298507462686568, "grad_norm": 0.26394634723070287, "learning_rate": 3.648388648388649e-05, "loss": 0.4508, "step": 794 }, { "epoch": 1.0311486048020766, "grad_norm": 0.27582489513269015, "learning_rate": 3.645983645983646e-05, "loss": 0.4412, "step": 795 }, { "epoch": 1.0324464633354964, "grad_norm": 0.24726232362927064, "learning_rate": 3.643578643578644e-05, "loss": 0.4168, "step": 796 }, { "epoch": 1.0337443218689162, "grad_norm": 0.28259272497475907, "learning_rate": 3.6411736411736415e-05, "loss": 0.4154, "step": 797 }, { "epoch": 1.0350421804023362, "grad_norm": 0.2663149191606831, "learning_rate": 3.638768638768639e-05, "loss": 0.4189, "step": 798 }, { "epoch": 1.036340038935756, "grad_norm": 0.23665837923986385, "learning_rate": 3.6363636363636364e-05, "loss": 0.4098, "step": 799 }, { "epoch": 1.0376378974691758, "grad_norm": 0.26291163598038214, "learning_rate": 3.633958633958634e-05, "loss": 0.4311, "step": 800 }, { "epoch": 1.0389357560025958, "grad_norm": 0.27970192834528934, "learning_rate": 3.631553631553632e-05, "loss": 0.4145, "step": 801 }, { "epoch": 1.0402336145360156, "grad_norm": 0.2710942408225925, "learning_rate": 3.6291486291486295e-05, "loss": 0.4277, "step": 802 }, { "epoch": 1.0415314730694354, "grad_norm": 0.2727015101923209, "learning_rate": 3.626743626743627e-05, "loss": 0.4232, "step": 803 }, { "epoch": 1.0428293316028552, "grad_norm": 0.3128392157246963, "learning_rate": 3.6243386243386245e-05, "loss": 0.4335, "step": 804 }, { "epoch": 1.0441271901362752, "grad_norm": 0.2349817412715045, "learning_rate": 3.621933621933622e-05, "loss": 0.4253, "step": 805 }, { "epoch": 1.045425048669695, "grad_norm": 0.28497577605120694, "learning_rate": 3.61952861952862e-05, "loss": 0.4168, "step": 806 }, { "epoch": 1.0467229072031148, "grad_norm": 0.2587789310002607, "learning_rate": 3.617123617123617e-05, "loss": 0.4217, "step": 807 }, { "epoch": 1.0480207657365348, "grad_norm": 0.26828544455125314, "learning_rate": 3.6147186147186144e-05, "loss": 0.4074, "step": 808 }, { "epoch": 1.0493186242699546, "grad_norm": 0.2598287776154231, "learning_rate": 3.6123136123136125e-05, "loss": 0.4184, "step": 809 }, { "epoch": 1.0506164828033744, "grad_norm": 0.2703287966299121, "learning_rate": 3.60990860990861e-05, "loss": 0.4323, "step": 810 }, { "epoch": 1.0519143413367944, "grad_norm": 0.318496639053465, "learning_rate": 3.6075036075036075e-05, "loss": 0.4209, "step": 811 }, { "epoch": 1.0532121998702142, "grad_norm": 0.2697909678170961, "learning_rate": 3.605098605098605e-05, "loss": 0.4244, "step": 812 }, { "epoch": 1.054510058403634, "grad_norm": 0.2969724315201258, "learning_rate": 3.602693602693603e-05, "loss": 0.4172, "step": 813 }, { "epoch": 1.0558079169370538, "grad_norm": 0.30303096771353466, "learning_rate": 3.6002886002886006e-05, "loss": 0.404, "step": 814 }, { "epoch": 1.0571057754704738, "grad_norm": 0.240870306849622, "learning_rate": 3.597883597883598e-05, "loss": 0.4226, "step": 815 }, { "epoch": 1.0584036340038936, "grad_norm": 0.31791173210740425, "learning_rate": 3.5954785954785956e-05, "loss": 0.411, "step": 816 }, { "epoch": 1.0597014925373134, "grad_norm": 0.2804167030463435, "learning_rate": 3.593073593073593e-05, "loss": 0.4297, "step": 817 }, { "epoch": 1.0609993510707332, "grad_norm": 0.24716383688085644, "learning_rate": 3.590668590668591e-05, "loss": 0.434, "step": 818 }, { "epoch": 1.0622972096041532, "grad_norm": 0.3039180273884032, "learning_rate": 3.588263588263589e-05, "loss": 0.4212, "step": 819 }, { "epoch": 1.063595068137573, "grad_norm": 0.319488059258111, "learning_rate": 3.5858585858585855e-05, "loss": 0.4068, "step": 820 }, { "epoch": 1.0648929266709928, "grad_norm": 0.22619788557964504, "learning_rate": 3.5834535834535836e-05, "loss": 0.4311, "step": 821 }, { "epoch": 1.0661907852044128, "grad_norm": 0.34410773893661634, "learning_rate": 3.581048581048581e-05, "loss": 0.4349, "step": 822 }, { "epoch": 1.0674886437378326, "grad_norm": 0.3100368353729728, "learning_rate": 3.578643578643579e-05, "loss": 0.4352, "step": 823 }, { "epoch": 1.0687865022712524, "grad_norm": 0.2901826811884039, "learning_rate": 3.576238576238576e-05, "loss": 0.4157, "step": 824 }, { "epoch": 1.0700843608046724, "grad_norm": 0.31611344846131356, "learning_rate": 3.5738335738335735e-05, "loss": 0.4693, "step": 825 }, { "epoch": 1.0713822193380922, "grad_norm": 0.35227684302990314, "learning_rate": 3.571428571428572e-05, "loss": 0.4356, "step": 826 }, { "epoch": 1.072680077871512, "grad_norm": 0.25990758753916315, "learning_rate": 3.569023569023569e-05, "loss": 0.4149, "step": 827 }, { "epoch": 1.0739779364049318, "grad_norm": 0.33795998379210196, "learning_rate": 3.5666185666185667e-05, "loss": 0.4231, "step": 828 }, { "epoch": 1.0752757949383518, "grad_norm": 0.260416289520159, "learning_rate": 3.564213564213564e-05, "loss": 0.4439, "step": 829 }, { "epoch": 1.0765736534717716, "grad_norm": 0.2745403629951124, "learning_rate": 3.5618085618085616e-05, "loss": 0.4363, "step": 830 }, { "epoch": 1.0778715120051914, "grad_norm": 0.30247544618833483, "learning_rate": 3.55940355940356e-05, "loss": 0.4195, "step": 831 }, { "epoch": 1.0791693705386114, "grad_norm": 0.32082708667036386, "learning_rate": 3.556998556998557e-05, "loss": 0.4207, "step": 832 }, { "epoch": 1.0804672290720312, "grad_norm": 0.2897448897920795, "learning_rate": 3.554593554593555e-05, "loss": 0.4256, "step": 833 }, { "epoch": 1.081765087605451, "grad_norm": 0.2799359981594651, "learning_rate": 3.552188552188552e-05, "loss": 0.4566, "step": 834 }, { "epoch": 1.0830629461388708, "grad_norm": 0.3069382837366587, "learning_rate": 3.5497835497835503e-05, "loss": 0.4422, "step": 835 }, { "epoch": 1.0843608046722908, "grad_norm": 0.2432398771659819, "learning_rate": 3.547378547378548e-05, "loss": 0.4058, "step": 836 }, { "epoch": 1.0856586632057106, "grad_norm": 0.2946884230374921, "learning_rate": 3.5449735449735446e-05, "loss": 0.4212, "step": 837 }, { "epoch": 1.0869565217391304, "grad_norm": 0.2833055873065403, "learning_rate": 3.542568542568543e-05, "loss": 0.4236, "step": 838 }, { "epoch": 1.0882543802725504, "grad_norm": 0.24633104520706475, "learning_rate": 3.54016354016354e-05, "loss": 0.4296, "step": 839 }, { "epoch": 1.0895522388059702, "grad_norm": 0.30305714876494183, "learning_rate": 3.5377585377585384e-05, "loss": 0.4125, "step": 840 }, { "epoch": 1.09085009733939, "grad_norm": 0.27515539449115956, "learning_rate": 3.535353535353535e-05, "loss": 0.4343, "step": 841 }, { "epoch": 1.0921479558728098, "grad_norm": 0.2566570554342314, "learning_rate": 3.532948532948533e-05, "loss": 0.4259, "step": 842 }, { "epoch": 1.0934458144062298, "grad_norm": 0.27580070959828384, "learning_rate": 3.530543530543531e-05, "loss": 0.4429, "step": 843 }, { "epoch": 1.0947436729396496, "grad_norm": 0.40316368454416424, "learning_rate": 3.528138528138528e-05, "loss": 0.4471, "step": 844 }, { "epoch": 1.0960415314730694, "grad_norm": 0.2693638650733015, "learning_rate": 3.525733525733526e-05, "loss": 0.4259, "step": 845 }, { "epoch": 1.0973393900064894, "grad_norm": 0.3044405084823207, "learning_rate": 3.523328523328523e-05, "loss": 0.4187, "step": 846 }, { "epoch": 1.0986372485399092, "grad_norm": 0.24933029657871267, "learning_rate": 3.520923520923521e-05, "loss": 0.4328, "step": 847 }, { "epoch": 1.099935107073329, "grad_norm": 0.26906838606368316, "learning_rate": 3.518518518518519e-05, "loss": 0.4368, "step": 848 }, { "epoch": 1.1012329656067488, "grad_norm": 0.26971278689485273, "learning_rate": 3.5161135161135164e-05, "loss": 0.4297, "step": 849 }, { "epoch": 1.1025308241401688, "grad_norm": 0.2747922938561738, "learning_rate": 3.513708513708514e-05, "loss": 0.4373, "step": 850 }, { "epoch": 1.1038286826735886, "grad_norm": 0.2761210141034917, "learning_rate": 3.5113035113035113e-05, "loss": 0.4382, "step": 851 }, { "epoch": 1.1051265412070084, "grad_norm": 0.2964790930440498, "learning_rate": 3.508898508898509e-05, "loss": 0.4231, "step": 852 }, { "epoch": 1.1064243997404284, "grad_norm": 0.25620765135533435, "learning_rate": 3.506493506493507e-05, "loss": 0.4408, "step": 853 }, { "epoch": 1.1077222582738482, "grad_norm": 0.21064152286484897, "learning_rate": 3.504088504088504e-05, "loss": 0.417, "step": 854 }, { "epoch": 1.109020116807268, "grad_norm": 0.2958430063839789, "learning_rate": 3.501683501683502e-05, "loss": 0.4339, "step": 855 }, { "epoch": 1.1103179753406878, "grad_norm": 0.22482067437635359, "learning_rate": 3.4992784992784994e-05, "loss": 0.4295, "step": 856 }, { "epoch": 1.1116158338741078, "grad_norm": 0.2955967782295721, "learning_rate": 3.4968734968734976e-05, "loss": 0.4015, "step": 857 }, { "epoch": 1.1129136924075276, "grad_norm": 0.2843718668047481, "learning_rate": 3.4944684944684944e-05, "loss": 0.4218, "step": 858 }, { "epoch": 1.1142115509409474, "grad_norm": 0.2764727160970055, "learning_rate": 3.492063492063492e-05, "loss": 0.4246, "step": 859 }, { "epoch": 1.1155094094743674, "grad_norm": 0.2624405170772858, "learning_rate": 3.48965848965849e-05, "loss": 0.407, "step": 860 }, { "epoch": 1.1168072680077872, "grad_norm": 0.2666157900698976, "learning_rate": 3.4872534872534875e-05, "loss": 0.4402, "step": 861 }, { "epoch": 1.118105126541207, "grad_norm": 1.2301966564667228, "learning_rate": 3.484848484848485e-05, "loss": 0.4452, "step": 862 }, { "epoch": 1.1194029850746268, "grad_norm": 0.3054038468605886, "learning_rate": 3.4824434824434824e-05, "loss": 0.4354, "step": 863 }, { "epoch": 1.1207008436080468, "grad_norm": 0.2582864922377692, "learning_rate": 3.48003848003848e-05, "loss": 0.4495, "step": 864 }, { "epoch": 1.1219987021414666, "grad_norm": 0.2500252146157672, "learning_rate": 3.477633477633478e-05, "loss": 0.4395, "step": 865 }, { "epoch": 1.1232965606748864, "grad_norm": 0.29631159663934065, "learning_rate": 3.4752284752284755e-05, "loss": 0.4514, "step": 866 }, { "epoch": 1.1245944192083064, "grad_norm": 0.28152136468600736, "learning_rate": 3.472823472823473e-05, "loss": 0.4397, "step": 867 }, { "epoch": 1.1258922777417262, "grad_norm": 0.21131772623693806, "learning_rate": 3.4704184704184705e-05, "loss": 0.4285, "step": 868 }, { "epoch": 1.127190136275146, "grad_norm": 0.25615102365106246, "learning_rate": 3.468013468013468e-05, "loss": 0.4133, "step": 869 }, { "epoch": 1.128487994808566, "grad_norm": 0.23917269321305726, "learning_rate": 3.465608465608466e-05, "loss": 0.4435, "step": 870 }, { "epoch": 1.1297858533419858, "grad_norm": 0.27161646725256705, "learning_rate": 3.463203463203463e-05, "loss": 0.4233, "step": 871 }, { "epoch": 1.1310837118754056, "grad_norm": 0.2412152469942244, "learning_rate": 3.460798460798461e-05, "loss": 0.4461, "step": 872 }, { "epoch": 1.1323815704088254, "grad_norm": 0.3179450028674149, "learning_rate": 3.4583934583934586e-05, "loss": 0.4119, "step": 873 }, { "epoch": 1.1336794289422454, "grad_norm": 0.23384786069007624, "learning_rate": 3.455988455988456e-05, "loss": 0.4103, "step": 874 }, { "epoch": 1.1349772874756652, "grad_norm": 0.2576089230129144, "learning_rate": 3.4535834535834535e-05, "loss": 0.4256, "step": 875 }, { "epoch": 1.136275146009085, "grad_norm": 0.25522949186712174, "learning_rate": 3.451178451178451e-05, "loss": 0.4306, "step": 876 }, { "epoch": 1.1375730045425048, "grad_norm": 0.30146067353056377, "learning_rate": 3.448773448773449e-05, "loss": 0.4373, "step": 877 }, { "epoch": 1.1388708630759248, "grad_norm": 0.2821696115546317, "learning_rate": 3.4463684463684466e-05, "loss": 0.4262, "step": 878 }, { "epoch": 1.1401687216093446, "grad_norm": 0.23559687187392536, "learning_rate": 3.443963443963444e-05, "loss": 0.4128, "step": 879 }, { "epoch": 1.1414665801427644, "grad_norm": 0.3096443772818808, "learning_rate": 3.4415584415584416e-05, "loss": 0.424, "step": 880 }, { "epoch": 1.1427644386761844, "grad_norm": 0.27131628167000316, "learning_rate": 3.439153439153439e-05, "loss": 0.4243, "step": 881 }, { "epoch": 1.1440622972096042, "grad_norm": 0.3321877999366987, "learning_rate": 3.436748436748437e-05, "loss": 0.3962, "step": 882 }, { "epoch": 1.145360155743024, "grad_norm": 0.252836911633108, "learning_rate": 3.434343434343435e-05, "loss": 0.4364, "step": 883 }, { "epoch": 1.146658014276444, "grad_norm": 0.28472809342059574, "learning_rate": 3.431938431938432e-05, "loss": 0.4265, "step": 884 }, { "epoch": 1.1479558728098638, "grad_norm": 0.25893718977038643, "learning_rate": 3.4295334295334296e-05, "loss": 0.4239, "step": 885 }, { "epoch": 1.1492537313432836, "grad_norm": 0.29888823830438355, "learning_rate": 3.427128427128427e-05, "loss": 0.4641, "step": 886 }, { "epoch": 1.1505515898767034, "grad_norm": 0.2336271423650757, "learning_rate": 3.424723424723425e-05, "loss": 0.4363, "step": 887 }, { "epoch": 1.1518494484101234, "grad_norm": 0.28355868807379, "learning_rate": 3.422318422318422e-05, "loss": 0.4446, "step": 888 }, { "epoch": 1.1531473069435432, "grad_norm": 0.25482834362533163, "learning_rate": 3.41991341991342e-05, "loss": 0.432, "step": 889 }, { "epoch": 1.154445165476963, "grad_norm": 0.257058818612092, "learning_rate": 3.417508417508418e-05, "loss": 0.4238, "step": 890 }, { "epoch": 1.1557430240103828, "grad_norm": 0.2964878759739716, "learning_rate": 3.415103415103415e-05, "loss": 0.4341, "step": 891 }, { "epoch": 1.1570408825438028, "grad_norm": 0.24581976687613294, "learning_rate": 3.412698412698413e-05, "loss": 0.445, "step": 892 }, { "epoch": 1.1583387410772226, "grad_norm": 0.2672951398900844, "learning_rate": 3.41029341029341e-05, "loss": 0.4146, "step": 893 }, { "epoch": 1.1596365996106424, "grad_norm": 0.29744313702383335, "learning_rate": 3.407888407888408e-05, "loss": 0.4291, "step": 894 }, { "epoch": 1.1609344581440624, "grad_norm": 0.2594025370257348, "learning_rate": 3.405483405483406e-05, "loss": 0.4393, "step": 895 }, { "epoch": 1.1622323166774822, "grad_norm": 0.27322558327059043, "learning_rate": 3.403078403078403e-05, "loss": 0.413, "step": 896 }, { "epoch": 1.163530175210902, "grad_norm": 0.27895427053368943, "learning_rate": 3.400673400673401e-05, "loss": 0.4282, "step": 897 }, { "epoch": 1.164828033744322, "grad_norm": 0.3278143440045291, "learning_rate": 3.398268398268398e-05, "loss": 0.4503, "step": 898 }, { "epoch": 1.1661258922777418, "grad_norm": 0.2878645741851875, "learning_rate": 3.3958633958633964e-05, "loss": 0.418, "step": 899 }, { "epoch": 1.1674237508111616, "grad_norm": 0.279091054078343, "learning_rate": 3.393458393458394e-05, "loss": 0.4351, "step": 900 }, { "epoch": 1.1687216093445814, "grad_norm": 0.300972554323965, "learning_rate": 3.391053391053391e-05, "loss": 0.4177, "step": 901 }, { "epoch": 1.1700194678780014, "grad_norm": 0.2912604255538886, "learning_rate": 3.388648388648389e-05, "loss": 0.4239, "step": 902 }, { "epoch": 1.1713173264114212, "grad_norm": 0.28729371845984225, "learning_rate": 3.386243386243386e-05, "loss": 0.4498, "step": 903 }, { "epoch": 1.172615184944841, "grad_norm": 0.2983707424965568, "learning_rate": 3.3838383838383844e-05, "loss": 0.4093, "step": 904 }, { "epoch": 1.1739130434782608, "grad_norm": 0.2396146655592429, "learning_rate": 3.381433381433381e-05, "loss": 0.4134, "step": 905 }, { "epoch": 1.1752109020116808, "grad_norm": 0.25743340902304185, "learning_rate": 3.3790283790283794e-05, "loss": 0.4024, "step": 906 }, { "epoch": 1.1765087605451006, "grad_norm": 0.27027531302973373, "learning_rate": 3.376623376623377e-05, "loss": 0.4518, "step": 907 }, { "epoch": 1.1778066190785204, "grad_norm": 0.25280300819232365, "learning_rate": 3.3742183742183743e-05, "loss": 0.4185, "step": 908 }, { "epoch": 1.1791044776119404, "grad_norm": 0.22682160703006224, "learning_rate": 3.371813371813372e-05, "loss": 0.4174, "step": 909 }, { "epoch": 1.1804023361453602, "grad_norm": 0.23204503630025836, "learning_rate": 3.369408369408369e-05, "loss": 0.4177, "step": 910 }, { "epoch": 1.18170019467878, "grad_norm": 0.25880635574030375, "learning_rate": 3.3670033670033675e-05, "loss": 0.4179, "step": 911 }, { "epoch": 1.1829980532122, "grad_norm": 0.2522597708371833, "learning_rate": 3.364598364598365e-05, "loss": 0.4283, "step": 912 }, { "epoch": 1.1842959117456198, "grad_norm": 0.2883869624140782, "learning_rate": 3.3621933621933624e-05, "loss": 0.4372, "step": 913 }, { "epoch": 1.1855937702790396, "grad_norm": 0.25106486957221746, "learning_rate": 3.35978835978836e-05, "loss": 0.4281, "step": 914 }, { "epoch": 1.1868916288124594, "grad_norm": 0.292526125260076, "learning_rate": 3.3573833573833574e-05, "loss": 0.4365, "step": 915 }, { "epoch": 1.1881894873458794, "grad_norm": 0.2676690874911841, "learning_rate": 3.3549783549783555e-05, "loss": 0.4326, "step": 916 }, { "epoch": 1.1894873458792992, "grad_norm": 0.26481922535161423, "learning_rate": 3.352573352573353e-05, "loss": 0.4274, "step": 917 }, { "epoch": 1.190785204412719, "grad_norm": 0.2798203995195467, "learning_rate": 3.35016835016835e-05, "loss": 0.4368, "step": 918 }, { "epoch": 1.1920830629461387, "grad_norm": 0.2861488916957064, "learning_rate": 3.347763347763348e-05, "loss": 0.4137, "step": 919 }, { "epoch": 1.1933809214795588, "grad_norm": 0.29631412123985384, "learning_rate": 3.3453583453583454e-05, "loss": 0.4569, "step": 920 }, { "epoch": 1.1946787800129786, "grad_norm": 0.2342734368682864, "learning_rate": 3.3429533429533436e-05, "loss": 0.4161, "step": 921 }, { "epoch": 1.1959766385463984, "grad_norm": 0.2701444926397203, "learning_rate": 3.3405483405483404e-05, "loss": 0.4436, "step": 922 }, { "epoch": 1.1972744970798184, "grad_norm": 0.24533441345926324, "learning_rate": 3.3381433381433385e-05, "loss": 0.4169, "step": 923 }, { "epoch": 1.1985723556132382, "grad_norm": 0.2540671392711727, "learning_rate": 3.335738335738336e-05, "loss": 0.4157, "step": 924 }, { "epoch": 1.199870214146658, "grad_norm": 0.24799812272450378, "learning_rate": 3.3333333333333335e-05, "loss": 0.4185, "step": 925 }, { "epoch": 1.201168072680078, "grad_norm": 0.22598144682861446, "learning_rate": 3.330928330928331e-05, "loss": 0.4316, "step": 926 }, { "epoch": 1.2024659312134978, "grad_norm": 0.2448180172195478, "learning_rate": 3.3285233285233284e-05, "loss": 0.4224, "step": 927 }, { "epoch": 1.2037637897469176, "grad_norm": 0.2765575264990623, "learning_rate": 3.3261183261183266e-05, "loss": 0.4152, "step": 928 }, { "epoch": 1.2050616482803373, "grad_norm": 0.2804413742079038, "learning_rate": 3.323713323713324e-05, "loss": 0.4268, "step": 929 }, { "epoch": 1.2063595068137574, "grad_norm": 0.241273559139525, "learning_rate": 3.3213083213083216e-05, "loss": 0.44, "step": 930 }, { "epoch": 1.2076573653471772, "grad_norm": 0.26424527656796193, "learning_rate": 3.318903318903319e-05, "loss": 0.423, "step": 931 }, { "epoch": 1.208955223880597, "grad_norm": 0.2618836109205494, "learning_rate": 3.3164983164983165e-05, "loss": 0.4263, "step": 932 }, { "epoch": 1.210253082414017, "grad_norm": 0.232277628692469, "learning_rate": 3.314093314093315e-05, "loss": 0.4285, "step": 933 }, { "epoch": 1.2115509409474368, "grad_norm": 0.22920838397957313, "learning_rate": 3.311688311688312e-05, "loss": 0.4222, "step": 934 }, { "epoch": 1.2128487994808566, "grad_norm": 0.22768841564535697, "learning_rate": 3.309283309283309e-05, "loss": 0.4343, "step": 935 }, { "epoch": 1.2141466580142763, "grad_norm": 0.2503532795435805, "learning_rate": 3.306878306878307e-05, "loss": 0.439, "step": 936 }, { "epoch": 1.2154445165476964, "grad_norm": 0.22461109640165802, "learning_rate": 3.3044733044733046e-05, "loss": 0.4188, "step": 937 }, { "epoch": 1.2167423750811162, "grad_norm": 0.209011116297864, "learning_rate": 3.302068302068303e-05, "loss": 0.4095, "step": 938 }, { "epoch": 1.218040233614536, "grad_norm": 0.2266971578892572, "learning_rate": 3.2996632996632995e-05, "loss": 0.4354, "step": 939 }, { "epoch": 1.219338092147956, "grad_norm": 0.22300168989786548, "learning_rate": 3.297258297258297e-05, "loss": 0.4225, "step": 940 }, { "epoch": 1.2206359506813758, "grad_norm": 0.24047207294507855, "learning_rate": 3.294853294853295e-05, "loss": 0.4485, "step": 941 }, { "epoch": 1.2219338092147956, "grad_norm": 0.26188839036093997, "learning_rate": 3.2924482924482926e-05, "loss": 0.4338, "step": 942 }, { "epoch": 1.2232316677482156, "grad_norm": 0.2235845694258825, "learning_rate": 3.29004329004329e-05, "loss": 0.4242, "step": 943 }, { "epoch": 1.2245295262816354, "grad_norm": 0.21723162921446287, "learning_rate": 3.2876382876382876e-05, "loss": 0.4241, "step": 944 }, { "epoch": 1.2258273848150552, "grad_norm": 0.25526775092171644, "learning_rate": 3.285233285233286e-05, "loss": 0.4253, "step": 945 }, { "epoch": 1.227125243348475, "grad_norm": 0.20573746450142508, "learning_rate": 3.282828282828283e-05, "loss": 0.4479, "step": 946 }, { "epoch": 1.228423101881895, "grad_norm": 0.2510082428750361, "learning_rate": 3.280423280423281e-05, "loss": 0.4347, "step": 947 }, { "epoch": 1.2297209604153148, "grad_norm": 0.2669964257731318, "learning_rate": 3.278018278018278e-05, "loss": 0.4291, "step": 948 }, { "epoch": 1.2310188189487346, "grad_norm": 0.23768286255343224, "learning_rate": 3.275613275613276e-05, "loss": 0.4348, "step": 949 }, { "epoch": 1.2323166774821543, "grad_norm": 0.253301068721141, "learning_rate": 3.273208273208274e-05, "loss": 0.4165, "step": 950 }, { "epoch": 1.2336145360155744, "grad_norm": 0.24211175208943894, "learning_rate": 3.270803270803271e-05, "loss": 0.4525, "step": 951 }, { "epoch": 1.2349123945489942, "grad_norm": 0.20694813799256812, "learning_rate": 3.268398268398268e-05, "loss": 0.4353, "step": 952 }, { "epoch": 1.236210253082414, "grad_norm": 0.24139016385768045, "learning_rate": 3.265993265993266e-05, "loss": 0.4255, "step": 953 }, { "epoch": 1.237508111615834, "grad_norm": 0.23298523425827472, "learning_rate": 3.263588263588264e-05, "loss": 0.4453, "step": 954 }, { "epoch": 1.2388059701492538, "grad_norm": 0.22089226803029272, "learning_rate": 3.261183261183262e-05, "loss": 0.4142, "step": 955 }, { "epoch": 1.2401038286826735, "grad_norm": 0.21279087400923866, "learning_rate": 3.258778258778259e-05, "loss": 0.3967, "step": 956 }, { "epoch": 1.2414016872160936, "grad_norm": 0.21365588686190673, "learning_rate": 3.256373256373256e-05, "loss": 0.4265, "step": 957 }, { "epoch": 1.2426995457495134, "grad_norm": 0.2297103141691502, "learning_rate": 3.253968253968254e-05, "loss": 0.4247, "step": 958 }, { "epoch": 1.2439974042829332, "grad_norm": 0.21237733020443594, "learning_rate": 3.251563251563252e-05, "loss": 0.4187, "step": 959 }, { "epoch": 1.245295262816353, "grad_norm": 0.25205287449171737, "learning_rate": 3.249158249158249e-05, "loss": 0.4195, "step": 960 }, { "epoch": 1.246593121349773, "grad_norm": 0.2275565335826296, "learning_rate": 3.246753246753247e-05, "loss": 0.4222, "step": 961 }, { "epoch": 1.2478909798831928, "grad_norm": 0.24497964692242122, "learning_rate": 3.244348244348244e-05, "loss": 0.4282, "step": 962 }, { "epoch": 1.2491888384166125, "grad_norm": 0.2154307123634933, "learning_rate": 3.2419432419432424e-05, "loss": 0.4354, "step": 963 }, { "epoch": 1.2504866969500323, "grad_norm": 0.21744389583650917, "learning_rate": 3.23953823953824e-05, "loss": 0.4138, "step": 964 }, { "epoch": 1.2517845554834524, "grad_norm": 0.2430698493626977, "learning_rate": 3.237133237133237e-05, "loss": 0.4342, "step": 965 }, { "epoch": 1.2530824140168721, "grad_norm": 0.2426482535744958, "learning_rate": 3.234728234728235e-05, "loss": 0.4253, "step": 966 }, { "epoch": 1.254380272550292, "grad_norm": 0.22495203456758703, "learning_rate": 3.232323232323233e-05, "loss": 0.4308, "step": 967 }, { "epoch": 1.255678131083712, "grad_norm": 0.24355897774937213, "learning_rate": 3.2299182299182304e-05, "loss": 0.4284, "step": 968 }, { "epoch": 1.2569759896171318, "grad_norm": 0.2417579603003613, "learning_rate": 3.227513227513227e-05, "loss": 0.4236, "step": 969 }, { "epoch": 1.2582738481505515, "grad_norm": 0.21950714489690643, "learning_rate": 3.2251082251082254e-05, "loss": 0.4319, "step": 970 }, { "epoch": 1.2595717066839716, "grad_norm": 0.248967863409087, "learning_rate": 3.222703222703223e-05, "loss": 0.414, "step": 971 }, { "epoch": 1.2608695652173914, "grad_norm": 0.2320971622059916, "learning_rate": 3.220298220298221e-05, "loss": 0.4396, "step": 972 }, { "epoch": 1.2621674237508111, "grad_norm": 0.23309901348515835, "learning_rate": 3.217893217893218e-05, "loss": 0.423, "step": 973 }, { "epoch": 1.263465282284231, "grad_norm": 0.2227973619365033, "learning_rate": 3.215488215488215e-05, "loss": 0.4256, "step": 974 }, { "epoch": 1.264763140817651, "grad_norm": 0.245760384681702, "learning_rate": 3.2130832130832135e-05, "loss": 0.4326, "step": 975 }, { "epoch": 1.2660609993510707, "grad_norm": 0.2789130551480554, "learning_rate": 3.210678210678211e-05, "loss": 0.4158, "step": 976 }, { "epoch": 1.2673588578844905, "grad_norm": 0.24466337361794802, "learning_rate": 3.2082732082732084e-05, "loss": 0.4132, "step": 977 }, { "epoch": 1.2686567164179103, "grad_norm": 0.26994667632692604, "learning_rate": 3.205868205868206e-05, "loss": 0.4193, "step": 978 }, { "epoch": 1.2699545749513304, "grad_norm": 0.27229427206268936, "learning_rate": 3.2034632034632034e-05, "loss": 0.4438, "step": 979 }, { "epoch": 1.2712524334847501, "grad_norm": 0.2562325137645907, "learning_rate": 3.2010582010582015e-05, "loss": 0.4356, "step": 980 }, { "epoch": 1.27255029201817, "grad_norm": 0.25248121832020193, "learning_rate": 3.198653198653199e-05, "loss": 0.4143, "step": 981 }, { "epoch": 1.27384815055159, "grad_norm": 0.2656639346583922, "learning_rate": 3.1962481962481965e-05, "loss": 0.4116, "step": 982 }, { "epoch": 1.2751460090850097, "grad_norm": 0.22210045965124164, "learning_rate": 3.193843193843194e-05, "loss": 0.4194, "step": 983 }, { "epoch": 1.2764438676184295, "grad_norm": 0.29306826782606415, "learning_rate": 3.1914381914381914e-05, "loss": 0.4148, "step": 984 }, { "epoch": 1.2777417261518496, "grad_norm": 0.24612787413957143, "learning_rate": 3.1890331890331896e-05, "loss": 0.419, "step": 985 }, { "epoch": 1.2790395846852693, "grad_norm": 0.24144328983707405, "learning_rate": 3.1866281866281864e-05, "loss": 0.4412, "step": 986 }, { "epoch": 1.2803374432186891, "grad_norm": 0.24433831385926233, "learning_rate": 3.1842231842231846e-05, "loss": 0.4237, "step": 987 }, { "epoch": 1.2816353017521092, "grad_norm": 0.29652784983616687, "learning_rate": 3.181818181818182e-05, "loss": 0.4052, "step": 988 }, { "epoch": 1.282933160285529, "grad_norm": 0.22517076903481237, "learning_rate": 3.1794131794131795e-05, "loss": 0.437, "step": 989 }, { "epoch": 1.2842310188189487, "grad_norm": 0.2695004176632199, "learning_rate": 3.177008177008177e-05, "loss": 0.4051, "step": 990 }, { "epoch": 1.2855288773523685, "grad_norm": 0.2831795101586642, "learning_rate": 3.1746031746031745e-05, "loss": 0.4142, "step": 991 }, { "epoch": 1.2868267358857883, "grad_norm": 0.2305605000016992, "learning_rate": 3.1721981721981726e-05, "loss": 0.4318, "step": 992 }, { "epoch": 1.2881245944192083, "grad_norm": 0.27421133904277795, "learning_rate": 3.16979316979317e-05, "loss": 0.4303, "step": 993 }, { "epoch": 1.2894224529526281, "grad_norm": 0.28015539417195207, "learning_rate": 3.1673881673881676e-05, "loss": 0.4332, "step": 994 }, { "epoch": 1.290720311486048, "grad_norm": 0.2456153256375182, "learning_rate": 3.164983164983165e-05, "loss": 0.4377, "step": 995 }, { "epoch": 1.292018170019468, "grad_norm": 0.2875091277813538, "learning_rate": 3.1625781625781625e-05, "loss": 0.4306, "step": 996 }, { "epoch": 1.2933160285528877, "grad_norm": 0.24534524749679693, "learning_rate": 3.160173160173161e-05, "loss": 0.4389, "step": 997 }, { "epoch": 1.2946138870863075, "grad_norm": 0.2604745981282834, "learning_rate": 3.1577681577681575e-05, "loss": 0.4221, "step": 998 }, { "epoch": 1.2959117456197276, "grad_norm": 0.2519193028255322, "learning_rate": 3.1553631553631556e-05, "loss": 0.4613, "step": 999 }, { "epoch": 1.2972096041531473, "grad_norm": 0.28305211286597437, "learning_rate": 3.152958152958153e-05, "loss": 0.4364, "step": 1000 }, { "epoch": 1.2985074626865671, "grad_norm": 0.31302239991262554, "learning_rate": 3.1505531505531506e-05, "loss": 0.4315, "step": 1001 }, { "epoch": 1.2998053212199872, "grad_norm": 0.21929034373943385, "learning_rate": 3.148148148148148e-05, "loss": 0.4389, "step": 1002 }, { "epoch": 1.301103179753407, "grad_norm": 0.33967578917657276, "learning_rate": 3.1457431457431456e-05, "loss": 0.4245, "step": 1003 }, { "epoch": 1.3024010382868267, "grad_norm": 0.26773599652804664, "learning_rate": 3.143338143338144e-05, "loss": 0.4322, "step": 1004 }, { "epoch": 1.3036988968202465, "grad_norm": 0.3377829113222901, "learning_rate": 3.140933140933141e-05, "loss": 0.4156, "step": 1005 }, { "epoch": 1.3049967553536663, "grad_norm": 0.28733563773070486, "learning_rate": 3.1385281385281387e-05, "loss": 0.4153, "step": 1006 }, { "epoch": 1.3062946138870863, "grad_norm": 0.29064269118004, "learning_rate": 3.136123136123136e-05, "loss": 0.4248, "step": 1007 }, { "epoch": 1.3075924724205061, "grad_norm": 0.3578706488624722, "learning_rate": 3.1337181337181336e-05, "loss": 0.4103, "step": 1008 }, { "epoch": 1.308890330953926, "grad_norm": 0.2506804986945498, "learning_rate": 3.131313131313132e-05, "loss": 0.4455, "step": 1009 }, { "epoch": 1.310188189487346, "grad_norm": 0.26163932223113945, "learning_rate": 3.128908128908129e-05, "loss": 0.418, "step": 1010 }, { "epoch": 1.3114860480207657, "grad_norm": 0.2968145670132288, "learning_rate": 3.126503126503126e-05, "loss": 0.4419, "step": 1011 }, { "epoch": 1.3127839065541855, "grad_norm": 0.29985902672925774, "learning_rate": 3.124098124098124e-05, "loss": 0.4397, "step": 1012 }, { "epoch": 1.3140817650876055, "grad_norm": 0.2693824680991638, "learning_rate": 3.121693121693122e-05, "loss": 0.4477, "step": 1013 }, { "epoch": 1.3153796236210253, "grad_norm": 0.2703405621428287, "learning_rate": 3.11928811928812e-05, "loss": 0.4325, "step": 1014 }, { "epoch": 1.3166774821544451, "grad_norm": 0.2721038339798775, "learning_rate": 3.1168831168831166e-05, "loss": 0.4373, "step": 1015 }, { "epoch": 1.3179753406878651, "grad_norm": 0.26849320227585655, "learning_rate": 3.114478114478115e-05, "loss": 0.4246, "step": 1016 }, { "epoch": 1.319273199221285, "grad_norm": 0.28294666170474586, "learning_rate": 3.112073112073112e-05, "loss": 0.4405, "step": 1017 }, { "epoch": 1.3205710577547047, "grad_norm": 0.7257885246786743, "learning_rate": 3.10966810966811e-05, "loss": 0.419, "step": 1018 }, { "epoch": 1.3218689162881245, "grad_norm": 0.26474834284364107, "learning_rate": 3.107263107263107e-05, "loss": 0.4561, "step": 1019 }, { "epoch": 1.3231667748215443, "grad_norm": 0.2836196696373746, "learning_rate": 3.104858104858105e-05, "loss": 0.4187, "step": 1020 }, { "epoch": 1.3244646333549643, "grad_norm": 0.2978444678442113, "learning_rate": 3.102453102453103e-05, "loss": 0.4245, "step": 1021 }, { "epoch": 1.3257624918883841, "grad_norm": 0.27039863643097406, "learning_rate": 3.1000481000481e-05, "loss": 0.4492, "step": 1022 }, { "epoch": 1.327060350421804, "grad_norm": 0.28245910629768817, "learning_rate": 3.097643097643098e-05, "loss": 0.4205, "step": 1023 }, { "epoch": 1.328358208955224, "grad_norm": 0.235926922135542, "learning_rate": 3.095238095238095e-05, "loss": 0.4176, "step": 1024 }, { "epoch": 1.3296560674886437, "grad_norm": 0.24582624312732296, "learning_rate": 3.092833092833093e-05, "loss": 0.4409, "step": 1025 }, { "epoch": 1.3309539260220635, "grad_norm": 0.25618143979663144, "learning_rate": 3.090428090428091e-05, "loss": 0.4411, "step": 1026 }, { "epoch": 1.3322517845554835, "grad_norm": 0.3044844628215498, "learning_rate": 3.0880230880230884e-05, "loss": 0.4628, "step": 1027 }, { "epoch": 1.3335496430889033, "grad_norm": 0.25846935424140755, "learning_rate": 3.085618085618085e-05, "loss": 0.431, "step": 1028 }, { "epoch": 1.3348475016223231, "grad_norm": 0.2299843153442688, "learning_rate": 3.0832130832130834e-05, "loss": 0.4091, "step": 1029 }, { "epoch": 1.3361453601557431, "grad_norm": 0.2461059374323843, "learning_rate": 3.080808080808081e-05, "loss": 0.4095, "step": 1030 }, { "epoch": 1.337443218689163, "grad_norm": 0.2591709406665044, "learning_rate": 3.078403078403079e-05, "loss": 0.4311, "step": 1031 }, { "epoch": 1.3387410772225827, "grad_norm": 0.22916233734032224, "learning_rate": 3.075998075998076e-05, "loss": 0.4485, "step": 1032 }, { "epoch": 1.3400389357560025, "grad_norm": 0.2600469770283129, "learning_rate": 3.073593073593073e-05, "loss": 0.4356, "step": 1033 }, { "epoch": 1.3413367942894223, "grad_norm": 0.24066487258683386, "learning_rate": 3.0711880711880714e-05, "loss": 0.4185, "step": 1034 }, { "epoch": 1.3426346528228423, "grad_norm": 0.27145997155071, "learning_rate": 3.068783068783069e-05, "loss": 0.4145, "step": 1035 }, { "epoch": 1.3439325113562621, "grad_norm": 0.22303833825329822, "learning_rate": 3.0663780663780664e-05, "loss": 0.4544, "step": 1036 }, { "epoch": 1.345230369889682, "grad_norm": 0.2631308244918525, "learning_rate": 3.063973063973064e-05, "loss": 0.4171, "step": 1037 }, { "epoch": 1.346528228423102, "grad_norm": 0.24013765031743725, "learning_rate": 3.061568061568062e-05, "loss": 0.4162, "step": 1038 }, { "epoch": 1.3478260869565217, "grad_norm": 0.2563444998106366, "learning_rate": 3.0591630591630595e-05, "loss": 0.4228, "step": 1039 }, { "epoch": 1.3491239454899415, "grad_norm": 0.26755087682195894, "learning_rate": 3.056758056758057e-05, "loss": 0.4393, "step": 1040 }, { "epoch": 1.3504218040233615, "grad_norm": 0.2651622032346258, "learning_rate": 3.0543530543530544e-05, "loss": 0.4149, "step": 1041 }, { "epoch": 1.3517196625567813, "grad_norm": 0.25334754508930496, "learning_rate": 3.051948051948052e-05, "loss": 0.3989, "step": 1042 }, { "epoch": 1.3530175210902011, "grad_norm": 0.23721003539310276, "learning_rate": 3.04954304954305e-05, "loss": 0.4221, "step": 1043 }, { "epoch": 1.3543153796236211, "grad_norm": 0.23037112505555338, "learning_rate": 3.0471380471380472e-05, "loss": 0.4182, "step": 1044 }, { "epoch": 1.355613238157041, "grad_norm": 0.2436111206549788, "learning_rate": 3.0447330447330447e-05, "loss": 0.4109, "step": 1045 }, { "epoch": 1.3569110966904607, "grad_norm": 0.24241659874410296, "learning_rate": 3.0423280423280425e-05, "loss": 0.4223, "step": 1046 }, { "epoch": 1.3582089552238805, "grad_norm": 0.25749535429929715, "learning_rate": 3.03992303992304e-05, "loss": 0.4182, "step": 1047 }, { "epoch": 1.3595068137573005, "grad_norm": 0.22344908087958898, "learning_rate": 3.0375180375180378e-05, "loss": 0.4321, "step": 1048 }, { "epoch": 1.3608046722907203, "grad_norm": 0.23197019372372432, "learning_rate": 3.0351130351130353e-05, "loss": 0.4399, "step": 1049 }, { "epoch": 1.3621025308241401, "grad_norm": 0.237479688071686, "learning_rate": 3.0327080327080328e-05, "loss": 0.4395, "step": 1050 }, { "epoch": 1.36340038935756, "grad_norm": 0.2607112703183096, "learning_rate": 3.0303030303030306e-05, "loss": 0.4379, "step": 1051 }, { "epoch": 1.36469824789098, "grad_norm": 0.2367521283120222, "learning_rate": 3.027898027898028e-05, "loss": 0.4102, "step": 1052 }, { "epoch": 1.3659961064243997, "grad_norm": 0.29194250880671824, "learning_rate": 3.025493025493026e-05, "loss": 0.443, "step": 1053 }, { "epoch": 1.3672939649578195, "grad_norm": 0.26648481874523033, "learning_rate": 3.0230880230880233e-05, "loss": 0.434, "step": 1054 }, { "epoch": 1.3685918234912395, "grad_norm": 0.26285311775511155, "learning_rate": 3.0206830206830205e-05, "loss": 0.4094, "step": 1055 }, { "epoch": 1.3698896820246593, "grad_norm": 0.2564406034453145, "learning_rate": 3.0182780182780186e-05, "loss": 0.4398, "step": 1056 }, { "epoch": 1.3711875405580791, "grad_norm": 0.2796151000660689, "learning_rate": 3.0158730158730158e-05, "loss": 0.4276, "step": 1057 }, { "epoch": 1.3724853990914991, "grad_norm": 0.26631625570118417, "learning_rate": 3.013468013468014e-05, "loss": 0.4343, "step": 1058 }, { "epoch": 1.373783257624919, "grad_norm": 0.23806771079205116, "learning_rate": 3.011063011063011e-05, "loss": 0.4334, "step": 1059 }, { "epoch": 1.3750811161583387, "grad_norm": 0.27717084828254007, "learning_rate": 3.0086580086580092e-05, "loss": 0.4445, "step": 1060 }, { "epoch": 1.3763789746917585, "grad_norm": 0.25955745890460635, "learning_rate": 3.0062530062530064e-05, "loss": 0.4339, "step": 1061 }, { "epoch": 1.3776768332251785, "grad_norm": 0.27833631860695396, "learning_rate": 3.003848003848004e-05, "loss": 0.4236, "step": 1062 }, { "epoch": 1.3789746917585983, "grad_norm": 0.23547677910993067, "learning_rate": 3.0014430014430017e-05, "loss": 0.4451, "step": 1063 }, { "epoch": 1.3802725502920181, "grad_norm": 0.2818218077693615, "learning_rate": 2.999037999037999e-05, "loss": 0.444, "step": 1064 }, { "epoch": 1.381570408825438, "grad_norm": 0.22221888861761538, "learning_rate": 2.996632996632997e-05, "loss": 0.403, "step": 1065 }, { "epoch": 1.382868267358858, "grad_norm": 0.2445641007844883, "learning_rate": 2.9942279942279944e-05, "loss": 0.4163, "step": 1066 }, { "epoch": 1.3841661258922777, "grad_norm": 0.24596043479534688, "learning_rate": 2.991822991822992e-05, "loss": 0.4227, "step": 1067 }, { "epoch": 1.3854639844256975, "grad_norm": 0.2278867553746751, "learning_rate": 2.9894179894179897e-05, "loss": 0.4252, "step": 1068 }, { "epoch": 1.3867618429591175, "grad_norm": 0.2275831703154012, "learning_rate": 2.9870129870129872e-05, "loss": 0.4242, "step": 1069 }, { "epoch": 1.3880597014925373, "grad_norm": 0.2792734033461531, "learning_rate": 2.984607984607985e-05, "loss": 0.4117, "step": 1070 }, { "epoch": 1.3893575600259571, "grad_norm": 0.22328171573584032, "learning_rate": 2.9822029822029825e-05, "loss": 0.4115, "step": 1071 }, { "epoch": 1.3906554185593771, "grad_norm": 0.2554385321095662, "learning_rate": 2.9797979797979796e-05, "loss": 0.4193, "step": 1072 }, { "epoch": 1.391953277092797, "grad_norm": 0.23411526096024632, "learning_rate": 2.9773929773929778e-05, "loss": 0.401, "step": 1073 }, { "epoch": 1.3932511356262167, "grad_norm": 0.2417797864860212, "learning_rate": 2.974987974987975e-05, "loss": 0.4175, "step": 1074 }, { "epoch": 1.3945489941596367, "grad_norm": 0.22909915502634573, "learning_rate": 2.972582972582973e-05, "loss": 0.4145, "step": 1075 }, { "epoch": 1.3958468526930565, "grad_norm": 0.24654124988803922, "learning_rate": 2.9701779701779702e-05, "loss": 0.4363, "step": 1076 }, { "epoch": 1.3971447112264763, "grad_norm": 0.23841085503876774, "learning_rate": 2.9677729677729677e-05, "loss": 0.4263, "step": 1077 }, { "epoch": 1.3984425697598961, "grad_norm": 0.2372620155067687, "learning_rate": 2.9653679653679655e-05, "loss": 0.4164, "step": 1078 }, { "epoch": 1.399740428293316, "grad_norm": 0.2174655800712752, "learning_rate": 2.962962962962963e-05, "loss": 0.4131, "step": 1079 }, { "epoch": 1.401038286826736, "grad_norm": 0.25800767555211196, "learning_rate": 2.9605579605579608e-05, "loss": 0.4353, "step": 1080 }, { "epoch": 1.4023361453601557, "grad_norm": 0.22921820584725752, "learning_rate": 2.9581529581529583e-05, "loss": 0.4048, "step": 1081 }, { "epoch": 1.4036340038935755, "grad_norm": 0.23402586035822698, "learning_rate": 2.955747955747956e-05, "loss": 0.4424, "step": 1082 }, { "epoch": 1.4049318624269955, "grad_norm": 0.2352437377010652, "learning_rate": 2.9533429533429536e-05, "loss": 0.4026, "step": 1083 }, { "epoch": 1.4062297209604153, "grad_norm": 0.2247412452603025, "learning_rate": 2.950937950937951e-05, "loss": 0.4336, "step": 1084 }, { "epoch": 1.4075275794938351, "grad_norm": 0.22277806225908384, "learning_rate": 2.948532948532949e-05, "loss": 0.4114, "step": 1085 }, { "epoch": 1.4088254380272551, "grad_norm": 0.241538295583898, "learning_rate": 2.946127946127946e-05, "loss": 0.4394, "step": 1086 }, { "epoch": 1.410123296560675, "grad_norm": 0.2610774457770794, "learning_rate": 2.943722943722944e-05, "loss": 0.4078, "step": 1087 }, { "epoch": 1.4114211550940947, "grad_norm": 0.28762989292521274, "learning_rate": 2.9413179413179413e-05, "loss": 0.4109, "step": 1088 }, { "epoch": 1.4127190136275147, "grad_norm": 0.22764926899208376, "learning_rate": 2.9389129389129388e-05, "loss": 0.4113, "step": 1089 }, { "epoch": 1.4140168721609345, "grad_norm": 0.2951748901233817, "learning_rate": 2.9365079365079366e-05, "loss": 0.4112, "step": 1090 }, { "epoch": 1.4153147306943543, "grad_norm": 0.2491523290558, "learning_rate": 2.934102934102934e-05, "loss": 0.4061, "step": 1091 }, { "epoch": 1.416612589227774, "grad_norm": 0.24774585578620387, "learning_rate": 2.931697931697932e-05, "loss": 0.4208, "step": 1092 }, { "epoch": 1.417910447761194, "grad_norm": 0.2383954293588994, "learning_rate": 2.9292929292929294e-05, "loss": 0.4243, "step": 1093 }, { "epoch": 1.419208306294614, "grad_norm": 0.2551439406874695, "learning_rate": 2.926887926887927e-05, "loss": 0.4254, "step": 1094 }, { "epoch": 1.4205061648280337, "grad_norm": 0.24117464674091157, "learning_rate": 2.9244829244829247e-05, "loss": 0.4082, "step": 1095 }, { "epoch": 1.4218040233614535, "grad_norm": 0.216503714519975, "learning_rate": 2.922077922077922e-05, "loss": 0.4242, "step": 1096 }, { "epoch": 1.4231018818948735, "grad_norm": 0.2422351302114647, "learning_rate": 2.91967291967292e-05, "loss": 0.4181, "step": 1097 }, { "epoch": 1.4243997404282933, "grad_norm": 0.22788810318626124, "learning_rate": 2.9172679172679174e-05, "loss": 0.4203, "step": 1098 }, { "epoch": 1.425697598961713, "grad_norm": 0.24325054021695863, "learning_rate": 2.9148629148629146e-05, "loss": 0.423, "step": 1099 }, { "epoch": 1.4269954574951331, "grad_norm": 0.23344217723128893, "learning_rate": 2.9124579124579127e-05, "loss": 0.4162, "step": 1100 }, { "epoch": 1.428293316028553, "grad_norm": 0.2543879197815251, "learning_rate": 2.91005291005291e-05, "loss": 0.4376, "step": 1101 }, { "epoch": 1.4295911745619727, "grad_norm": 0.2501682851968916, "learning_rate": 2.907647907647908e-05, "loss": 0.4164, "step": 1102 }, { "epoch": 1.4308890330953927, "grad_norm": 0.21492688667239696, "learning_rate": 2.905242905242905e-05, "loss": 0.4113, "step": 1103 }, { "epoch": 1.4321868916288125, "grad_norm": 0.2658354623358409, "learning_rate": 2.9028379028379033e-05, "loss": 0.4315, "step": 1104 }, { "epoch": 1.4334847501622323, "grad_norm": 0.2831492165913101, "learning_rate": 2.9004329004329005e-05, "loss": 0.4338, "step": 1105 }, { "epoch": 1.434782608695652, "grad_norm": 0.22471574523786844, "learning_rate": 2.898027898027898e-05, "loss": 0.4157, "step": 1106 }, { "epoch": 1.436080467229072, "grad_norm": 0.24385415494263882, "learning_rate": 2.8956228956228958e-05, "loss": 0.4172, "step": 1107 }, { "epoch": 1.437378325762492, "grad_norm": 0.2722198360023313, "learning_rate": 2.8932178932178932e-05, "loss": 0.407, "step": 1108 }, { "epoch": 1.4386761842959117, "grad_norm": 0.21019395837235147, "learning_rate": 2.890812890812891e-05, "loss": 0.4189, "step": 1109 }, { "epoch": 1.4399740428293315, "grad_norm": 0.23984115871051997, "learning_rate": 2.8884078884078885e-05, "loss": 0.4261, "step": 1110 }, { "epoch": 1.4412719013627515, "grad_norm": 0.25738974208155574, "learning_rate": 2.886002886002886e-05, "loss": 0.4694, "step": 1111 }, { "epoch": 1.4425697598961713, "grad_norm": 0.2739742760545878, "learning_rate": 2.8835978835978838e-05, "loss": 0.4284, "step": 1112 }, { "epoch": 1.443867618429591, "grad_norm": 0.2563615388623274, "learning_rate": 2.8811928811928813e-05, "loss": 0.4187, "step": 1113 }, { "epoch": 1.4451654769630111, "grad_norm": 0.2355873474417628, "learning_rate": 2.878787878787879e-05, "loss": 0.4201, "step": 1114 }, { "epoch": 1.446463335496431, "grad_norm": 0.3037078461896459, "learning_rate": 2.8763828763828766e-05, "loss": 0.4164, "step": 1115 }, { "epoch": 1.4477611940298507, "grad_norm": 0.2778889853166693, "learning_rate": 2.8739778739778737e-05, "loss": 0.4263, "step": 1116 }, { "epoch": 1.4490590525632707, "grad_norm": 0.25304453875189337, "learning_rate": 2.871572871572872e-05, "loss": 0.4238, "step": 1117 }, { "epoch": 1.4503569110966905, "grad_norm": 0.2617845594600046, "learning_rate": 2.869167869167869e-05, "loss": 0.4325, "step": 1118 }, { "epoch": 1.4516547696301103, "grad_norm": 0.28455794197858575, "learning_rate": 2.8667628667628672e-05, "loss": 0.4156, "step": 1119 }, { "epoch": 1.45295262816353, "grad_norm": 0.23740191563596869, "learning_rate": 2.8643578643578643e-05, "loss": 0.4261, "step": 1120 }, { "epoch": 1.45425048669695, "grad_norm": 0.24072843737266889, "learning_rate": 2.8619528619528618e-05, "loss": 0.4151, "step": 1121 }, { "epoch": 1.45554834523037, "grad_norm": 0.2700405419220064, "learning_rate": 2.8595478595478596e-05, "loss": 0.4252, "step": 1122 }, { "epoch": 1.4568462037637897, "grad_norm": 0.2488359814096937, "learning_rate": 2.857142857142857e-05, "loss": 0.4428, "step": 1123 }, { "epoch": 1.4581440622972095, "grad_norm": 0.23487079361910798, "learning_rate": 2.854737854737855e-05, "loss": 0.4216, "step": 1124 }, { "epoch": 1.4594419208306295, "grad_norm": 0.2466576788103327, "learning_rate": 2.8523328523328524e-05, "loss": 0.4226, "step": 1125 }, { "epoch": 1.4607397793640493, "grad_norm": 0.2391996649480345, "learning_rate": 2.8499278499278502e-05, "loss": 0.4096, "step": 1126 }, { "epoch": 1.462037637897469, "grad_norm": 0.24258796808928063, "learning_rate": 2.8475228475228477e-05, "loss": 0.4145, "step": 1127 }, { "epoch": 1.4633354964308891, "grad_norm": 0.2560406117346898, "learning_rate": 2.845117845117845e-05, "loss": 0.4312, "step": 1128 }, { "epoch": 1.464633354964309, "grad_norm": 0.2861667925744788, "learning_rate": 2.842712842712843e-05, "loss": 0.4328, "step": 1129 }, { "epoch": 1.4659312134977287, "grad_norm": 0.25402106108095945, "learning_rate": 2.8403078403078404e-05, "loss": 0.4258, "step": 1130 }, { "epoch": 1.4672290720311487, "grad_norm": 0.24075563965908323, "learning_rate": 2.8379028379028383e-05, "loss": 0.4397, "step": 1131 }, { "epoch": 1.4685269305645685, "grad_norm": 0.28522807177447185, "learning_rate": 2.8354978354978357e-05, "loss": 0.4303, "step": 1132 }, { "epoch": 1.4698247890979883, "grad_norm": 0.26004049344709895, "learning_rate": 2.833092833092833e-05, "loss": 0.4294, "step": 1133 }, { "epoch": 1.471122647631408, "grad_norm": 0.29853418709486346, "learning_rate": 2.830687830687831e-05, "loss": 0.4323, "step": 1134 }, { "epoch": 1.4724205061648281, "grad_norm": 0.2633187635030568, "learning_rate": 2.8282828282828282e-05, "loss": 0.439, "step": 1135 }, { "epoch": 1.473718364698248, "grad_norm": 0.3157910451013907, "learning_rate": 2.8258778258778263e-05, "loss": 0.4337, "step": 1136 }, { "epoch": 1.4750162232316677, "grad_norm": 0.27203466331005977, "learning_rate": 2.8234728234728235e-05, "loss": 0.4198, "step": 1137 }, { "epoch": 1.4763140817650875, "grad_norm": 0.2508023724498563, "learning_rate": 2.821067821067821e-05, "loss": 0.4267, "step": 1138 }, { "epoch": 1.4776119402985075, "grad_norm": 0.2808852939089359, "learning_rate": 2.8186628186628188e-05, "loss": 0.4281, "step": 1139 }, { "epoch": 1.4789097988319273, "grad_norm": 0.24576060851892864, "learning_rate": 2.8162578162578162e-05, "loss": 0.4206, "step": 1140 }, { "epoch": 1.480207657365347, "grad_norm": 0.29052064234787545, "learning_rate": 2.813852813852814e-05, "loss": 0.413, "step": 1141 }, { "epoch": 1.4815055158987671, "grad_norm": 0.23398439998245094, "learning_rate": 2.8114478114478115e-05, "loss": 0.4352, "step": 1142 }, { "epoch": 1.482803374432187, "grad_norm": 0.25678491256047153, "learning_rate": 2.809042809042809e-05, "loss": 0.4346, "step": 1143 }, { "epoch": 1.4841012329656067, "grad_norm": 0.2826048101734635, "learning_rate": 2.8066378066378068e-05, "loss": 0.4262, "step": 1144 }, { "epoch": 1.4853990914990267, "grad_norm": 0.25015775061708817, "learning_rate": 2.8042328042328043e-05, "loss": 0.4129, "step": 1145 }, { "epoch": 1.4866969500324465, "grad_norm": 0.2561977557181458, "learning_rate": 2.801827801827802e-05, "loss": 0.417, "step": 1146 }, { "epoch": 1.4879948085658663, "grad_norm": 0.25036370521793533, "learning_rate": 2.7994227994227996e-05, "loss": 0.4079, "step": 1147 }, { "epoch": 1.4892926670992863, "grad_norm": 0.28901223175805674, "learning_rate": 2.7970177970177974e-05, "loss": 0.4094, "step": 1148 }, { "epoch": 1.490590525632706, "grad_norm": 0.23134484811007663, "learning_rate": 2.794612794612795e-05, "loss": 0.4476, "step": 1149 }, { "epoch": 1.491888384166126, "grad_norm": 0.25137689970727467, "learning_rate": 2.792207792207792e-05, "loss": 0.4416, "step": 1150 }, { "epoch": 1.4931862426995457, "grad_norm": 0.2524284266331274, "learning_rate": 2.7898027898027902e-05, "loss": 0.4295, "step": 1151 }, { "epoch": 1.4944841012329655, "grad_norm": 0.22266682751444122, "learning_rate": 2.7873977873977873e-05, "loss": 0.4115, "step": 1152 }, { "epoch": 1.4957819597663855, "grad_norm": 0.2085505106465029, "learning_rate": 2.7849927849927855e-05, "loss": 0.4271, "step": 1153 }, { "epoch": 1.4970798182998053, "grad_norm": 0.2352572065506912, "learning_rate": 2.7825877825877826e-05, "loss": 0.4129, "step": 1154 }, { "epoch": 1.498377676833225, "grad_norm": 0.2322270923460416, "learning_rate": 2.78018278018278e-05, "loss": 0.4404, "step": 1155 }, { "epoch": 1.499675535366645, "grad_norm": 0.20327079840186968, "learning_rate": 2.777777777777778e-05, "loss": 0.3992, "step": 1156 }, { "epoch": 1.500973393900065, "grad_norm": 0.22409767153079405, "learning_rate": 2.7753727753727754e-05, "loss": 0.4351, "step": 1157 }, { "epoch": 1.5022712524334847, "grad_norm": 0.21789564363803948, "learning_rate": 2.7729677729677732e-05, "loss": 0.4263, "step": 1158 }, { "epoch": 1.5035691109669047, "grad_norm": 0.23289144485137522, "learning_rate": 2.7705627705627707e-05, "loss": 0.4296, "step": 1159 }, { "epoch": 1.5048669695003245, "grad_norm": 0.22790992420912343, "learning_rate": 2.768157768157768e-05, "loss": 0.4275, "step": 1160 }, { "epoch": 1.5061648280337443, "grad_norm": 0.2180550660231808, "learning_rate": 2.765752765752766e-05, "loss": 0.4242, "step": 1161 }, { "epoch": 1.5074626865671643, "grad_norm": 0.23490836544769822, "learning_rate": 2.7633477633477635e-05, "loss": 0.43, "step": 1162 }, { "epoch": 1.5087605451005839, "grad_norm": 0.22192430223007012, "learning_rate": 2.7609427609427613e-05, "loss": 0.429, "step": 1163 }, { "epoch": 1.510058403634004, "grad_norm": 0.2311428363766677, "learning_rate": 2.7585377585377587e-05, "loss": 0.4414, "step": 1164 }, { "epoch": 1.511356262167424, "grad_norm": 0.2001408824188442, "learning_rate": 2.756132756132756e-05, "loss": 0.4225, "step": 1165 }, { "epoch": 1.5126541207008435, "grad_norm": 0.2279619365636095, "learning_rate": 2.753727753727754e-05, "loss": 0.4149, "step": 1166 }, { "epoch": 1.5139519792342635, "grad_norm": 0.24302952170040615, "learning_rate": 2.7513227513227512e-05, "loss": 0.4153, "step": 1167 }, { "epoch": 1.5152498377676833, "grad_norm": 0.2252876935922963, "learning_rate": 2.7489177489177493e-05, "loss": 0.4135, "step": 1168 }, { "epoch": 1.516547696301103, "grad_norm": 0.22162900859128726, "learning_rate": 2.7465127465127465e-05, "loss": 0.4283, "step": 1169 }, { "epoch": 1.517845554834523, "grad_norm": 0.23728269459202284, "learning_rate": 2.7441077441077446e-05, "loss": 0.4284, "step": 1170 }, { "epoch": 1.519143413367943, "grad_norm": 0.2073068726020532, "learning_rate": 2.7417027417027418e-05, "loss": 0.419, "step": 1171 }, { "epoch": 1.5204412719013627, "grad_norm": 0.2308870482056988, "learning_rate": 2.7392977392977392e-05, "loss": 0.4409, "step": 1172 }, { "epoch": 1.5217391304347827, "grad_norm": 0.21898533880032697, "learning_rate": 2.736892736892737e-05, "loss": 0.4171, "step": 1173 }, { "epoch": 1.5230369889682025, "grad_norm": 0.21000995819843474, "learning_rate": 2.7344877344877345e-05, "loss": 0.417, "step": 1174 }, { "epoch": 1.5243348475016223, "grad_norm": 0.2150245170655777, "learning_rate": 2.7320827320827324e-05, "loss": 0.4365, "step": 1175 }, { "epoch": 1.5256327060350423, "grad_norm": 0.24290565598308295, "learning_rate": 2.72967772967773e-05, "loss": 0.4201, "step": 1176 }, { "epoch": 1.5269305645684619, "grad_norm": 0.2304464719146474, "learning_rate": 2.7272727272727273e-05, "loss": 0.4149, "step": 1177 }, { "epoch": 1.528228423101882, "grad_norm": 0.23523933221515506, "learning_rate": 2.724867724867725e-05, "loss": 0.453, "step": 1178 }, { "epoch": 1.529526281635302, "grad_norm": 0.2253825255615944, "learning_rate": 2.7224627224627226e-05, "loss": 0.4209, "step": 1179 }, { "epoch": 1.5308241401687215, "grad_norm": 0.2742775834013937, "learning_rate": 2.7200577200577204e-05, "loss": 0.442, "step": 1180 }, { "epoch": 1.5321219987021415, "grad_norm": 0.2176528388600847, "learning_rate": 2.717652717652718e-05, "loss": 0.4329, "step": 1181 }, { "epoch": 1.5334198572355613, "grad_norm": 0.23818415433225926, "learning_rate": 2.715247715247715e-05, "loss": 0.4187, "step": 1182 }, { "epoch": 1.534717715768981, "grad_norm": 0.26109881547859903, "learning_rate": 2.7128427128427132e-05, "loss": 0.4251, "step": 1183 }, { "epoch": 1.536015574302401, "grad_norm": 0.2196942384869763, "learning_rate": 2.7104377104377103e-05, "loss": 0.418, "step": 1184 }, { "epoch": 1.537313432835821, "grad_norm": 0.2400322015222156, "learning_rate": 2.7080327080327085e-05, "loss": 0.4109, "step": 1185 }, { "epoch": 1.5386112913692407, "grad_norm": 0.23150552647711828, "learning_rate": 2.7056277056277056e-05, "loss": 0.4264, "step": 1186 }, { "epoch": 1.5399091499026607, "grad_norm": 0.22005403208488783, "learning_rate": 2.703222703222703e-05, "loss": 0.4039, "step": 1187 }, { "epoch": 1.5412070084360805, "grad_norm": 0.22581597634393283, "learning_rate": 2.700817700817701e-05, "loss": 0.425, "step": 1188 }, { "epoch": 1.5425048669695003, "grad_norm": 0.2382695341310579, "learning_rate": 2.6984126984126984e-05, "loss": 0.4496, "step": 1189 }, { "epoch": 1.5438027255029203, "grad_norm": 0.2203961917107305, "learning_rate": 2.6960076960076962e-05, "loss": 0.4155, "step": 1190 }, { "epoch": 1.5451005840363399, "grad_norm": 0.25210372953982285, "learning_rate": 2.6936026936026937e-05, "loss": 0.4342, "step": 1191 }, { "epoch": 1.54639844256976, "grad_norm": 0.238604523146027, "learning_rate": 2.691197691197691e-05, "loss": 0.4323, "step": 1192 }, { "epoch": 1.54769630110318, "grad_norm": 0.23138471132633792, "learning_rate": 2.688792688792689e-05, "loss": 0.4242, "step": 1193 }, { "epoch": 1.5489941596365995, "grad_norm": 0.2320529813667351, "learning_rate": 2.6863876863876865e-05, "loss": 0.4363, "step": 1194 }, { "epoch": 1.5502920181700195, "grad_norm": 0.22679612862184145, "learning_rate": 2.6839826839826843e-05, "loss": 0.4253, "step": 1195 }, { "epoch": 1.5515898767034393, "grad_norm": 0.2665688161045152, "learning_rate": 2.6815776815776818e-05, "loss": 0.4222, "step": 1196 }, { "epoch": 1.552887735236859, "grad_norm": 0.21178913986030537, "learning_rate": 2.6791726791726796e-05, "loss": 0.422, "step": 1197 }, { "epoch": 1.554185593770279, "grad_norm": 0.24464931528999015, "learning_rate": 2.676767676767677e-05, "loss": 0.4241, "step": 1198 }, { "epoch": 1.5554834523036989, "grad_norm": 0.22319718290311183, "learning_rate": 2.6743626743626742e-05, "loss": 0.4168, "step": 1199 }, { "epoch": 1.5567813108371187, "grad_norm": 0.2302808693777694, "learning_rate": 2.6719576719576723e-05, "loss": 0.41, "step": 1200 }, { "epoch": 1.5580791693705387, "grad_norm": 0.2317544115600513, "learning_rate": 2.6695526695526695e-05, "loss": 0.4555, "step": 1201 }, { "epoch": 1.5593770279039585, "grad_norm": 0.2554067046842974, "learning_rate": 2.6671476671476676e-05, "loss": 0.4075, "step": 1202 }, { "epoch": 1.5606748864373783, "grad_norm": 0.23832375199078534, "learning_rate": 2.6647426647426648e-05, "loss": 0.4149, "step": 1203 }, { "epoch": 1.5619727449707983, "grad_norm": 0.2387421511606967, "learning_rate": 2.6623376623376623e-05, "loss": 0.4349, "step": 1204 }, { "epoch": 1.563270603504218, "grad_norm": 0.24466721743899011, "learning_rate": 2.65993265993266e-05, "loss": 0.4158, "step": 1205 }, { "epoch": 1.5645684620376379, "grad_norm": 0.23174513616055498, "learning_rate": 2.6575276575276575e-05, "loss": 0.4443, "step": 1206 }, { "epoch": 1.565866320571058, "grad_norm": 0.218582346579111, "learning_rate": 2.6551226551226554e-05, "loss": 0.4228, "step": 1207 }, { "epoch": 1.5671641791044775, "grad_norm": 0.23236180907143378, "learning_rate": 2.652717652717653e-05, "loss": 0.4198, "step": 1208 }, { "epoch": 1.5684620376378975, "grad_norm": 0.2461597550351122, "learning_rate": 2.6503126503126503e-05, "loss": 0.4388, "step": 1209 }, { "epoch": 1.5697598961713173, "grad_norm": 0.24135274528584466, "learning_rate": 2.647907647907648e-05, "loss": 0.4182, "step": 1210 }, { "epoch": 1.571057754704737, "grad_norm": 0.23011430180334824, "learning_rate": 2.6455026455026456e-05, "loss": 0.4345, "step": 1211 }, { "epoch": 1.572355613238157, "grad_norm": 0.25813925411615873, "learning_rate": 2.6430976430976434e-05, "loss": 0.4152, "step": 1212 }, { "epoch": 1.5736534717715769, "grad_norm": 0.2361569395941438, "learning_rate": 2.640692640692641e-05, "loss": 0.4107, "step": 1213 }, { "epoch": 1.5749513303049967, "grad_norm": 0.26363884372789825, "learning_rate": 2.638287638287638e-05, "loss": 0.4392, "step": 1214 }, { "epoch": 1.5762491888384167, "grad_norm": 0.24244610329485705, "learning_rate": 2.6358826358826362e-05, "loss": 0.4164, "step": 1215 }, { "epoch": 1.5775470473718365, "grad_norm": 0.2552987758465308, "learning_rate": 2.6334776334776333e-05, "loss": 0.4339, "step": 1216 }, { "epoch": 1.5788449059052563, "grad_norm": 0.2622601300659554, "learning_rate": 2.6310726310726315e-05, "loss": 0.4081, "step": 1217 }, { "epoch": 1.5801427644386763, "grad_norm": 0.23435950487013313, "learning_rate": 2.6286676286676286e-05, "loss": 0.4266, "step": 1218 }, { "epoch": 1.581440622972096, "grad_norm": 0.31150362868262865, "learning_rate": 2.6262626262626268e-05, "loss": 0.4205, "step": 1219 }, { "epoch": 1.5827384815055159, "grad_norm": 0.2356568945579236, "learning_rate": 2.623857623857624e-05, "loss": 0.4235, "step": 1220 }, { "epoch": 1.584036340038936, "grad_norm": 0.2636851026847217, "learning_rate": 2.6214526214526214e-05, "loss": 0.4194, "step": 1221 }, { "epoch": 1.5853341985723555, "grad_norm": 0.2609824789762705, "learning_rate": 2.6190476190476192e-05, "loss": 0.4386, "step": 1222 }, { "epoch": 1.5866320571057755, "grad_norm": 0.2503475112982072, "learning_rate": 2.6166426166426167e-05, "loss": 0.4295, "step": 1223 }, { "epoch": 1.5879299156391953, "grad_norm": 0.2748789264904923, "learning_rate": 2.6142376142376145e-05, "loss": 0.431, "step": 1224 }, { "epoch": 1.589227774172615, "grad_norm": 0.2122856536086439, "learning_rate": 2.611832611832612e-05, "loss": 0.4151, "step": 1225 }, { "epoch": 1.590525632706035, "grad_norm": 0.2882371321327433, "learning_rate": 2.6094276094276095e-05, "loss": 0.4242, "step": 1226 }, { "epoch": 1.5918234912394549, "grad_norm": 0.22024360438567706, "learning_rate": 2.6070226070226073e-05, "loss": 0.4173, "step": 1227 }, { "epoch": 1.5931213497728747, "grad_norm": 0.23708353175014626, "learning_rate": 2.6046176046176048e-05, "loss": 0.4251, "step": 1228 }, { "epoch": 1.5944192083062947, "grad_norm": 0.2658200863217972, "learning_rate": 2.6022126022126026e-05, "loss": 0.4408, "step": 1229 }, { "epoch": 1.5957170668397145, "grad_norm": 0.21583066555363375, "learning_rate": 2.5998075998076e-05, "loss": 0.4191, "step": 1230 }, { "epoch": 1.5970149253731343, "grad_norm": 0.2777242614566809, "learning_rate": 2.5974025974025972e-05, "loss": 0.4393, "step": 1231 }, { "epoch": 1.5983127839065543, "grad_norm": 0.23219187892619703, "learning_rate": 2.5949975949975954e-05, "loss": 0.4265, "step": 1232 }, { "epoch": 1.599610642439974, "grad_norm": 0.29387387169794943, "learning_rate": 2.5925925925925925e-05, "loss": 0.4244, "step": 1233 }, { "epoch": 1.6009085009733939, "grad_norm": 0.22151955032464254, "learning_rate": 2.5901875901875906e-05, "loss": 0.4085, "step": 1234 }, { "epoch": 1.602206359506814, "grad_norm": 0.24242248455059523, "learning_rate": 2.5877825877825878e-05, "loss": 0.4285, "step": 1235 }, { "epoch": 1.6035042180402335, "grad_norm": 0.2621217435997206, "learning_rate": 2.5853775853775853e-05, "loss": 0.4379, "step": 1236 }, { "epoch": 1.6048020765736535, "grad_norm": 0.22823804591889496, "learning_rate": 2.582972582972583e-05, "loss": 0.4211, "step": 1237 }, { "epoch": 1.6060999351070735, "grad_norm": 0.2353758128022499, "learning_rate": 2.5805675805675806e-05, "loss": 0.4127, "step": 1238 }, { "epoch": 1.607397793640493, "grad_norm": 0.22811130965496038, "learning_rate": 2.5781625781625784e-05, "loss": 0.4246, "step": 1239 }, { "epoch": 1.608695652173913, "grad_norm": 0.2366434232412805, "learning_rate": 2.575757575757576e-05, "loss": 0.4109, "step": 1240 }, { "epoch": 1.6099935107073329, "grad_norm": 0.20375567441674386, "learning_rate": 2.5733525733525737e-05, "loss": 0.4146, "step": 1241 }, { "epoch": 1.6112913692407527, "grad_norm": 0.2436449942466404, "learning_rate": 2.570947570947571e-05, "loss": 0.4162, "step": 1242 }, { "epoch": 1.6125892277741727, "grad_norm": 0.22023021001348508, "learning_rate": 2.5685425685425686e-05, "loss": 0.4136, "step": 1243 }, { "epoch": 1.6138870863075925, "grad_norm": 0.2069116265186359, "learning_rate": 2.5661375661375664e-05, "loss": 0.3999, "step": 1244 }, { "epoch": 1.6151849448410123, "grad_norm": 0.24450308671714907, "learning_rate": 2.563732563732564e-05, "loss": 0.4352, "step": 1245 }, { "epoch": 1.6164828033744323, "grad_norm": 0.2361666753423955, "learning_rate": 2.5613275613275617e-05, "loss": 0.4215, "step": 1246 }, { "epoch": 1.617780661907852, "grad_norm": 0.24709753794900446, "learning_rate": 2.5589225589225592e-05, "loss": 0.4095, "step": 1247 }, { "epoch": 1.6190785204412719, "grad_norm": 0.22770211218246428, "learning_rate": 2.5565175565175563e-05, "loss": 0.4208, "step": 1248 }, { "epoch": 1.6203763789746919, "grad_norm": 0.24442985342584414, "learning_rate": 2.5541125541125545e-05, "loss": 0.4048, "step": 1249 }, { "epoch": 1.6216742375081115, "grad_norm": 0.2449341182023967, "learning_rate": 2.5517075517075516e-05, "loss": 0.429, "step": 1250 }, { "epoch": 1.6229720960415315, "grad_norm": 0.22314422338157636, "learning_rate": 2.5493025493025498e-05, "loss": 0.4161, "step": 1251 }, { "epoch": 1.6242699545749515, "grad_norm": 0.22271710889727703, "learning_rate": 2.546897546897547e-05, "loss": 0.4223, "step": 1252 }, { "epoch": 1.625567813108371, "grad_norm": 0.23943855813232637, "learning_rate": 2.5444925444925444e-05, "loss": 0.4263, "step": 1253 }, { "epoch": 1.626865671641791, "grad_norm": 0.22346829290932305, "learning_rate": 2.5420875420875422e-05, "loss": 0.4002, "step": 1254 }, { "epoch": 1.6281635301752109, "grad_norm": 0.21819410830608127, "learning_rate": 2.5396825396825397e-05, "loss": 0.4228, "step": 1255 }, { "epoch": 1.6294613887086307, "grad_norm": 0.2487542450136884, "learning_rate": 2.5372775372775375e-05, "loss": 0.431, "step": 1256 }, { "epoch": 1.6307592472420507, "grad_norm": 0.22276858066653343, "learning_rate": 2.534872534872535e-05, "loss": 0.3975, "step": 1257 }, { "epoch": 1.6320571057754705, "grad_norm": 0.20406534653386582, "learning_rate": 2.5324675324675325e-05, "loss": 0.4308, "step": 1258 }, { "epoch": 1.6333549643088903, "grad_norm": 0.2369459882014465, "learning_rate": 2.5300625300625303e-05, "loss": 0.4434, "step": 1259 }, { "epoch": 1.6346528228423103, "grad_norm": 0.23054872564198348, "learning_rate": 2.5276575276575278e-05, "loss": 0.4296, "step": 1260 }, { "epoch": 1.63595068137573, "grad_norm": 0.21314688817002478, "learning_rate": 2.5252525252525256e-05, "loss": 0.4234, "step": 1261 }, { "epoch": 1.6372485399091499, "grad_norm": 0.22937591574682323, "learning_rate": 2.522847522847523e-05, "loss": 0.425, "step": 1262 }, { "epoch": 1.6385463984425699, "grad_norm": 0.23974213218799267, "learning_rate": 2.520442520442521e-05, "loss": 0.4393, "step": 1263 }, { "epoch": 1.6398442569759895, "grad_norm": 0.23441342590653153, "learning_rate": 2.5180375180375184e-05, "loss": 0.4474, "step": 1264 }, { "epoch": 1.6411421155094095, "grad_norm": 0.22460634450789943, "learning_rate": 2.5156325156325155e-05, "loss": 0.4405, "step": 1265 }, { "epoch": 1.6424399740428295, "grad_norm": 0.21099257965853147, "learning_rate": 2.5132275132275137e-05, "loss": 0.4147, "step": 1266 }, { "epoch": 1.643737832576249, "grad_norm": 0.24160346011701583, "learning_rate": 2.5108225108225108e-05, "loss": 0.4397, "step": 1267 }, { "epoch": 1.645035691109669, "grad_norm": 0.21504387068528427, "learning_rate": 2.5084175084175086e-05, "loss": 0.4134, "step": 1268 }, { "epoch": 1.6463335496430889, "grad_norm": 0.20136235310740322, "learning_rate": 2.506012506012506e-05, "loss": 0.4352, "step": 1269 }, { "epoch": 1.6476314081765087, "grad_norm": 0.20297036044525715, "learning_rate": 2.5036075036075036e-05, "loss": 0.4181, "step": 1270 }, { "epoch": 1.6489292667099287, "grad_norm": 0.22303019507601843, "learning_rate": 2.5012025012025014e-05, "loss": 0.4028, "step": 1271 }, { "epoch": 1.6502271252433485, "grad_norm": 0.22166881997259968, "learning_rate": 2.498797498797499e-05, "loss": 0.4347, "step": 1272 }, { "epoch": 1.6515249837767683, "grad_norm": 0.20648838786480744, "learning_rate": 2.4963924963924963e-05, "loss": 0.4236, "step": 1273 }, { "epoch": 1.6528228423101883, "grad_norm": 0.23349839066379247, "learning_rate": 2.493987493987494e-05, "loss": 0.419, "step": 1274 }, { "epoch": 1.654120700843608, "grad_norm": 0.23063394385414213, "learning_rate": 2.4915824915824916e-05, "loss": 0.4152, "step": 1275 }, { "epoch": 1.6554185593770279, "grad_norm": 0.2190005315364852, "learning_rate": 2.4891774891774894e-05, "loss": 0.4115, "step": 1276 }, { "epoch": 1.6567164179104479, "grad_norm": 0.20078160624348626, "learning_rate": 2.4867724867724866e-05, "loss": 0.4245, "step": 1277 }, { "epoch": 1.6580142764438677, "grad_norm": 0.24133729159661466, "learning_rate": 2.4843674843674844e-05, "loss": 0.4293, "step": 1278 }, { "epoch": 1.6593121349772875, "grad_norm": 0.23794916923086656, "learning_rate": 2.481962481962482e-05, "loss": 0.4271, "step": 1279 }, { "epoch": 1.6606099935107075, "grad_norm": 0.2574981267536903, "learning_rate": 2.4795574795574797e-05, "loss": 0.447, "step": 1280 }, { "epoch": 1.661907852044127, "grad_norm": 0.23168835516119046, "learning_rate": 2.4771524771524772e-05, "loss": 0.4193, "step": 1281 }, { "epoch": 1.663205710577547, "grad_norm": 0.23742732882318857, "learning_rate": 2.474747474747475e-05, "loss": 0.4476, "step": 1282 }, { "epoch": 1.6645035691109669, "grad_norm": 0.2514178670168895, "learning_rate": 2.4723424723424725e-05, "loss": 0.4204, "step": 1283 }, { "epoch": 1.6658014276443867, "grad_norm": 0.24624362103838568, "learning_rate": 2.46993746993747e-05, "loss": 0.4212, "step": 1284 }, { "epoch": 1.6670992861778067, "grad_norm": 0.23275179943867672, "learning_rate": 2.4675324675324678e-05, "loss": 0.4312, "step": 1285 }, { "epoch": 1.6683971447112265, "grad_norm": 0.24951766353093746, "learning_rate": 2.4651274651274652e-05, "loss": 0.4068, "step": 1286 }, { "epoch": 1.6696950032446463, "grad_norm": 0.2052390086188538, "learning_rate": 2.462722462722463e-05, "loss": 0.4173, "step": 1287 }, { "epoch": 1.6709928617780663, "grad_norm": 0.23198703435419166, "learning_rate": 2.4603174603174602e-05, "loss": 0.4177, "step": 1288 }, { "epoch": 1.672290720311486, "grad_norm": 0.212107274947473, "learning_rate": 2.457912457912458e-05, "loss": 0.4166, "step": 1289 }, { "epoch": 1.6735885788449059, "grad_norm": 0.2322378778891487, "learning_rate": 2.4555074555074555e-05, "loss": 0.434, "step": 1290 }, { "epoch": 1.6748864373783259, "grad_norm": 0.21435317963286998, "learning_rate": 2.4531024531024533e-05, "loss": 0.4133, "step": 1291 }, { "epoch": 1.6761842959117457, "grad_norm": 0.2156907157084962, "learning_rate": 2.4506974506974508e-05, "loss": 0.4285, "step": 1292 }, { "epoch": 1.6774821544451655, "grad_norm": 0.2568679217265247, "learning_rate": 2.4482924482924486e-05, "loss": 0.4134, "step": 1293 }, { "epoch": 1.6787800129785855, "grad_norm": 0.23974841708540973, "learning_rate": 2.4458874458874457e-05, "loss": 0.432, "step": 1294 }, { "epoch": 1.680077871512005, "grad_norm": 0.24031934038845462, "learning_rate": 2.4434824434824436e-05, "loss": 0.4444, "step": 1295 }, { "epoch": 1.681375730045425, "grad_norm": 0.2672955058745279, "learning_rate": 2.441077441077441e-05, "loss": 0.4369, "step": 1296 }, { "epoch": 1.6826735885788449, "grad_norm": 0.250150232180256, "learning_rate": 2.438672438672439e-05, "loss": 0.4273, "step": 1297 }, { "epoch": 1.6839714471122647, "grad_norm": 0.23626717443698833, "learning_rate": 2.4362674362674363e-05, "loss": 0.4236, "step": 1298 }, { "epoch": 1.6852693056456847, "grad_norm": 0.22422589742898313, "learning_rate": 2.4338624338624338e-05, "loss": 0.4338, "step": 1299 }, { "epoch": 1.6865671641791045, "grad_norm": 0.2308927071463409, "learning_rate": 2.4314574314574316e-05, "loss": 0.4386, "step": 1300 }, { "epoch": 1.6878650227125243, "grad_norm": 0.2564472802977678, "learning_rate": 2.429052429052429e-05, "loss": 0.4117, "step": 1301 }, { "epoch": 1.6891628812459443, "grad_norm": 0.23392174813654176, "learning_rate": 2.426647426647427e-05, "loss": 0.4154, "step": 1302 }, { "epoch": 1.690460739779364, "grad_norm": 0.2397393509201778, "learning_rate": 2.4242424242424244e-05, "loss": 0.4117, "step": 1303 }, { "epoch": 1.6917585983127839, "grad_norm": 0.22912904331451653, "learning_rate": 2.4218374218374222e-05, "loss": 0.4273, "step": 1304 }, { "epoch": 1.6930564568462039, "grad_norm": 0.23255973129828944, "learning_rate": 2.4194324194324193e-05, "loss": 0.4199, "step": 1305 }, { "epoch": 1.6943543153796237, "grad_norm": 0.2459474867528304, "learning_rate": 2.417027417027417e-05, "loss": 0.3879, "step": 1306 }, { "epoch": 1.6956521739130435, "grad_norm": 0.21055785182005404, "learning_rate": 2.4146224146224146e-05, "loss": 0.4284, "step": 1307 }, { "epoch": 1.6969500324464635, "grad_norm": 0.23246957442423627, "learning_rate": 2.4122174122174125e-05, "loss": 0.4131, "step": 1308 }, { "epoch": 1.698247890979883, "grad_norm": 0.23994403266599254, "learning_rate": 2.40981240981241e-05, "loss": 0.4333, "step": 1309 }, { "epoch": 1.699545749513303, "grad_norm": 0.2316301617751929, "learning_rate": 2.4074074074074074e-05, "loss": 0.4486, "step": 1310 }, { "epoch": 1.7008436080467229, "grad_norm": 0.22426960542300423, "learning_rate": 2.405002405002405e-05, "loss": 0.4087, "step": 1311 }, { "epoch": 1.7021414665801426, "grad_norm": 0.2150721322470821, "learning_rate": 2.4025974025974027e-05, "loss": 0.4073, "step": 1312 }, { "epoch": 1.7034393251135627, "grad_norm": 0.2519319561232589, "learning_rate": 2.4001924001924002e-05, "loss": 0.4312, "step": 1313 }, { "epoch": 1.7047371836469825, "grad_norm": 0.21097482449181498, "learning_rate": 2.397787397787398e-05, "loss": 0.4319, "step": 1314 }, { "epoch": 1.7060350421804023, "grad_norm": 0.24220719812817199, "learning_rate": 2.3953823953823955e-05, "loss": 0.4203, "step": 1315 }, { "epoch": 1.7073329007138223, "grad_norm": 0.22812751540222995, "learning_rate": 2.392977392977393e-05, "loss": 0.4325, "step": 1316 }, { "epoch": 1.708630759247242, "grad_norm": 0.21991172303538978, "learning_rate": 2.3905723905723908e-05, "loss": 0.4307, "step": 1317 }, { "epoch": 1.7099286177806619, "grad_norm": 0.24924033452167085, "learning_rate": 2.3881673881673882e-05, "loss": 0.4049, "step": 1318 }, { "epoch": 1.7112264763140819, "grad_norm": 0.20641915282626047, "learning_rate": 2.385762385762386e-05, "loss": 0.4171, "step": 1319 }, { "epoch": 1.7125243348475017, "grad_norm": 0.24635070927670322, "learning_rate": 2.3833573833573835e-05, "loss": 0.4273, "step": 1320 }, { "epoch": 1.7138221933809215, "grad_norm": 0.22793016083957607, "learning_rate": 2.380952380952381e-05, "loss": 0.4028, "step": 1321 }, { "epoch": 1.7151200519143415, "grad_norm": 0.21686313598641418, "learning_rate": 2.3785473785473785e-05, "loss": 0.4216, "step": 1322 }, { "epoch": 1.716417910447761, "grad_norm": 0.20935343473455395, "learning_rate": 2.3761423761423763e-05, "loss": 0.4002, "step": 1323 }, { "epoch": 1.717715768981181, "grad_norm": 0.22891020801302248, "learning_rate": 2.3737373737373738e-05, "loss": 0.411, "step": 1324 }, { "epoch": 1.719013627514601, "grad_norm": 0.20360937762091913, "learning_rate": 2.3713323713323716e-05, "loss": 0.4445, "step": 1325 }, { "epoch": 1.7203114860480206, "grad_norm": 0.21662567258205914, "learning_rate": 2.368927368927369e-05, "loss": 0.4182, "step": 1326 }, { "epoch": 1.7216093445814407, "grad_norm": 0.2112365052652544, "learning_rate": 2.3665223665223666e-05, "loss": 0.4119, "step": 1327 }, { "epoch": 1.7229072031148605, "grad_norm": 0.24045539800451038, "learning_rate": 2.364117364117364e-05, "loss": 0.4294, "step": 1328 }, { "epoch": 1.7242050616482802, "grad_norm": 0.22344351793204972, "learning_rate": 2.361712361712362e-05, "loss": 0.4159, "step": 1329 }, { "epoch": 1.7255029201817003, "grad_norm": 0.21385701142195507, "learning_rate": 2.3593073593073593e-05, "loss": 0.4208, "step": 1330 }, { "epoch": 1.72680077871512, "grad_norm": 0.22306282993754703, "learning_rate": 2.356902356902357e-05, "loss": 0.417, "step": 1331 }, { "epoch": 1.7280986372485398, "grad_norm": 0.23450328976859844, "learning_rate": 2.3544973544973546e-05, "loss": 0.397, "step": 1332 }, { "epoch": 1.7293964957819599, "grad_norm": 0.23314049600175976, "learning_rate": 2.352092352092352e-05, "loss": 0.4297, "step": 1333 }, { "epoch": 1.7306943543153797, "grad_norm": 0.21399972133644776, "learning_rate": 2.34968734968735e-05, "loss": 0.4366, "step": 1334 }, { "epoch": 1.7319922128487995, "grad_norm": 0.22426899700350952, "learning_rate": 2.3472823472823474e-05, "loss": 0.433, "step": 1335 }, { "epoch": 1.7332900713822195, "grad_norm": 0.21720443031252623, "learning_rate": 2.3448773448773452e-05, "loss": 0.4135, "step": 1336 }, { "epoch": 1.734587929915639, "grad_norm": 0.22022369229968872, "learning_rate": 2.3424723424723427e-05, "loss": 0.4035, "step": 1337 }, { "epoch": 1.735885788449059, "grad_norm": 0.2432882987842844, "learning_rate": 2.34006734006734e-05, "loss": 0.4274, "step": 1338 }, { "epoch": 1.737183646982479, "grad_norm": 0.22954645223280482, "learning_rate": 2.3376623376623376e-05, "loss": 0.4265, "step": 1339 }, { "epoch": 1.7384815055158986, "grad_norm": 0.23456332298959323, "learning_rate": 2.3352573352573355e-05, "loss": 0.4261, "step": 1340 }, { "epoch": 1.7397793640493187, "grad_norm": 0.23090513352220413, "learning_rate": 2.332852332852333e-05, "loss": 0.4343, "step": 1341 }, { "epoch": 1.7410772225827384, "grad_norm": 0.22635007188997747, "learning_rate": 2.3304473304473308e-05, "loss": 0.434, "step": 1342 }, { "epoch": 1.7423750811161582, "grad_norm": 0.24328716551223983, "learning_rate": 2.328042328042328e-05, "loss": 0.4329, "step": 1343 }, { "epoch": 1.7436729396495783, "grad_norm": 0.2245296632717372, "learning_rate": 2.3256373256373257e-05, "loss": 0.4135, "step": 1344 }, { "epoch": 1.744970798182998, "grad_norm": 0.23430249945874695, "learning_rate": 2.3232323232323232e-05, "loss": 0.4178, "step": 1345 }, { "epoch": 1.7462686567164178, "grad_norm": 0.21397181948116892, "learning_rate": 2.320827320827321e-05, "loss": 0.4236, "step": 1346 }, { "epoch": 1.7475665152498379, "grad_norm": 0.21776737560072357, "learning_rate": 2.3184223184223185e-05, "loss": 0.425, "step": 1347 }, { "epoch": 1.7488643737832577, "grad_norm": 0.23739059292954565, "learning_rate": 2.3160173160173163e-05, "loss": 0.4262, "step": 1348 }, { "epoch": 1.7501622323166774, "grad_norm": 0.2207747605074272, "learning_rate": 2.3136123136123138e-05, "loss": 0.4097, "step": 1349 }, { "epoch": 1.7514600908500975, "grad_norm": 0.19291564676436485, "learning_rate": 2.3112073112073113e-05, "loss": 0.4222, "step": 1350 }, { "epoch": 1.752757949383517, "grad_norm": 0.20208738954938904, "learning_rate": 2.308802308802309e-05, "loss": 0.4199, "step": 1351 }, { "epoch": 1.754055807916937, "grad_norm": 0.22066527169458836, "learning_rate": 2.3063973063973065e-05, "loss": 0.426, "step": 1352 }, { "epoch": 1.755353666450357, "grad_norm": 0.22615489279435733, "learning_rate": 2.3039923039923044e-05, "loss": 0.4103, "step": 1353 }, { "epoch": 1.7566515249837766, "grad_norm": 0.24657435823356594, "learning_rate": 2.3015873015873015e-05, "loss": 0.4006, "step": 1354 }, { "epoch": 1.7579493835171967, "grad_norm": 0.2287984884377898, "learning_rate": 2.2991822991822993e-05, "loss": 0.4481, "step": 1355 }, { "epoch": 1.7592472420506164, "grad_norm": 0.21060281438071618, "learning_rate": 2.2967772967772968e-05, "loss": 0.399, "step": 1356 }, { "epoch": 1.7605451005840362, "grad_norm": 0.2265028463512503, "learning_rate": 2.2943722943722946e-05, "loss": 0.4258, "step": 1357 }, { "epoch": 1.7618429591174563, "grad_norm": 0.21362689950493072, "learning_rate": 2.291967291967292e-05, "loss": 0.4178, "step": 1358 }, { "epoch": 1.763140817650876, "grad_norm": 0.21905319629937445, "learning_rate": 2.28956228956229e-05, "loss": 0.435, "step": 1359 }, { "epoch": 1.7644386761842958, "grad_norm": 0.2390674190592371, "learning_rate": 2.287157287157287e-05, "loss": 0.4374, "step": 1360 }, { "epoch": 1.7657365347177159, "grad_norm": 0.262047575243414, "learning_rate": 2.284752284752285e-05, "loss": 0.4249, "step": 1361 }, { "epoch": 1.7670343932511356, "grad_norm": 0.215263905999125, "learning_rate": 2.2823472823472823e-05, "loss": 0.4358, "step": 1362 }, { "epoch": 1.7683322517845554, "grad_norm": 0.2566335364538136, "learning_rate": 2.27994227994228e-05, "loss": 0.4358, "step": 1363 }, { "epoch": 1.7696301103179755, "grad_norm": 0.23396394290955702, "learning_rate": 2.2775372775372776e-05, "loss": 0.4345, "step": 1364 }, { "epoch": 1.7709279688513953, "grad_norm": 0.2997951835182788, "learning_rate": 2.275132275132275e-05, "loss": 0.417, "step": 1365 }, { "epoch": 1.772225827384815, "grad_norm": 0.25908281100055225, "learning_rate": 2.272727272727273e-05, "loss": 0.4283, "step": 1366 }, { "epoch": 1.773523685918235, "grad_norm": 0.2710663340672724, "learning_rate": 2.2703222703222704e-05, "loss": 0.4405, "step": 1367 }, { "epoch": 1.7748215444516546, "grad_norm": 0.25367420601807966, "learning_rate": 2.267917267917268e-05, "loss": 0.4149, "step": 1368 }, { "epoch": 1.7761194029850746, "grad_norm": 0.2872738552914004, "learning_rate": 2.2655122655122657e-05, "loss": 0.4339, "step": 1369 }, { "epoch": 1.7774172615184944, "grad_norm": 0.21919850447737751, "learning_rate": 2.2631072631072632e-05, "loss": 0.4445, "step": 1370 }, { "epoch": 1.7787151200519142, "grad_norm": 0.27892242525176375, "learning_rate": 2.2607022607022607e-05, "loss": 0.4131, "step": 1371 }, { "epoch": 1.7800129785853342, "grad_norm": 0.24026730555070555, "learning_rate": 2.2582972582972585e-05, "loss": 0.427, "step": 1372 }, { "epoch": 1.781310837118754, "grad_norm": 0.2145688542497997, "learning_rate": 2.255892255892256e-05, "loss": 0.4391, "step": 1373 }, { "epoch": 1.7826086956521738, "grad_norm": 0.23661442477067585, "learning_rate": 2.2534872534872538e-05, "loss": 0.41, "step": 1374 }, { "epoch": 1.7839065541855939, "grad_norm": 0.2610547392581578, "learning_rate": 2.2510822510822512e-05, "loss": 0.4188, "step": 1375 }, { "epoch": 1.7852044127190136, "grad_norm": 0.2493506467256105, "learning_rate": 2.2486772486772487e-05, "loss": 0.4206, "step": 1376 }, { "epoch": 1.7865022712524334, "grad_norm": 0.2649167628997299, "learning_rate": 2.2462722462722462e-05, "loss": 0.4237, "step": 1377 }, { "epoch": 1.7878001297858535, "grad_norm": 0.2534895267108062, "learning_rate": 2.243867243867244e-05, "loss": 0.4283, "step": 1378 }, { "epoch": 1.7890979883192732, "grad_norm": 0.2709616156112994, "learning_rate": 2.2414622414622415e-05, "loss": 0.4113, "step": 1379 }, { "epoch": 1.790395846852693, "grad_norm": 0.22792963880042075, "learning_rate": 2.2390572390572393e-05, "loss": 0.4267, "step": 1380 }, { "epoch": 1.791693705386113, "grad_norm": 0.24622268955355062, "learning_rate": 2.2366522366522368e-05, "loss": 0.4051, "step": 1381 }, { "epoch": 1.7929915639195326, "grad_norm": 0.27590035842972194, "learning_rate": 2.2342472342472343e-05, "loss": 0.4378, "step": 1382 }, { "epoch": 1.7942894224529526, "grad_norm": 0.2264860712514965, "learning_rate": 2.2318422318422317e-05, "loss": 0.4171, "step": 1383 }, { "epoch": 1.7955872809863724, "grad_norm": 0.27527712703496315, "learning_rate": 2.2294372294372296e-05, "loss": 0.4136, "step": 1384 }, { "epoch": 1.7968851395197922, "grad_norm": 0.27052531643386396, "learning_rate": 2.227032227032227e-05, "loss": 0.3935, "step": 1385 }, { "epoch": 1.7981829980532122, "grad_norm": 0.2479444281803134, "learning_rate": 2.224627224627225e-05, "loss": 0.4331, "step": 1386 }, { "epoch": 1.799480856586632, "grad_norm": 0.2373284631481721, "learning_rate": 2.2222222222222223e-05, "loss": 0.417, "step": 1387 }, { "epoch": 1.8007787151200518, "grad_norm": 0.2575638652052547, "learning_rate": 2.2198172198172198e-05, "loss": 0.4323, "step": 1388 }, { "epoch": 1.8020765736534718, "grad_norm": 0.2407980171885747, "learning_rate": 2.2174122174122176e-05, "loss": 0.4127, "step": 1389 }, { "epoch": 1.8033744321868916, "grad_norm": 0.21117036443387086, "learning_rate": 2.215007215007215e-05, "loss": 0.4191, "step": 1390 }, { "epoch": 1.8046722907203114, "grad_norm": 0.20129164818193102, "learning_rate": 2.212602212602213e-05, "loss": 0.4115, "step": 1391 }, { "epoch": 1.8059701492537314, "grad_norm": 0.2540001501490721, "learning_rate": 2.2101972101972104e-05, "loss": 0.431, "step": 1392 }, { "epoch": 1.8072680077871512, "grad_norm": 0.24804686013462887, "learning_rate": 2.207792207792208e-05, "loss": 0.4228, "step": 1393 }, { "epoch": 1.808565866320571, "grad_norm": 0.2022520818624456, "learning_rate": 2.2053872053872053e-05, "loss": 0.4203, "step": 1394 }, { "epoch": 1.809863724853991, "grad_norm": 0.23238075295532062, "learning_rate": 2.202982202982203e-05, "loss": 0.425, "step": 1395 }, { "epoch": 1.8111615833874106, "grad_norm": 0.21664815140422355, "learning_rate": 2.2005772005772006e-05, "loss": 0.4299, "step": 1396 }, { "epoch": 1.8124594419208306, "grad_norm": 0.21744531033366538, "learning_rate": 2.1981721981721985e-05, "loss": 0.4019, "step": 1397 }, { "epoch": 1.8137573004542504, "grad_norm": 0.2138946808987489, "learning_rate": 2.1957671957671956e-05, "loss": 0.4256, "step": 1398 }, { "epoch": 1.8150551589876702, "grad_norm": 0.2224576706896047, "learning_rate": 2.1933621933621934e-05, "loss": 0.4147, "step": 1399 }, { "epoch": 1.8163530175210902, "grad_norm": 0.2002966133486591, "learning_rate": 2.190957190957191e-05, "loss": 0.426, "step": 1400 }, { "epoch": 1.81765087605451, "grad_norm": 0.202531126088113, "learning_rate": 2.1885521885521887e-05, "loss": 0.4429, "step": 1401 }, { "epoch": 1.8189487345879298, "grad_norm": 0.20516498421820234, "learning_rate": 2.1861471861471862e-05, "loss": 0.4208, "step": 1402 }, { "epoch": 1.8202465931213498, "grad_norm": 0.23389135616856488, "learning_rate": 2.183742183742184e-05, "loss": 0.4261, "step": 1403 }, { "epoch": 1.8215444516547696, "grad_norm": 0.21459274849252136, "learning_rate": 2.1813371813371815e-05, "loss": 0.4346, "step": 1404 }, { "epoch": 1.8228423101881894, "grad_norm": 0.2078147272516738, "learning_rate": 2.178932178932179e-05, "loss": 0.4092, "step": 1405 }, { "epoch": 1.8241401687216094, "grad_norm": 0.24580373752808737, "learning_rate": 2.1765271765271768e-05, "loss": 0.4233, "step": 1406 }, { "epoch": 1.8254380272550292, "grad_norm": 0.22454106978014404, "learning_rate": 2.1741221741221743e-05, "loss": 0.4147, "step": 1407 }, { "epoch": 1.826735885788449, "grad_norm": 0.21599295500558674, "learning_rate": 2.171717171717172e-05, "loss": 0.4349, "step": 1408 }, { "epoch": 1.828033744321869, "grad_norm": 0.2127033614651673, "learning_rate": 2.1693121693121692e-05, "loss": 0.4206, "step": 1409 }, { "epoch": 1.8293316028552886, "grad_norm": 0.21925596786696352, "learning_rate": 2.166907166907167e-05, "loss": 0.4399, "step": 1410 }, { "epoch": 1.8306294613887086, "grad_norm": 0.21016324905145667, "learning_rate": 2.1645021645021645e-05, "loss": 0.4222, "step": 1411 }, { "epoch": 1.8319273199221286, "grad_norm": 0.22520381391920555, "learning_rate": 2.1620971620971623e-05, "loss": 0.4258, "step": 1412 }, { "epoch": 1.8332251784555482, "grad_norm": 0.22141690462102792, "learning_rate": 2.1596921596921598e-05, "loss": 0.412, "step": 1413 }, { "epoch": 1.8345230369889682, "grad_norm": 0.2429839281627191, "learning_rate": 2.1572871572871576e-05, "loss": 0.4269, "step": 1414 }, { "epoch": 1.835820895522388, "grad_norm": 0.2160140354835784, "learning_rate": 2.1548821548821547e-05, "loss": 0.4205, "step": 1415 }, { "epoch": 1.8371187540558078, "grad_norm": 0.2402260672982623, "learning_rate": 2.1524771524771526e-05, "loss": 0.4193, "step": 1416 }, { "epoch": 1.8384166125892278, "grad_norm": 0.29744843810112265, "learning_rate": 2.15007215007215e-05, "loss": 0.4325, "step": 1417 }, { "epoch": 1.8397144711226476, "grad_norm": 0.22530015703559994, "learning_rate": 2.147667147667148e-05, "loss": 0.4318, "step": 1418 }, { "epoch": 1.8410123296560674, "grad_norm": 0.29390956909610316, "learning_rate": 2.1452621452621453e-05, "loss": 0.4288, "step": 1419 }, { "epoch": 1.8423101881894874, "grad_norm": 0.23358318964698258, "learning_rate": 2.1428571428571428e-05, "loss": 0.4084, "step": 1420 }, { "epoch": 1.8436080467229072, "grad_norm": 0.21167664114993395, "learning_rate": 2.1404521404521406e-05, "loss": 0.4313, "step": 1421 }, { "epoch": 1.844905905256327, "grad_norm": 0.2460698519801602, "learning_rate": 2.138047138047138e-05, "loss": 0.4155, "step": 1422 }, { "epoch": 1.846203763789747, "grad_norm": 0.23025941782631765, "learning_rate": 2.135642135642136e-05, "loss": 0.4057, "step": 1423 }, { "epoch": 1.8475016223231666, "grad_norm": 0.19865359681586736, "learning_rate": 2.1332371332371334e-05, "loss": 0.4075, "step": 1424 }, { "epoch": 1.8487994808565866, "grad_norm": 0.22150167838157933, "learning_rate": 2.1308321308321312e-05, "loss": 0.4338, "step": 1425 }, { "epoch": 1.8500973393900066, "grad_norm": 0.27381218064289997, "learning_rate": 2.1284271284271284e-05, "loss": 0.4385, "step": 1426 }, { "epoch": 1.8513951979234262, "grad_norm": 0.2386126810899565, "learning_rate": 2.1260221260221262e-05, "loss": 0.4138, "step": 1427 }, { "epoch": 1.8526930564568462, "grad_norm": 0.23844253499070778, "learning_rate": 2.1236171236171237e-05, "loss": 0.417, "step": 1428 }, { "epoch": 1.853990914990266, "grad_norm": 0.24734871284649604, "learning_rate": 2.1212121212121215e-05, "loss": 0.4266, "step": 1429 }, { "epoch": 1.8552887735236858, "grad_norm": 0.2581372866509555, "learning_rate": 2.118807118807119e-05, "loss": 0.4073, "step": 1430 }, { "epoch": 1.8565866320571058, "grad_norm": 0.20591243236055737, "learning_rate": 2.1164021164021164e-05, "loss": 0.3973, "step": 1431 }, { "epoch": 1.8578844905905256, "grad_norm": 0.25393718244850216, "learning_rate": 2.113997113997114e-05, "loss": 0.4237, "step": 1432 }, { "epoch": 1.8591823491239454, "grad_norm": 0.256757051595813, "learning_rate": 2.1115921115921117e-05, "loss": 0.4276, "step": 1433 }, { "epoch": 1.8604802076573654, "grad_norm": 0.2199746107316156, "learning_rate": 2.1091871091871092e-05, "loss": 0.4027, "step": 1434 }, { "epoch": 1.8617780661907852, "grad_norm": 0.22993418151409517, "learning_rate": 2.106782106782107e-05, "loss": 0.4258, "step": 1435 }, { "epoch": 1.863075924724205, "grad_norm": 0.23986794245337564, "learning_rate": 2.1043771043771045e-05, "loss": 0.4092, "step": 1436 }, { "epoch": 1.864373783257625, "grad_norm": 0.2503767269878855, "learning_rate": 2.101972101972102e-05, "loss": 0.4337, "step": 1437 }, { "epoch": 1.8656716417910446, "grad_norm": 0.19966379931345576, "learning_rate": 2.0995670995670998e-05, "loss": 0.4083, "step": 1438 }, { "epoch": 1.8669695003244646, "grad_norm": 0.22975695557758422, "learning_rate": 2.0971620971620973e-05, "loss": 0.4155, "step": 1439 }, { "epoch": 1.8682673588578846, "grad_norm": 0.26927614268096606, "learning_rate": 2.094757094757095e-05, "loss": 0.3885, "step": 1440 }, { "epoch": 1.8695652173913042, "grad_norm": 0.20373734252329936, "learning_rate": 2.0923520923520926e-05, "loss": 0.4291, "step": 1441 }, { "epoch": 1.8708630759247242, "grad_norm": 0.2683958306016899, "learning_rate": 2.08994708994709e-05, "loss": 0.433, "step": 1442 }, { "epoch": 1.872160934458144, "grad_norm": 0.26568578858883407, "learning_rate": 2.0875420875420875e-05, "loss": 0.4379, "step": 1443 }, { "epoch": 1.8734587929915638, "grad_norm": 0.2365065973899857, "learning_rate": 2.0851370851370853e-05, "loss": 0.4372, "step": 1444 }, { "epoch": 1.8747566515249838, "grad_norm": 0.2160536365282337, "learning_rate": 2.0827320827320828e-05, "loss": 0.4249, "step": 1445 }, { "epoch": 1.8760545100584036, "grad_norm": 0.2698594967367338, "learning_rate": 2.0803270803270806e-05, "loss": 0.4251, "step": 1446 }, { "epoch": 1.8773523685918234, "grad_norm": 0.240476141319818, "learning_rate": 2.077922077922078e-05, "loss": 0.4051, "step": 1447 }, { "epoch": 1.8786502271252434, "grad_norm": 0.20313150197250998, "learning_rate": 2.0755170755170756e-05, "loss": 0.4142, "step": 1448 }, { "epoch": 1.8799480856586632, "grad_norm": 0.2513888218859537, "learning_rate": 2.073112073112073e-05, "loss": 0.4235, "step": 1449 }, { "epoch": 1.881245944192083, "grad_norm": 0.263020254508393, "learning_rate": 2.070707070707071e-05, "loss": 0.4133, "step": 1450 }, { "epoch": 1.882543802725503, "grad_norm": 0.19807928758761542, "learning_rate": 2.0683020683020683e-05, "loss": 0.4211, "step": 1451 }, { "epoch": 1.8838416612589228, "grad_norm": 0.28553479995616016, "learning_rate": 2.065897065897066e-05, "loss": 0.4295, "step": 1452 }, { "epoch": 1.8851395197923426, "grad_norm": 0.22454512768715873, "learning_rate": 2.0634920634920636e-05, "loss": 0.4164, "step": 1453 }, { "epoch": 1.8864373783257626, "grad_norm": 0.2212283630425153, "learning_rate": 2.061087061087061e-05, "loss": 0.4309, "step": 1454 }, { "epoch": 1.8877352368591822, "grad_norm": 0.21880750363041376, "learning_rate": 2.058682058682059e-05, "loss": 0.4148, "step": 1455 }, { "epoch": 1.8890330953926022, "grad_norm": 0.2698709703952382, "learning_rate": 2.0562770562770564e-05, "loss": 0.4339, "step": 1456 }, { "epoch": 1.890330953926022, "grad_norm": 0.21631366892137663, "learning_rate": 2.0538720538720542e-05, "loss": 0.4209, "step": 1457 }, { "epoch": 1.8916288124594418, "grad_norm": 0.22312561756649457, "learning_rate": 2.0514670514670517e-05, "loss": 0.4205, "step": 1458 }, { "epoch": 1.8929266709928618, "grad_norm": 0.22982817420831553, "learning_rate": 2.0490620490620492e-05, "loss": 0.4127, "step": 1459 }, { "epoch": 1.8942245295262816, "grad_norm": 0.23011803773822845, "learning_rate": 2.0466570466570467e-05, "loss": 0.4018, "step": 1460 }, { "epoch": 1.8955223880597014, "grad_norm": 0.2156774757909124, "learning_rate": 2.0442520442520445e-05, "loss": 0.448, "step": 1461 }, { "epoch": 1.8968202465931214, "grad_norm": 0.2465313564522942, "learning_rate": 2.041847041847042e-05, "loss": 0.4433, "step": 1462 }, { "epoch": 1.8981181051265412, "grad_norm": 0.21906089100592563, "learning_rate": 2.0394420394420398e-05, "loss": 0.4103, "step": 1463 }, { "epoch": 1.899415963659961, "grad_norm": 0.21024875722994074, "learning_rate": 2.037037037037037e-05, "loss": 0.3951, "step": 1464 }, { "epoch": 1.900713822193381, "grad_norm": 0.18955824822069273, "learning_rate": 2.0346320346320347e-05, "loss": 0.4174, "step": 1465 }, { "epoch": 1.9020116807268008, "grad_norm": 0.23144049792472646, "learning_rate": 2.0322270322270322e-05, "loss": 0.3998, "step": 1466 }, { "epoch": 1.9033095392602206, "grad_norm": 0.21081067517865779, "learning_rate": 2.02982202982203e-05, "loss": 0.4103, "step": 1467 }, { "epoch": 1.9046073977936406, "grad_norm": 0.2194010873045385, "learning_rate": 2.0274170274170275e-05, "loss": 0.4123, "step": 1468 }, { "epoch": 1.9059052563270602, "grad_norm": 0.20757690294910305, "learning_rate": 2.025012025012025e-05, "loss": 0.4012, "step": 1469 }, { "epoch": 1.9072031148604802, "grad_norm": 0.2628488845364361, "learning_rate": 2.0226070226070225e-05, "loss": 0.4118, "step": 1470 }, { "epoch": 1.9085009733939, "grad_norm": 0.2503984380267546, "learning_rate": 2.0202020202020203e-05, "loss": 0.4277, "step": 1471 }, { "epoch": 1.9097988319273198, "grad_norm": 0.20731051341055765, "learning_rate": 2.0177970177970177e-05, "loss": 0.4223, "step": 1472 }, { "epoch": 1.9110966904607398, "grad_norm": 0.2469892137078129, "learning_rate": 2.0153920153920156e-05, "loss": 0.4278, "step": 1473 }, { "epoch": 1.9123945489941596, "grad_norm": 0.2302509090632293, "learning_rate": 2.012987012987013e-05, "loss": 0.4286, "step": 1474 }, { "epoch": 1.9136924075275794, "grad_norm": 0.21560820581873713, "learning_rate": 2.0105820105820105e-05, "loss": 0.4036, "step": 1475 }, { "epoch": 1.9149902660609994, "grad_norm": 0.21761526673837062, "learning_rate": 2.0081770081770083e-05, "loss": 0.4383, "step": 1476 }, { "epoch": 1.9162881245944192, "grad_norm": 0.25419859148323953, "learning_rate": 2.0057720057720058e-05, "loss": 0.4168, "step": 1477 }, { "epoch": 1.917585983127839, "grad_norm": 0.21447148417291215, "learning_rate": 2.0033670033670036e-05, "loss": 0.4233, "step": 1478 }, { "epoch": 1.918883841661259, "grad_norm": 0.22177181102304355, "learning_rate": 2.000962000962001e-05, "loss": 0.3973, "step": 1479 }, { "epoch": 1.9201817001946788, "grad_norm": 0.2361964777550035, "learning_rate": 1.9985569985569986e-05, "loss": 0.4062, "step": 1480 }, { "epoch": 1.9214795587280986, "grad_norm": 0.2268625180335479, "learning_rate": 1.996151996151996e-05, "loss": 0.4288, "step": 1481 }, { "epoch": 1.9227774172615186, "grad_norm": 0.22109891172640594, "learning_rate": 1.993746993746994e-05, "loss": 0.4168, "step": 1482 }, { "epoch": 1.9240752757949382, "grad_norm": 0.24199558362942594, "learning_rate": 1.9913419913419914e-05, "loss": 0.4368, "step": 1483 }, { "epoch": 1.9253731343283582, "grad_norm": 0.23386196057480746, "learning_rate": 1.9889369889369892e-05, "loss": 0.4929, "step": 1484 }, { "epoch": 1.9266709928617782, "grad_norm": 2.3863223206635507, "learning_rate": 1.9865319865319866e-05, "loss": 0.4321, "step": 1485 }, { "epoch": 1.9279688513951978, "grad_norm": 0.2117744722116347, "learning_rate": 1.984126984126984e-05, "loss": 0.4046, "step": 1486 }, { "epoch": 1.9292667099286178, "grad_norm": 0.23753639694866985, "learning_rate": 1.9817219817219816e-05, "loss": 0.4088, "step": 1487 }, { "epoch": 1.9305645684620376, "grad_norm": 0.2029549567060751, "learning_rate": 1.9793169793169794e-05, "loss": 0.4129, "step": 1488 }, { "epoch": 1.9318624269954574, "grad_norm": 0.20999056789664505, "learning_rate": 1.976911976911977e-05, "loss": 0.4149, "step": 1489 }, { "epoch": 1.9331602855288774, "grad_norm": 0.25609740431868805, "learning_rate": 1.9745069745069747e-05, "loss": 0.4422, "step": 1490 }, { "epoch": 1.9344581440622972, "grad_norm": 0.23240223312760538, "learning_rate": 1.9721019721019722e-05, "loss": 0.4376, "step": 1491 }, { "epoch": 1.935756002595717, "grad_norm": 0.22288941915151747, "learning_rate": 1.9696969696969697e-05, "loss": 0.4031, "step": 1492 }, { "epoch": 1.937053861129137, "grad_norm": 0.20823811668022293, "learning_rate": 1.9672919672919675e-05, "loss": 0.4141, "step": 1493 }, { "epoch": 1.9383517196625568, "grad_norm": 0.22958571482808876, "learning_rate": 1.964886964886965e-05, "loss": 0.4205, "step": 1494 }, { "epoch": 1.9396495781959766, "grad_norm": 0.24269527070284858, "learning_rate": 1.9624819624819628e-05, "loss": 0.414, "step": 1495 }, { "epoch": 1.9409474367293966, "grad_norm": 0.2070554761070819, "learning_rate": 1.9600769600769603e-05, "loss": 0.4045, "step": 1496 }, { "epoch": 1.9422452952628162, "grad_norm": 0.24376293095622897, "learning_rate": 1.9576719576719577e-05, "loss": 0.4257, "step": 1497 }, { "epoch": 1.9435431537962362, "grad_norm": 0.2254498669697948, "learning_rate": 1.9552669552669552e-05, "loss": 0.421, "step": 1498 }, { "epoch": 1.9448410123296562, "grad_norm": 0.21748513808130843, "learning_rate": 1.952861952861953e-05, "loss": 0.4062, "step": 1499 }, { "epoch": 1.9461388708630758, "grad_norm": 0.2148376810996354, "learning_rate": 1.9504569504569505e-05, "loss": 0.4203, "step": 1500 }, { "epoch": 1.9474367293964958, "grad_norm": 0.25871259714383205, "learning_rate": 1.9480519480519483e-05, "loss": 0.4229, "step": 1501 }, { "epoch": 1.9487345879299156, "grad_norm": 0.19582347887373358, "learning_rate": 1.9456469456469455e-05, "loss": 0.4081, "step": 1502 }, { "epoch": 1.9500324464633354, "grad_norm": 0.22789399470009464, "learning_rate": 1.9432419432419433e-05, "loss": 0.4245, "step": 1503 }, { "epoch": 1.9513303049967554, "grad_norm": 0.23018173092515049, "learning_rate": 1.9408369408369408e-05, "loss": 0.4216, "step": 1504 }, { "epoch": 1.9526281635301752, "grad_norm": 0.21444832133823605, "learning_rate": 1.9384319384319386e-05, "loss": 0.4147, "step": 1505 }, { "epoch": 1.953926022063595, "grad_norm": 0.20562647584839736, "learning_rate": 1.936026936026936e-05, "loss": 0.4008, "step": 1506 }, { "epoch": 1.955223880597015, "grad_norm": 0.23110467483063488, "learning_rate": 1.933621933621934e-05, "loss": 0.3972, "step": 1507 }, { "epoch": 1.9565217391304348, "grad_norm": 0.21834150518213843, "learning_rate": 1.9312169312169313e-05, "loss": 0.4256, "step": 1508 }, { "epoch": 1.9578195976638546, "grad_norm": 0.21958450668275112, "learning_rate": 1.9288119288119288e-05, "loss": 0.4088, "step": 1509 }, { "epoch": 1.9591174561972746, "grad_norm": 0.20052094185224426, "learning_rate": 1.9264069264069266e-05, "loss": 0.4121, "step": 1510 }, { "epoch": 1.9604153147306942, "grad_norm": 0.24326880305407378, "learning_rate": 1.924001924001924e-05, "loss": 0.422, "step": 1511 }, { "epoch": 1.9617131732641142, "grad_norm": 0.2410106190975958, "learning_rate": 1.921596921596922e-05, "loss": 0.4085, "step": 1512 }, { "epoch": 1.9630110317975342, "grad_norm": 0.20377491892233185, "learning_rate": 1.919191919191919e-05, "loss": 0.4312, "step": 1513 }, { "epoch": 1.9643088903309538, "grad_norm": 0.22992091739225845, "learning_rate": 1.916786916786917e-05, "loss": 0.4283, "step": 1514 }, { "epoch": 1.9656067488643738, "grad_norm": 0.23320180740415136, "learning_rate": 1.9143819143819144e-05, "loss": 0.4167, "step": 1515 }, { "epoch": 1.9669046073977936, "grad_norm": 0.21478096347520134, "learning_rate": 1.9119769119769122e-05, "loss": 0.4373, "step": 1516 }, { "epoch": 1.9682024659312134, "grad_norm": 0.24312143244424492, "learning_rate": 1.9095719095719097e-05, "loss": 0.4384, "step": 1517 }, { "epoch": 1.9695003244646334, "grad_norm": 0.22013684407762923, "learning_rate": 1.9071669071669075e-05, "loss": 0.4244, "step": 1518 }, { "epoch": 1.9707981829980532, "grad_norm": 0.22394887240003014, "learning_rate": 1.9047619047619046e-05, "loss": 0.419, "step": 1519 }, { "epoch": 1.972096041531473, "grad_norm": 0.2319362635386066, "learning_rate": 1.9023569023569024e-05, "loss": 0.4059, "step": 1520 }, { "epoch": 1.973393900064893, "grad_norm": 0.1980072863895625, "learning_rate": 1.8999518999519e-05, "loss": 0.4012, "step": 1521 }, { "epoch": 1.9746917585983128, "grad_norm": 0.21340412052310542, "learning_rate": 1.8975468975468977e-05, "loss": 0.4252, "step": 1522 }, { "epoch": 1.9759896171317326, "grad_norm": 0.20523875562201788, "learning_rate": 1.8951418951418952e-05, "loss": 0.3954, "step": 1523 }, { "epoch": 1.9772874756651526, "grad_norm": 0.21593958433489607, "learning_rate": 1.8927368927368927e-05, "loss": 0.4204, "step": 1524 }, { "epoch": 1.9785853341985724, "grad_norm": 0.1982941991349422, "learning_rate": 1.8903318903318905e-05, "loss": 0.3982, "step": 1525 }, { "epoch": 1.9798831927319922, "grad_norm": 0.20466190375196575, "learning_rate": 1.887926887926888e-05, "loss": 0.4134, "step": 1526 }, { "epoch": 1.9811810512654122, "grad_norm": 0.21442210205444864, "learning_rate": 1.8855218855218858e-05, "loss": 0.4221, "step": 1527 }, { "epoch": 1.9824789097988318, "grad_norm": 0.2077434816627499, "learning_rate": 1.8831168831168833e-05, "loss": 0.4168, "step": 1528 }, { "epoch": 1.9837767683322518, "grad_norm": 0.20932779718622976, "learning_rate": 1.880711880711881e-05, "loss": 0.4129, "step": 1529 }, { "epoch": 1.9850746268656716, "grad_norm": 0.20073142812922465, "learning_rate": 1.8783068783068782e-05, "loss": 0.4282, "step": 1530 }, { "epoch": 1.9863724853990914, "grad_norm": 0.21070295646641607, "learning_rate": 1.875901875901876e-05, "loss": 0.414, "step": 1531 }, { "epoch": 1.9876703439325114, "grad_norm": 0.1983254407503139, "learning_rate": 1.8734968734968735e-05, "loss": 0.4246, "step": 1532 }, { "epoch": 1.9889682024659312, "grad_norm": 0.2063440750783136, "learning_rate": 1.8710918710918713e-05, "loss": 0.4127, "step": 1533 }, { "epoch": 1.990266060999351, "grad_norm": 0.20062950578015543, "learning_rate": 1.8686868686868688e-05, "loss": 0.404, "step": 1534 }, { "epoch": 1.991563919532771, "grad_norm": 0.22191712136507424, "learning_rate": 1.8662818662818663e-05, "loss": 0.4119, "step": 1535 }, { "epoch": 1.9928617780661908, "grad_norm": 0.2215336165604822, "learning_rate": 1.8638768638768638e-05, "loss": 0.4121, "step": 1536 }, { "epoch": 1.9941596365996106, "grad_norm": 0.20271253230410582, "learning_rate": 1.8614718614718616e-05, "loss": 0.4023, "step": 1537 }, { "epoch": 1.9954574951330306, "grad_norm": 0.26159702346568764, "learning_rate": 1.859066859066859e-05, "loss": 0.4102, "step": 1538 }, { "epoch": 1.9967553536664504, "grad_norm": 0.21830457585192162, "learning_rate": 1.856661856661857e-05, "loss": 0.4129, "step": 1539 }, { "epoch": 1.9980532121998702, "grad_norm": 0.21634003518886286, "learning_rate": 1.8542568542568544e-05, "loss": 0.3979, "step": 1540 }, { "epoch": 1.9993510707332902, "grad_norm": 0.21520818627840688, "learning_rate": 1.8518518518518518e-05, "loss": 0.4072, "step": 1541 }, { "epoch": 2.0, "grad_norm": 0.3162004070666462, "learning_rate": 1.8494468494468496e-05, "loss": 0.3731, "step": 1542 }, { "epoch": 2.00129785853342, "grad_norm": 0.3002746540509754, "learning_rate": 1.847041847041847e-05, "loss": 0.351, "step": 1543 }, { "epoch": 2.0025957170668396, "grad_norm": 0.22179389161987628, "learning_rate": 1.844636844636845e-05, "loss": 0.3591, "step": 1544 }, { "epoch": 2.0038935756002596, "grad_norm": 0.29853309659990307, "learning_rate": 1.8422318422318424e-05, "loss": 0.3502, "step": 1545 }, { "epoch": 2.0051914341336796, "grad_norm": 0.27887815631774426, "learning_rate": 1.83982683982684e-05, "loss": 0.3397, "step": 1546 }, { "epoch": 2.006489292667099, "grad_norm": 0.21973812049478386, "learning_rate": 1.8374218374218374e-05, "loss": 0.3429, "step": 1547 }, { "epoch": 2.007787151200519, "grad_norm": 0.26933885808676494, "learning_rate": 1.8350168350168352e-05, "loss": 0.3436, "step": 1548 }, { "epoch": 2.009085009733939, "grad_norm": 0.2996171251030684, "learning_rate": 1.8326118326118327e-05, "loss": 0.3409, "step": 1549 }, { "epoch": 2.010382868267359, "grad_norm": 0.24083124258461439, "learning_rate": 1.8302068302068305e-05, "loss": 0.3386, "step": 1550 }, { "epoch": 2.011680726800779, "grad_norm": 0.30787012297971555, "learning_rate": 1.827801827801828e-05, "loss": 0.3478, "step": 1551 }, { "epoch": 2.0129785853341984, "grad_norm": 0.2857849577396285, "learning_rate": 1.8253968253968254e-05, "loss": 0.347, "step": 1552 }, { "epoch": 2.0142764438676184, "grad_norm": 0.2458814691129703, "learning_rate": 1.822991822991823e-05, "loss": 0.338, "step": 1553 }, { "epoch": 2.0155743024010384, "grad_norm": 0.24217238914022393, "learning_rate": 1.8205868205868207e-05, "loss": 0.3527, "step": 1554 }, { "epoch": 2.016872160934458, "grad_norm": 0.2708034381508514, "learning_rate": 1.8181818181818182e-05, "loss": 0.343, "step": 1555 }, { "epoch": 2.018170019467878, "grad_norm": 0.2771300303365467, "learning_rate": 1.815776815776816e-05, "loss": 0.3578, "step": 1556 }, { "epoch": 2.019467878001298, "grad_norm": 0.24314761286723605, "learning_rate": 1.8133718133718135e-05, "loss": 0.3628, "step": 1557 }, { "epoch": 2.0207657365347176, "grad_norm": 0.24528635873262158, "learning_rate": 1.810966810966811e-05, "loss": 0.3478, "step": 1558 }, { "epoch": 2.0220635950681376, "grad_norm": 0.2441718287388452, "learning_rate": 1.8085618085618085e-05, "loss": 0.3463, "step": 1559 }, { "epoch": 2.0233614536015576, "grad_norm": 0.2429476875932917, "learning_rate": 1.8061568061568063e-05, "loss": 0.3533, "step": 1560 }, { "epoch": 2.024659312134977, "grad_norm": 0.2584288721917747, "learning_rate": 1.8037518037518038e-05, "loss": 0.3443, "step": 1561 }, { "epoch": 2.025957170668397, "grad_norm": 0.21619552093380776, "learning_rate": 1.8013468013468016e-05, "loss": 0.3313, "step": 1562 }, { "epoch": 2.027255029201817, "grad_norm": 0.22416123032351348, "learning_rate": 1.798941798941799e-05, "loss": 0.3396, "step": 1563 }, { "epoch": 2.028552887735237, "grad_norm": 0.23762867221262482, "learning_rate": 1.7965367965367965e-05, "loss": 0.3328, "step": 1564 }, { "epoch": 2.029850746268657, "grad_norm": 0.23910726203342522, "learning_rate": 1.7941317941317943e-05, "loss": 0.3373, "step": 1565 }, { "epoch": 2.0311486048020764, "grad_norm": 0.2270224344033579, "learning_rate": 1.7917267917267918e-05, "loss": 0.3414, "step": 1566 }, { "epoch": 2.0324464633354964, "grad_norm": 0.22685492291746387, "learning_rate": 1.7893217893217896e-05, "loss": 0.334, "step": 1567 }, { "epoch": 2.0337443218689164, "grad_norm": 0.23079013731214076, "learning_rate": 1.7869167869167868e-05, "loss": 0.3373, "step": 1568 }, { "epoch": 2.035042180402336, "grad_norm": 0.23832074407215584, "learning_rate": 1.7845117845117846e-05, "loss": 0.3402, "step": 1569 }, { "epoch": 2.036340038935756, "grad_norm": 0.2413146931238051, "learning_rate": 1.782106782106782e-05, "loss": 0.3476, "step": 1570 }, { "epoch": 2.037637897469176, "grad_norm": 0.23685355784727574, "learning_rate": 1.77970177970178e-05, "loss": 0.3397, "step": 1571 }, { "epoch": 2.0389357560025956, "grad_norm": 0.24437850977020956, "learning_rate": 1.7772967772967774e-05, "loss": 0.3445, "step": 1572 }, { "epoch": 2.0402336145360156, "grad_norm": 0.22724458516557208, "learning_rate": 1.7748917748917752e-05, "loss": 0.3443, "step": 1573 }, { "epoch": 2.0415314730694356, "grad_norm": 0.23475541449011794, "learning_rate": 1.7724867724867723e-05, "loss": 0.3432, "step": 1574 }, { "epoch": 2.042829331602855, "grad_norm": 0.21469511225658197, "learning_rate": 1.77008177008177e-05, "loss": 0.3473, "step": 1575 }, { "epoch": 2.044127190136275, "grad_norm": 0.22486022557380209, "learning_rate": 1.7676767676767676e-05, "loss": 0.3566, "step": 1576 }, { "epoch": 2.045425048669695, "grad_norm": 0.22895350371242218, "learning_rate": 1.7652717652717654e-05, "loss": 0.3478, "step": 1577 }, { "epoch": 2.046722907203115, "grad_norm": 0.24538812579393815, "learning_rate": 1.762866762866763e-05, "loss": 0.3332, "step": 1578 }, { "epoch": 2.048020765736535, "grad_norm": 0.25912360209705504, "learning_rate": 1.7604617604617604e-05, "loss": 0.344, "step": 1579 }, { "epoch": 2.0493186242699544, "grad_norm": 0.19959244601082998, "learning_rate": 1.7580567580567582e-05, "loss": 0.3337, "step": 1580 }, { "epoch": 2.0506164828033744, "grad_norm": 0.22265382752478494, "learning_rate": 1.7556517556517557e-05, "loss": 0.3385, "step": 1581 }, { "epoch": 2.0519143413367944, "grad_norm": 0.2165757161328648, "learning_rate": 1.7532467532467535e-05, "loss": 0.3339, "step": 1582 }, { "epoch": 2.053212199870214, "grad_norm": 0.21372021503164076, "learning_rate": 1.750841750841751e-05, "loss": 0.3507, "step": 1583 }, { "epoch": 2.054510058403634, "grad_norm": 0.2336377004408556, "learning_rate": 1.7484367484367488e-05, "loss": 0.3491, "step": 1584 }, { "epoch": 2.055807916937054, "grad_norm": 0.2117993328839063, "learning_rate": 1.746031746031746e-05, "loss": 0.3407, "step": 1585 }, { "epoch": 2.0571057754704736, "grad_norm": 0.21231266244658922, "learning_rate": 1.7436267436267437e-05, "loss": 0.3444, "step": 1586 }, { "epoch": 2.0584036340038936, "grad_norm": 0.21032023819015722, "learning_rate": 1.7412217412217412e-05, "loss": 0.3398, "step": 1587 }, { "epoch": 2.0597014925373136, "grad_norm": 0.2371048058409055, "learning_rate": 1.738816738816739e-05, "loss": 0.3445, "step": 1588 }, { "epoch": 2.060999351070733, "grad_norm": 0.2059222075267882, "learning_rate": 1.7364117364117365e-05, "loss": 0.3443, "step": 1589 }, { "epoch": 2.062297209604153, "grad_norm": 0.22719406397240552, "learning_rate": 1.734006734006734e-05, "loss": 0.3495, "step": 1590 }, { "epoch": 2.063595068137573, "grad_norm": 0.2222707506963988, "learning_rate": 1.7316017316017315e-05, "loss": 0.3399, "step": 1591 }, { "epoch": 2.064892926670993, "grad_norm": 0.22555682797470167, "learning_rate": 1.7291967291967293e-05, "loss": 0.3383, "step": 1592 }, { "epoch": 2.066190785204413, "grad_norm": 0.22889368520998704, "learning_rate": 1.7267917267917268e-05, "loss": 0.3418, "step": 1593 }, { "epoch": 2.0674886437378324, "grad_norm": 0.22102058324621057, "learning_rate": 1.7243867243867246e-05, "loss": 0.3325, "step": 1594 }, { "epoch": 2.0687865022712524, "grad_norm": 0.23774221641545448, "learning_rate": 1.721981721981722e-05, "loss": 0.3369, "step": 1595 }, { "epoch": 2.0700843608046724, "grad_norm": 0.24890061582412498, "learning_rate": 1.7195767195767195e-05, "loss": 0.3441, "step": 1596 }, { "epoch": 2.071382219338092, "grad_norm": 0.2100873376295878, "learning_rate": 1.7171717171717173e-05, "loss": 0.3303, "step": 1597 }, { "epoch": 2.072680077871512, "grad_norm": 0.22680230775268373, "learning_rate": 1.7147667147667148e-05, "loss": 0.3371, "step": 1598 }, { "epoch": 2.073977936404932, "grad_norm": 0.20382351560763964, "learning_rate": 1.7123617123617126e-05, "loss": 0.3337, "step": 1599 }, { "epoch": 2.0752757949383516, "grad_norm": 0.21534525112062658, "learning_rate": 1.70995670995671e-05, "loss": 0.3371, "step": 1600 }, { "epoch": 2.0765736534717716, "grad_norm": 0.21827103094501965, "learning_rate": 1.7075517075517076e-05, "loss": 0.3357, "step": 1601 }, { "epoch": 2.0778715120051916, "grad_norm": 0.21047818536264323, "learning_rate": 1.705146705146705e-05, "loss": 0.3358, "step": 1602 }, { "epoch": 2.079169370538611, "grad_norm": 0.24359393587806577, "learning_rate": 1.702741702741703e-05, "loss": 0.3409, "step": 1603 }, { "epoch": 2.080467229072031, "grad_norm": 0.32771668536616283, "learning_rate": 1.7003367003367004e-05, "loss": 0.3296, "step": 1604 }, { "epoch": 2.081765087605451, "grad_norm": 0.2105738510166506, "learning_rate": 1.6979316979316982e-05, "loss": 0.3471, "step": 1605 }, { "epoch": 2.0830629461388708, "grad_norm": 0.2609805732511619, "learning_rate": 1.6955266955266957e-05, "loss": 0.3395, "step": 1606 }, { "epoch": 2.084360804672291, "grad_norm": 0.20813077643429093, "learning_rate": 1.693121693121693e-05, "loss": 0.3323, "step": 1607 }, { "epoch": 2.0856586632057104, "grad_norm": 0.20588845509767667, "learning_rate": 1.6907166907166906e-05, "loss": 0.3395, "step": 1608 }, { "epoch": 2.0869565217391304, "grad_norm": 0.2231300777445713, "learning_rate": 1.6883116883116884e-05, "loss": 0.3448, "step": 1609 }, { "epoch": 2.0882543802725504, "grad_norm": 0.2262598516285643, "learning_rate": 1.685906685906686e-05, "loss": 0.3524, "step": 1610 }, { "epoch": 2.08955223880597, "grad_norm": 0.2112479505042923, "learning_rate": 1.6835016835016837e-05, "loss": 0.335, "step": 1611 }, { "epoch": 2.09085009733939, "grad_norm": 0.22189847243097133, "learning_rate": 1.6810966810966812e-05, "loss": 0.3312, "step": 1612 }, { "epoch": 2.09214795587281, "grad_norm": 0.21750458672162346, "learning_rate": 1.6786916786916787e-05, "loss": 0.3382, "step": 1613 }, { "epoch": 2.0934458144062296, "grad_norm": 0.22791119863516698, "learning_rate": 1.6762866762866765e-05, "loss": 0.3346, "step": 1614 }, { "epoch": 2.0947436729396496, "grad_norm": 0.2253801306495037, "learning_rate": 1.673881673881674e-05, "loss": 0.3587, "step": 1615 }, { "epoch": 2.0960415314730696, "grad_norm": 0.21941342953368462, "learning_rate": 1.6714766714766718e-05, "loss": 0.322, "step": 1616 }, { "epoch": 2.097339390006489, "grad_norm": 0.24442422654973892, "learning_rate": 1.6690716690716693e-05, "loss": 0.3613, "step": 1617 }, { "epoch": 2.098637248539909, "grad_norm": 0.20892381899190737, "learning_rate": 1.6666666666666667e-05, "loss": 0.3372, "step": 1618 }, { "epoch": 2.099935107073329, "grad_norm": 0.2204239619365003, "learning_rate": 1.6642616642616642e-05, "loss": 0.345, "step": 1619 }, { "epoch": 2.1012329656067488, "grad_norm": 0.2433539902885864, "learning_rate": 1.661856661856662e-05, "loss": 0.3399, "step": 1620 }, { "epoch": 2.102530824140169, "grad_norm": 0.21756713439856754, "learning_rate": 1.6594516594516595e-05, "loss": 0.3258, "step": 1621 }, { "epoch": 2.103828682673589, "grad_norm": 0.21185754465198486, "learning_rate": 1.6570466570466573e-05, "loss": 0.3338, "step": 1622 }, { "epoch": 2.1051265412070084, "grad_norm": 0.19886634840357484, "learning_rate": 1.6546416546416545e-05, "loss": 0.3379, "step": 1623 }, { "epoch": 2.1064243997404284, "grad_norm": 0.23378633487014983, "learning_rate": 1.6522366522366523e-05, "loss": 0.3518, "step": 1624 }, { "epoch": 2.107722258273848, "grad_norm": 0.22367396102680723, "learning_rate": 1.6498316498316498e-05, "loss": 0.3341, "step": 1625 }, { "epoch": 2.109020116807268, "grad_norm": 0.2063911182342616, "learning_rate": 1.6474266474266476e-05, "loss": 0.3446, "step": 1626 }, { "epoch": 2.110317975340688, "grad_norm": 0.22050227302851286, "learning_rate": 1.645021645021645e-05, "loss": 0.3359, "step": 1627 }, { "epoch": 2.1116158338741076, "grad_norm": 0.24896315784423037, "learning_rate": 1.642616642616643e-05, "loss": 0.3311, "step": 1628 }, { "epoch": 2.1129136924075276, "grad_norm": 0.24014540903069778, "learning_rate": 1.6402116402116404e-05, "loss": 0.3455, "step": 1629 }, { "epoch": 2.1142115509409476, "grad_norm": 0.22208241595868494, "learning_rate": 1.637806637806638e-05, "loss": 0.3609, "step": 1630 }, { "epoch": 2.115509409474367, "grad_norm": 0.25185975208100675, "learning_rate": 1.6354016354016356e-05, "loss": 0.3355, "step": 1631 }, { "epoch": 2.116807268007787, "grad_norm": 0.25068577932779473, "learning_rate": 1.632996632996633e-05, "loss": 0.3378, "step": 1632 }, { "epoch": 2.118105126541207, "grad_norm": 0.23743465262529753, "learning_rate": 1.630591630591631e-05, "loss": 0.3453, "step": 1633 }, { "epoch": 2.1194029850746268, "grad_norm": 0.2873478730260964, "learning_rate": 1.628186628186628e-05, "loss": 0.3469, "step": 1634 }, { "epoch": 2.120700843608047, "grad_norm": 0.23190098964145786, "learning_rate": 1.625781625781626e-05, "loss": 0.3299, "step": 1635 }, { "epoch": 2.1219987021414664, "grad_norm": 0.24692649408648704, "learning_rate": 1.6233766233766234e-05, "loss": 0.3413, "step": 1636 }, { "epoch": 2.1232965606748864, "grad_norm": 0.2601882707422891, "learning_rate": 1.6209716209716212e-05, "loss": 0.3237, "step": 1637 }, { "epoch": 2.1245944192083064, "grad_norm": 0.2307540235499652, "learning_rate": 1.6185666185666187e-05, "loss": 0.3479, "step": 1638 }, { "epoch": 2.125892277741726, "grad_norm": 0.24207343026907496, "learning_rate": 1.6161616161616165e-05, "loss": 0.342, "step": 1639 }, { "epoch": 2.127190136275146, "grad_norm": 0.25704183477977455, "learning_rate": 1.6137566137566136e-05, "loss": 0.347, "step": 1640 }, { "epoch": 2.128487994808566, "grad_norm": 0.2204667533876293, "learning_rate": 1.6113516113516114e-05, "loss": 0.3421, "step": 1641 }, { "epoch": 2.1297858533419856, "grad_norm": 0.209970904431735, "learning_rate": 1.608946608946609e-05, "loss": 0.3434, "step": 1642 }, { "epoch": 2.1310837118754056, "grad_norm": 0.2322772777553896, "learning_rate": 1.6065416065416067e-05, "loss": 0.3462, "step": 1643 }, { "epoch": 2.1323815704088256, "grad_norm": 0.24539006715088627, "learning_rate": 1.6041366041366042e-05, "loss": 0.3474, "step": 1644 }, { "epoch": 2.133679428942245, "grad_norm": 0.2071657985362404, "learning_rate": 1.6017316017316017e-05, "loss": 0.315, "step": 1645 }, { "epoch": 2.134977287475665, "grad_norm": 0.22559015132345064, "learning_rate": 1.5993265993265995e-05, "loss": 0.3435, "step": 1646 }, { "epoch": 2.136275146009085, "grad_norm": 0.21866568313609377, "learning_rate": 1.596921596921597e-05, "loss": 0.3214, "step": 1647 }, { "epoch": 2.1375730045425048, "grad_norm": 0.22054227732700918, "learning_rate": 1.5945165945165948e-05, "loss": 0.3366, "step": 1648 }, { "epoch": 2.1388708630759248, "grad_norm": 0.2513837156975892, "learning_rate": 1.5921115921115923e-05, "loss": 0.3661, "step": 1649 }, { "epoch": 2.140168721609345, "grad_norm": 0.22217224038750824, "learning_rate": 1.5897065897065898e-05, "loss": 0.3377, "step": 1650 }, { "epoch": 2.1414665801427644, "grad_norm": 0.21936038896279167, "learning_rate": 1.5873015873015872e-05, "loss": 0.3326, "step": 1651 }, { "epoch": 2.1427644386761844, "grad_norm": 0.2064910202725993, "learning_rate": 1.584896584896585e-05, "loss": 0.3365, "step": 1652 }, { "epoch": 2.144062297209604, "grad_norm": 0.20100161814145587, "learning_rate": 1.5824915824915825e-05, "loss": 0.3278, "step": 1653 }, { "epoch": 2.145360155743024, "grad_norm": 0.21879340248910953, "learning_rate": 1.5800865800865803e-05, "loss": 0.3431, "step": 1654 }, { "epoch": 2.146658014276444, "grad_norm": 0.2129168285291938, "learning_rate": 1.5776815776815778e-05, "loss": 0.3503, "step": 1655 }, { "epoch": 2.1479558728098636, "grad_norm": 0.2091423056576906, "learning_rate": 1.5752765752765753e-05, "loss": 0.3492, "step": 1656 }, { "epoch": 2.1492537313432836, "grad_norm": 0.2828239816378975, "learning_rate": 1.5728715728715728e-05, "loss": 0.3564, "step": 1657 }, { "epoch": 2.1505515898767036, "grad_norm": 0.21413983501492062, "learning_rate": 1.5704665704665706e-05, "loss": 0.3544, "step": 1658 }, { "epoch": 2.151849448410123, "grad_norm": 0.21542714936607865, "learning_rate": 1.568061568061568e-05, "loss": 0.3399, "step": 1659 }, { "epoch": 2.153147306943543, "grad_norm": 0.23220715953725188, "learning_rate": 1.565656565656566e-05, "loss": 0.3322, "step": 1660 }, { "epoch": 2.154445165476963, "grad_norm": 0.20221618002166372, "learning_rate": 1.563251563251563e-05, "loss": 0.3297, "step": 1661 }, { "epoch": 2.1557430240103828, "grad_norm": 0.23075242395941706, "learning_rate": 1.560846560846561e-05, "loss": 0.3372, "step": 1662 }, { "epoch": 2.1570408825438028, "grad_norm": 0.2065260094429175, "learning_rate": 1.5584415584415583e-05, "loss": 0.3396, "step": 1663 }, { "epoch": 2.158338741077223, "grad_norm": 0.2048397557697161, "learning_rate": 1.556036556036556e-05, "loss": 0.338, "step": 1664 }, { "epoch": 2.1596365996106424, "grad_norm": 0.21058822127057414, "learning_rate": 1.5536315536315536e-05, "loss": 0.3475, "step": 1665 }, { "epoch": 2.1609344581440624, "grad_norm": 0.20856117486455514, "learning_rate": 1.5512265512265514e-05, "loss": 0.3565, "step": 1666 }, { "epoch": 2.162232316677482, "grad_norm": 0.2101578604301147, "learning_rate": 1.548821548821549e-05, "loss": 0.3341, "step": 1667 }, { "epoch": 2.163530175210902, "grad_norm": 0.1948843188896066, "learning_rate": 1.5464165464165464e-05, "loss": 0.3307, "step": 1668 }, { "epoch": 2.164828033744322, "grad_norm": 0.22409902293345668, "learning_rate": 1.5440115440115442e-05, "loss": 0.3484, "step": 1669 }, { "epoch": 2.1661258922777415, "grad_norm": 0.22671547696900604, "learning_rate": 1.5416065416065417e-05, "loss": 0.3523, "step": 1670 }, { "epoch": 2.1674237508111616, "grad_norm": 0.21140422198147263, "learning_rate": 1.5392015392015395e-05, "loss": 0.3455, "step": 1671 }, { "epoch": 2.1687216093445816, "grad_norm": 0.21245427765806485, "learning_rate": 1.5367965367965366e-05, "loss": 0.3429, "step": 1672 }, { "epoch": 2.170019467878001, "grad_norm": 0.20475755840127652, "learning_rate": 1.5343915343915344e-05, "loss": 0.334, "step": 1673 }, { "epoch": 2.171317326411421, "grad_norm": 0.22161947726300274, "learning_rate": 1.531986531986532e-05, "loss": 0.3445, "step": 1674 }, { "epoch": 2.172615184944841, "grad_norm": 0.20686901638420155, "learning_rate": 1.5295815295815297e-05, "loss": 0.3423, "step": 1675 }, { "epoch": 2.1739130434782608, "grad_norm": 0.21241410102955396, "learning_rate": 1.5271765271765272e-05, "loss": 0.3272, "step": 1676 }, { "epoch": 2.1752109020116808, "grad_norm": 0.20178219626778365, "learning_rate": 1.524771524771525e-05, "loss": 0.3429, "step": 1677 }, { "epoch": 2.176508760545101, "grad_norm": 0.24450118577688046, "learning_rate": 1.5223665223665223e-05, "loss": 0.3481, "step": 1678 }, { "epoch": 2.1778066190785204, "grad_norm": 0.20454351303651527, "learning_rate": 1.51996151996152e-05, "loss": 0.3581, "step": 1679 }, { "epoch": 2.1791044776119404, "grad_norm": 0.21613091983231933, "learning_rate": 1.5175565175565176e-05, "loss": 0.3299, "step": 1680 }, { "epoch": 2.1804023361453604, "grad_norm": 0.23413467617760164, "learning_rate": 1.5151515151515153e-05, "loss": 0.3622, "step": 1681 }, { "epoch": 2.18170019467878, "grad_norm": 0.21133908102093862, "learning_rate": 1.512746512746513e-05, "loss": 0.3323, "step": 1682 }, { "epoch": 2.1829980532122, "grad_norm": 0.2117083520838165, "learning_rate": 1.5103415103415102e-05, "loss": 0.3394, "step": 1683 }, { "epoch": 2.1842959117456195, "grad_norm": 0.21732760719194733, "learning_rate": 1.5079365079365079e-05, "loss": 0.3419, "step": 1684 }, { "epoch": 2.1855937702790396, "grad_norm": 0.20428681560868892, "learning_rate": 1.5055315055315055e-05, "loss": 0.3346, "step": 1685 }, { "epoch": 2.1868916288124596, "grad_norm": 0.21590103314227366, "learning_rate": 1.5031265031265032e-05, "loss": 0.3662, "step": 1686 }, { "epoch": 2.188189487345879, "grad_norm": 0.21412757670479102, "learning_rate": 1.5007215007215008e-05, "loss": 0.3334, "step": 1687 }, { "epoch": 2.189487345879299, "grad_norm": 0.2161794768756431, "learning_rate": 1.4983164983164985e-05, "loss": 0.334, "step": 1688 }, { "epoch": 2.190785204412719, "grad_norm": 0.2121354087724828, "learning_rate": 1.495911495911496e-05, "loss": 0.344, "step": 1689 }, { "epoch": 2.1920830629461387, "grad_norm": 0.20921645686294296, "learning_rate": 1.4935064935064936e-05, "loss": 0.3397, "step": 1690 }, { "epoch": 2.1933809214795588, "grad_norm": 0.20966308146376983, "learning_rate": 1.4911014911014912e-05, "loss": 0.3464, "step": 1691 }, { "epoch": 2.194678780012979, "grad_norm": 0.21908775100267822, "learning_rate": 1.4886964886964889e-05, "loss": 0.3317, "step": 1692 }, { "epoch": 2.1959766385463984, "grad_norm": 0.22468058324603687, "learning_rate": 1.4862914862914865e-05, "loss": 0.3407, "step": 1693 }, { "epoch": 2.1972744970798184, "grad_norm": 0.22075013149066078, "learning_rate": 1.4838864838864838e-05, "loss": 0.3283, "step": 1694 }, { "epoch": 2.198572355613238, "grad_norm": 0.21551594408964253, "learning_rate": 1.4814814814814815e-05, "loss": 0.3199, "step": 1695 }, { "epoch": 2.199870214146658, "grad_norm": 0.21485802609507748, "learning_rate": 1.4790764790764791e-05, "loss": 0.3287, "step": 1696 }, { "epoch": 2.201168072680078, "grad_norm": 0.2151673200316436, "learning_rate": 1.4766714766714768e-05, "loss": 0.3426, "step": 1697 }, { "epoch": 2.2024659312134975, "grad_norm": 0.22612559865377715, "learning_rate": 1.4742664742664744e-05, "loss": 0.3573, "step": 1698 }, { "epoch": 2.2037637897469176, "grad_norm": 0.21837527709726925, "learning_rate": 1.471861471861472e-05, "loss": 0.3325, "step": 1699 }, { "epoch": 2.2050616482803376, "grad_norm": 0.21236356177018392, "learning_rate": 1.4694564694564694e-05, "loss": 0.3553, "step": 1700 }, { "epoch": 2.206359506813757, "grad_norm": 0.19590709294592334, "learning_rate": 1.467051467051467e-05, "loss": 0.3215, "step": 1701 }, { "epoch": 2.207657365347177, "grad_norm": 0.22771176110161379, "learning_rate": 1.4646464646464647e-05, "loss": 0.3465, "step": 1702 }, { "epoch": 2.208955223880597, "grad_norm": 0.22229793471063694, "learning_rate": 1.4622414622414623e-05, "loss": 0.3432, "step": 1703 }, { "epoch": 2.2102530824140167, "grad_norm": 0.21252084935676113, "learning_rate": 1.45983645983646e-05, "loss": 0.3508, "step": 1704 }, { "epoch": 2.2115509409474368, "grad_norm": 0.2202632087669158, "learning_rate": 1.4574314574314573e-05, "loss": 0.3369, "step": 1705 }, { "epoch": 2.2128487994808568, "grad_norm": 0.21520394343961766, "learning_rate": 1.455026455026455e-05, "loss": 0.3327, "step": 1706 }, { "epoch": 2.2141466580142763, "grad_norm": 0.23322877877504564, "learning_rate": 1.4526214526214526e-05, "loss": 0.3491, "step": 1707 }, { "epoch": 2.2154445165476964, "grad_norm": 0.22519715240573737, "learning_rate": 1.4502164502164502e-05, "loss": 0.3403, "step": 1708 }, { "epoch": 2.2167423750811164, "grad_norm": 0.22210131676178194, "learning_rate": 1.4478114478114479e-05, "loss": 0.344, "step": 1709 }, { "epoch": 2.218040233614536, "grad_norm": 0.2119128486555464, "learning_rate": 1.4454064454064455e-05, "loss": 0.3562, "step": 1710 }, { "epoch": 2.219338092147956, "grad_norm": 0.26673890205097306, "learning_rate": 1.443001443001443e-05, "loss": 0.3284, "step": 1711 }, { "epoch": 2.2206359506813755, "grad_norm": 0.22633092379431619, "learning_rate": 1.4405964405964406e-05, "loss": 0.3383, "step": 1712 }, { "epoch": 2.2219338092147956, "grad_norm": 0.21248117786007845, "learning_rate": 1.4381914381914383e-05, "loss": 0.3458, "step": 1713 }, { "epoch": 2.2232316677482156, "grad_norm": 0.23708025124779677, "learning_rate": 1.435786435786436e-05, "loss": 0.3521, "step": 1714 }, { "epoch": 2.224529526281635, "grad_norm": 0.2216080470200345, "learning_rate": 1.4333814333814336e-05, "loss": 0.338, "step": 1715 }, { "epoch": 2.225827384815055, "grad_norm": 0.20064012115244553, "learning_rate": 1.4309764309764309e-05, "loss": 0.3323, "step": 1716 }, { "epoch": 2.227125243348475, "grad_norm": 0.22946959688466415, "learning_rate": 1.4285714285714285e-05, "loss": 0.3533, "step": 1717 }, { "epoch": 2.2284231018818947, "grad_norm": 0.23129226313271936, "learning_rate": 1.4261664261664262e-05, "loss": 0.3477, "step": 1718 }, { "epoch": 2.2297209604153148, "grad_norm": 0.23801346245682223, "learning_rate": 1.4237614237614238e-05, "loss": 0.3525, "step": 1719 }, { "epoch": 2.2310188189487348, "grad_norm": 0.20874849017749872, "learning_rate": 1.4213564213564215e-05, "loss": 0.3414, "step": 1720 }, { "epoch": 2.2323166774821543, "grad_norm": 0.21629318124043379, "learning_rate": 1.4189514189514191e-05, "loss": 0.3379, "step": 1721 }, { "epoch": 2.2336145360155744, "grad_norm": 0.22905806827791111, "learning_rate": 1.4165464165464164e-05, "loss": 0.337, "step": 1722 }, { "epoch": 2.234912394548994, "grad_norm": 0.2279391640803175, "learning_rate": 1.4141414141414141e-05, "loss": 0.3536, "step": 1723 }, { "epoch": 2.236210253082414, "grad_norm": 0.20743141809266266, "learning_rate": 1.4117364117364117e-05, "loss": 0.3331, "step": 1724 }, { "epoch": 2.237508111615834, "grad_norm": 0.21930312729137855, "learning_rate": 1.4093314093314094e-05, "loss": 0.3426, "step": 1725 }, { "epoch": 2.2388059701492535, "grad_norm": 0.21584060808323735, "learning_rate": 1.406926406926407e-05, "loss": 0.3326, "step": 1726 }, { "epoch": 2.2401038286826735, "grad_norm": 0.20452984796446286, "learning_rate": 1.4045214045214045e-05, "loss": 0.3407, "step": 1727 }, { "epoch": 2.2414016872160936, "grad_norm": 0.21278894400954876, "learning_rate": 1.4021164021164022e-05, "loss": 0.3272, "step": 1728 }, { "epoch": 2.242699545749513, "grad_norm": 0.19989940211340443, "learning_rate": 1.3997113997113998e-05, "loss": 0.3268, "step": 1729 }, { "epoch": 2.243997404282933, "grad_norm": 0.20756531216735427, "learning_rate": 1.3973063973063974e-05, "loss": 0.3428, "step": 1730 }, { "epoch": 2.245295262816353, "grad_norm": 0.2289163023889471, "learning_rate": 1.3949013949013951e-05, "loss": 0.3531, "step": 1731 }, { "epoch": 2.2465931213497727, "grad_norm": 0.19474714109733912, "learning_rate": 1.3924963924963927e-05, "loss": 0.3232, "step": 1732 }, { "epoch": 2.2478909798831928, "grad_norm": 0.20474938955793603, "learning_rate": 1.39009139009139e-05, "loss": 0.3368, "step": 1733 }, { "epoch": 2.2491888384166128, "grad_norm": 0.2134278581088506, "learning_rate": 1.3876863876863877e-05, "loss": 0.3332, "step": 1734 }, { "epoch": 2.2504866969500323, "grad_norm": 0.20557863182364947, "learning_rate": 1.3852813852813853e-05, "loss": 0.3189, "step": 1735 }, { "epoch": 2.2517845554834524, "grad_norm": 0.20536064005062202, "learning_rate": 1.382876382876383e-05, "loss": 0.3432, "step": 1736 }, { "epoch": 2.2530824140168724, "grad_norm": 0.20167055126244063, "learning_rate": 1.3804713804713806e-05, "loss": 0.3333, "step": 1737 }, { "epoch": 2.254380272550292, "grad_norm": 0.21070060115934763, "learning_rate": 1.378066378066378e-05, "loss": 0.3447, "step": 1738 }, { "epoch": 2.255678131083712, "grad_norm": 0.2207628970020607, "learning_rate": 1.3756613756613756e-05, "loss": 0.3583, "step": 1739 }, { "epoch": 2.256975989617132, "grad_norm": 0.2127435360633616, "learning_rate": 1.3732563732563732e-05, "loss": 0.3348, "step": 1740 }, { "epoch": 2.2582738481505515, "grad_norm": 0.21284122659552568, "learning_rate": 1.3708513708513709e-05, "loss": 0.3625, "step": 1741 }, { "epoch": 2.2595717066839716, "grad_norm": 0.1950008896432417, "learning_rate": 1.3684463684463685e-05, "loss": 0.3324, "step": 1742 }, { "epoch": 2.260869565217391, "grad_norm": 0.2020508328430615, "learning_rate": 1.3660413660413662e-05, "loss": 0.3479, "step": 1743 }, { "epoch": 2.262167423750811, "grad_norm": 0.20145545550530372, "learning_rate": 1.3636363636363637e-05, "loss": 0.3499, "step": 1744 }, { "epoch": 2.263465282284231, "grad_norm": 0.20947296790103498, "learning_rate": 1.3612313612313613e-05, "loss": 0.3499, "step": 1745 }, { "epoch": 2.2647631408176507, "grad_norm": 0.20527034801018748, "learning_rate": 1.358826358826359e-05, "loss": 0.3311, "step": 1746 }, { "epoch": 2.2660609993510707, "grad_norm": 0.21571814965895064, "learning_rate": 1.3564213564213566e-05, "loss": 0.3445, "step": 1747 }, { "epoch": 2.2673588578844908, "grad_norm": 0.1951070244580519, "learning_rate": 1.3540163540163542e-05, "loss": 0.3317, "step": 1748 }, { "epoch": 2.2686567164179103, "grad_norm": 0.20164098805440273, "learning_rate": 1.3516113516113516e-05, "loss": 0.3489, "step": 1749 }, { "epoch": 2.2699545749513304, "grad_norm": 0.2037235219687979, "learning_rate": 1.3492063492063492e-05, "loss": 0.3447, "step": 1750 }, { "epoch": 2.27125243348475, "grad_norm": 0.21220292515247122, "learning_rate": 1.3468013468013468e-05, "loss": 0.3395, "step": 1751 }, { "epoch": 2.27255029201817, "grad_norm": 0.2037471406983462, "learning_rate": 1.3443963443963445e-05, "loss": 0.3415, "step": 1752 }, { "epoch": 2.27384815055159, "grad_norm": 0.20298389579886292, "learning_rate": 1.3419913419913421e-05, "loss": 0.3374, "step": 1753 }, { "epoch": 2.2751460090850095, "grad_norm": 0.20691493521870025, "learning_rate": 1.3395863395863398e-05, "loss": 0.3519, "step": 1754 }, { "epoch": 2.2764438676184295, "grad_norm": 0.2013802963124298, "learning_rate": 1.3371813371813371e-05, "loss": 0.3411, "step": 1755 }, { "epoch": 2.2777417261518496, "grad_norm": 0.20436433898344716, "learning_rate": 1.3347763347763347e-05, "loss": 0.3447, "step": 1756 }, { "epoch": 2.279039584685269, "grad_norm": 0.1926683227358606, "learning_rate": 1.3323713323713324e-05, "loss": 0.3326, "step": 1757 }, { "epoch": 2.280337443218689, "grad_norm": 0.22362998880849946, "learning_rate": 1.32996632996633e-05, "loss": 0.338, "step": 1758 }, { "epoch": 2.281635301752109, "grad_norm": 0.19696154548860742, "learning_rate": 1.3275613275613277e-05, "loss": 0.3302, "step": 1759 }, { "epoch": 2.2829331602855287, "grad_norm": 0.21290597699073446, "learning_rate": 1.3251563251563252e-05, "loss": 0.3341, "step": 1760 }, { "epoch": 2.2842310188189487, "grad_norm": 0.2025599154856036, "learning_rate": 1.3227513227513228e-05, "loss": 0.3292, "step": 1761 }, { "epoch": 2.2855288773523688, "grad_norm": 0.20827197895684288, "learning_rate": 1.3203463203463205e-05, "loss": 0.348, "step": 1762 }, { "epoch": 2.2868267358857883, "grad_norm": 0.2031367197949487, "learning_rate": 1.3179413179413181e-05, "loss": 0.3353, "step": 1763 }, { "epoch": 2.2881245944192083, "grad_norm": 0.20119905187117781, "learning_rate": 1.3155363155363157e-05, "loss": 0.3291, "step": 1764 }, { "epoch": 2.2894224529526284, "grad_norm": 0.19920483051203258, "learning_rate": 1.3131313131313134e-05, "loss": 0.3475, "step": 1765 }, { "epoch": 2.290720311486048, "grad_norm": 0.3011556934952649, "learning_rate": 1.3107263107263107e-05, "loss": 0.3535, "step": 1766 }, { "epoch": 2.292018170019468, "grad_norm": 0.21100649854372194, "learning_rate": 1.3083213083213083e-05, "loss": 0.3469, "step": 1767 }, { "epoch": 2.293316028552888, "grad_norm": 0.1984331828091875, "learning_rate": 1.305916305916306e-05, "loss": 0.346, "step": 1768 }, { "epoch": 2.2946138870863075, "grad_norm": 0.2128841448679342, "learning_rate": 1.3035113035113036e-05, "loss": 0.3426, "step": 1769 }, { "epoch": 2.2959117456197276, "grad_norm": 0.20032623160831234, "learning_rate": 1.3011063011063013e-05, "loss": 0.3323, "step": 1770 }, { "epoch": 2.297209604153147, "grad_norm": 0.21020072070413953, "learning_rate": 1.2987012987012986e-05, "loss": 0.345, "step": 1771 }, { "epoch": 2.298507462686567, "grad_norm": 0.220140017454239, "learning_rate": 1.2962962962962962e-05, "loss": 0.338, "step": 1772 }, { "epoch": 2.299805321219987, "grad_norm": 0.20602192883284073, "learning_rate": 1.2938912938912939e-05, "loss": 0.3295, "step": 1773 }, { "epoch": 2.3011031797534067, "grad_norm": 0.19782214494971023, "learning_rate": 1.2914862914862915e-05, "loss": 0.3443, "step": 1774 }, { "epoch": 2.3024010382868267, "grad_norm": 0.2212349266901331, "learning_rate": 1.2890812890812892e-05, "loss": 0.3372, "step": 1775 }, { "epoch": 2.3036988968202468, "grad_norm": 0.20379287350155195, "learning_rate": 1.2866762866762868e-05, "loss": 0.3298, "step": 1776 }, { "epoch": 2.3049967553536663, "grad_norm": 0.22633645234017427, "learning_rate": 1.2842712842712843e-05, "loss": 0.3358, "step": 1777 }, { "epoch": 2.3062946138870863, "grad_norm": 0.22571485246122885, "learning_rate": 1.281866281866282e-05, "loss": 0.3535, "step": 1778 }, { "epoch": 2.3075924724205064, "grad_norm": 0.1971894196957368, "learning_rate": 1.2794612794612796e-05, "loss": 0.3437, "step": 1779 }, { "epoch": 2.308890330953926, "grad_norm": 0.21039001676553398, "learning_rate": 1.2770562770562773e-05, "loss": 0.3436, "step": 1780 }, { "epoch": 2.310188189487346, "grad_norm": 0.21586183237415976, "learning_rate": 1.2746512746512749e-05, "loss": 0.3371, "step": 1781 }, { "epoch": 2.3114860480207655, "grad_norm": 0.22449090420078008, "learning_rate": 1.2722462722462722e-05, "loss": 0.3426, "step": 1782 }, { "epoch": 2.3127839065541855, "grad_norm": 0.20000822387430392, "learning_rate": 1.2698412698412699e-05, "loss": 0.3273, "step": 1783 }, { "epoch": 2.3140817650876055, "grad_norm": 0.19796366453881245, "learning_rate": 1.2674362674362675e-05, "loss": 0.341, "step": 1784 }, { "epoch": 2.315379623621025, "grad_norm": 0.20625398962095803, "learning_rate": 1.2650312650312651e-05, "loss": 0.3327, "step": 1785 }, { "epoch": 2.316677482154445, "grad_norm": 0.2018595792474862, "learning_rate": 1.2626262626262628e-05, "loss": 0.3425, "step": 1786 }, { "epoch": 2.317975340687865, "grad_norm": 0.21810379177370443, "learning_rate": 1.2602212602212604e-05, "loss": 0.3411, "step": 1787 }, { "epoch": 2.3192731992212847, "grad_norm": 0.21122946645750976, "learning_rate": 1.2578162578162577e-05, "loss": 0.3519, "step": 1788 }, { "epoch": 2.3205710577547047, "grad_norm": 0.21682608106667023, "learning_rate": 1.2554112554112554e-05, "loss": 0.3391, "step": 1789 }, { "epoch": 2.3218689162881248, "grad_norm": 0.21596334841986267, "learning_rate": 1.253006253006253e-05, "loss": 0.3478, "step": 1790 }, { "epoch": 2.3231667748215443, "grad_norm": 0.21752697824305056, "learning_rate": 1.2506012506012507e-05, "loss": 0.3327, "step": 1791 }, { "epoch": 2.3244646333549643, "grad_norm": 0.19911928426673434, "learning_rate": 1.2481962481962482e-05, "loss": 0.3337, "step": 1792 }, { "epoch": 2.3257624918883844, "grad_norm": 0.21067483324446745, "learning_rate": 1.2457912457912458e-05, "loss": 0.3383, "step": 1793 }, { "epoch": 2.327060350421804, "grad_norm": 0.21855693528672904, "learning_rate": 1.2433862433862433e-05, "loss": 0.3392, "step": 1794 }, { "epoch": 2.328358208955224, "grad_norm": 0.2313881551522217, "learning_rate": 1.240981240981241e-05, "loss": 0.344, "step": 1795 }, { "epoch": 2.329656067488644, "grad_norm": 0.21647217328647403, "learning_rate": 1.2385762385762386e-05, "loss": 0.3498, "step": 1796 }, { "epoch": 2.3309539260220635, "grad_norm": 0.23563572241047098, "learning_rate": 1.2361712361712362e-05, "loss": 0.3492, "step": 1797 }, { "epoch": 2.3322517845554835, "grad_norm": 0.22879253600129817, "learning_rate": 1.2337662337662339e-05, "loss": 0.3476, "step": 1798 }, { "epoch": 2.3335496430889036, "grad_norm": 0.2194770239864021, "learning_rate": 1.2313612313612315e-05, "loss": 0.3499, "step": 1799 }, { "epoch": 2.334847501622323, "grad_norm": 0.2323432486506921, "learning_rate": 1.228956228956229e-05, "loss": 0.3498, "step": 1800 }, { "epoch": 2.336145360155743, "grad_norm": 0.2379829541757471, "learning_rate": 1.2265512265512267e-05, "loss": 0.3455, "step": 1801 }, { "epoch": 2.3374432186891627, "grad_norm": 0.22305188673020696, "learning_rate": 1.2241462241462243e-05, "loss": 0.3464, "step": 1802 }, { "epoch": 2.3387410772225827, "grad_norm": 0.20626742501221118, "learning_rate": 1.2217412217412218e-05, "loss": 0.3383, "step": 1803 }, { "epoch": 2.3400389357560027, "grad_norm": 0.20476597434634727, "learning_rate": 1.2193362193362194e-05, "loss": 0.3271, "step": 1804 }, { "epoch": 2.3413367942894223, "grad_norm": 0.2361285638480301, "learning_rate": 1.2169312169312169e-05, "loss": 0.363, "step": 1805 }, { "epoch": 2.3426346528228423, "grad_norm": 0.21885176640576573, "learning_rate": 1.2145262145262145e-05, "loss": 0.3322, "step": 1806 }, { "epoch": 2.3439325113562623, "grad_norm": 0.2186287737946408, "learning_rate": 1.2121212121212122e-05, "loss": 0.3517, "step": 1807 }, { "epoch": 2.345230369889682, "grad_norm": 0.22242925263530489, "learning_rate": 1.2097162097162097e-05, "loss": 0.3556, "step": 1808 }, { "epoch": 2.346528228423102, "grad_norm": 0.21839105455863506, "learning_rate": 1.2073112073112073e-05, "loss": 0.3537, "step": 1809 }, { "epoch": 2.3478260869565215, "grad_norm": 0.19288692731778082, "learning_rate": 1.204906204906205e-05, "loss": 0.3267, "step": 1810 }, { "epoch": 2.3491239454899415, "grad_norm": 0.21299009964604498, "learning_rate": 1.2025012025012024e-05, "loss": 0.3343, "step": 1811 }, { "epoch": 2.3504218040233615, "grad_norm": 0.20073387684513502, "learning_rate": 1.2000962000962001e-05, "loss": 0.3366, "step": 1812 }, { "epoch": 2.351719662556781, "grad_norm": 0.20924433976474296, "learning_rate": 1.1976911976911977e-05, "loss": 0.3503, "step": 1813 }, { "epoch": 2.353017521090201, "grad_norm": 0.2024959718616962, "learning_rate": 1.1952861952861954e-05, "loss": 0.3398, "step": 1814 }, { "epoch": 2.354315379623621, "grad_norm": 0.20136147992617448, "learning_rate": 1.192881192881193e-05, "loss": 0.3329, "step": 1815 }, { "epoch": 2.3556132381570407, "grad_norm": 0.2023856480954257, "learning_rate": 1.1904761904761905e-05, "loss": 0.3431, "step": 1816 }, { "epoch": 2.3569110966904607, "grad_norm": 0.19571106019001464, "learning_rate": 1.1880711880711882e-05, "loss": 0.3409, "step": 1817 }, { "epoch": 2.3582089552238807, "grad_norm": 0.20043918045056716, "learning_rate": 1.1856661856661858e-05, "loss": 0.3457, "step": 1818 }, { "epoch": 2.3595068137573003, "grad_norm": 0.20301799957042976, "learning_rate": 1.1832611832611833e-05, "loss": 0.358, "step": 1819 }, { "epoch": 2.3608046722907203, "grad_norm": 0.1985053306211951, "learning_rate": 1.180856180856181e-05, "loss": 0.3476, "step": 1820 }, { "epoch": 2.3621025308241403, "grad_norm": 0.1946833217729552, "learning_rate": 1.1784511784511786e-05, "loss": 0.3381, "step": 1821 }, { "epoch": 2.36340038935756, "grad_norm": 0.20694128635780762, "learning_rate": 1.176046176046176e-05, "loss": 0.3484, "step": 1822 }, { "epoch": 2.36469824789098, "grad_norm": 0.19680244677531286, "learning_rate": 1.1736411736411737e-05, "loss": 0.3412, "step": 1823 }, { "epoch": 2.3659961064244, "grad_norm": 0.19475058414044913, "learning_rate": 1.1712361712361713e-05, "loss": 0.3338, "step": 1824 }, { "epoch": 2.3672939649578195, "grad_norm": 0.20017845696117334, "learning_rate": 1.1688311688311688e-05, "loss": 0.3292, "step": 1825 }, { "epoch": 2.3685918234912395, "grad_norm": 0.21405181485690658, "learning_rate": 1.1664261664261665e-05, "loss": 0.3319, "step": 1826 }, { "epoch": 2.3698896820246595, "grad_norm": 0.2255906580166369, "learning_rate": 1.164021164021164e-05, "loss": 0.3583, "step": 1827 }, { "epoch": 2.371187540558079, "grad_norm": 0.19990250671178067, "learning_rate": 1.1616161616161616e-05, "loss": 0.337, "step": 1828 }, { "epoch": 2.372485399091499, "grad_norm": 0.19827952220037648, "learning_rate": 1.1592111592111592e-05, "loss": 0.3432, "step": 1829 }, { "epoch": 2.3737832576249187, "grad_norm": 0.19939279257051523, "learning_rate": 1.1568061568061569e-05, "loss": 0.3374, "step": 1830 }, { "epoch": 2.3750811161583387, "grad_norm": 0.1906800360211246, "learning_rate": 1.1544011544011545e-05, "loss": 0.3423, "step": 1831 }, { "epoch": 2.3763789746917587, "grad_norm": 0.2027119176012166, "learning_rate": 1.1519961519961522e-05, "loss": 0.3431, "step": 1832 }, { "epoch": 2.3776768332251783, "grad_norm": 0.20771653103434248, "learning_rate": 1.1495911495911497e-05, "loss": 0.3621, "step": 1833 }, { "epoch": 2.3789746917585983, "grad_norm": 0.18554441127828813, "learning_rate": 1.1471861471861473e-05, "loss": 0.326, "step": 1834 }, { "epoch": 2.3802725502920183, "grad_norm": 0.19747340923777565, "learning_rate": 1.144781144781145e-05, "loss": 0.3247, "step": 1835 }, { "epoch": 2.381570408825438, "grad_norm": 0.20647886448091093, "learning_rate": 1.1423761423761424e-05, "loss": 0.3341, "step": 1836 }, { "epoch": 2.382868267358858, "grad_norm": 0.1957627449624196, "learning_rate": 1.13997113997114e-05, "loss": 0.3328, "step": 1837 }, { "epoch": 2.3841661258922775, "grad_norm": 0.19525704689585352, "learning_rate": 1.1375661375661376e-05, "loss": 0.347, "step": 1838 }, { "epoch": 2.3854639844256975, "grad_norm": 0.20804623050610585, "learning_rate": 1.1351611351611352e-05, "loss": 0.3313, "step": 1839 }, { "epoch": 2.3867618429591175, "grad_norm": 0.21139171271078994, "learning_rate": 1.1327561327561329e-05, "loss": 0.3295, "step": 1840 }, { "epoch": 2.388059701492537, "grad_norm": 0.19695078516317913, "learning_rate": 1.1303511303511303e-05, "loss": 0.3244, "step": 1841 }, { "epoch": 2.389357560025957, "grad_norm": 0.21329704668287522, "learning_rate": 1.127946127946128e-05, "loss": 0.3496, "step": 1842 }, { "epoch": 2.390655418559377, "grad_norm": 0.19692253824786773, "learning_rate": 1.1255411255411256e-05, "loss": 0.3414, "step": 1843 }, { "epoch": 2.3919532770927967, "grad_norm": 0.20145116731281673, "learning_rate": 1.1231361231361231e-05, "loss": 0.3411, "step": 1844 }, { "epoch": 2.3932511356262167, "grad_norm": 0.21094040457540925, "learning_rate": 1.1207311207311207e-05, "loss": 0.3425, "step": 1845 }, { "epoch": 2.3945489941596367, "grad_norm": 0.20933806797553264, "learning_rate": 1.1183261183261184e-05, "loss": 0.3447, "step": 1846 }, { "epoch": 2.3958468526930563, "grad_norm": 0.22025133979918343, "learning_rate": 1.1159211159211159e-05, "loss": 0.3276, "step": 1847 }, { "epoch": 2.3971447112264763, "grad_norm": 0.2033688921914023, "learning_rate": 1.1135161135161135e-05, "loss": 0.3427, "step": 1848 }, { "epoch": 2.3984425697598963, "grad_norm": 0.20001119671379927, "learning_rate": 1.1111111111111112e-05, "loss": 0.3291, "step": 1849 }, { "epoch": 2.399740428293316, "grad_norm": 0.2096895946679153, "learning_rate": 1.1087061087061088e-05, "loss": 0.3529, "step": 1850 }, { "epoch": 2.401038286826736, "grad_norm": 0.20086557980683176, "learning_rate": 1.1063011063011065e-05, "loss": 0.3386, "step": 1851 }, { "epoch": 2.402336145360156, "grad_norm": 0.19654561939004062, "learning_rate": 1.103896103896104e-05, "loss": 0.347, "step": 1852 }, { "epoch": 2.4036340038935755, "grad_norm": 0.20190747910646842, "learning_rate": 1.1014911014911016e-05, "loss": 0.3355, "step": 1853 }, { "epoch": 2.4049318624269955, "grad_norm": 0.20073723518377382, "learning_rate": 1.0990860990860992e-05, "loss": 0.3506, "step": 1854 }, { "epoch": 2.4062297209604155, "grad_norm": 0.19812022485550956, "learning_rate": 1.0966810966810967e-05, "loss": 0.3326, "step": 1855 }, { "epoch": 2.407527579493835, "grad_norm": 0.20293093391898026, "learning_rate": 1.0942760942760944e-05, "loss": 0.3437, "step": 1856 }, { "epoch": 2.408825438027255, "grad_norm": 0.18445064925773152, "learning_rate": 1.091871091871092e-05, "loss": 0.3412, "step": 1857 }, { "epoch": 2.4101232965606747, "grad_norm": 0.18982196010225733, "learning_rate": 1.0894660894660895e-05, "loss": 0.3334, "step": 1858 }, { "epoch": 2.4114211550940947, "grad_norm": 0.19790842980140105, "learning_rate": 1.0870610870610871e-05, "loss": 0.341, "step": 1859 }, { "epoch": 2.4127190136275147, "grad_norm": 0.21825470803362326, "learning_rate": 1.0846560846560846e-05, "loss": 0.3463, "step": 1860 }, { "epoch": 2.4140168721609343, "grad_norm": 0.19742393864907667, "learning_rate": 1.0822510822510823e-05, "loss": 0.3275, "step": 1861 }, { "epoch": 2.4153147306943543, "grad_norm": 0.19224523357142126, "learning_rate": 1.0798460798460799e-05, "loss": 0.3325, "step": 1862 }, { "epoch": 2.4166125892277743, "grad_norm": 0.19938049624693138, "learning_rate": 1.0774410774410774e-05, "loss": 0.3262, "step": 1863 }, { "epoch": 2.417910447761194, "grad_norm": 0.19485806663699845, "learning_rate": 1.075036075036075e-05, "loss": 0.3357, "step": 1864 }, { "epoch": 2.419208306294614, "grad_norm": 0.20583844260408463, "learning_rate": 1.0726310726310727e-05, "loss": 0.3408, "step": 1865 }, { "epoch": 2.420506164828034, "grad_norm": 0.19685425374116253, "learning_rate": 1.0702260702260703e-05, "loss": 0.34, "step": 1866 }, { "epoch": 2.4218040233614535, "grad_norm": 0.19475698760728785, "learning_rate": 1.067821067821068e-05, "loss": 0.3415, "step": 1867 }, { "epoch": 2.4231018818948735, "grad_norm": 0.197170239994665, "learning_rate": 1.0654160654160656e-05, "loss": 0.3328, "step": 1868 }, { "epoch": 2.424399740428293, "grad_norm": 0.2312526818007591, "learning_rate": 1.0630110630110631e-05, "loss": 0.3517, "step": 1869 }, { "epoch": 2.425697598961713, "grad_norm": 0.21684364271507464, "learning_rate": 1.0606060606060607e-05, "loss": 0.3383, "step": 1870 }, { "epoch": 2.426995457495133, "grad_norm": 0.1985082584320083, "learning_rate": 1.0582010582010582e-05, "loss": 0.3246, "step": 1871 }, { "epoch": 2.4282933160285527, "grad_norm": 0.21714639429646007, "learning_rate": 1.0557960557960559e-05, "loss": 0.3497, "step": 1872 }, { "epoch": 2.4295911745619727, "grad_norm": 0.22321524923011035, "learning_rate": 1.0533910533910535e-05, "loss": 0.3553, "step": 1873 }, { "epoch": 2.4308890330953927, "grad_norm": 0.23372261304917366, "learning_rate": 1.050986050986051e-05, "loss": 0.3686, "step": 1874 }, { "epoch": 2.4321868916288123, "grad_norm": 0.20001574330722693, "learning_rate": 1.0485810485810486e-05, "loss": 0.3351, "step": 1875 }, { "epoch": 2.4334847501622323, "grad_norm": 0.20673565072923414, "learning_rate": 1.0461760461760463e-05, "loss": 0.3596, "step": 1876 }, { "epoch": 2.4347826086956523, "grad_norm": 0.20095320206649409, "learning_rate": 1.0437710437710438e-05, "loss": 0.3328, "step": 1877 }, { "epoch": 2.436080467229072, "grad_norm": 0.2789190666720269, "learning_rate": 1.0413660413660414e-05, "loss": 0.3509, "step": 1878 }, { "epoch": 2.437378325762492, "grad_norm": 0.2079166722723446, "learning_rate": 1.038961038961039e-05, "loss": 0.3492, "step": 1879 }, { "epoch": 2.438676184295912, "grad_norm": 0.21591855675091434, "learning_rate": 1.0365560365560365e-05, "loss": 0.3375, "step": 1880 }, { "epoch": 2.4399740428293315, "grad_norm": 0.21397550791993689, "learning_rate": 1.0341510341510342e-05, "loss": 0.3495, "step": 1881 }, { "epoch": 2.4412719013627515, "grad_norm": 0.3807729201699182, "learning_rate": 1.0317460317460318e-05, "loss": 0.3449, "step": 1882 }, { "epoch": 2.4425697598961715, "grad_norm": 0.19473145374740133, "learning_rate": 1.0293410293410295e-05, "loss": 0.3412, "step": 1883 }, { "epoch": 2.443867618429591, "grad_norm": 0.20643794857809838, "learning_rate": 1.0269360269360271e-05, "loss": 0.3548, "step": 1884 }, { "epoch": 2.445165476963011, "grad_norm": 0.19967818102155932, "learning_rate": 1.0245310245310246e-05, "loss": 0.3251, "step": 1885 }, { "epoch": 2.446463335496431, "grad_norm": 0.2068701964008534, "learning_rate": 1.0221260221260222e-05, "loss": 0.3503, "step": 1886 }, { "epoch": 2.4477611940298507, "grad_norm": 0.19485423370421984, "learning_rate": 1.0197210197210199e-05, "loss": 0.3316, "step": 1887 }, { "epoch": 2.4490590525632707, "grad_norm": 0.20945089504608722, "learning_rate": 1.0173160173160174e-05, "loss": 0.3723, "step": 1888 }, { "epoch": 2.4503569110966903, "grad_norm": 0.21961631414794303, "learning_rate": 1.014911014911015e-05, "loss": 0.3555, "step": 1889 }, { "epoch": 2.4516547696301103, "grad_norm": 0.20913009484803424, "learning_rate": 1.0125060125060125e-05, "loss": 0.3433, "step": 1890 }, { "epoch": 2.4529526281635303, "grad_norm": 0.1975805461078145, "learning_rate": 1.0101010101010101e-05, "loss": 0.3522, "step": 1891 }, { "epoch": 2.45425048669695, "grad_norm": 0.18824749115573988, "learning_rate": 1.0076960076960078e-05, "loss": 0.3262, "step": 1892 }, { "epoch": 2.45554834523037, "grad_norm": 0.19363390712933798, "learning_rate": 1.0052910052910053e-05, "loss": 0.3304, "step": 1893 }, { "epoch": 2.45684620376379, "grad_norm": 0.20877531448498393, "learning_rate": 1.0028860028860029e-05, "loss": 0.3502, "step": 1894 }, { "epoch": 2.4581440622972095, "grad_norm": 0.2061242277033731, "learning_rate": 1.0004810004810006e-05, "loss": 0.3382, "step": 1895 }, { "epoch": 2.4594419208306295, "grad_norm": 0.20527048632536887, "learning_rate": 9.98075998075998e-06, "loss": 0.3391, "step": 1896 }, { "epoch": 2.460739779364049, "grad_norm": 0.20055534262640298, "learning_rate": 9.956709956709957e-06, "loss": 0.3564, "step": 1897 }, { "epoch": 2.462037637897469, "grad_norm": 0.19785197665929594, "learning_rate": 9.932659932659933e-06, "loss": 0.3443, "step": 1898 }, { "epoch": 2.463335496430889, "grad_norm": 0.2037702638037453, "learning_rate": 9.908609908609908e-06, "loss": 0.348, "step": 1899 }, { "epoch": 2.4646333549643087, "grad_norm": 0.21220856247877268, "learning_rate": 9.884559884559884e-06, "loss": 0.3428, "step": 1900 }, { "epoch": 2.4659312134977287, "grad_norm": 0.2219575251336399, "learning_rate": 9.860509860509861e-06, "loss": 0.3418, "step": 1901 }, { "epoch": 2.4672290720311487, "grad_norm": 0.19647517582803717, "learning_rate": 9.836459836459837e-06, "loss": 0.3345, "step": 1902 }, { "epoch": 2.4685269305645683, "grad_norm": 0.19654210291881336, "learning_rate": 9.812409812409814e-06, "loss": 0.3439, "step": 1903 }, { "epoch": 2.4698247890979883, "grad_norm": 0.20215632118085225, "learning_rate": 9.788359788359789e-06, "loss": 0.343, "step": 1904 }, { "epoch": 2.4711226476314083, "grad_norm": 0.21700717313986337, "learning_rate": 9.764309764309765e-06, "loss": 0.3422, "step": 1905 }, { "epoch": 2.472420506164828, "grad_norm": 0.22639821333763582, "learning_rate": 9.740259740259742e-06, "loss": 0.3545, "step": 1906 }, { "epoch": 2.473718364698248, "grad_norm": 0.20194895913978017, "learning_rate": 9.716209716209716e-06, "loss": 0.3389, "step": 1907 }, { "epoch": 2.475016223231668, "grad_norm": 0.20577729584744323, "learning_rate": 9.692159692159693e-06, "loss": 0.3399, "step": 1908 }, { "epoch": 2.4763140817650875, "grad_norm": 0.19912299573383097, "learning_rate": 9.66810966810967e-06, "loss": 0.3373, "step": 1909 }, { "epoch": 2.4776119402985075, "grad_norm": 0.21269657354433175, "learning_rate": 9.644059644059644e-06, "loss": 0.3266, "step": 1910 }, { "epoch": 2.4789097988319275, "grad_norm": 0.20736564955023581, "learning_rate": 9.62000962000962e-06, "loss": 0.3778, "step": 1911 }, { "epoch": 2.480207657365347, "grad_norm": 0.20499397029892405, "learning_rate": 9.595959595959595e-06, "loss": 0.3476, "step": 1912 }, { "epoch": 2.481505515898767, "grad_norm": 0.21222717155575171, "learning_rate": 9.571909571909572e-06, "loss": 0.3614, "step": 1913 }, { "epoch": 2.482803374432187, "grad_norm": 0.19863505520154515, "learning_rate": 9.547859547859548e-06, "loss": 0.3405, "step": 1914 }, { "epoch": 2.4841012329656067, "grad_norm": 0.18907377927162114, "learning_rate": 9.523809523809523e-06, "loss": 0.3286, "step": 1915 }, { "epoch": 2.4853990914990267, "grad_norm": 0.21676854818994787, "learning_rate": 9.4997594997595e-06, "loss": 0.3464, "step": 1916 }, { "epoch": 2.4866969500324463, "grad_norm": 0.20682249845168457, "learning_rate": 9.475709475709476e-06, "loss": 0.3482, "step": 1917 }, { "epoch": 2.4879948085658663, "grad_norm": 0.20359288013276863, "learning_rate": 9.451659451659452e-06, "loss": 0.3361, "step": 1918 }, { "epoch": 2.4892926670992863, "grad_norm": 0.20782718092266766, "learning_rate": 9.427609427609429e-06, "loss": 0.3437, "step": 1919 }, { "epoch": 2.490590525632706, "grad_norm": 0.19751302818390967, "learning_rate": 9.403559403559405e-06, "loss": 0.3291, "step": 1920 }, { "epoch": 2.491888384166126, "grad_norm": 0.21156577333606844, "learning_rate": 9.37950937950938e-06, "loss": 0.3402, "step": 1921 }, { "epoch": 2.493186242699546, "grad_norm": 0.20413132600430678, "learning_rate": 9.355459355459357e-06, "loss": 0.3483, "step": 1922 }, { "epoch": 2.4944841012329655, "grad_norm": 0.1966838578810161, "learning_rate": 9.331409331409331e-06, "loss": 0.3412, "step": 1923 }, { "epoch": 2.4957819597663855, "grad_norm": 0.19973328891467468, "learning_rate": 9.307359307359308e-06, "loss": 0.3458, "step": 1924 }, { "epoch": 2.497079818299805, "grad_norm": 0.2020638046963019, "learning_rate": 9.283309283309284e-06, "loss": 0.3393, "step": 1925 }, { "epoch": 2.498377676833225, "grad_norm": 0.22766804942928376, "learning_rate": 9.259259259259259e-06, "loss": 0.3255, "step": 1926 }, { "epoch": 2.499675535366645, "grad_norm": 0.1997890343589566, "learning_rate": 9.235209235209236e-06, "loss": 0.3379, "step": 1927 }, { "epoch": 2.5009733939000647, "grad_norm": 0.19417057565689014, "learning_rate": 9.211159211159212e-06, "loss": 0.3362, "step": 1928 }, { "epoch": 2.5022712524334847, "grad_norm": 0.2159121005299441, "learning_rate": 9.187109187109187e-06, "loss": 0.333, "step": 1929 }, { "epoch": 2.5035691109669047, "grad_norm": 0.2152179356087276, "learning_rate": 9.163059163059163e-06, "loss": 0.3299, "step": 1930 }, { "epoch": 2.5048669695003243, "grad_norm": 0.19905359149185228, "learning_rate": 9.13900913900914e-06, "loss": 0.3402, "step": 1931 }, { "epoch": 2.5061648280337443, "grad_norm": 0.20691948697032309, "learning_rate": 9.114959114959115e-06, "loss": 0.3513, "step": 1932 }, { "epoch": 2.5074626865671643, "grad_norm": 0.19894235448528472, "learning_rate": 9.090909090909091e-06, "loss": 0.3288, "step": 1933 }, { "epoch": 2.508760545100584, "grad_norm": 0.21082966995035995, "learning_rate": 9.066859066859068e-06, "loss": 0.3332, "step": 1934 }, { "epoch": 2.510058403634004, "grad_norm": 0.20169901454411296, "learning_rate": 9.042809042809042e-06, "loss": 0.3223, "step": 1935 }, { "epoch": 2.511356262167424, "grad_norm": 0.21866454354506568, "learning_rate": 9.018759018759019e-06, "loss": 0.3392, "step": 1936 }, { "epoch": 2.5126541207008435, "grad_norm": 0.21257413698744998, "learning_rate": 8.994708994708995e-06, "loss": 0.3452, "step": 1937 }, { "epoch": 2.5139519792342635, "grad_norm": 0.19946752360820091, "learning_rate": 8.970658970658972e-06, "loss": 0.3426, "step": 1938 }, { "epoch": 2.5152498377676835, "grad_norm": 0.2048047114149695, "learning_rate": 8.946608946608948e-06, "loss": 0.3496, "step": 1939 }, { "epoch": 2.516547696301103, "grad_norm": 0.2060713206544726, "learning_rate": 8.922558922558923e-06, "loss": 0.3466, "step": 1940 }, { "epoch": 2.517845554834523, "grad_norm": 0.21499517524206285, "learning_rate": 8.8985088985089e-06, "loss": 0.3391, "step": 1941 }, { "epoch": 2.519143413367943, "grad_norm": 0.2043639715896453, "learning_rate": 8.874458874458876e-06, "loss": 0.3463, "step": 1942 }, { "epoch": 2.5204412719013627, "grad_norm": 0.2183414837407522, "learning_rate": 8.85040885040885e-06, "loss": 0.3427, "step": 1943 }, { "epoch": 2.5217391304347827, "grad_norm": 0.20285948319637043, "learning_rate": 8.826358826358827e-06, "loss": 0.3377, "step": 1944 }, { "epoch": 2.5230369889682027, "grad_norm": 0.19391653715006987, "learning_rate": 8.802308802308802e-06, "loss": 0.3431, "step": 1945 }, { "epoch": 2.5243348475016223, "grad_norm": 0.19725005677345353, "learning_rate": 8.778258778258778e-06, "loss": 0.3402, "step": 1946 }, { "epoch": 2.5256327060350423, "grad_norm": 0.20776675755315593, "learning_rate": 8.754208754208755e-06, "loss": 0.3432, "step": 1947 }, { "epoch": 2.526930564568462, "grad_norm": 0.20770257485418178, "learning_rate": 8.73015873015873e-06, "loss": 0.3309, "step": 1948 }, { "epoch": 2.528228423101882, "grad_norm": 0.19473645944952384, "learning_rate": 8.706108706108706e-06, "loss": 0.3335, "step": 1949 }, { "epoch": 2.529526281635302, "grad_norm": 0.20760911826948453, "learning_rate": 8.682058682058683e-06, "loss": 0.3418, "step": 1950 }, { "epoch": 2.5308241401687215, "grad_norm": 0.2094780858154728, "learning_rate": 8.658008658008657e-06, "loss": 0.3425, "step": 1951 }, { "epoch": 2.5321219987021415, "grad_norm": 0.19840367931370975, "learning_rate": 8.633958633958634e-06, "loss": 0.3417, "step": 1952 }, { "epoch": 2.533419857235561, "grad_norm": 0.19624410854082408, "learning_rate": 8.60990860990861e-06, "loss": 0.3456, "step": 1953 }, { "epoch": 2.534717715768981, "grad_norm": 0.19653675310930965, "learning_rate": 8.585858585858587e-06, "loss": 0.3287, "step": 1954 }, { "epoch": 2.536015574302401, "grad_norm": 0.2033148019089131, "learning_rate": 8.561808561808563e-06, "loss": 0.3416, "step": 1955 }, { "epoch": 2.5373134328358207, "grad_norm": 0.19360893151616687, "learning_rate": 8.537758537758538e-06, "loss": 0.3554, "step": 1956 }, { "epoch": 2.5386112913692407, "grad_norm": 0.18776659365965748, "learning_rate": 8.513708513708514e-06, "loss": 0.3394, "step": 1957 }, { "epoch": 2.5399091499026607, "grad_norm": 0.21687377756718906, "learning_rate": 8.489658489658491e-06, "loss": 0.3447, "step": 1958 }, { "epoch": 2.5412070084360803, "grad_norm": 0.19788163493461905, "learning_rate": 8.465608465608466e-06, "loss": 0.3383, "step": 1959 }, { "epoch": 2.5425048669695003, "grad_norm": 0.20991331629937682, "learning_rate": 8.441558441558442e-06, "loss": 0.3405, "step": 1960 }, { "epoch": 2.5438027255029203, "grad_norm": 0.20315647768031084, "learning_rate": 8.417508417508419e-06, "loss": 0.3454, "step": 1961 }, { "epoch": 2.54510058403634, "grad_norm": 0.23523075305367147, "learning_rate": 8.393458393458393e-06, "loss": 0.3618, "step": 1962 }, { "epoch": 2.54639844256976, "grad_norm": 0.2157971364103013, "learning_rate": 8.36940836940837e-06, "loss": 0.3497, "step": 1963 }, { "epoch": 2.54769630110318, "grad_norm": 0.20499915960426765, "learning_rate": 8.345358345358346e-06, "loss": 0.3309, "step": 1964 }, { "epoch": 2.5489941596365995, "grad_norm": 0.20698252650858404, "learning_rate": 8.321308321308321e-06, "loss": 0.3331, "step": 1965 }, { "epoch": 2.5502920181700195, "grad_norm": 0.20188066664017376, "learning_rate": 8.297258297258298e-06, "loss": 0.346, "step": 1966 }, { "epoch": 2.5515898767034395, "grad_norm": 0.22092684369504248, "learning_rate": 8.273208273208272e-06, "loss": 0.3487, "step": 1967 }, { "epoch": 2.552887735236859, "grad_norm": 0.20340970119145418, "learning_rate": 8.249158249158249e-06, "loss": 0.3341, "step": 1968 }, { "epoch": 2.554185593770279, "grad_norm": 0.20252572395415291, "learning_rate": 8.225108225108225e-06, "loss": 0.3332, "step": 1969 }, { "epoch": 2.555483452303699, "grad_norm": 0.1879834295873596, "learning_rate": 8.201058201058202e-06, "loss": 0.3212, "step": 1970 }, { "epoch": 2.5567813108371187, "grad_norm": 0.19299674074621231, "learning_rate": 8.177008177008178e-06, "loss": 0.3404, "step": 1971 }, { "epoch": 2.5580791693705387, "grad_norm": 0.20796973098772337, "learning_rate": 8.152958152958155e-06, "loss": 0.3389, "step": 1972 }, { "epoch": 2.5593770279039587, "grad_norm": 0.21058263937992205, "learning_rate": 8.12890812890813e-06, "loss": 0.3382, "step": 1973 }, { "epoch": 2.5606748864373783, "grad_norm": 0.19698041890440957, "learning_rate": 8.104858104858106e-06, "loss": 0.3406, "step": 1974 }, { "epoch": 2.5619727449707983, "grad_norm": 0.20441051012340986, "learning_rate": 8.080808080808082e-06, "loss": 0.3568, "step": 1975 }, { "epoch": 2.5632706035042183, "grad_norm": 0.20023706231769592, "learning_rate": 8.056758056758057e-06, "loss": 0.3453, "step": 1976 }, { "epoch": 2.564568462037638, "grad_norm": 0.19384483347770198, "learning_rate": 8.032708032708034e-06, "loss": 0.3541, "step": 1977 }, { "epoch": 2.565866320571058, "grad_norm": 0.1902457140768143, "learning_rate": 8.008658008658008e-06, "loss": 0.3259, "step": 1978 }, { "epoch": 2.5671641791044775, "grad_norm": 0.2065028347871094, "learning_rate": 7.984607984607985e-06, "loss": 0.3543, "step": 1979 }, { "epoch": 2.5684620376378975, "grad_norm": 0.19772657046385608, "learning_rate": 7.960557960557961e-06, "loss": 0.3353, "step": 1980 }, { "epoch": 2.569759896171317, "grad_norm": 0.18849021172503813, "learning_rate": 7.936507936507936e-06, "loss": 0.3223, "step": 1981 }, { "epoch": 2.571057754704737, "grad_norm": 0.208888741548615, "learning_rate": 7.912457912457913e-06, "loss": 0.3444, "step": 1982 }, { "epoch": 2.572355613238157, "grad_norm": 0.22525608656131163, "learning_rate": 7.888407888407889e-06, "loss": 0.3361, "step": 1983 }, { "epoch": 2.5736534717715767, "grad_norm": 0.207515663353734, "learning_rate": 7.864357864357864e-06, "loss": 0.3368, "step": 1984 }, { "epoch": 2.5749513303049967, "grad_norm": 0.2154517901626009, "learning_rate": 7.84030784030784e-06, "loss": 0.3345, "step": 1985 }, { "epoch": 2.5762491888384167, "grad_norm": 0.2059272342950417, "learning_rate": 7.816257816257815e-06, "loss": 0.3657, "step": 1986 }, { "epoch": 2.5775470473718363, "grad_norm": 0.20247815650907755, "learning_rate": 7.792207792207792e-06, "loss": 0.3248, "step": 1987 }, { "epoch": 2.5788449059052563, "grad_norm": 0.18967243424535923, "learning_rate": 7.768157768157768e-06, "loss": 0.3345, "step": 1988 }, { "epoch": 2.5801427644386763, "grad_norm": 0.20514510031749197, "learning_rate": 7.744107744107745e-06, "loss": 0.3407, "step": 1989 }, { "epoch": 2.581440622972096, "grad_norm": 0.2047332936097657, "learning_rate": 7.720057720057721e-06, "loss": 0.3404, "step": 1990 }, { "epoch": 2.582738481505516, "grad_norm": 0.19001160907023656, "learning_rate": 7.696007696007697e-06, "loss": 0.3363, "step": 1991 }, { "epoch": 2.584036340038936, "grad_norm": 0.2007791373982576, "learning_rate": 7.671957671957672e-06, "loss": 0.345, "step": 1992 }, { "epoch": 2.5853341985723555, "grad_norm": 0.1995014007677766, "learning_rate": 7.647907647907649e-06, "loss": 0.3373, "step": 1993 }, { "epoch": 2.5866320571057755, "grad_norm": 0.22960201823541218, "learning_rate": 7.623857623857625e-06, "loss": 0.35, "step": 1994 }, { "epoch": 2.5879299156391955, "grad_norm": 0.2041708033798508, "learning_rate": 7.5998075998076e-06, "loss": 0.3299, "step": 1995 }, { "epoch": 2.589227774172615, "grad_norm": 0.19021132164070584, "learning_rate": 7.5757575757575764e-06, "loss": 0.3434, "step": 1996 }, { "epoch": 2.590525632706035, "grad_norm": 0.19160256744066273, "learning_rate": 7.551707551707551e-06, "loss": 0.3376, "step": 1997 }, { "epoch": 2.591823491239455, "grad_norm": 0.1955416018816942, "learning_rate": 7.527657527657528e-06, "loss": 0.3407, "step": 1998 }, { "epoch": 2.5931213497728747, "grad_norm": 0.1994363091424892, "learning_rate": 7.503607503607504e-06, "loss": 0.3271, "step": 1999 }, { "epoch": 2.5944192083062947, "grad_norm": 0.19719206828799127, "learning_rate": 7.47955747955748e-06, "loss": 0.3491, "step": 2000 }, { "epoch": 2.5957170668397147, "grad_norm": 0.21854928408023744, "learning_rate": 7.455507455507456e-06, "loss": 0.3559, "step": 2001 }, { "epoch": 2.5970149253731343, "grad_norm": 0.2004433126098414, "learning_rate": 7.431457431457433e-06, "loss": 0.353, "step": 2002 }, { "epoch": 2.5983127839065543, "grad_norm": 0.21521520240395595, "learning_rate": 7.4074074074074075e-06, "loss": 0.3533, "step": 2003 }, { "epoch": 2.5996106424399743, "grad_norm": 0.1904741360498635, "learning_rate": 7.383357383357384e-06, "loss": 0.3375, "step": 2004 }, { "epoch": 2.600908500973394, "grad_norm": 0.20148594815991572, "learning_rate": 7.35930735930736e-06, "loss": 0.3404, "step": 2005 }, { "epoch": 2.602206359506814, "grad_norm": 0.20222565117203797, "learning_rate": 7.335257335257335e-06, "loss": 0.3325, "step": 2006 }, { "epoch": 2.6035042180402335, "grad_norm": 0.2086022602926794, "learning_rate": 7.311207311207312e-06, "loss": 0.3323, "step": 2007 }, { "epoch": 2.6048020765736535, "grad_norm": 0.19695846789935645, "learning_rate": 7.2871572871572864e-06, "loss": 0.3419, "step": 2008 }, { "epoch": 2.6060999351070735, "grad_norm": 0.2003548511519371, "learning_rate": 7.263107263107263e-06, "loss": 0.3356, "step": 2009 }, { "epoch": 2.607397793640493, "grad_norm": 0.20970980037255266, "learning_rate": 7.239057239057239e-06, "loss": 0.3385, "step": 2010 }, { "epoch": 2.608695652173913, "grad_norm": 0.19898685255635887, "learning_rate": 7.215007215007215e-06, "loss": 0.3427, "step": 2011 }, { "epoch": 2.6099935107073327, "grad_norm": 0.20226511821516943, "learning_rate": 7.1909571909571915e-06, "loss": 0.3272, "step": 2012 }, { "epoch": 2.6112913692407527, "grad_norm": 0.18832236422091425, "learning_rate": 7.166907166907168e-06, "loss": 0.3354, "step": 2013 }, { "epoch": 2.6125892277741727, "grad_norm": 0.18327010064211902, "learning_rate": 7.142857142857143e-06, "loss": 0.3286, "step": 2014 }, { "epoch": 2.6138870863075923, "grad_norm": 0.19895899041097867, "learning_rate": 7.118807118807119e-06, "loss": 0.3297, "step": 2015 }, { "epoch": 2.6151849448410123, "grad_norm": 0.2069075643758518, "learning_rate": 7.094757094757096e-06, "loss": 0.3302, "step": 2016 }, { "epoch": 2.6164828033744323, "grad_norm": 0.19524313780033972, "learning_rate": 7.0707070707070704e-06, "loss": 0.3365, "step": 2017 }, { "epoch": 2.617780661907852, "grad_norm": 0.19938059659766813, "learning_rate": 7.046657046657047e-06, "loss": 0.3338, "step": 2018 }, { "epoch": 2.619078520441272, "grad_norm": 0.19493636421814658, "learning_rate": 7.0226070226070225e-06, "loss": 0.3315, "step": 2019 }, { "epoch": 2.620376378974692, "grad_norm": 0.19839871511366805, "learning_rate": 6.998556998556999e-06, "loss": 0.3328, "step": 2020 }, { "epoch": 2.6216742375081115, "grad_norm": 0.20131069199570611, "learning_rate": 6.9745069745069755e-06, "loss": 0.3509, "step": 2021 }, { "epoch": 2.6229720960415315, "grad_norm": 0.19462988449384028, "learning_rate": 6.95045695045695e-06, "loss": 0.3491, "step": 2022 }, { "epoch": 2.6242699545749515, "grad_norm": 0.21282789287292148, "learning_rate": 6.926406926406927e-06, "loss": 0.34, "step": 2023 }, { "epoch": 2.625567813108371, "grad_norm": 0.1993074561130325, "learning_rate": 6.902356902356903e-06, "loss": 0.3466, "step": 2024 }, { "epoch": 2.626865671641791, "grad_norm": 0.20351150350698377, "learning_rate": 6.878306878306878e-06, "loss": 0.3598, "step": 2025 }, { "epoch": 2.628163530175211, "grad_norm": 0.19771094464637703, "learning_rate": 6.854256854256854e-06, "loss": 0.336, "step": 2026 }, { "epoch": 2.6294613887086307, "grad_norm": 0.21482701918153888, "learning_rate": 6.830206830206831e-06, "loss": 0.34, "step": 2027 }, { "epoch": 2.6307592472420507, "grad_norm": 0.2003506016053636, "learning_rate": 6.8061568061568065e-06, "loss": 0.3337, "step": 2028 }, { "epoch": 2.6320571057754707, "grad_norm": 0.19105711872310324, "learning_rate": 6.782106782106783e-06, "loss": 0.3384, "step": 2029 }, { "epoch": 2.6333549643088903, "grad_norm": 0.20159758006212594, "learning_rate": 6.758056758056758e-06, "loss": 0.3375, "step": 2030 }, { "epoch": 2.6346528228423103, "grad_norm": 0.20143060805909305, "learning_rate": 6.734006734006734e-06, "loss": 0.3491, "step": 2031 }, { "epoch": 2.6359506813757303, "grad_norm": 0.19866829942887484, "learning_rate": 6.709956709956711e-06, "loss": 0.3376, "step": 2032 }, { "epoch": 2.63724853990915, "grad_norm": 0.2037924476485615, "learning_rate": 6.6859066859066855e-06, "loss": 0.3353, "step": 2033 }, { "epoch": 2.63854639844257, "grad_norm": 0.19840737231971423, "learning_rate": 6.661856661856662e-06, "loss": 0.3419, "step": 2034 }, { "epoch": 2.6398442569759895, "grad_norm": 0.2012069800711804, "learning_rate": 6.637806637806638e-06, "loss": 0.3374, "step": 2035 }, { "epoch": 2.6411421155094095, "grad_norm": 0.2083568986903432, "learning_rate": 6.613756613756614e-06, "loss": 0.3535, "step": 2036 }, { "epoch": 2.6424399740428295, "grad_norm": 0.19697725314574688, "learning_rate": 6.5897065897065905e-06, "loss": 0.3449, "step": 2037 }, { "epoch": 2.643737832576249, "grad_norm": 0.19818150221871475, "learning_rate": 6.565656565656567e-06, "loss": 0.3409, "step": 2038 }, { "epoch": 2.645035691109669, "grad_norm": 0.1978527267292534, "learning_rate": 6.541606541606542e-06, "loss": 0.3373, "step": 2039 }, { "epoch": 2.6463335496430886, "grad_norm": 0.18635435314872623, "learning_rate": 6.517556517556518e-06, "loss": 0.3342, "step": 2040 }, { "epoch": 2.6476314081765087, "grad_norm": 0.19790356410543516, "learning_rate": 6.493506493506493e-06, "loss": 0.3436, "step": 2041 }, { "epoch": 2.6489292667099287, "grad_norm": 0.20316872420087156, "learning_rate": 6.4694564694564695e-06, "loss": 0.3448, "step": 2042 }, { "epoch": 2.6502271252433482, "grad_norm": 0.19838963723019196, "learning_rate": 6.445406445406446e-06, "loss": 0.3467, "step": 2043 }, { "epoch": 2.6515249837767683, "grad_norm": 0.19768166770591664, "learning_rate": 6.4213564213564216e-06, "loss": 0.3412, "step": 2044 }, { "epoch": 2.6528228423101883, "grad_norm": 0.18870632230633216, "learning_rate": 6.397306397306398e-06, "loss": 0.3451, "step": 2045 }, { "epoch": 2.654120700843608, "grad_norm": 0.19129582515190627, "learning_rate": 6.3732563732563745e-06, "loss": 0.3454, "step": 2046 }, { "epoch": 2.655418559377028, "grad_norm": 0.20596889818566627, "learning_rate": 6.349206349206349e-06, "loss": 0.3534, "step": 2047 }, { "epoch": 2.656716417910448, "grad_norm": 0.1951677844464869, "learning_rate": 6.325156325156326e-06, "loss": 0.3465, "step": 2048 }, { "epoch": 2.6580142764438675, "grad_norm": 0.1839023960232542, "learning_rate": 6.301106301106302e-06, "loss": 0.3237, "step": 2049 }, { "epoch": 2.6593121349772875, "grad_norm": 0.18411994909099402, "learning_rate": 6.277056277056277e-06, "loss": 0.3402, "step": 2050 }, { "epoch": 2.6606099935107075, "grad_norm": 0.19031776842501638, "learning_rate": 6.2530062530062535e-06, "loss": 0.3354, "step": 2051 }, { "epoch": 2.661907852044127, "grad_norm": 0.21384183187184516, "learning_rate": 6.228956228956229e-06, "loss": 0.342, "step": 2052 }, { "epoch": 2.663205710577547, "grad_norm": 0.18478906620074495, "learning_rate": 6.204906204906205e-06, "loss": 0.3348, "step": 2053 }, { "epoch": 2.664503569110967, "grad_norm": 0.20269895193238666, "learning_rate": 6.180856180856181e-06, "loss": 0.3351, "step": 2054 }, { "epoch": 2.6658014276443867, "grad_norm": 0.20051102629513898, "learning_rate": 6.156806156806158e-06, "loss": 0.3385, "step": 2055 }, { "epoch": 2.6670992861778067, "grad_norm": 0.18318606924324213, "learning_rate": 6.132756132756133e-06, "loss": 0.3364, "step": 2056 }, { "epoch": 2.6683971447112267, "grad_norm": 0.19493171826087835, "learning_rate": 6.108706108706109e-06, "loss": 0.3348, "step": 2057 }, { "epoch": 2.6696950032446463, "grad_norm": 0.18393836124851373, "learning_rate": 6.0846560846560845e-06, "loss": 0.3364, "step": 2058 }, { "epoch": 2.6709928617780663, "grad_norm": 0.19573395789299228, "learning_rate": 6.060606060606061e-06, "loss": 0.3501, "step": 2059 }, { "epoch": 2.6722907203114863, "grad_norm": 0.19289300424261566, "learning_rate": 6.036556036556037e-06, "loss": 0.3331, "step": 2060 }, { "epoch": 2.673588578844906, "grad_norm": 0.20391450990957627, "learning_rate": 6.012506012506012e-06, "loss": 0.3523, "step": 2061 }, { "epoch": 2.674886437378326, "grad_norm": 0.20029237980281211, "learning_rate": 5.988455988455989e-06, "loss": 0.3333, "step": 2062 }, { "epoch": 2.676184295911746, "grad_norm": 0.1866620677693766, "learning_rate": 5.964405964405965e-06, "loss": 0.3311, "step": 2063 }, { "epoch": 2.6774821544451655, "grad_norm": 0.19121945839733376, "learning_rate": 5.940355940355941e-06, "loss": 0.3411, "step": 2064 }, { "epoch": 2.6787800129785855, "grad_norm": 0.19768175445567718, "learning_rate": 5.916305916305916e-06, "loss": 0.3331, "step": 2065 }, { "epoch": 2.680077871512005, "grad_norm": 0.19564286501355985, "learning_rate": 5.892255892255893e-06, "loss": 0.3494, "step": 2066 }, { "epoch": 2.681375730045425, "grad_norm": 0.18680263316102796, "learning_rate": 5.8682058682058685e-06, "loss": 0.3261, "step": 2067 }, { "epoch": 2.6826735885788446, "grad_norm": 0.18888920820250896, "learning_rate": 5.844155844155844e-06, "loss": 0.3319, "step": 2068 }, { "epoch": 2.6839714471122647, "grad_norm": 0.19397052933336428, "learning_rate": 5.82010582010582e-06, "loss": 0.3361, "step": 2069 }, { "epoch": 2.6852693056456847, "grad_norm": 0.19077138187186174, "learning_rate": 5.796055796055796e-06, "loss": 0.3299, "step": 2070 }, { "epoch": 2.6865671641791042, "grad_norm": 0.21127811092361148, "learning_rate": 5.772005772005773e-06, "loss": 0.3351, "step": 2071 }, { "epoch": 2.6878650227125243, "grad_norm": 0.19439235797880994, "learning_rate": 5.747955747955748e-06, "loss": 0.3311, "step": 2072 }, { "epoch": 2.6891628812459443, "grad_norm": 0.18768581482039637, "learning_rate": 5.723905723905725e-06, "loss": 0.3408, "step": 2073 }, { "epoch": 2.690460739779364, "grad_norm": 0.20150498398104075, "learning_rate": 5.6998556998557e-06, "loss": 0.3297, "step": 2074 }, { "epoch": 2.691758598312784, "grad_norm": 0.2030708714806736, "learning_rate": 5.675805675805676e-06, "loss": 0.3464, "step": 2075 }, { "epoch": 2.693056456846204, "grad_norm": 0.19331515757587614, "learning_rate": 5.651755651755652e-06, "loss": 0.341, "step": 2076 }, { "epoch": 2.6943543153796234, "grad_norm": 0.20425020368275043, "learning_rate": 5.627705627705628e-06, "loss": 0.3533, "step": 2077 }, { "epoch": 2.6956521739130435, "grad_norm": 0.2332987934303732, "learning_rate": 5.603655603655604e-06, "loss": 0.3395, "step": 2078 }, { "epoch": 2.6969500324464635, "grad_norm": 0.19691484544610982, "learning_rate": 5.579605579605579e-06, "loss": 0.3493, "step": 2079 }, { "epoch": 2.698247890979883, "grad_norm": 0.2052051327790077, "learning_rate": 5.555555555555556e-06, "loss": 0.3622, "step": 2080 }, { "epoch": 2.699545749513303, "grad_norm": 0.19298325427314583, "learning_rate": 5.531505531505532e-06, "loss": 0.3275, "step": 2081 }, { "epoch": 2.700843608046723, "grad_norm": 0.19205634748728384, "learning_rate": 5.507455507455508e-06, "loss": 0.3364, "step": 2082 }, { "epoch": 2.7021414665801426, "grad_norm": 0.199120001609843, "learning_rate": 5.4834054834054835e-06, "loss": 0.3604, "step": 2083 }, { "epoch": 2.7034393251135627, "grad_norm": 0.19279039644707233, "learning_rate": 5.45935545935546e-06, "loss": 0.3531, "step": 2084 }, { "epoch": 2.7047371836469827, "grad_norm": 0.19816454924229257, "learning_rate": 5.435305435305436e-06, "loss": 0.3386, "step": 2085 }, { "epoch": 2.7060350421804023, "grad_norm": 0.1978192250026057, "learning_rate": 5.411255411255411e-06, "loss": 0.3353, "step": 2086 }, { "epoch": 2.7073329007138223, "grad_norm": 0.1866947813546459, "learning_rate": 5.387205387205387e-06, "loss": 0.3361, "step": 2087 }, { "epoch": 2.7086307592472423, "grad_norm": 0.19130243354442364, "learning_rate": 5.363155363155363e-06, "loss": 0.3339, "step": 2088 }, { "epoch": 2.709928617780662, "grad_norm": 0.20123966523314857, "learning_rate": 5.33910533910534e-06, "loss": 0.352, "step": 2089 }, { "epoch": 2.711226476314082, "grad_norm": 0.204082189254048, "learning_rate": 5.3150553150553154e-06, "loss": 0.3465, "step": 2090 }, { "epoch": 2.712524334847502, "grad_norm": 0.18978137246025184, "learning_rate": 5.291005291005291e-06, "loss": 0.3436, "step": 2091 }, { "epoch": 2.7138221933809215, "grad_norm": 0.18714999050678918, "learning_rate": 5.2669552669552675e-06, "loss": 0.3191, "step": 2092 }, { "epoch": 2.7151200519143415, "grad_norm": 0.19155333104247205, "learning_rate": 5.242905242905243e-06, "loss": 0.3298, "step": 2093 }, { "epoch": 2.716417910447761, "grad_norm": 0.19864886304787258, "learning_rate": 5.218855218855219e-06, "loss": 0.337, "step": 2094 }, { "epoch": 2.717715768981181, "grad_norm": 0.20900011549166994, "learning_rate": 5.194805194805195e-06, "loss": 0.3351, "step": 2095 }, { "epoch": 2.719013627514601, "grad_norm": 0.19283552005981475, "learning_rate": 5.170755170755171e-06, "loss": 0.3415, "step": 2096 }, { "epoch": 2.7203114860480206, "grad_norm": 0.19735002707288396, "learning_rate": 5.146705146705147e-06, "loss": 0.3511, "step": 2097 }, { "epoch": 2.7216093445814407, "grad_norm": 0.19007560174944538, "learning_rate": 5.122655122655123e-06, "loss": 0.3337, "step": 2098 }, { "epoch": 2.7229072031148602, "grad_norm": 0.1945528470127734, "learning_rate": 5.0986050986050994e-06, "loss": 0.3333, "step": 2099 }, { "epoch": 2.7242050616482802, "grad_norm": 0.19541816607187268, "learning_rate": 5.074555074555075e-06, "loss": 0.3375, "step": 2100 }, { "epoch": 2.7255029201817003, "grad_norm": 0.1965347736989204, "learning_rate": 5.050505050505051e-06, "loss": 0.3374, "step": 2101 }, { "epoch": 2.72680077871512, "grad_norm": 0.20176319781405136, "learning_rate": 5.026455026455026e-06, "loss": 0.3292, "step": 2102 }, { "epoch": 2.72809863724854, "grad_norm": 0.20334136636920452, "learning_rate": 5.002405002405003e-06, "loss": 0.3546, "step": 2103 }, { "epoch": 2.72939649578196, "grad_norm": 0.20179646454271286, "learning_rate": 4.978354978354978e-06, "loss": 0.3401, "step": 2104 }, { "epoch": 2.7306943543153794, "grad_norm": 0.20239489126680496, "learning_rate": 4.954304954304954e-06, "loss": 0.362, "step": 2105 }, { "epoch": 2.7319922128487995, "grad_norm": 0.1870990703654753, "learning_rate": 4.9302549302549305e-06, "loss": 0.3443, "step": 2106 }, { "epoch": 2.7332900713822195, "grad_norm": 0.18557759206233268, "learning_rate": 4.906204906204907e-06, "loss": 0.3313, "step": 2107 }, { "epoch": 2.734587929915639, "grad_norm": 0.19525861207810466, "learning_rate": 4.8821548821548826e-06, "loss": 0.3471, "step": 2108 }, { "epoch": 2.735885788449059, "grad_norm": 0.1863895368451734, "learning_rate": 4.858104858104858e-06, "loss": 0.3302, "step": 2109 }, { "epoch": 2.737183646982479, "grad_norm": 0.19520260521522104, "learning_rate": 4.834054834054835e-06, "loss": 0.3285, "step": 2110 }, { "epoch": 2.7384815055158986, "grad_norm": 0.19296947043696586, "learning_rate": 4.81000481000481e-06, "loss": 0.3493, "step": 2111 }, { "epoch": 2.7397793640493187, "grad_norm": 0.19207052807282113, "learning_rate": 4.785954785954786e-06, "loss": 0.3557, "step": 2112 }, { "epoch": 2.7410772225827387, "grad_norm": 0.19274032451154754, "learning_rate": 4.7619047619047615e-06, "loss": 0.3363, "step": 2113 }, { "epoch": 2.7423750811161582, "grad_norm": 0.1991025079179398, "learning_rate": 4.737854737854738e-06, "loss": 0.3383, "step": 2114 }, { "epoch": 2.7436729396495783, "grad_norm": 0.20322726939044952, "learning_rate": 4.7138047138047145e-06, "loss": 0.3309, "step": 2115 }, { "epoch": 2.7449707981829983, "grad_norm": 0.1953261545822402, "learning_rate": 4.68975468975469e-06, "loss": 0.3324, "step": 2116 }, { "epoch": 2.746268656716418, "grad_norm": 0.20572870381079225, "learning_rate": 4.665704665704666e-06, "loss": 0.3519, "step": 2117 }, { "epoch": 2.747566515249838, "grad_norm": 0.1896514715456808, "learning_rate": 4.641654641654642e-06, "loss": 0.3269, "step": 2118 }, { "epoch": 2.748864373783258, "grad_norm": 0.20395631514698148, "learning_rate": 4.617604617604618e-06, "loss": 0.3378, "step": 2119 }, { "epoch": 2.7501622323166774, "grad_norm": 0.18572879939359394, "learning_rate": 4.5935545935545934e-06, "loss": 0.3577, "step": 2120 }, { "epoch": 2.7514600908500975, "grad_norm": 0.1888314312446457, "learning_rate": 4.56950456950457e-06, "loss": 0.3323, "step": 2121 }, { "epoch": 2.752757949383517, "grad_norm": 0.19430952155026918, "learning_rate": 4.5454545454545455e-06, "loss": 0.3439, "step": 2122 }, { "epoch": 2.754055807916937, "grad_norm": 0.42636074237494337, "learning_rate": 4.521404521404521e-06, "loss": 0.3473, "step": 2123 }, { "epoch": 2.755353666450357, "grad_norm": 0.2086969030917497, "learning_rate": 4.497354497354498e-06, "loss": 0.3651, "step": 2124 }, { "epoch": 2.7566515249837766, "grad_norm": 0.18713017378956393, "learning_rate": 4.473304473304474e-06, "loss": 0.3391, "step": 2125 }, { "epoch": 2.7579493835171967, "grad_norm": 0.1997677484748378, "learning_rate": 4.44925444925445e-06, "loss": 0.3296, "step": 2126 }, { "epoch": 2.7592472420506162, "grad_norm": 0.19085510775569323, "learning_rate": 4.425204425204425e-06, "loss": 0.3306, "step": 2127 }, { "epoch": 2.7605451005840362, "grad_norm": 0.1897332096553123, "learning_rate": 4.401154401154401e-06, "loss": 0.3319, "step": 2128 }, { "epoch": 2.7618429591174563, "grad_norm": 0.18447234964642742, "learning_rate": 4.377104377104377e-06, "loss": 0.3381, "step": 2129 }, { "epoch": 2.763140817650876, "grad_norm": 0.19659685588771536, "learning_rate": 4.353054353054353e-06, "loss": 0.3411, "step": 2130 }, { "epoch": 2.764438676184296, "grad_norm": 0.19285636988233915, "learning_rate": 4.329004329004329e-06, "loss": 0.3363, "step": 2131 }, { "epoch": 2.765736534717716, "grad_norm": 0.1873294345390938, "learning_rate": 4.304954304954305e-06, "loss": 0.3479, "step": 2132 }, { "epoch": 2.7670343932511354, "grad_norm": 0.20641352605961297, "learning_rate": 4.280904280904282e-06, "loss": 0.3479, "step": 2133 }, { "epoch": 2.7683322517845554, "grad_norm": 0.19611830886976564, "learning_rate": 4.256854256854257e-06, "loss": 0.3598, "step": 2134 }, { "epoch": 2.7696301103179755, "grad_norm": 0.19362363231720492, "learning_rate": 4.232804232804233e-06, "loss": 0.3587, "step": 2135 }, { "epoch": 2.770927968851395, "grad_norm": 0.18330247717729053, "learning_rate": 4.208754208754209e-06, "loss": 0.3395, "step": 2136 }, { "epoch": 2.772225827384815, "grad_norm": 0.194933200111678, "learning_rate": 4.184704184704185e-06, "loss": 0.3508, "step": 2137 }, { "epoch": 2.773523685918235, "grad_norm": 0.21351784132569623, "learning_rate": 4.1606541606541606e-06, "loss": 0.3569, "step": 2138 }, { "epoch": 2.7748215444516546, "grad_norm": 0.2022298125861802, "learning_rate": 4.136604136604136e-06, "loss": 0.3452, "step": 2139 }, { "epoch": 2.7761194029850746, "grad_norm": 0.19008703004655647, "learning_rate": 4.112554112554113e-06, "loss": 0.3261, "step": 2140 }, { "epoch": 2.7774172615184947, "grad_norm": 0.1996334755494925, "learning_rate": 4.088504088504089e-06, "loss": 0.3348, "step": 2141 }, { "epoch": 2.7787151200519142, "grad_norm": 0.2016162252782046, "learning_rate": 4.064454064454065e-06, "loss": 0.339, "step": 2142 }, { "epoch": 2.7800129785853342, "grad_norm": 0.20145152124370821, "learning_rate": 4.040404040404041e-06, "loss": 0.3384, "step": 2143 }, { "epoch": 2.7813108371187543, "grad_norm": 0.20921251143803626, "learning_rate": 4.016354016354017e-06, "loss": 0.3449, "step": 2144 }, { "epoch": 2.782608695652174, "grad_norm": 0.18838640289015987, "learning_rate": 3.9923039923039925e-06, "loss": 0.3249, "step": 2145 }, { "epoch": 2.783906554185594, "grad_norm": 0.203141862957814, "learning_rate": 3.968253968253968e-06, "loss": 0.3474, "step": 2146 }, { "epoch": 2.785204412719014, "grad_norm": 0.18959922111635913, "learning_rate": 3.9442039442039446e-06, "loss": 0.3358, "step": 2147 }, { "epoch": 2.7865022712524334, "grad_norm": 0.2034563222680623, "learning_rate": 3.92015392015392e-06, "loss": 0.3399, "step": 2148 }, { "epoch": 2.7878001297858535, "grad_norm": 0.197654010004629, "learning_rate": 3.896103896103896e-06, "loss": 0.3384, "step": 2149 }, { "epoch": 2.7890979883192735, "grad_norm": 0.20680720078235312, "learning_rate": 3.872053872053872e-06, "loss": 0.3326, "step": 2150 }, { "epoch": 2.790395846852693, "grad_norm": 0.1927827514450044, "learning_rate": 3.848003848003849e-06, "loss": 0.3276, "step": 2151 }, { "epoch": 2.791693705386113, "grad_norm": 0.1899459712119537, "learning_rate": 3.823953823953824e-06, "loss": 0.3368, "step": 2152 }, { "epoch": 2.7929915639195326, "grad_norm": 0.18276333170806777, "learning_rate": 3.7999037999038e-06, "loss": 0.3252, "step": 2153 }, { "epoch": 2.7942894224529526, "grad_norm": 0.19256844716061763, "learning_rate": 3.7758537758537756e-06, "loss": 0.3483, "step": 2154 }, { "epoch": 2.795587280986372, "grad_norm": 0.18473354170932832, "learning_rate": 3.751803751803752e-06, "loss": 0.3329, "step": 2155 }, { "epoch": 2.7968851395197922, "grad_norm": 0.20272211241013205, "learning_rate": 3.727753727753728e-06, "loss": 0.363, "step": 2156 }, { "epoch": 2.7981829980532122, "grad_norm": 0.210319586938317, "learning_rate": 3.7037037037037037e-06, "loss": 0.3547, "step": 2157 }, { "epoch": 2.799480856586632, "grad_norm": 0.18619320502174647, "learning_rate": 3.67965367965368e-06, "loss": 0.3282, "step": 2158 }, { "epoch": 2.800778715120052, "grad_norm": 0.1772781571540208, "learning_rate": 3.655603655603656e-06, "loss": 0.33, "step": 2159 }, { "epoch": 2.802076573653472, "grad_norm": 0.19401337158226317, "learning_rate": 3.6315536315536315e-06, "loss": 0.3418, "step": 2160 }, { "epoch": 2.8033744321868914, "grad_norm": 0.20093342296511638, "learning_rate": 3.6075036075036075e-06, "loss": 0.3324, "step": 2161 }, { "epoch": 2.8046722907203114, "grad_norm": 0.18887641527097687, "learning_rate": 3.583453583453584e-06, "loss": 0.3256, "step": 2162 }, { "epoch": 2.8059701492537314, "grad_norm": 0.17961558157640115, "learning_rate": 3.5594035594035596e-06, "loss": 0.3328, "step": 2163 }, { "epoch": 2.807268007787151, "grad_norm": 0.20347532828593798, "learning_rate": 3.5353535353535352e-06, "loss": 0.3421, "step": 2164 }, { "epoch": 2.808565866320571, "grad_norm": 0.18010386527024905, "learning_rate": 3.5113035113035113e-06, "loss": 0.326, "step": 2165 }, { "epoch": 2.809863724853991, "grad_norm": 0.18682472145471216, "learning_rate": 3.4872534872534877e-06, "loss": 0.3277, "step": 2166 }, { "epoch": 2.8111615833874106, "grad_norm": 0.18668496331694528, "learning_rate": 3.4632034632034634e-06, "loss": 0.3441, "step": 2167 }, { "epoch": 2.8124594419208306, "grad_norm": 0.18876447344150002, "learning_rate": 3.439153439153439e-06, "loss": 0.3543, "step": 2168 }, { "epoch": 2.8137573004542507, "grad_norm": 0.1880026989268264, "learning_rate": 3.4151034151034154e-06, "loss": 0.3419, "step": 2169 }, { "epoch": 2.8150551589876702, "grad_norm": 0.19326058199934312, "learning_rate": 3.3910533910533915e-06, "loss": 0.3332, "step": 2170 }, { "epoch": 2.8163530175210902, "grad_norm": 0.18329023377490067, "learning_rate": 3.367003367003367e-06, "loss": 0.3632, "step": 2171 }, { "epoch": 2.8176508760545103, "grad_norm": 0.19371890235019304, "learning_rate": 3.3429533429533427e-06, "loss": 0.3377, "step": 2172 }, { "epoch": 2.81894873458793, "grad_norm": 0.18600523979644987, "learning_rate": 3.318903318903319e-06, "loss": 0.3469, "step": 2173 }, { "epoch": 2.82024659312135, "grad_norm": 0.19389227471137455, "learning_rate": 3.2948532948532953e-06, "loss": 0.3387, "step": 2174 }, { "epoch": 2.82154445165477, "grad_norm": 0.18954299093028096, "learning_rate": 3.270803270803271e-06, "loss": 0.3279, "step": 2175 }, { "epoch": 2.8228423101881894, "grad_norm": 0.19152410986871543, "learning_rate": 3.2467532467532465e-06, "loss": 0.3374, "step": 2176 }, { "epoch": 2.8241401687216094, "grad_norm": 0.196063104407719, "learning_rate": 3.222703222703223e-06, "loss": 0.3385, "step": 2177 }, { "epoch": 2.8254380272550295, "grad_norm": 0.18850542953314792, "learning_rate": 3.198653198653199e-06, "loss": 0.349, "step": 2178 }, { "epoch": 2.826735885788449, "grad_norm": 0.20124590955928826, "learning_rate": 3.1746031746031746e-06, "loss": 0.3415, "step": 2179 }, { "epoch": 2.828033744321869, "grad_norm": 0.20248550914830157, "learning_rate": 3.150553150553151e-06, "loss": 0.3461, "step": 2180 }, { "epoch": 2.8293316028552886, "grad_norm": 0.20035006058516966, "learning_rate": 3.1265031265031267e-06, "loss": 0.3432, "step": 2181 }, { "epoch": 2.8306294613887086, "grad_norm": 0.1855009910859687, "learning_rate": 3.1024531024531023e-06, "loss": 0.3187, "step": 2182 }, { "epoch": 2.8319273199221286, "grad_norm": 0.2143582348750643, "learning_rate": 3.078403078403079e-06, "loss": 0.3405, "step": 2183 }, { "epoch": 2.833225178455548, "grad_norm": 0.19882736391050926, "learning_rate": 3.0543530543530544e-06, "loss": 0.3437, "step": 2184 }, { "epoch": 2.8345230369889682, "grad_norm": 0.19373263047996803, "learning_rate": 3.0303030303030305e-06, "loss": 0.3391, "step": 2185 }, { "epoch": 2.835820895522388, "grad_norm": 0.19378870740750723, "learning_rate": 3.006253006253006e-06, "loss": 0.3478, "step": 2186 }, { "epoch": 2.837118754055808, "grad_norm": 0.24107367256249349, "learning_rate": 2.9822029822029826e-06, "loss": 0.3458, "step": 2187 }, { "epoch": 2.838416612589228, "grad_norm": 0.18521053064833032, "learning_rate": 2.958152958152958e-06, "loss": 0.3383, "step": 2188 }, { "epoch": 2.8397144711226474, "grad_norm": 0.1881021394765785, "learning_rate": 2.9341029341029342e-06, "loss": 0.3517, "step": 2189 }, { "epoch": 2.8410123296560674, "grad_norm": 0.20171700206005888, "learning_rate": 2.91005291005291e-06, "loss": 0.3689, "step": 2190 }, { "epoch": 2.8423101881894874, "grad_norm": 0.18467275960567497, "learning_rate": 2.8860028860028863e-06, "loss": 0.3271, "step": 2191 }, { "epoch": 2.843608046722907, "grad_norm": 0.1935758058556294, "learning_rate": 2.8619528619528624e-06, "loss": 0.3457, "step": 2192 }, { "epoch": 2.844905905256327, "grad_norm": 0.20313128506718564, "learning_rate": 2.837902837902838e-06, "loss": 0.3315, "step": 2193 }, { "epoch": 2.846203763789747, "grad_norm": 0.2694894225052093, "learning_rate": 2.813852813852814e-06, "loss": 0.3651, "step": 2194 }, { "epoch": 2.8475016223231666, "grad_norm": 0.191890934429557, "learning_rate": 2.7898027898027897e-06, "loss": 0.3457, "step": 2195 }, { "epoch": 2.8487994808565866, "grad_norm": 0.18753053839949832, "learning_rate": 2.765752765752766e-06, "loss": 0.3438, "step": 2196 }, { "epoch": 2.8500973393900066, "grad_norm": 0.18148036317355992, "learning_rate": 2.7417027417027418e-06, "loss": 0.3271, "step": 2197 }, { "epoch": 2.851395197923426, "grad_norm": 0.20235355112701617, "learning_rate": 2.717652717652718e-06, "loss": 0.3468, "step": 2198 }, { "epoch": 2.8526930564568462, "grad_norm": 0.19882040438487505, "learning_rate": 2.6936026936026934e-06, "loss": 0.3649, "step": 2199 }, { "epoch": 2.8539909149902662, "grad_norm": 0.1919921579971501, "learning_rate": 2.66955266955267e-06, "loss": 0.3373, "step": 2200 }, { "epoch": 2.855288773523686, "grad_norm": 0.19166967508036267, "learning_rate": 2.6455026455026455e-06, "loss": 0.3407, "step": 2201 }, { "epoch": 2.856586632057106, "grad_norm": 0.18413982998209266, "learning_rate": 2.6214526214526216e-06, "loss": 0.3281, "step": 2202 }, { "epoch": 2.857884490590526, "grad_norm": 0.1963183914870018, "learning_rate": 2.5974025974025976e-06, "loss": 0.3519, "step": 2203 }, { "epoch": 2.8591823491239454, "grad_norm": 0.19124327719338702, "learning_rate": 2.5733525733525737e-06, "loss": 0.3546, "step": 2204 }, { "epoch": 2.8604802076573654, "grad_norm": 0.1908455405574935, "learning_rate": 2.5493025493025497e-06, "loss": 0.3441, "step": 2205 }, { "epoch": 2.8617780661907855, "grad_norm": 0.20074510401322135, "learning_rate": 2.5252525252525253e-06, "loss": 0.3623, "step": 2206 }, { "epoch": 2.863075924724205, "grad_norm": 0.19215908226190542, "learning_rate": 2.5012025012025014e-06, "loss": 0.3323, "step": 2207 }, { "epoch": 2.864373783257625, "grad_norm": 0.18434791337783116, "learning_rate": 2.477152477152477e-06, "loss": 0.3319, "step": 2208 }, { "epoch": 2.8656716417910446, "grad_norm": 0.19538722948283108, "learning_rate": 2.4531024531024535e-06, "loss": 0.3275, "step": 2209 }, { "epoch": 2.8669695003244646, "grad_norm": 0.1882905413491712, "learning_rate": 2.429052429052429e-06, "loss": 0.3301, "step": 2210 }, { "epoch": 2.8682673588578846, "grad_norm": 0.18413079624889964, "learning_rate": 2.405002405002405e-06, "loss": 0.3316, "step": 2211 }, { "epoch": 2.869565217391304, "grad_norm": 0.18114340544426985, "learning_rate": 2.3809523809523808e-06, "loss": 0.3293, "step": 2212 }, { "epoch": 2.8708630759247242, "grad_norm": 0.18339952929530665, "learning_rate": 2.3569023569023572e-06, "loss": 0.3385, "step": 2213 }, { "epoch": 2.872160934458144, "grad_norm": 0.19149046899206099, "learning_rate": 2.332852332852333e-06, "loss": 0.3362, "step": 2214 }, { "epoch": 2.873458792991564, "grad_norm": 0.21144971266274312, "learning_rate": 2.308802308802309e-06, "loss": 0.3426, "step": 2215 }, { "epoch": 2.874756651524984, "grad_norm": 0.18252025886810355, "learning_rate": 2.284752284752285e-06, "loss": 0.3461, "step": 2216 }, { "epoch": 2.8760545100584034, "grad_norm": 0.18661014168815393, "learning_rate": 2.2607022607022606e-06, "loss": 0.3532, "step": 2217 }, { "epoch": 2.8773523685918234, "grad_norm": 0.19921497008286657, "learning_rate": 2.236652236652237e-06, "loss": 0.3456, "step": 2218 }, { "epoch": 2.8786502271252434, "grad_norm": 0.19199558097836697, "learning_rate": 2.2126022126022127e-06, "loss": 0.336, "step": 2219 }, { "epoch": 2.879948085658663, "grad_norm": 0.18697397277980365, "learning_rate": 2.1885521885521887e-06, "loss": 0.3274, "step": 2220 }, { "epoch": 2.881245944192083, "grad_norm": 0.18651465996633548, "learning_rate": 2.1645021645021643e-06, "loss": 0.3362, "step": 2221 }, { "epoch": 2.882543802725503, "grad_norm": 0.18811253057615676, "learning_rate": 2.140452140452141e-06, "loss": 0.3441, "step": 2222 }, { "epoch": 2.8838416612589226, "grad_norm": 0.1807114251046355, "learning_rate": 2.1164021164021164e-06, "loss": 0.3237, "step": 2223 }, { "epoch": 2.8851395197923426, "grad_norm": 0.18668302514006135, "learning_rate": 2.0923520923520925e-06, "loss": 0.3556, "step": 2224 }, { "epoch": 2.8864373783257626, "grad_norm": 0.1951670359448047, "learning_rate": 2.068302068302068e-06, "loss": 0.3181, "step": 2225 }, { "epoch": 2.887735236859182, "grad_norm": 0.1874121175894903, "learning_rate": 2.0442520442520446e-06, "loss": 0.3447, "step": 2226 }, { "epoch": 2.8890330953926022, "grad_norm": 0.18533011275342226, "learning_rate": 2.0202020202020206e-06, "loss": 0.3363, "step": 2227 }, { "epoch": 2.8903309539260222, "grad_norm": 0.18793474012414535, "learning_rate": 1.9961519961519962e-06, "loss": 0.345, "step": 2228 }, { "epoch": 2.891628812459442, "grad_norm": 0.18922319032385707, "learning_rate": 1.9721019721019723e-06, "loss": 0.3386, "step": 2229 }, { "epoch": 2.892926670992862, "grad_norm": 0.19294630111421893, "learning_rate": 1.948051948051948e-06, "loss": 0.3513, "step": 2230 }, { "epoch": 2.894224529526282, "grad_norm": 0.1831585914628477, "learning_rate": 1.9240019240019244e-06, "loss": 0.3185, "step": 2231 }, { "epoch": 2.8955223880597014, "grad_norm": 0.18959540248425852, "learning_rate": 1.8999518999519e-06, "loss": 0.3448, "step": 2232 }, { "epoch": 2.8968202465931214, "grad_norm": 0.18655567267427287, "learning_rate": 1.875901875901876e-06, "loss": 0.3262, "step": 2233 }, { "epoch": 2.8981181051265414, "grad_norm": 0.18952603893507794, "learning_rate": 1.8518518518518519e-06, "loss": 0.3282, "step": 2234 }, { "epoch": 2.899415963659961, "grad_norm": 0.1876885696047965, "learning_rate": 1.827801827801828e-06, "loss": 0.3435, "step": 2235 }, { "epoch": 2.900713822193381, "grad_norm": 0.1865624408496654, "learning_rate": 1.8037518037518038e-06, "loss": 0.3331, "step": 2236 }, { "epoch": 2.902011680726801, "grad_norm": 0.19122160432352084, "learning_rate": 1.7797017797017798e-06, "loss": 0.3371, "step": 2237 }, { "epoch": 2.9033095392602206, "grad_norm": 0.19352943277773518, "learning_rate": 1.7556517556517556e-06, "loss": 0.3405, "step": 2238 }, { "epoch": 2.9046073977936406, "grad_norm": 0.19319000412284978, "learning_rate": 1.7316017316017317e-06, "loss": 0.3399, "step": 2239 }, { "epoch": 2.90590525632706, "grad_norm": 0.19327464804923486, "learning_rate": 1.7075517075517077e-06, "loss": 0.3319, "step": 2240 }, { "epoch": 2.90720311486048, "grad_norm": 0.19976992892290674, "learning_rate": 1.6835016835016836e-06, "loss": 0.3432, "step": 2241 }, { "epoch": 2.9085009733939, "grad_norm": 0.1892168913000648, "learning_rate": 1.6594516594516596e-06, "loss": 0.3463, "step": 2242 }, { "epoch": 2.90979883192732, "grad_norm": 0.19443589296751324, "learning_rate": 1.6354016354016354e-06, "loss": 0.357, "step": 2243 }, { "epoch": 2.91109669046074, "grad_norm": 0.18449321307823713, "learning_rate": 1.6113516113516115e-06, "loss": 0.345, "step": 2244 }, { "epoch": 2.9123945489941594, "grad_norm": 0.1858254182171351, "learning_rate": 1.5873015873015873e-06, "loss": 0.3447, "step": 2245 }, { "epoch": 2.9136924075275794, "grad_norm": 0.2004597208671706, "learning_rate": 1.5632515632515634e-06, "loss": 0.3366, "step": 2246 }, { "epoch": 2.9149902660609994, "grad_norm": 0.18322062527491037, "learning_rate": 1.5392015392015394e-06, "loss": 0.3375, "step": 2247 }, { "epoch": 2.916288124594419, "grad_norm": 0.1905983562403602, "learning_rate": 1.5151515151515152e-06, "loss": 0.3411, "step": 2248 }, { "epoch": 2.917585983127839, "grad_norm": 0.1867573407190652, "learning_rate": 1.4911014911014913e-06, "loss": 0.336, "step": 2249 }, { "epoch": 2.918883841661259, "grad_norm": 0.19058206255135468, "learning_rate": 1.4670514670514671e-06, "loss": 0.3474, "step": 2250 }, { "epoch": 2.9201817001946786, "grad_norm": 0.18685316734046567, "learning_rate": 1.4430014430014432e-06, "loss": 0.3502, "step": 2251 }, { "epoch": 2.9214795587280986, "grad_norm": 0.18640666171276632, "learning_rate": 1.418951418951419e-06, "loss": 0.3317, "step": 2252 }, { "epoch": 2.9227774172615186, "grad_norm": 0.19155464358225485, "learning_rate": 1.3949013949013948e-06, "loss": 0.3352, "step": 2253 }, { "epoch": 2.924075275794938, "grad_norm": 0.1868855983928718, "learning_rate": 1.3708513708513709e-06, "loss": 0.328, "step": 2254 }, { "epoch": 2.925373134328358, "grad_norm": 0.18433362766279043, "learning_rate": 1.3468013468013467e-06, "loss": 0.3452, "step": 2255 }, { "epoch": 2.9266709928617782, "grad_norm": 0.19259127006608057, "learning_rate": 1.3227513227513228e-06, "loss": 0.3404, "step": 2256 }, { "epoch": 2.927968851395198, "grad_norm": 0.18080624245350022, "learning_rate": 1.2987012987012988e-06, "loss": 0.3266, "step": 2257 }, { "epoch": 2.929266709928618, "grad_norm": 0.17871124072334996, "learning_rate": 1.2746512746512749e-06, "loss": 0.3395, "step": 2258 }, { "epoch": 2.930564568462038, "grad_norm": 0.18709418548907147, "learning_rate": 1.2506012506012507e-06, "loss": 0.346, "step": 2259 }, { "epoch": 2.9318624269954574, "grad_norm": 0.18683092960850883, "learning_rate": 1.2265512265512267e-06, "loss": 0.336, "step": 2260 }, { "epoch": 2.9331602855288774, "grad_norm": 0.18777575130149565, "learning_rate": 1.2025012025012026e-06, "loss": 0.3368, "step": 2261 }, { "epoch": 2.9344581440622974, "grad_norm": 0.18324636658598714, "learning_rate": 1.1784511784511786e-06, "loss": 0.3292, "step": 2262 }, { "epoch": 2.935756002595717, "grad_norm": 0.1851227917603969, "learning_rate": 1.1544011544011545e-06, "loss": 0.3378, "step": 2263 }, { "epoch": 2.937053861129137, "grad_norm": 0.19847788748302606, "learning_rate": 1.1303511303511303e-06, "loss": 0.3754, "step": 2264 }, { "epoch": 2.938351719662557, "grad_norm": 0.177806134860686, "learning_rate": 1.1063011063011063e-06, "loss": 0.327, "step": 2265 }, { "epoch": 2.9396495781959766, "grad_norm": 0.197095005642012, "learning_rate": 1.0822510822510822e-06, "loss": 0.344, "step": 2266 }, { "epoch": 2.9409474367293966, "grad_norm": 0.18894739122645604, "learning_rate": 1.0582010582010582e-06, "loss": 0.3437, "step": 2267 }, { "epoch": 2.942245295262816, "grad_norm": 0.1763401949490533, "learning_rate": 1.034151034151034e-06, "loss": 0.3285, "step": 2268 }, { "epoch": 2.943543153796236, "grad_norm": 0.1786777654803748, "learning_rate": 1.0101010101010103e-06, "loss": 0.3264, "step": 2269 }, { "epoch": 2.9448410123296562, "grad_norm": 0.18511698934832105, "learning_rate": 9.860509860509861e-07, "loss": 0.3403, "step": 2270 }, { "epoch": 2.946138870863076, "grad_norm": 0.18872890425471747, "learning_rate": 9.620009620009622e-07, "loss": 0.3288, "step": 2271 }, { "epoch": 2.947436729396496, "grad_norm": 0.18279278752067737, "learning_rate": 9.37950937950938e-07, "loss": 0.3422, "step": 2272 }, { "epoch": 2.9487345879299154, "grad_norm": 0.18006141885171842, "learning_rate": 9.13900913900914e-07, "loss": 0.3446, "step": 2273 }, { "epoch": 2.9500324464633354, "grad_norm": 0.19260565452121156, "learning_rate": 8.898508898508899e-07, "loss": 0.3398, "step": 2274 }, { "epoch": 2.9513303049967554, "grad_norm": 0.21921401490874187, "learning_rate": 8.658008658008658e-07, "loss": 0.3568, "step": 2275 }, { "epoch": 2.952628163530175, "grad_norm": 0.18842161857636638, "learning_rate": 8.417508417508418e-07, "loss": 0.3498, "step": 2276 }, { "epoch": 2.953926022063595, "grad_norm": 0.17891619649785445, "learning_rate": 8.177008177008177e-07, "loss": 0.3272, "step": 2277 }, { "epoch": 2.955223880597015, "grad_norm": 0.17755022636746284, "learning_rate": 7.936507936507937e-07, "loss": 0.3231, "step": 2278 }, { "epoch": 2.9565217391304346, "grad_norm": 0.1863595372909174, "learning_rate": 7.696007696007697e-07, "loss": 0.3408, "step": 2279 }, { "epoch": 2.9578195976638546, "grad_norm": 0.17943740925178142, "learning_rate": 7.455507455507456e-07, "loss": 0.3222, "step": 2280 }, { "epoch": 2.9591174561972746, "grad_norm": 0.1875857112899972, "learning_rate": 7.215007215007216e-07, "loss": 0.33, "step": 2281 }, { "epoch": 2.960415314730694, "grad_norm": 0.18672707365536773, "learning_rate": 6.974506974506974e-07, "loss": 0.3377, "step": 2282 }, { "epoch": 2.961713173264114, "grad_norm": 0.18854762948864245, "learning_rate": 6.734006734006734e-07, "loss": 0.3511, "step": 2283 }, { "epoch": 2.9630110317975342, "grad_norm": 0.1846633145163194, "learning_rate": 6.493506493506494e-07, "loss": 0.3301, "step": 2284 }, { "epoch": 2.964308890330954, "grad_norm": 0.1782259793697707, "learning_rate": 6.253006253006253e-07, "loss": 0.3327, "step": 2285 }, { "epoch": 2.965606748864374, "grad_norm": 0.19406554087810526, "learning_rate": 6.012506012506013e-07, "loss": 0.3374, "step": 2286 }, { "epoch": 2.966904607397794, "grad_norm": 0.18974085077422986, "learning_rate": 5.772005772005772e-07, "loss": 0.3193, "step": 2287 }, { "epoch": 2.9682024659312134, "grad_norm": 0.19068994562305627, "learning_rate": 5.531505531505532e-07, "loss": 0.3387, "step": 2288 }, { "epoch": 2.9695003244646334, "grad_norm": 0.17821215477258306, "learning_rate": 5.291005291005291e-07, "loss": 0.3328, "step": 2289 }, { "epoch": 2.9707981829980534, "grad_norm": 0.18413236462451124, "learning_rate": 5.050505050505052e-07, "loss": 0.3362, "step": 2290 }, { "epoch": 2.972096041531473, "grad_norm": 0.18085396718066815, "learning_rate": 4.810004810004811e-07, "loss": 0.3302, "step": 2291 }, { "epoch": 2.973393900064893, "grad_norm": 0.18231587065998014, "learning_rate": 4.56950456950457e-07, "loss": 0.3338, "step": 2292 }, { "epoch": 2.974691758598313, "grad_norm": 0.18433591352078926, "learning_rate": 4.329004329004329e-07, "loss": 0.3373, "step": 2293 }, { "epoch": 2.9759896171317326, "grad_norm": 0.17897899773682865, "learning_rate": 4.0885040885040886e-07, "loss": 0.3303, "step": 2294 }, { "epoch": 2.9772874756651526, "grad_norm": 0.18401504997308174, "learning_rate": 3.8480038480038485e-07, "loss": 0.3295, "step": 2295 }, { "epoch": 2.9785853341985726, "grad_norm": 0.18601416230069387, "learning_rate": 3.607503607503608e-07, "loss": 0.3384, "step": 2296 }, { "epoch": 2.979883192731992, "grad_norm": 0.18294283749703283, "learning_rate": 3.367003367003367e-07, "loss": 0.3414, "step": 2297 }, { "epoch": 2.981181051265412, "grad_norm": 0.18248918456066632, "learning_rate": 3.1265031265031267e-07, "loss": 0.3522, "step": 2298 }, { "epoch": 2.982478909798832, "grad_norm": 0.18259396477447506, "learning_rate": 2.886002886002886e-07, "loss": 0.3194, "step": 2299 }, { "epoch": 2.983776768332252, "grad_norm": 0.1885739190189894, "learning_rate": 2.6455026455026455e-07, "loss": 0.3407, "step": 2300 }, { "epoch": 2.9850746268656714, "grad_norm": 0.18564276265962515, "learning_rate": 2.4050024050024055e-07, "loss": 0.3431, "step": 2301 }, { "epoch": 2.9863724853990914, "grad_norm": 0.17794361853046303, "learning_rate": 2.1645021645021646e-07, "loss": 0.3228, "step": 2302 }, { "epoch": 2.9876703439325114, "grad_norm": 0.18130600626680995, "learning_rate": 1.9240019240019243e-07, "loss": 0.3336, "step": 2303 }, { "epoch": 2.988968202465931, "grad_norm": 0.18435788490293198, "learning_rate": 1.6835016835016834e-07, "loss": 0.3308, "step": 2304 }, { "epoch": 2.990266060999351, "grad_norm": 0.18127043211913135, "learning_rate": 1.443001443001443e-07, "loss": 0.3449, "step": 2305 }, { "epoch": 2.991563919532771, "grad_norm": 0.1799885551759602, "learning_rate": 1.2025012025012027e-07, "loss": 0.3267, "step": 2306 }, { "epoch": 2.9928617780661906, "grad_norm": 0.18020390891152432, "learning_rate": 9.620009620009621e-08, "loss": 0.3376, "step": 2307 }, { "epoch": 2.9941596365996106, "grad_norm": 0.18299772015112434, "learning_rate": 7.215007215007215e-08, "loss": 0.3404, "step": 2308 }, { "epoch": 2.9954574951330306, "grad_norm": 0.17991344814248053, "learning_rate": 4.8100048100048107e-08, "loss": 0.3293, "step": 2309 }, { "epoch": 2.99675535366645, "grad_norm": 0.18545369820383786, "learning_rate": 2.4050024050024053e-08, "loss": 0.3376, "step": 2310 }, { "epoch": 2.99675535366645, "step": 2310, "total_flos": 2.5679880641918796e+19, "train_loss": 0.5017085661361744, "train_runtime": 66070.1568, "train_samples_per_second": 0.56, "train_steps_per_second": 0.035 } ], "logging_steps": 1, "max_steps": 2310, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.5679880641918796e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }