{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6805035726437564, "eval_steps": 1000, "global_step": 8000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "crossentropy": 11.309896469116211, "epoch": 8.506294658046955e-05, "grad_norm": 18.974912643432617, "learning_rate": 6.25e-05, "loss": 11.3099, "step": 1 }, { "crossentropy": 11.142946243286133, "epoch": 0.0001701258931609391, "grad_norm": 17.902788162231445, "learning_rate": 0.000125, "loss": 11.1429, "step": 2 }, { "crossentropy": 10.845649719238281, "epoch": 0.0002551888397414086, "grad_norm": 16.72595977783203, "learning_rate": 0.0001875, "loss": 10.8456, "step": 3 }, { "crossentropy": 11.160599708557129, "epoch": 0.0003402517863218782, "grad_norm": 19.85663414001465, "learning_rate": 0.00025, "loss": 11.1606, "step": 4 }, { "crossentropy": 10.84029769897461, "epoch": 0.00042531473290234774, "grad_norm": 15.51010799407959, "learning_rate": 0.0003125, "loss": 10.8403, "step": 5 }, { "crossentropy": 10.718737602233887, "epoch": 0.0005103776794828172, "grad_norm": 13.594999313354492, "learning_rate": 0.000375, "loss": 10.7187, "step": 6 }, { "crossentropy": 10.684952735900879, "epoch": 0.0005954406260632869, "grad_norm": 13.211255073547363, "learning_rate": 0.00043749999999999995, "loss": 10.685, "step": 7 }, { "crossentropy": 10.314162254333496, "epoch": 0.0006805035726437564, "grad_norm": 9.28316879272461, "learning_rate": 0.0005, "loss": 10.3142, "step": 8 }, { "crossentropy": 9.983610153198242, "epoch": 0.000765566519224226, "grad_norm": 5.95074987411499, "learning_rate": 0.0005625000000000001, "loss": 9.9836, "step": 9 }, { "crossentropy": 9.924571990966797, "epoch": 0.0008506294658046955, "grad_norm": 5.6699700355529785, "learning_rate": 0.000625, "loss": 9.9246, "step": 10 }, { "crossentropy": 9.79047679901123, "epoch": 0.000935692412385165, "grad_norm": 5.157408237457275, "learning_rate": 0.0006875000000000001, "loss": 9.7905, "step": 11 }, { "crossentropy": 9.82883071899414, "epoch": 0.0010207553589656345, "grad_norm": 3.55379056930542, "learning_rate": 0.00075, "loss": 9.8288, "step": 12 }, { "crossentropy": 9.727521896362305, "epoch": 0.0011058183055461042, "grad_norm": 4.279326438903809, "learning_rate": 0.0008125000000000001, "loss": 9.7275, "step": 13 }, { "crossentropy": 9.527801513671875, "epoch": 0.0011908812521265737, "grad_norm": 3.041395902633667, "learning_rate": 0.0008749999999999999, "loss": 9.5278, "step": 14 }, { "crossentropy": 9.41383171081543, "epoch": 0.0012759441987070432, "grad_norm": 2.9315736293792725, "learning_rate": 0.0009375, "loss": 9.4138, "step": 15 }, { "crossentropy": 9.280156135559082, "epoch": 0.0013610071452875127, "grad_norm": 2.897942543029785, "grad_norm_var": 41.40489251350094, "learning_rate": 0.001, "loss": 9.2802, "step": 16 }, { "crossentropy": 9.173386573791504, "epoch": 0.0014460700918679822, "grad_norm": 2.652008533477783, "grad_norm_var": 38.3260067150703, "learning_rate": 0.0010625, "loss": 9.1734, "step": 17 }, { "crossentropy": 8.977143287658691, "epoch": 0.001531133038448452, "grad_norm": 2.85750150680542, "grad_norm_var": 34.390926827206826, "learning_rate": 0.0011250000000000001, "loss": 8.9771, "step": 18 }, { "crossentropy": 9.044631004333496, "epoch": 0.0016161959850289215, "grad_norm": 2.647768259048462, "grad_norm_var": 30.30178380168377, "learning_rate": 0.0011875, "loss": 9.0446, "step": 19 }, { "crossentropy": 9.014820098876953, "epoch": 0.001701258931609391, "grad_norm": 2.1930229663848877, "grad_norm_var": 19.683971983228396, "learning_rate": 0.00125, "loss": 9.0148, "step": 20 }, { "crossentropy": 8.815771102905273, "epoch": 0.0017863218781898605, "grad_norm": 2.1837751865386963, "grad_norm_var": 13.82235760319115, "learning_rate": 0.0013125, "loss": 8.8158, "step": 21 }, { "crossentropy": 8.811899185180664, "epoch": 0.00187138482477033, "grad_norm": 1.9939371347427368, "grad_norm_var": 9.14264710632777, "learning_rate": 0.0013750000000000001, "loss": 8.8119, "step": 22 }, { "crossentropy": 8.60848617553711, "epoch": 0.0019564477713507997, "grad_norm": 1.8629437685012817, "grad_norm_var": 3.869167065478945, "learning_rate": 0.0014375, "loss": 8.6085, "step": 23 }, { "crossentropy": 8.543891906738281, "epoch": 0.002041510717931269, "grad_norm": 1.5073816776275635, "grad_norm_var": 1.8567924566902014, "learning_rate": 0.0015, "loss": 8.5439, "step": 24 }, { "crossentropy": 8.424093246459961, "epoch": 0.0021265736645117387, "grad_norm": 1.4603967666625977, "grad_norm_var": 1.4768392296806987, "learning_rate": 0.0015625, "loss": 8.4241, "step": 25 }, { "crossentropy": 8.605175018310547, "epoch": 0.0022116366110922084, "grad_norm": 1.239119052886963, "grad_norm_var": 1.0855214234333324, "learning_rate": 0.0016250000000000001, "loss": 8.6052, "step": 26 }, { "crossentropy": 8.274113655090332, "epoch": 0.0022966995576726777, "grad_norm": 0.9894035458564758, "grad_norm_var": 0.7798953785683899, "learning_rate": 0.0016875000000000002, "loss": 8.2741, "step": 27 }, { "crossentropy": 8.239933013916016, "epoch": 0.0023817625042531474, "grad_norm": 0.931402325630188, "grad_norm_var": 0.8039022546582192, "learning_rate": 0.0017499999999999998, "loss": 8.2399, "step": 28 }, { "crossentropy": 8.093734741210938, "epoch": 0.0024668254508336167, "grad_norm": 0.8903639316558838, "grad_norm_var": 0.5953933716226523, "learning_rate": 0.0018124999999999999, "loss": 8.0937, "step": 29 }, { "crossentropy": 8.059680938720703, "epoch": 0.0025518883974140864, "grad_norm": 0.8093271851539612, "grad_norm_var": 0.6020545653558498, "learning_rate": 0.001875, "loss": 8.0597, "step": 30 }, { "crossentropy": 7.993825435638428, "epoch": 0.002636951343994556, "grad_norm": 0.6597331166267395, "grad_norm_var": 0.6054906065439066, "learning_rate": 0.0019375000000000002, "loss": 7.9938, "step": 31 }, { "crossentropy": 7.935140132904053, "epoch": 0.0027220142905750254, "grad_norm": 0.5547610521316528, "grad_norm_var": 0.5856283941988302, "learning_rate": 0.002, "loss": 7.9351, "step": 32 }, { "crossentropy": 7.898117542266846, "epoch": 0.002807077237155495, "grad_norm": 0.5901926755905151, "grad_norm_var": 0.5592427938622041, "learning_rate": 0.0020625, "loss": 7.8981, "step": 33 }, { "crossentropy": 7.876916408538818, "epoch": 0.0028921401837359645, "grad_norm": 0.5221530199050903, "grad_norm_var": 0.4651695017220639, "learning_rate": 0.002125, "loss": 7.8769, "step": 34 }, { "crossentropy": 7.952791690826416, "epoch": 0.002977203130316434, "grad_norm": 0.47528332471847534, "grad_norm_var": 0.3740161349433449, "learning_rate": 0.0021875, "loss": 7.9528, "step": 35 }, { "crossentropy": 7.869116306304932, "epoch": 0.003062266076896904, "grad_norm": 0.4856501519680023, "grad_norm_var": 0.325357793603863, "learning_rate": 0.0022500000000000003, "loss": 7.8691, "step": 36 }, { "crossentropy": 7.877625465393066, "epoch": 0.003147329023477373, "grad_norm": 0.4792061150074005, "grad_norm_var": 0.25432966416154995, "learning_rate": 0.0023125000000000003, "loss": 7.8776, "step": 37 }, { "crossentropy": 7.706593990325928, "epoch": 0.003232391970057843, "grad_norm": 0.5315346717834473, "grad_norm_var": 0.18750127365180594, "learning_rate": 0.002375, "loss": 7.7066, "step": 38 }, { "crossentropy": 7.7198076248168945, "epoch": 0.003317454916638312, "grad_norm": 0.4393047094345093, "grad_norm_var": 0.12651073783459457, "learning_rate": 0.0024375, "loss": 7.7198, "step": 39 }, { "crossentropy": 7.777041912078857, "epoch": 0.003402517863218782, "grad_norm": 0.42561084032058716, "grad_norm_var": 0.09550346939757308, "learning_rate": 0.0025, "loss": 7.777, "step": 40 }, { "crossentropy": 7.754281520843506, "epoch": 0.0034875808097992516, "grad_norm": 0.35795411467552185, "grad_norm_var": 0.06229618893163468, "learning_rate": 0.0025624999999999997, "loss": 7.7543, "step": 41 }, { "crossentropy": 7.802982807159424, "epoch": 0.003572643756379721, "grad_norm": 0.3135263919830322, "grad_norm_var": 0.042990176778044224, "learning_rate": 0.002625, "loss": 7.803, "step": 42 }, { "crossentropy": 7.7398247718811035, "epoch": 0.0036577067029601906, "grad_norm": 0.3644159734249115, "grad_norm_var": 0.03420054547204523, "learning_rate": 0.0026875, "loss": 7.7398, "step": 43 }, { "crossentropy": 7.641439914703369, "epoch": 0.00374276964954066, "grad_norm": 0.30564576387405396, "grad_norm_var": 0.027010376278331923, "learning_rate": 0.0027500000000000003, "loss": 7.6414, "step": 44 }, { "crossentropy": 7.619621276855469, "epoch": 0.0038278325961211297, "grad_norm": 0.3500343859195709, "grad_norm_var": 0.0180558176753346, "learning_rate": 0.0028125, "loss": 7.6196, "step": 45 }, { "crossentropy": 7.564472198486328, "epoch": 0.003912895542701599, "grad_norm": 0.29853832721710205, "grad_norm_var": 0.011866823044508002, "learning_rate": 0.002875, "loss": 7.5645, "step": 46 }, { "crossentropy": 7.703369140625, "epoch": 0.003997958489282069, "grad_norm": 0.2995685935020447, "grad_norm_var": 0.009763008097976533, "learning_rate": 0.0029375, "loss": 7.7034, "step": 47 }, { "crossentropy": 7.6040568351745605, "epoch": 0.004083021435862538, "grad_norm": 0.3057446777820587, "grad_norm_var": 0.009316492863184677, "learning_rate": 0.003, "loss": 7.6041, "step": 48 }, { "crossentropy": 7.565240383148193, "epoch": 0.004168084382443008, "grad_norm": 0.28561165928840637, "grad_norm_var": 0.0077571359061762045, "learning_rate": 0.0030625, "loss": 7.5652, "step": 49 }, { "crossentropy": 7.631986141204834, "epoch": 0.004253147329023477, "grad_norm": 0.27788108587265015, "grad_norm_var": 0.007181822387785645, "learning_rate": 0.003125, "loss": 7.632, "step": 50 }, { "crossentropy": 7.62631368637085, "epoch": 0.004338210275603947, "grad_norm": 0.2542114853858948, "grad_norm_var": 0.007272123576704485, "learning_rate": 0.0031875, "loss": 7.6263, "step": 51 }, { "crossentropy": 7.516110897064209, "epoch": 0.004423273222184417, "grad_norm": 0.3250061273574829, "grad_norm_var": 0.006213033266452387, "learning_rate": 0.0032500000000000003, "loss": 7.5161, "step": 52 }, { "crossentropy": 7.551267147064209, "epoch": 0.004508336168764886, "grad_norm": 0.27453434467315674, "grad_norm_var": 0.005328740969118443, "learning_rate": 0.0033125, "loss": 7.5513, "step": 53 }, { "crossentropy": 7.584129810333252, "epoch": 0.004593399115345355, "grad_norm": 0.26441776752471924, "grad_norm_var": 0.0028978551255909126, "learning_rate": 0.0033750000000000004, "loss": 7.5841, "step": 54 }, { "crossentropy": 7.555300712585449, "epoch": 0.004678462061925825, "grad_norm": 0.31277361512184143, "grad_norm_var": 0.0019089240532428775, "learning_rate": 0.0034375, "loss": 7.5553, "step": 55 }, { "crossentropy": 7.565512180328369, "epoch": 0.004763525008506295, "grad_norm": 0.26766854524612427, "grad_norm_var": 0.0011064046710686762, "learning_rate": 0.0034999999999999996, "loss": 7.5655, "step": 56 }, { "crossentropy": 7.577280521392822, "epoch": 0.004848587955086765, "grad_norm": 0.2576090097427368, "grad_norm_var": 0.0010084472035153428, "learning_rate": 0.0035625, "loss": 7.5773, "step": 57 }, { "crossentropy": 7.520791530609131, "epoch": 0.004933650901667233, "grad_norm": 0.2548196613788605, "grad_norm_var": 0.0010970287921503155, "learning_rate": 0.0036249999999999998, "loss": 7.5208, "step": 58 }, { "crossentropy": 7.514278411865234, "epoch": 0.005018713848247703, "grad_norm": 0.30607619881629944, "grad_norm_var": 0.0007593259722254923, "learning_rate": 0.0036875000000000002, "loss": 7.5143, "step": 59 }, { "crossentropy": 7.490321636199951, "epoch": 0.005103776794828173, "grad_norm": 0.2747018337249756, "grad_norm_var": 0.0007546556313660672, "learning_rate": 0.00375, "loss": 7.4903, "step": 60 }, { "crossentropy": 7.504281044006348, "epoch": 0.005188839741408643, "grad_norm": 0.27712082862854004, "grad_norm_var": 0.0004845709480581643, "learning_rate": 0.0038125, "loss": 7.5043, "step": 61 }, { "crossentropy": 7.443078994750977, "epoch": 0.005273902687989112, "grad_norm": 0.24445556104183197, "grad_norm_var": 0.0005590660248915145, "learning_rate": 0.0038750000000000004, "loss": 7.4431, "step": 62 }, { "crossentropy": 7.412005424499512, "epoch": 0.005358965634569581, "grad_norm": 0.28495848178863525, "grad_norm_var": 0.0005345550467412761, "learning_rate": 0.0039375, "loss": 7.412, "step": 63 }, { "crossentropy": 7.385380744934082, "epoch": 0.005444028581150051, "grad_norm": 0.24084094166755676, "grad_norm_var": 0.0005683342285025159, "learning_rate": 0.004, "loss": 7.3854, "step": 64 }, { "crossentropy": 7.411404609680176, "epoch": 0.005529091527730521, "grad_norm": 0.313495934009552, "grad_norm_var": 0.0006557587404904188, "learning_rate": 0.0040625, "loss": 7.4114, "step": 65 }, { "crossentropy": 7.306937217712402, "epoch": 0.00561415447431099, "grad_norm": 0.2802676558494568, "grad_norm_var": 0.0006564235041801434, "learning_rate": 0.004125, "loss": 7.3069, "step": 66 }, { "crossentropy": 7.328421115875244, "epoch": 0.00569921742089146, "grad_norm": 0.31991204619407654, "grad_norm_var": 0.0007260551377454692, "learning_rate": 0.0041875, "loss": 7.3284, "step": 67 }, { "crossentropy": 7.320525646209717, "epoch": 0.005784280367471929, "grad_norm": 0.2681311070919037, "grad_norm_var": 0.0005957749257016855, "learning_rate": 0.00425, "loss": 7.3205, "step": 68 }, { "crossentropy": 7.274356842041016, "epoch": 0.005869343314052399, "grad_norm": 0.2656392753124237, "grad_norm_var": 0.0006043695669067326, "learning_rate": 0.0043125, "loss": 7.2744, "step": 69 }, { "crossentropy": 7.19435453414917, "epoch": 0.005954406260632868, "grad_norm": 0.2741522192955017, "grad_norm_var": 0.0005938891483218199, "learning_rate": 0.004375, "loss": 7.1944, "step": 70 }, { "crossentropy": 7.194360256195068, "epoch": 0.006039469207213338, "grad_norm": 0.28835028409957886, "grad_norm_var": 0.000516837620823558, "learning_rate": 0.0044375, "loss": 7.1944, "step": 71 }, { "crossentropy": 7.233011722564697, "epoch": 0.006124532153793808, "grad_norm": 0.43178847432136536, "grad_norm_var": 0.002014974401098301, "learning_rate": 0.0045000000000000005, "loss": 7.233, "step": 72 }, { "crossentropy": 7.183064937591553, "epoch": 0.006209595100374277, "grad_norm": 0.4894546568393707, "grad_norm_var": 0.00448464639179468, "learning_rate": 0.0045625, "loss": 7.1831, "step": 73 }, { "crossentropy": 7.119131565093994, "epoch": 0.006294658046954746, "grad_norm": 0.4337690472602844, "grad_norm_var": 0.005386953658534024, "learning_rate": 0.004625000000000001, "loss": 7.1191, "step": 74 }, { "crossentropy": 7.114971160888672, "epoch": 0.006379720993535216, "grad_norm": 0.3881780207157135, "grad_norm_var": 0.0057426381129141914, "learning_rate": 0.0046875, "loss": 7.115, "step": 75 }, { "crossentropy": 7.138244152069092, "epoch": 0.006464783940115686, "grad_norm": 0.4313463568687439, "grad_norm_var": 0.006388596912491317, "learning_rate": 0.00475, "loss": 7.1382, "step": 76 }, { "crossentropy": 7.056614875793457, "epoch": 0.0065498468866961555, "grad_norm": 0.5524854063987732, "grad_norm_var": 0.00929669169954923, "learning_rate": 0.0048125, "loss": 7.0566, "step": 77 }, { "crossentropy": 6.966063499450684, "epoch": 0.006634909833276624, "grad_norm": 0.8406451344490051, "grad_norm_var": 0.023582811361168684, "learning_rate": 0.004875, "loss": 6.9661, "step": 78 }, { "crossentropy": 7.061474800109863, "epoch": 0.006719972779857094, "grad_norm": 0.7252867221832275, "grad_norm_var": 0.030035023516989497, "learning_rate": 0.0049375, "loss": 7.0615, "step": 79 }, { "crossentropy": 6.991960525512695, "epoch": 0.006805035726437564, "grad_norm": 0.6852894425392151, "grad_norm_var": 0.032416806516942606, "learning_rate": 0.005, "loss": 6.992, "step": 80 }, { "crossentropy": 6.947705268859863, "epoch": 0.0068900986730180335, "grad_norm": 0.6264182925224304, "grad_norm_var": 0.0333938044893615, "learning_rate": 0.0050625, "loss": 6.9477, "step": 81 }, { "crossentropy": 6.975273132324219, "epoch": 0.006975161619598503, "grad_norm": 0.5516034364700317, "grad_norm_var": 0.03162602181982486, "learning_rate": 0.005124999999999999, "loss": 6.9753, "step": 82 }, { "crossentropy": 6.895355701446533, "epoch": 0.007060224566178972, "grad_norm": 0.4920431673526764, "grad_norm_var": 0.029957965431353494, "learning_rate": 0.0051875, "loss": 6.8954, "step": 83 }, { "crossentropy": 6.926131725311279, "epoch": 0.007145287512759442, "grad_norm": 0.5053110718727112, "grad_norm_var": 0.02664607612382147, "learning_rate": 0.00525, "loss": 6.9261, "step": 84 }, { "crossentropy": 6.787765026092529, "epoch": 0.0072303504593399116, "grad_norm": 0.5696502923965454, "grad_norm_var": 0.022968936263326713, "learning_rate": 0.0053125, "loss": 6.7878, "step": 85 }, { "crossentropy": 6.724362373352051, "epoch": 0.007315413405920381, "grad_norm": 0.645754337310791, "grad_norm_var": 0.019524430407763305, "learning_rate": 0.005375, "loss": 6.7244, "step": 86 }, { "crossentropy": 6.736844539642334, "epoch": 0.007400476352500851, "grad_norm": 0.5248293876647949, "grad_norm_var": 0.015050686562859506, "learning_rate": 0.0054375, "loss": 6.7368, "step": 87 }, { "crossentropy": 6.771926403045654, "epoch": 0.00748553929908132, "grad_norm": 0.5884330868721008, "grad_norm_var": 0.013992809279913691, "learning_rate": 0.0055000000000000005, "loss": 6.7719, "step": 88 }, { "crossentropy": 6.7114434242248535, "epoch": 0.00757060224566179, "grad_norm": 0.42808547616004944, "grad_norm_var": 0.014851718118260046, "learning_rate": 0.005562500000000001, "loss": 6.7114, "step": 89 }, { "crossentropy": 6.748940944671631, "epoch": 0.007655665192242259, "grad_norm": 0.31629714369773865, "grad_norm_var": 0.017719856511875718, "learning_rate": 0.005625, "loss": 6.7489, "step": 90 }, { "crossentropy": 6.607691287994385, "epoch": 0.007740728138822729, "grad_norm": 0.34043949842453003, "grad_norm_var": 0.01892081744559789, "learning_rate": 0.0056875, "loss": 6.6077, "step": 91 }, { "crossentropy": 6.689177513122559, "epoch": 0.007825791085403199, "grad_norm": 0.377320796251297, "grad_norm_var": 0.01996871894117239, "learning_rate": 0.00575, "loss": 6.6892, "step": 92 }, { "crossentropy": 6.527837753295898, "epoch": 0.007910854031983668, "grad_norm": 0.38037019968032837, "grad_norm_var": 0.0217199771716522, "learning_rate": 0.005812500000000001, "loss": 6.5278, "step": 93 }, { "crossentropy": 6.586463451385498, "epoch": 0.007995916978564138, "grad_norm": 0.40682509541511536, "grad_norm_var": 0.015939707012279043, "learning_rate": 0.005875, "loss": 6.5865, "step": 94 }, { "crossentropy": 6.577503681182861, "epoch": 0.008080979925144607, "grad_norm": 0.41826921701431274, "grad_norm_var": 0.013028160692259254, "learning_rate": 0.0059375, "loss": 6.5775, "step": 95 }, { "crossentropy": 6.75349235534668, "epoch": 0.008166042871725076, "grad_norm": 0.2977065443992615, "grad_norm_var": 0.01237954264066297, "learning_rate": 0.006, "loss": 6.7535, "step": 96 }, { "crossentropy": 6.5777974128723145, "epoch": 0.008251105818305546, "grad_norm": 0.27614033222198486, "grad_norm_var": 0.012594814909013681, "learning_rate": 0.006062499999999999, "loss": 6.5778, "step": 97 }, { "crossentropy": 6.519662380218506, "epoch": 0.008336168764886015, "grad_norm": 0.2539125978946686, "grad_norm_var": 0.013899954529264728, "learning_rate": 0.006125, "loss": 6.5197, "step": 98 }, { "crossentropy": 6.48069429397583, "epoch": 0.008421231711466486, "grad_norm": 0.36278197169303894, "grad_norm_var": 0.013811794660797464, "learning_rate": 0.0061875, "loss": 6.4807, "step": 99 }, { "crossentropy": 6.523020267486572, "epoch": 0.008506294658046955, "grad_norm": 0.3279649317264557, "grad_norm_var": 0.013719051423065691, "learning_rate": 0.00625, "loss": 6.523, "step": 100 }, { "crossentropy": 6.498292922973633, "epoch": 0.008591357604627424, "grad_norm": 0.5352258682250977, "grad_norm_var": 0.013047361889885064, "learning_rate": 0.0063124999999999995, "loss": 6.4983, "step": 101 }, { "crossentropy": 6.436935901641846, "epoch": 0.008676420551207894, "grad_norm": 0.44642096757888794, "grad_norm_var": 0.009132599624153578, "learning_rate": 0.006375, "loss": 6.4369, "step": 102 }, { "crossentropy": 6.35798454284668, "epoch": 0.008761483497788363, "grad_norm": 0.33586880564689636, "grad_norm_var": 0.008031836959865032, "learning_rate": 0.0064375000000000005, "loss": 6.358, "step": 103 }, { "crossentropy": 6.283843517303467, "epoch": 0.008846546444368834, "grad_norm": 0.43269214034080505, "grad_norm_var": 0.005235236032229636, "learning_rate": 0.006500000000000001, "loss": 6.2838, "step": 104 }, { "crossentropy": 6.404956340789795, "epoch": 0.008931609390949303, "grad_norm": 0.47406354546546936, "grad_norm_var": 0.005717194075354555, "learning_rate": 0.0065625, "loss": 6.405, "step": 105 }, { "crossentropy": 6.324512481689453, "epoch": 0.009016672337529771, "grad_norm": 0.3121030628681183, "grad_norm_var": 0.005750502100924666, "learning_rate": 0.006625, "loss": 6.3245, "step": 106 }, { "crossentropy": 6.285346984863281, "epoch": 0.009101735284110242, "grad_norm": 0.24542127549648285, "grad_norm_var": 0.0067352949332925155, "learning_rate": 0.0066875, "loss": 6.2853, "step": 107 }, { "crossentropy": 6.25586462020874, "epoch": 0.00918679823069071, "grad_norm": 0.3192988932132721, "grad_norm_var": 0.006871220372202638, "learning_rate": 0.006750000000000001, "loss": 6.2559, "step": 108 }, { "crossentropy": 6.30493688583374, "epoch": 0.009271861177271181, "grad_norm": 0.262833833694458, "grad_norm_var": 0.007479142942736253, "learning_rate": 0.0068125, "loss": 6.3049, "step": 109 }, { "crossentropy": 6.2640509605407715, "epoch": 0.00935692412385165, "grad_norm": 0.27229687571525574, "grad_norm_var": 0.007711528339213861, "learning_rate": 0.006875, "loss": 6.2641, "step": 110 }, { "crossentropy": 6.289525985717773, "epoch": 0.009441987070432119, "grad_norm": 0.3042278587818146, "grad_norm_var": 0.007460640751159167, "learning_rate": 0.0069375, "loss": 6.2895, "step": 111 }, { "crossentropy": 6.188907623291016, "epoch": 0.00952705001701259, "grad_norm": 0.2872515022754669, "grad_norm_var": 0.007528081663446384, "learning_rate": 0.006999999999999999, "loss": 6.1889, "step": 112 }, { "crossentropy": 6.163388252258301, "epoch": 0.009612112963593059, "grad_norm": 0.29698604345321655, "grad_norm_var": 0.007376269937595147, "learning_rate": 0.0070625, "loss": 6.1634, "step": 113 }, { "crossentropy": 6.185665607452393, "epoch": 0.00969717591017353, "grad_norm": 0.31410831212997437, "grad_norm_var": 0.006897071545086171, "learning_rate": 0.007125, "loss": 6.1857, "step": 114 }, { "crossentropy": 6.133439540863037, "epoch": 0.009782238856753998, "grad_norm": 0.3000437617301941, "grad_norm_var": 0.006999319621642071, "learning_rate": 0.0071875, "loss": 6.1334, "step": 115 }, { "crossentropy": 6.099243640899658, "epoch": 0.009867301803334467, "grad_norm": 0.28948813676834106, "grad_norm_var": 0.007162186999458889, "learning_rate": 0.0072499999999999995, "loss": 6.0992, "step": 116 }, { "crossentropy": 5.968860626220703, "epoch": 0.009952364749914937, "grad_norm": 0.22912771999835968, "grad_norm_var": 0.005020655746346862, "learning_rate": 0.0073124999999999996, "loss": 5.9689, "step": 117 }, { "crossentropy": 5.994052886962891, "epoch": 0.010037427696495406, "grad_norm": 0.3315988779067993, "grad_norm_var": 0.0039113432011718356, "learning_rate": 0.0073750000000000005, "loss": 5.9941, "step": 118 }, { "crossentropy": 5.999678134918213, "epoch": 0.010122490643075877, "grad_norm": 0.31148216128349304, "grad_norm_var": 0.0038740335837602834, "learning_rate": 0.0074375000000000005, "loss": 5.9997, "step": 119 }, { "crossentropy": 5.952829837799072, "epoch": 0.010207553589656346, "grad_norm": 0.26099321246147156, "grad_norm_var": 0.002940694973577873, "learning_rate": 0.0075, "loss": 5.9528, "step": 120 }, { "crossentropy": 5.962660789489746, "epoch": 0.010292616536236815, "grad_norm": 0.23667429387569427, "grad_norm_var": 0.0009757603056199133, "learning_rate": 0.0075625, "loss": 5.9627, "step": 121 }, { "crossentropy": 5.9423747062683105, "epoch": 0.010377679482817285, "grad_norm": 0.2990190088748932, "grad_norm_var": 0.0009406969185574183, "learning_rate": 0.007625, "loss": 5.9424, "step": 122 }, { "crossentropy": 5.875547409057617, "epoch": 0.010462742429397754, "grad_norm": 0.29126814007759094, "grad_norm_var": 0.000829801041900513, "learning_rate": 0.007687500000000001, "loss": 5.8755, "step": 123 }, { "crossentropy": 5.838551998138428, "epoch": 0.010547805375978225, "grad_norm": 0.255105584859848, "grad_norm_var": 0.0008187630846316462, "learning_rate": 0.007750000000000001, "loss": 5.8386, "step": 124 }, { "crossentropy": 5.766818046569824, "epoch": 0.010632868322558693, "grad_norm": 0.23496288061141968, "grad_norm_var": 0.0009456214745828657, "learning_rate": 0.0078125, "loss": 5.7668, "step": 125 }, { "crossentropy": 5.681275844573975, "epoch": 0.010717931269139162, "grad_norm": 0.25278884172439575, "grad_norm_var": 0.0009950734652114416, "learning_rate": 0.007875, "loss": 5.6813, "step": 126 }, { "crossentropy": 5.7617645263671875, "epoch": 0.010802994215719633, "grad_norm": 0.320692241191864, "grad_norm_var": 0.0010631265575339259, "learning_rate": 0.0079375, "loss": 5.7618, "step": 127 }, { "crossentropy": 5.767834663391113, "epoch": 0.010888057162300102, "grad_norm": 0.38363513350486755, "grad_norm_var": 0.0017115559114857515, "learning_rate": 0.008, "loss": 5.7678, "step": 128 }, { "crossentropy": 5.657352924346924, "epoch": 0.010973120108880572, "grad_norm": 0.29541271924972534, "grad_norm_var": 0.001709825223221406, "learning_rate": 0.0080625, "loss": 5.6574, "step": 129 }, { "crossentropy": 5.558316707611084, "epoch": 0.011058183055461041, "grad_norm": 0.24206119775772095, "grad_norm_var": 0.0017824855725882957, "learning_rate": 0.008125, "loss": 5.5583, "step": 130 }, { "crossentropy": 5.54618501663208, "epoch": 0.01114324600204151, "grad_norm": 0.26013559103012085, "grad_norm_var": 0.0017934486811256682, "learning_rate": 0.0081875, "loss": 5.5462, "step": 131 }, { "crossentropy": 5.549060821533203, "epoch": 0.01122830894862198, "grad_norm": 0.2480442076921463, "grad_norm_var": 0.001853357614672697, "learning_rate": 0.00825, "loss": 5.5491, "step": 132 }, { "crossentropy": 5.536707401275635, "epoch": 0.01131337189520245, "grad_norm": 0.26030927896499634, "grad_norm_var": 0.0017096374959784703, "learning_rate": 0.0083125, "loss": 5.5367, "step": 133 }, { "crossentropy": 5.438579082489014, "epoch": 0.01139843484178292, "grad_norm": 0.22871343791484833, "grad_norm_var": 0.0016669761550276432, "learning_rate": 0.008375, "loss": 5.4386, "step": 134 }, { "crossentropy": 5.44108247756958, "epoch": 0.011483497788363389, "grad_norm": 0.24029290676116943, "grad_norm_var": 0.0016263405926016559, "learning_rate": 0.0084375, "loss": 5.4411, "step": 135 }, { "crossentropy": 5.454643249511719, "epoch": 0.011568560734943858, "grad_norm": 0.3056889772415161, "grad_norm_var": 0.0017012063556831076, "learning_rate": 0.0085, "loss": 5.4546, "step": 136 }, { "crossentropy": 5.331473350524902, "epoch": 0.011653623681524328, "grad_norm": 0.23304922878742218, "grad_norm_var": 0.0017191867911240306, "learning_rate": 0.008562499999999999, "loss": 5.3315, "step": 137 }, { "crossentropy": 5.177455902099609, "epoch": 0.011738686628104797, "grad_norm": 0.24541275203227997, "grad_norm_var": 0.0017053037357671942, "learning_rate": 0.008625, "loss": 5.1775, "step": 138 }, { "crossentropy": 5.26737642288208, "epoch": 0.011823749574685268, "grad_norm": 0.28021836280822754, "grad_norm_var": 0.0016795353059726756, "learning_rate": 0.0086875, "loss": 5.2674, "step": 139 }, { "crossentropy": 5.236880302429199, "epoch": 0.011908812521265737, "grad_norm": 0.2545812427997589, "grad_norm_var": 0.0016804475149961196, "learning_rate": 0.00875, "loss": 5.2369, "step": 140 }, { "crossentropy": 5.189611911773682, "epoch": 0.011993875467846206, "grad_norm": 0.23174633085727692, "grad_norm_var": 0.0016952092544838897, "learning_rate": 0.0088125, "loss": 5.1896, "step": 141 }, { "crossentropy": 5.174455165863037, "epoch": 0.012078938414426676, "grad_norm": 0.24460504949092865, "grad_norm_var": 0.0017156373246144606, "learning_rate": 0.008875, "loss": 5.1745, "step": 142 }, { "crossentropy": 5.0488457679748535, "epoch": 0.012164001361007145, "grad_norm": 0.18707337975502014, "grad_norm_var": 0.001877833095825828, "learning_rate": 0.008937500000000001, "loss": 5.0488, "step": 143 }, { "crossentropy": 5.016098976135254, "epoch": 0.012249064307587616, "grad_norm": 0.20294660329818726, "grad_norm_var": 0.0009111218095388137, "learning_rate": 0.009000000000000001, "loss": 5.0161, "step": 144 }, { "crossentropy": 5.012355327606201, "epoch": 0.012334127254168084, "grad_norm": 0.24907968938350677, "grad_norm_var": 0.0007494139299521206, "learning_rate": 0.0090625, "loss": 5.0124, "step": 145 }, { "crossentropy": 4.989967346191406, "epoch": 0.012419190200748553, "grad_norm": 0.20837673544883728, "grad_norm_var": 0.0008318321010291053, "learning_rate": 0.009125, "loss": 4.99, "step": 146 }, { "crossentropy": 4.905921459197998, "epoch": 0.012504253147329024, "grad_norm": 0.19255036115646362, "grad_norm_var": 0.0009585507697384745, "learning_rate": 0.0091875, "loss": 4.9059, "step": 147 }, { "crossentropy": 4.819963455200195, "epoch": 0.012589316093909493, "grad_norm": 0.21177802979946136, "grad_norm_var": 0.0009936012919035332, "learning_rate": 0.009250000000000001, "loss": 4.82, "step": 148 }, { "crossentropy": 4.869275093078613, "epoch": 0.012674379040489963, "grad_norm": 0.24839143455028534, "grad_norm_var": 0.0009638918672519759, "learning_rate": 0.0093125, "loss": 4.8693, "step": 149 }, { "crossentropy": 4.831873893737793, "epoch": 0.012759441987070432, "grad_norm": 0.27071788907051086, "grad_norm_var": 0.0010373800085692396, "learning_rate": 0.009375, "loss": 4.8319, "step": 150 }, { "crossentropy": 4.783388137817383, "epoch": 0.012844504933650901, "grad_norm": 0.19062751531600952, "grad_norm_var": 0.001175744850972633, "learning_rate": 0.0094375, "loss": 4.7834, "step": 151 }, { "crossentropy": 4.702752113342285, "epoch": 0.012929567880231372, "grad_norm": 0.18635216355323792, "grad_norm_var": 0.0009379125964575454, "learning_rate": 0.0095, "loss": 4.7028, "step": 152 }, { "crossentropy": 4.742338180541992, "epoch": 0.01301463082681184, "grad_norm": 0.3315200209617615, "grad_norm_var": 0.0016188478350393572, "learning_rate": 0.0095625, "loss": 4.7423, "step": 153 }, { "crossentropy": 4.707618236541748, "epoch": 0.013099693773392311, "grad_norm": 0.2781056761741638, "grad_norm_var": 0.0017375840139002498, "learning_rate": 0.009625, "loss": 4.7076, "step": 154 }, { "crossentropy": 4.6902241706848145, "epoch": 0.01318475671997278, "grad_norm": 0.19503560662269592, "grad_norm_var": 0.0016836685473057714, "learning_rate": 0.0096875, "loss": 4.6902, "step": 155 }, { "crossentropy": 4.647319316864014, "epoch": 0.013269819666553249, "grad_norm": 0.15114480257034302, "grad_norm_var": 0.0020163556049931576, "learning_rate": 0.00975, "loss": 4.6473, "step": 156 }, { "crossentropy": 4.612910747528076, "epoch": 0.01335488261313372, "grad_norm": 0.1459556370973587, "grad_norm_var": 0.002384926865025937, "learning_rate": 0.0098125, "loss": 4.6129, "step": 157 }, { "crossentropy": 4.552581310272217, "epoch": 0.013439945559714188, "grad_norm": 0.14018110930919647, "grad_norm_var": 0.0027014700733364086, "learning_rate": 0.009875, "loss": 4.5526, "step": 158 }, { "crossentropy": 4.658143997192383, "epoch": 0.013525008506294659, "grad_norm": 0.21442651748657227, "grad_norm_var": 0.0026578158229162937, "learning_rate": 0.0099375, "loss": 4.6581, "step": 159 }, { "crossentropy": 4.499566555023193, "epoch": 0.013610071452875128, "grad_norm": 0.17341655492782593, "grad_norm_var": 0.0027541624048017144, "learning_rate": 0.01, "loss": 4.4996, "step": 160 }, { "crossentropy": 4.460829257965088, "epoch": 0.013695134399455597, "grad_norm": 0.14353439211845398, "grad_norm_var": 0.0029247714900103804, "learning_rate": 0.009999999598572036, "loss": 4.4608, "step": 161 }, { "crossentropy": 4.388937950134277, "epoch": 0.013780197346036067, "grad_norm": 0.12398557364940643, "grad_norm_var": 0.003333379706263202, "learning_rate": 0.009999998394288208, "loss": 4.3889, "step": 162 }, { "crossentropy": 4.3460693359375, "epoch": 0.013865260292616536, "grad_norm": 0.12452977150678635, "grad_norm_var": 0.0036888280588562227, "learning_rate": 0.00999999638714871, "loss": 4.3461, "step": 163 }, { "crossentropy": 4.337122917175293, "epoch": 0.013950323239197007, "grad_norm": 0.1483049988746643, "grad_norm_var": 0.0038037681703845375, "learning_rate": 0.009999993577153863, "loss": 4.3371, "step": 164 }, { "crossentropy": 4.4350690841674805, "epoch": 0.014035386185777475, "grad_norm": 0.1338140219449997, "grad_norm_var": 0.003757266264385156, "learning_rate": 0.00999998996430412, "loss": 4.4351, "step": 165 }, { "crossentropy": 4.4221601486206055, "epoch": 0.014120449132357944, "grad_norm": 0.20258711278438568, "grad_norm_var": 0.0032639692667963123, "learning_rate": 0.009999985548600059, "loss": 4.4222, "step": 166 }, { "crossentropy": 4.269092559814453, "epoch": 0.014205512078938415, "grad_norm": 0.14014071226119995, "grad_norm_var": 0.003353218260682169, "learning_rate": 0.009999980330042391, "loss": 4.2691, "step": 167 }, { "crossentropy": 4.380237102508545, "epoch": 0.014290575025518884, "grad_norm": 0.1251642405986786, "grad_norm_var": 0.0035114448638484245, "learning_rate": 0.009999974308631953, "loss": 4.3802, "step": 168 }, { "crossentropy": 4.307499408721924, "epoch": 0.014375637972099354, "grad_norm": 0.11807546019554138, "grad_norm_var": 0.0018543335437038748, "learning_rate": 0.009999967484369711, "loss": 4.3075, "step": 169 }, { "crossentropy": 4.19040584564209, "epoch": 0.014460700918679823, "grad_norm": 0.1393699198961258, "grad_norm_var": 0.000870731185454045, "learning_rate": 0.009999959857256765, "loss": 4.1904, "step": 170 }, { "crossentropy": 4.207085609436035, "epoch": 0.014545763865260292, "grad_norm": 0.12155484408140182, "grad_norm_var": 0.0007790041973492858, "learning_rate": 0.009999951427294334, "loss": 4.2071, "step": 171 }, { "crossentropy": 4.258392333984375, "epoch": 0.014630826811840763, "grad_norm": 0.11116243898868561, "grad_norm_var": 0.0008548829118862751, "learning_rate": 0.009999942194483772, "loss": 4.2584, "step": 172 }, { "crossentropy": 4.216152191162109, "epoch": 0.014715889758421231, "grad_norm": 0.11542423069477081, "grad_norm_var": 0.0009057428054977631, "learning_rate": 0.009999932158826568, "loss": 4.2162, "step": 173 }, { "crossentropy": 4.185194969177246, "epoch": 0.014800952705001702, "grad_norm": 0.1103348433971405, "grad_norm_var": 0.0009695693298621772, "learning_rate": 0.009999921320324326, "loss": 4.1852, "step": 174 }, { "crossentropy": 4.215174198150635, "epoch": 0.014886015651582171, "grad_norm": 0.11477025598287582, "grad_norm_var": 0.000606175525209466, "learning_rate": 0.009999909678978791, "loss": 4.2152, "step": 175 }, { "crossentropy": 4.0929365158081055, "epoch": 0.01497107859816264, "grad_norm": 0.11929339170455933, "grad_norm_var": 0.0005057897841458526, "learning_rate": 0.00999989723479183, "loss": 4.0929, "step": 176 }, { "crossentropy": 4.172039031982422, "epoch": 0.01505614154474311, "grad_norm": 0.11944177001714706, "grad_norm_var": 0.0005010095269496251, "learning_rate": 0.009999883987765442, "loss": 4.172, "step": 177 }, { "crossentropy": 4.212802886962891, "epoch": 0.01514120449132358, "grad_norm": 0.11632092297077179, "grad_norm_var": 0.0005100582403374484, "learning_rate": 0.009999869937901754, "loss": 4.2128, "step": 178 }, { "crossentropy": 4.116163730621338, "epoch": 0.01522626743790405, "grad_norm": 0.15290944278240204, "grad_norm_var": 0.0005443586069996118, "learning_rate": 0.009999855085203022, "loss": 4.1162, "step": 179 }, { "crossentropy": 4.20488166809082, "epoch": 0.015311330384484519, "grad_norm": 0.12184792011976242, "grad_norm_var": 0.0005254454811261994, "learning_rate": 0.009999839429671632, "loss": 4.2049, "step": 180 }, { "crossentropy": 4.101900100708008, "epoch": 0.015396393331064987, "grad_norm": 0.10514309257268906, "grad_norm_var": 0.0005579915607911081, "learning_rate": 0.009999822971310095, "loss": 4.1019, "step": 181 }, { "crossentropy": 4.147430896759033, "epoch": 0.015481456277645458, "grad_norm": 0.10256848484277725, "grad_norm_var": 0.00017649259533883502, "learning_rate": 0.009999805710121054, "loss": 4.1474, "step": 182 }, { "crossentropy": 4.123279571533203, "epoch": 0.015566519224225927, "grad_norm": 0.12785078585147858, "grad_norm_var": 0.00015431388924728542, "learning_rate": 0.009999787646107285, "loss": 4.1233, "step": 183 }, { "crossentropy": 3.998288869857788, "epoch": 0.015651582170806397, "grad_norm": 0.09751972556114197, "grad_norm_var": 0.00018332636037562042, "learning_rate": 0.009999768779271685, "loss": 3.9983, "step": 184 }, { "crossentropy": 4.0406575202941895, "epoch": 0.015736645117386865, "grad_norm": 0.13931429386138916, "grad_norm_var": 0.00021074411694036455, "learning_rate": 0.009999749109617284, "loss": 4.0407, "step": 185 }, { "crossentropy": 4.019528865814209, "epoch": 0.015821708063967335, "grad_norm": 0.10983426868915558, "grad_norm_var": 0.0001877124694070716, "learning_rate": 0.00999972863714724, "loss": 4.0195, "step": 186 }, { "crossentropy": 4.026954650878906, "epoch": 0.015906771010547806, "grad_norm": 0.15860959887504578, "grad_norm_var": 0.0002919281811279981, "learning_rate": 0.00999970736186484, "loss": 4.027, "step": 187 }, { "crossentropy": 4.001175880432129, "epoch": 0.015991833957128276, "grad_norm": 0.11410552263259888, "grad_norm_var": 0.00028894405824863, "learning_rate": 0.009999685283773502, "loss": 4.0012, "step": 188 }, { "crossentropy": 3.971679449081421, "epoch": 0.016076896903708743, "grad_norm": 0.14522969722747803, "grad_norm_var": 0.00032496896679390906, "learning_rate": 0.009999662402876771, "loss": 3.9717, "step": 189 }, { "crossentropy": 3.9547159671783447, "epoch": 0.016161959850289214, "grad_norm": 0.10799132287502289, "grad_norm_var": 0.0003290176509447811, "learning_rate": 0.00999963871917832, "loss": 3.9547, "step": 190 }, { "crossentropy": 3.814692735671997, "epoch": 0.016247022796869685, "grad_norm": 0.0903182402253151, "grad_norm_var": 0.00039011030300730327, "learning_rate": 0.00999961423268195, "loss": 3.8147, "step": 191 }, { "crossentropy": 3.890299081802368, "epoch": 0.016332085743450152, "grad_norm": 0.11423495411872864, "grad_norm_var": 0.0003925359290211629, "learning_rate": 0.009999588943391596, "loss": 3.8903, "step": 192 }, { "crossentropy": 3.8698418140411377, "epoch": 0.016417148690030622, "grad_norm": 0.11010842770338058, "grad_norm_var": 0.000398927074416344, "learning_rate": 0.009999562851311318, "loss": 3.8698, "step": 193 }, { "crossentropy": 3.842388153076172, "epoch": 0.016502211636611093, "grad_norm": 0.10201792418956757, "grad_norm_var": 0.0004180030344562063, "learning_rate": 0.009999535956445305, "loss": 3.8424, "step": 194 }, { "crossentropy": 3.940546751022339, "epoch": 0.01658727458319156, "grad_norm": 0.124986432492733, "grad_norm_var": 0.000339463796046871, "learning_rate": 0.009999508258797876, "loss": 3.9405, "step": 195 }, { "crossentropy": 3.7210769653320312, "epoch": 0.01667233752977203, "grad_norm": 0.13131988048553467, "grad_norm_var": 0.0003512189513695897, "learning_rate": 0.009999479758373477, "loss": 3.7211, "step": 196 }, { "crossentropy": 3.9723284244537354, "epoch": 0.0167574004763525, "grad_norm": 0.11966034770011902, "grad_norm_var": 0.00034033297498466954, "learning_rate": 0.009999450455176686, "loss": 3.9723, "step": 197 }, { "crossentropy": 3.890979290008545, "epoch": 0.016842463422932972, "grad_norm": 0.0953628346323967, "grad_norm_var": 0.0003588644978060061, "learning_rate": 0.009999420349212208, "loss": 3.891, "step": 198 }, { "crossentropy": 3.8522298336029053, "epoch": 0.01692752636951344, "grad_norm": 0.091729536652565, "grad_norm_var": 0.000393107758856806, "learning_rate": 0.009999389440484877, "loss": 3.8522, "step": 199 }, { "crossentropy": 3.8179471492767334, "epoch": 0.01701258931609391, "grad_norm": 0.12048850953578949, "grad_norm_var": 0.000370184621677927, "learning_rate": 0.009999357728999656, "loss": 3.8179, "step": 200 }, { "crossentropy": 3.745457172393799, "epoch": 0.01709765226267438, "grad_norm": 0.0836581289768219, "grad_norm_var": 0.00039973077595621226, "learning_rate": 0.009999325214761637, "loss": 3.7455, "step": 201 }, { "crossentropy": 3.8777382373809814, "epoch": 0.017182715209254847, "grad_norm": 0.08524348586797714, "grad_norm_var": 0.0004502931548633254, "learning_rate": 0.009999291897776041, "loss": 3.8777, "step": 202 }, { "crossentropy": 3.9465324878692627, "epoch": 0.017267778155835318, "grad_norm": 0.15514503419399261, "grad_norm_var": 0.00042960091230655634, "learning_rate": 0.009999257778048217, "loss": 3.9465, "step": 203 }, { "crossentropy": 3.7918105125427246, "epoch": 0.01735284110241579, "grad_norm": 0.08989336341619492, "grad_norm_var": 0.00045936231914681195, "learning_rate": 0.009999222855583646, "loss": 3.7918, "step": 204 }, { "crossentropy": 3.86405873298645, "epoch": 0.017437904048996256, "grad_norm": 0.08256359398365021, "grad_norm_var": 0.0004142995102138817, "learning_rate": 0.00999918713038793, "loss": 3.8641, "step": 205 }, { "crossentropy": 3.829869270324707, "epoch": 0.017522966995576726, "grad_norm": 0.0981428250670433, "grad_norm_var": 0.000418462518216674, "learning_rate": 0.00999915060246681, "loss": 3.8299, "step": 206 }, { "crossentropy": 3.806807518005371, "epoch": 0.017608029942157197, "grad_norm": 0.08989384770393372, "grad_norm_var": 0.0004193571539892564, "learning_rate": 0.009999113271826154, "loss": 3.8068, "step": 207 }, { "crossentropy": 3.7843382358551025, "epoch": 0.017693092888737667, "grad_norm": 0.08628932386636734, "grad_norm_var": 0.0004371217458069156, "learning_rate": 0.00999907513847195, "loss": 3.7843, "step": 208 }, { "crossentropy": 3.7964184284210205, "epoch": 0.017778155835318134, "grad_norm": 0.08445479720830917, "grad_norm_var": 0.0004578949616388807, "learning_rate": 0.009999036202410324, "loss": 3.7964, "step": 209 }, { "crossentropy": 3.790006399154663, "epoch": 0.017863218781898605, "grad_norm": 0.08108890056610107, "grad_norm_var": 0.0004867649375152159, "learning_rate": 0.00999899646364753, "loss": 3.79, "step": 210 }, { "crossentropy": 3.804021120071411, "epoch": 0.017948281728479076, "grad_norm": 0.08059185743331909, "grad_norm_var": 0.0004694130349900294, "learning_rate": 0.009998955922189943, "loss": 3.804, "step": 211 }, { "crossentropy": 3.7429869174957275, "epoch": 0.018033344675059543, "grad_norm": 0.08009009063243866, "grad_norm_var": 0.000409060757917748, "learning_rate": 0.009998914578044079, "loss": 3.743, "step": 212 }, { "crossentropy": 3.8774008750915527, "epoch": 0.018118407621640013, "grad_norm": 0.0828937441110611, "grad_norm_var": 0.0003739732977234178, "learning_rate": 0.009998872431216574, "loss": 3.8774, "step": 213 }, { "crossentropy": 3.772819757461548, "epoch": 0.018203470568220484, "grad_norm": 0.09708669781684875, "grad_norm_var": 0.0003747088766683816, "learning_rate": 0.009998829481714195, "loss": 3.7728, "step": 214 }, { "crossentropy": 3.7592010498046875, "epoch": 0.01828853351480095, "grad_norm": 0.08220867067575455, "grad_norm_var": 0.00038208656758103774, "learning_rate": 0.00999878572954384, "loss": 3.7592, "step": 215 }, { "crossentropy": 3.8342199325561523, "epoch": 0.01837359646138142, "grad_norm": 0.10131965577602386, "grad_norm_var": 0.00033347485310502765, "learning_rate": 0.009998741174712533, "loss": 3.8342, "step": 216 }, { "crossentropy": 3.842776298522949, "epoch": 0.018458659407961892, "grad_norm": 0.09545695781707764, "grad_norm_var": 0.0003301768112966896, "learning_rate": 0.009998695817227428, "loss": 3.8428, "step": 217 }, { "crossentropy": 3.605769157409668, "epoch": 0.018543722354542363, "grad_norm": 0.08969026058912277, "grad_norm_var": 0.000327393268844219, "learning_rate": 0.00999864965709581, "loss": 3.6058, "step": 218 }, { "crossentropy": 3.730006217956543, "epoch": 0.01862878530112283, "grad_norm": 0.09575316309928894, "grad_norm_var": 5.01956215526983e-05, "learning_rate": 0.009998602694325091, "loss": 3.73, "step": 219 }, { "crossentropy": 3.756810188293457, "epoch": 0.0187138482477033, "grad_norm": 0.1145874485373497, "grad_norm_var": 9.260394758046722e-05, "learning_rate": 0.00999855492892281, "loss": 3.7568, "step": 220 }, { "crossentropy": 3.7972190380096436, "epoch": 0.01879891119428377, "grad_norm": 0.1837984323501587, "grad_norm_var": 0.0006309766867688397, "learning_rate": 0.009998506360896636, "loss": 3.7972, "step": 221 }, { "crossentropy": 3.611988067626953, "epoch": 0.018883974140864238, "grad_norm": 0.08491060882806778, "grad_norm_var": 0.0006389494382201368, "learning_rate": 0.009998456990254371, "loss": 3.612, "step": 222 }, { "crossentropy": 3.7341792583465576, "epoch": 0.01896903708744471, "grad_norm": 0.0942736268043518, "grad_norm_var": 0.0006367973406707685, "learning_rate": 0.00999840681700394, "loss": 3.7342, "step": 223 }, { "crossentropy": 3.6458802223205566, "epoch": 0.01905410003402518, "grad_norm": 0.08655840158462524, "grad_norm_var": 0.0006364568520326128, "learning_rate": 0.009998355841153399, "loss": 3.6459, "step": 224 }, { "crossentropy": 3.7632675170898438, "epoch": 0.019139162980605647, "grad_norm": 0.09640248119831085, "grad_norm_var": 0.0006271098872668005, "learning_rate": 0.009998304062710935, "loss": 3.7633, "step": 225 }, { "crossentropy": 3.736149549484253, "epoch": 0.019224225927186117, "grad_norm": 0.09715108573436737, "grad_norm_var": 0.0006098668370152336, "learning_rate": 0.009998251481684862, "loss": 3.7361, "step": 226 }, { "crossentropy": 3.7409703731536865, "epoch": 0.019309288873766588, "grad_norm": 0.07847946137189865, "grad_norm_var": 0.00061495676859394, "learning_rate": 0.009998198098083623, "loss": 3.741, "step": 227 }, { "crossentropy": 3.6764204502105713, "epoch": 0.01939435182034706, "grad_norm": 0.0893683210015297, "grad_norm_var": 0.0005987482715977525, "learning_rate": 0.009998143911915787, "loss": 3.6764, "step": 228 }, { "crossentropy": 3.7373223304748535, "epoch": 0.019479414766927525, "grad_norm": 0.08607456833124161, "grad_norm_var": 0.0005929225143768792, "learning_rate": 0.009998088923190058, "loss": 3.7373, "step": 229 }, { "crossentropy": 3.7029168605804443, "epoch": 0.019564477713507996, "grad_norm": 0.09379161894321442, "grad_norm_var": 0.0005941429503031704, "learning_rate": 0.009998033131915265, "loss": 3.7029, "step": 230 }, { "crossentropy": 3.79892897605896, "epoch": 0.019649540660088467, "grad_norm": 0.08676975220441818, "grad_norm_var": 0.0005857704040219141, "learning_rate": 0.009997976538100365, "loss": 3.7989, "step": 231 }, { "crossentropy": 3.6350975036621094, "epoch": 0.019734603606668934, "grad_norm": 0.10037481784820557, "grad_norm_var": 0.0005854582739481304, "learning_rate": 0.009997919141754448, "loss": 3.6351, "step": 232 }, { "crossentropy": 3.651282787322998, "epoch": 0.019819666553249404, "grad_norm": 0.09800085425376892, "grad_norm_var": 0.0005848848275860516, "learning_rate": 0.009997860942886728, "loss": 3.6513, "step": 233 }, { "crossentropy": 3.6006598472595215, "epoch": 0.019904729499829875, "grad_norm": 0.07604920864105225, "grad_norm_var": 0.0006125362200926438, "learning_rate": 0.00999780194150655, "loss": 3.6007, "step": 234 }, { "crossentropy": 3.6477065086364746, "epoch": 0.019989792446410342, "grad_norm": 0.07809572666883469, "grad_norm_var": 0.0006364802945882724, "learning_rate": 0.009997742137623389, "loss": 3.6477, "step": 235 }, { "crossentropy": 3.677593231201172, "epoch": 0.020074855392990813, "grad_norm": 0.08077288419008255, "grad_norm_var": 0.0006265885398918053, "learning_rate": 0.009997681531246846, "loss": 3.6776, "step": 236 }, { "crossentropy": 3.663299560546875, "epoch": 0.020159918339571283, "grad_norm": 0.0985582023859024, "grad_norm_var": 6.499653755637746e-05, "learning_rate": 0.009997620122386657, "loss": 3.6633, "step": 237 }, { "crossentropy": 3.6120262145996094, "epoch": 0.020244981286151754, "grad_norm": 0.10855399072170258, "grad_norm_var": 8.6721551687118e-05, "learning_rate": 0.009997557911052677, "loss": 3.612, "step": 238 }, { "crossentropy": 3.721298933029175, "epoch": 0.02033004423273222, "grad_norm": 0.10942047834396362, "grad_norm_var": 0.00010852095262064525, "learning_rate": 0.0099974948972549, "loss": 3.7213, "step": 239 }, { "crossentropy": 3.6205689907073975, "epoch": 0.02041510717931269, "grad_norm": 0.09240198135375977, "grad_norm_var": 0.00010678440726499168, "learning_rate": 0.00999743108100344, "loss": 3.6206, "step": 240 }, { "crossentropy": 3.5810821056365967, "epoch": 0.020500170125893162, "grad_norm": 0.08026450872421265, "grad_norm_var": 0.0001133553226739544, "learning_rate": 0.009997366462308546, "loss": 3.5811, "step": 241 }, { "crossentropy": 3.6048641204833984, "epoch": 0.02058523307247363, "grad_norm": 0.10290330648422241, "grad_norm_var": 0.00012023073962312744, "learning_rate": 0.009997301041180595, "loss": 3.6049, "step": 242 }, { "crossentropy": 3.568984270095825, "epoch": 0.0206702960190541, "grad_norm": 0.10370510816574097, "grad_norm_var": 0.00011707418116691677, "learning_rate": 0.009997234817630091, "loss": 3.569, "step": 243 }, { "crossentropy": 3.6555705070495605, "epoch": 0.02075535896563457, "grad_norm": 0.09491326659917831, "grad_norm_var": 0.00011644459416961216, "learning_rate": 0.009997167791667667, "loss": 3.6556, "step": 244 }, { "crossentropy": 3.628356456756592, "epoch": 0.020840421912215037, "grad_norm": 0.07784529030323029, "grad_norm_var": 0.00012845774674933585, "learning_rate": 0.009997099963304086, "loss": 3.6284, "step": 245 }, { "crossentropy": 3.5227437019348145, "epoch": 0.020925484858795508, "grad_norm": 0.08868250995874405, "grad_norm_var": 0.00012931239041609732, "learning_rate": 0.009997031332550237, "loss": 3.5227, "step": 246 }, { "crossentropy": 3.5749337673187256, "epoch": 0.02101054780537598, "grad_norm": 0.08452091366052628, "grad_norm_var": 0.00013129628093480592, "learning_rate": 0.009996961899417145, "loss": 3.5749, "step": 247 }, { "crossentropy": 3.563178300857544, "epoch": 0.02109561075195645, "grad_norm": 0.07731559127569199, "grad_norm_var": 0.00013936896297229255, "learning_rate": 0.009996891663915954, "loss": 3.5632, "step": 248 }, { "crossentropy": 3.6829638481140137, "epoch": 0.021180673698536916, "grad_norm": 0.08901867270469666, "grad_norm_var": 0.00013572792454846308, "learning_rate": 0.009996820626057945, "loss": 3.683, "step": 249 }, { "crossentropy": 3.608712911605835, "epoch": 0.021265736645117387, "grad_norm": 0.08982384204864502, "grad_norm_var": 0.00012161758594943228, "learning_rate": 0.009996748785854524, "loss": 3.6087, "step": 250 }, { "crossentropy": 3.5996549129486084, "epoch": 0.021350799591697858, "grad_norm": 0.07837523519992828, "grad_norm_var": 0.00012113970012145661, "learning_rate": 0.009996676143317226, "loss": 3.5997, "step": 251 }, { "crossentropy": 3.527463674545288, "epoch": 0.021435862538278325, "grad_norm": 0.08441566675901413, "grad_norm_var": 0.0001169690551109009, "learning_rate": 0.009996602698457716, "loss": 3.5275, "step": 252 }, { "crossentropy": 3.571465253829956, "epoch": 0.021520925484858795, "grad_norm": 0.08478222042322159, "grad_norm_var": 0.00011548896260258993, "learning_rate": 0.009996528451287784, "loss": 3.5715, "step": 253 }, { "crossentropy": 3.4826247692108154, "epoch": 0.021605988431439266, "grad_norm": 0.1029534637928009, "grad_norm_var": 0.00010391839919274096, "learning_rate": 0.009996453401819356, "loss": 3.4826, "step": 254 }, { "crossentropy": 3.585174798965454, "epoch": 0.021691051378019733, "grad_norm": 0.08049716055393219, "grad_norm_var": 8.163281067414726e-05, "learning_rate": 0.00999637755006448, "loss": 3.5852, "step": 255 }, { "crossentropy": 3.535442590713501, "epoch": 0.021776114324600204, "grad_norm": 0.08524977415800095, "grad_norm_var": 8.089545329609237e-05, "learning_rate": 0.00999630089603534, "loss": 3.5354, "step": 256 }, { "crossentropy": 3.6108410358428955, "epoch": 0.021861177271180674, "grad_norm": 0.0802326500415802, "grad_norm_var": 8.092765003421797e-05, "learning_rate": 0.009996223439744238, "loss": 3.6108, "step": 257 }, { "crossentropy": 3.530984401702881, "epoch": 0.021946240217761145, "grad_norm": 0.07257720082998276, "grad_norm_var": 7.744711297881975e-05, "learning_rate": 0.009996145181203616, "loss": 3.531, "step": 258 }, { "crossentropy": 3.451094150543213, "epoch": 0.022031303164341612, "grad_norm": 0.08926261216402054, "grad_norm_var": 5.6258232929451195e-05, "learning_rate": 0.009996066120426037, "loss": 3.4511, "step": 259 }, { "crossentropy": 3.5156898498535156, "epoch": 0.022116366110922082, "grad_norm": 0.0752675011754036, "grad_norm_var": 5.4489630356779665e-05, "learning_rate": 0.009995986257424198, "loss": 3.5157, "step": 260 }, { "crossentropy": 3.625443935394287, "epoch": 0.022201429057502553, "grad_norm": 0.12317869067192078, "grad_norm_var": 0.00014693381869234056, "learning_rate": 0.009995905592210924, "loss": 3.6254, "step": 261 }, { "crossentropy": 3.6402292251586914, "epoch": 0.02228649200408302, "grad_norm": 0.0878577008843422, "grad_norm_var": 0.00014675112083016099, "learning_rate": 0.009995824124799163, "loss": 3.6402, "step": 262 }, { "crossentropy": 3.572868585586548, "epoch": 0.02237155495066349, "grad_norm": 0.08385153859853745, "grad_norm_var": 0.000146963170988771, "learning_rate": 0.009995741855202, "loss": 3.5729, "step": 263 }, { "crossentropy": 3.4440550804138184, "epoch": 0.02245661789724396, "grad_norm": 0.07007579505443573, "grad_norm_var": 0.00015914464291252596, "learning_rate": 0.009995658783432644, "loss": 3.4441, "step": 264 }, { "crossentropy": 3.540539026260376, "epoch": 0.02254168084382443, "grad_norm": 0.08477947115898132, "grad_norm_var": 0.00015861173901108954, "learning_rate": 0.009995574909504435, "loss": 3.5405, "step": 265 }, { "crossentropy": 3.491117000579834, "epoch": 0.0226267437904049, "grad_norm": 0.07929136604070663, "grad_norm_var": 0.00015992765083466972, "learning_rate": 0.00999549023343084, "loss": 3.4911, "step": 266 }, { "crossentropy": 3.6149203777313232, "epoch": 0.02271180673698537, "grad_norm": 0.07434512674808502, "grad_norm_var": 0.0001645914971216372, "learning_rate": 0.009995404755225453, "loss": 3.6149, "step": 267 }, { "crossentropy": 3.5075907707214355, "epoch": 0.02279686968356584, "grad_norm": 0.08488232642412186, "grad_norm_var": 0.00016457412445786853, "learning_rate": 0.009995318474902003, "loss": 3.5076, "step": 268 }, { "crossentropy": 3.601724624633789, "epoch": 0.022881932630146307, "grad_norm": 0.07680029422044754, "grad_norm_var": 0.00016872695559954978, "learning_rate": 0.009995231392474341, "loss": 3.6017, "step": 269 }, { "crossentropy": 3.525110960006714, "epoch": 0.022966995576726778, "grad_norm": 0.11252494156360626, "grad_norm_var": 0.00019807460848476598, "learning_rate": 0.009995143507956454, "loss": 3.5251, "step": 270 }, { "crossentropy": 3.5688138008117676, "epoch": 0.02305205852330725, "grad_norm": 0.09293940663337708, "grad_norm_var": 0.00020021024372955364, "learning_rate": 0.009995054821362452, "loss": 3.5688, "step": 271 }, { "crossentropy": 3.6104443073272705, "epoch": 0.023137121469887716, "grad_norm": 0.08931928128004074, "grad_norm_var": 0.0002009360163681156, "learning_rate": 0.009994965332706574, "loss": 3.6104, "step": 272 }, { "crossentropy": 3.477769136428833, "epoch": 0.023222184416468186, "grad_norm": 0.09165553003549576, "grad_norm_var": 0.0002001942999397192, "learning_rate": 0.00999487504200319, "loss": 3.4778, "step": 273 }, { "crossentropy": 3.6074936389923096, "epoch": 0.023307247363048657, "grad_norm": 0.09292537719011307, "grad_norm_var": 0.00018751700496249615, "learning_rate": 0.009994783949266797, "loss": 3.6075, "step": 274 }, { "crossentropy": 3.5472676753997803, "epoch": 0.023392310309629124, "grad_norm": 0.08159862458705902, "grad_norm_var": 0.00018995894760556904, "learning_rate": 0.009994692054512025, "loss": 3.5473, "step": 275 }, { "crossentropy": 3.5271053314208984, "epoch": 0.023477373256209595, "grad_norm": 0.09892002493143082, "grad_norm_var": 0.00018609195209105142, "learning_rate": 0.009994599357753627, "loss": 3.5271, "step": 276 }, { "crossentropy": 3.5045650005340576, "epoch": 0.023562436202790065, "grad_norm": 0.10940872877836227, "grad_norm_var": 0.00013529928570224595, "learning_rate": 0.009994505859006488, "loss": 3.5046, "step": 277 }, { "crossentropy": 3.6498360633850098, "epoch": 0.023647499149370536, "grad_norm": 0.0860615149140358, "grad_norm_var": 0.0001355825399601838, "learning_rate": 0.009994411558285623, "loss": 3.6498, "step": 278 }, { "crossentropy": 3.5342750549316406, "epoch": 0.023732562095951003, "grad_norm": 0.07256685942411423, "grad_norm_var": 0.0001499131256076212, "learning_rate": 0.009994316455606172, "loss": 3.5343, "step": 279 }, { "crossentropy": 3.549914836883545, "epoch": 0.023817625042531473, "grad_norm": 0.07973713427782059, "grad_norm_var": 0.00013345488095232335, "learning_rate": 0.009994220550983405, "loss": 3.5499, "step": 280 }, { "crossentropy": 3.541752338409424, "epoch": 0.023902687989111944, "grad_norm": 0.076323002576828, "grad_norm_var": 0.00014153841814460607, "learning_rate": 0.009994123844432724, "loss": 3.5418, "step": 281 }, { "crossentropy": 3.5795979499816895, "epoch": 0.02398775093569241, "grad_norm": 0.07945823669433594, "grad_norm_var": 0.000141358495211825, "learning_rate": 0.009994026335969654, "loss": 3.5796, "step": 282 }, { "crossentropy": 3.5370681285858154, "epoch": 0.02407281388227288, "grad_norm": 0.07364124804735184, "grad_norm_var": 0.00014262092203046358, "learning_rate": 0.009993928025609855, "loss": 3.5371, "step": 283 }, { "crossentropy": 3.418961524963379, "epoch": 0.024157876828853352, "grad_norm": 0.10033980756998062, "grad_norm_var": 0.0001523186622886324, "learning_rate": 0.009993828913369111, "loss": 3.419, "step": 284 }, { "crossentropy": 3.3490023612976074, "epoch": 0.02424293977543382, "grad_norm": 0.07508822530508041, "grad_norm_var": 0.00015514722587399698, "learning_rate": 0.009993728999263339, "loss": 3.349, "step": 285 }, { "crossentropy": 3.5191688537597656, "epoch": 0.02432800272201429, "grad_norm": 0.11542336642742157, "grad_norm_var": 0.00016504122396023257, "learning_rate": 0.00999362828330858, "loss": 3.5192, "step": 286 }, { "crossentropy": 3.5324935913085938, "epoch": 0.02441306566859476, "grad_norm": 0.08360631763935089, "grad_norm_var": 0.00016491476293716996, "learning_rate": 0.009993526765521009, "loss": 3.5325, "step": 287 }, { "crossentropy": 3.529761552810669, "epoch": 0.02449812861517523, "grad_norm": 0.08233854919672012, "grad_norm_var": 0.00016662040417254957, "learning_rate": 0.009993424445916922, "loss": 3.5298, "step": 288 }, { "crossentropy": 3.4978270530700684, "epoch": 0.0245831915617557, "grad_norm": 0.07960892468690872, "grad_norm_var": 0.00016892467956059823, "learning_rate": 0.009993321324512751, "loss": 3.4978, "step": 289 }, { "crossentropy": 3.4276275634765625, "epoch": 0.02466825450833617, "grad_norm": 0.07632949203252792, "grad_norm_var": 0.00017234191695971525, "learning_rate": 0.009993217401325057, "loss": 3.4276, "step": 290 }, { "crossentropy": 3.454660654067993, "epoch": 0.02475331745491664, "grad_norm": 0.07289869338274002, "grad_norm_var": 0.00018177565457171417, "learning_rate": 0.009993112676370522, "loss": 3.4547, "step": 291 }, { "crossentropy": 3.340754985809326, "epoch": 0.024838380401497107, "grad_norm": 0.07507588714361191, "grad_norm_var": 0.00017340253927574195, "learning_rate": 0.009993007149665967, "loss": 3.3408, "step": 292 }, { "crossentropy": 3.4556376934051514, "epoch": 0.024923443348077577, "grad_norm": 0.07598789781332016, "grad_norm_var": 0.00012829070055607277, "learning_rate": 0.009992900821228333, "loss": 3.4556, "step": 293 }, { "crossentropy": 3.437713384628296, "epoch": 0.025008506294658048, "grad_norm": 0.08666319400072098, "grad_norm_var": 0.00012867683650659805, "learning_rate": 0.009992793691074694, "loss": 3.4377, "step": 294 }, { "crossentropy": 3.467104911804199, "epoch": 0.025093569241238515, "grad_norm": 0.07546163350343704, "grad_norm_var": 0.00012572642776815062, "learning_rate": 0.00999268575922225, "loss": 3.4671, "step": 295 }, { "crossentropy": 3.480400323867798, "epoch": 0.025178632187818985, "grad_norm": 0.07586372643709183, "grad_norm_var": 0.00012770309301289492, "learning_rate": 0.009992577025688338, "loss": 3.4804, "step": 296 }, { "crossentropy": 3.5148491859436035, "epoch": 0.025263695134399456, "grad_norm": 0.0862354263663292, "grad_norm_var": 0.00012699295187211778, "learning_rate": 0.009992467490490412, "loss": 3.5148, "step": 297 }, { "crossentropy": 3.425670862197876, "epoch": 0.025348758080979927, "grad_norm": 0.07200814038515091, "grad_norm_var": 0.00013311224750147676, "learning_rate": 0.009992357153646061, "loss": 3.4257, "step": 298 }, { "crossentropy": 3.384868621826172, "epoch": 0.025433821027560394, "grad_norm": 0.07531536370515823, "grad_norm_var": 0.00013149735795759583, "learning_rate": 0.009992246015173003, "loss": 3.3849, "step": 299 }, { "crossentropy": 3.52479887008667, "epoch": 0.025518883974140864, "grad_norm": 0.07066698372364044, "grad_norm_var": 0.00011303935397841853, "learning_rate": 0.009992134075089084, "loss": 3.5248, "step": 300 }, { "crossentropy": 3.532233238220215, "epoch": 0.025603946920721335, "grad_norm": 0.096543088555336, "grad_norm_var": 0.00012801328545252176, "learning_rate": 0.009992021333412275, "loss": 3.5322, "step": 301 }, { "crossentropy": 3.4258174896240234, "epoch": 0.025689009867301802, "grad_norm": 0.08765491098165512, "grad_norm_var": 4.9686858251670137e-05, "learning_rate": 0.009991907790160685, "loss": 3.4258, "step": 302 }, { "crossentropy": 3.5529799461364746, "epoch": 0.025774072813882273, "grad_norm": 0.08938015252351761, "grad_norm_var": 5.491923321185125e-05, "learning_rate": 0.00999179344535254, "loss": 3.553, "step": 303 }, { "crossentropy": 3.5132548809051514, "epoch": 0.025859135760462743, "grad_norm": 0.12648236751556396, "grad_norm_var": 0.00019119979372923696, "learning_rate": 0.009991678299006206, "loss": 3.5133, "step": 304 }, { "crossentropy": 3.5836241245269775, "epoch": 0.02594419870704321, "grad_norm": 0.1147625520825386, "grad_norm_var": 0.00025424756599656237, "learning_rate": 0.009991562351140166, "loss": 3.5836, "step": 305 }, { "crossentropy": 3.337860345840454, "epoch": 0.02602926165362368, "grad_norm": 0.07373955100774765, "grad_norm_var": 0.00025760331363219847, "learning_rate": 0.009991445601773041, "loss": 3.3379, "step": 306 }, { "crossentropy": 3.509267568588257, "epoch": 0.02611432460020415, "grad_norm": 0.08863765746355057, "grad_norm_var": 0.0002483805109421975, "learning_rate": 0.00999132805092358, "loss": 3.5093, "step": 307 }, { "crossentropy": 3.488011360168457, "epoch": 0.026199387546784622, "grad_norm": 0.0789242833852768, "grad_norm_var": 0.00024387784349665654, "learning_rate": 0.009991209698610655, "loss": 3.488, "step": 308 }, { "crossentropy": 3.4435012340545654, "epoch": 0.02628445049336509, "grad_norm": 0.07613440603017807, "grad_norm_var": 0.00024368564699416562, "learning_rate": 0.00999109054485327, "loss": 3.4435, "step": 309 }, { "crossentropy": 3.5588912963867188, "epoch": 0.02636951343994556, "grad_norm": 0.10676809400320053, "grad_norm_var": 0.00027098213948537617, "learning_rate": 0.009990970589670559, "loss": 3.5589, "step": 310 }, { "crossentropy": 3.3790409564971924, "epoch": 0.02645457638652603, "grad_norm": 0.09749291837215424, "grad_norm_var": 0.00026695086138771377, "learning_rate": 0.009990849833081784, "loss": 3.379, "step": 311 }, { "crossentropy": 3.5360941886901855, "epoch": 0.026539639333106498, "grad_norm": 0.07725343853235245, "grad_norm_var": 0.0002647230699930704, "learning_rate": 0.009990728275106331, "loss": 3.5361, "step": 312 }, { "crossentropy": 3.431347131729126, "epoch": 0.026624702279686968, "grad_norm": 0.07425813376903534, "grad_norm_var": 0.0002775050577173199, "learning_rate": 0.009990605915763723, "loss": 3.4313, "step": 313 }, { "crossentropy": 3.498661518096924, "epoch": 0.02670976522626744, "grad_norm": 0.13098299503326416, "grad_norm_var": 0.00037010521894856297, "learning_rate": 0.009990482755073606, "loss": 3.4987, "step": 314 }, { "crossentropy": 3.460193157196045, "epoch": 0.026794828172847906, "grad_norm": 0.09570879489183426, "grad_norm_var": 0.0003519210177639803, "learning_rate": 0.009990358793055756, "loss": 3.4602, "step": 315 }, { "crossentropy": 3.4394986629486084, "epoch": 0.026879891119428376, "grad_norm": 0.07162744551897049, "grad_norm_var": 0.00034913955942484455, "learning_rate": 0.009990234029730078, "loss": 3.4395, "step": 316 }, { "crossentropy": 3.3987925052642822, "epoch": 0.026964954066008847, "grad_norm": 0.07328823208808899, "grad_norm_var": 0.00037163336123620475, "learning_rate": 0.009990108465116606, "loss": 3.3988, "step": 317 }, { "crossentropy": 3.447909116744995, "epoch": 0.027050017012589318, "grad_norm": 0.0619593970477581, "grad_norm_var": 0.0004258795272906748, "learning_rate": 0.0099899820992355, "loss": 3.4479, "step": 318 }, { "crossentropy": 3.389164447784424, "epoch": 0.027135079959169785, "grad_norm": 0.07080056518316269, "grad_norm_var": 0.00044858763579861213, "learning_rate": 0.009989854932107052, "loss": 3.3892, "step": 319 }, { "crossentropy": 3.479830741882324, "epoch": 0.027220142905750255, "grad_norm": 0.0748809427022934, "grad_norm_var": 0.0003548939129804376, "learning_rate": 0.009989726963751682, "loss": 3.4798, "step": 320 }, { "crossentropy": 3.385645627975464, "epoch": 0.027305205852330726, "grad_norm": 0.07538603991270065, "grad_norm_var": 0.0002979103295921501, "learning_rate": 0.009989598194189937, "loss": 3.3856, "step": 321 }, { "crossentropy": 3.4681553840637207, "epoch": 0.027390268798911193, "grad_norm": 0.07154525071382523, "grad_norm_var": 0.00030091775219883565, "learning_rate": 0.009989468623442493, "loss": 3.4682, "step": 322 }, { "crossentropy": 3.398301839828491, "epoch": 0.027475331745491664, "grad_norm": 0.07245306670665741, "grad_norm_var": 0.00030480616836073277, "learning_rate": 0.009989338251530158, "loss": 3.3983, "step": 323 }, { "crossentropy": 3.4214978218078613, "epoch": 0.027560394692072134, "grad_norm": 0.07606002688407898, "grad_norm_var": 0.00030643300383902437, "learning_rate": 0.009989207078473863, "loss": 3.4215, "step": 324 }, { "crossentropy": 3.51798152923584, "epoch": 0.0276454576386526, "grad_norm": 0.07427560538053513, "grad_norm_var": 0.00030801902945535074, "learning_rate": 0.009989075104294674, "loss": 3.518, "step": 325 }, { "crossentropy": 3.319744110107422, "epoch": 0.027730520585233072, "grad_norm": 0.09521228820085526, "grad_norm_var": 0.0002775039969383126, "learning_rate": 0.009988942329013777, "loss": 3.3197, "step": 326 }, { "crossentropy": 3.4836528301239014, "epoch": 0.027815583531813542, "grad_norm": 0.123762346804142, "grad_norm_var": 0.0003790183209862495, "learning_rate": 0.009988808752652498, "loss": 3.4837, "step": 327 }, { "crossentropy": 3.367030143737793, "epoch": 0.027900646478394013, "grad_norm": 0.10005807131528854, "grad_norm_var": 0.00039567239095029304, "learning_rate": 0.00998867437523228, "loss": 3.367, "step": 328 }, { "crossentropy": 3.435622453689575, "epoch": 0.02798570942497448, "grad_norm": 0.0900360569357872, "grad_norm_var": 0.00039096601553628745, "learning_rate": 0.009988539196774704, "loss": 3.4356, "step": 329 }, { "crossentropy": 3.348193407058716, "epoch": 0.02807077237155495, "grad_norm": 0.07593764364719391, "grad_norm_var": 0.00024195335955588162, "learning_rate": 0.009988403217301475, "loss": 3.3482, "step": 330 }, { "crossentropy": 3.436574935913086, "epoch": 0.02815583531813542, "grad_norm": 0.07328229397535324, "grad_norm_var": 0.00023071204093708334, "learning_rate": 0.009988266436834427, "loss": 3.4366, "step": 331 }, { "crossentropy": 3.415771961212158, "epoch": 0.02824089826471589, "grad_norm": 0.07088818401098251, "grad_norm_var": 0.00023157494766569506, "learning_rate": 0.009988128855395522, "loss": 3.4158, "step": 332 }, { "crossentropy": 3.3944084644317627, "epoch": 0.02832596121129636, "grad_norm": 0.08037582784891129, "grad_norm_var": 0.00022838214348076344, "learning_rate": 0.009987990473006853, "loss": 3.3944, "step": 333 }, { "crossentropy": 3.4725983142852783, "epoch": 0.02841102415787683, "grad_norm": 0.0726630762219429, "grad_norm_var": 0.0002091792381230584, "learning_rate": 0.009987851289690639, "loss": 3.4726, "step": 334 }, { "crossentropy": 3.4168498516082764, "epoch": 0.028496087104457297, "grad_norm": 0.06942179054021835, "grad_norm_var": 0.00021119166373534393, "learning_rate": 0.00998771130546923, "loss": 3.4168, "step": 335 }, { "crossentropy": 3.5203051567077637, "epoch": 0.028581150051037767, "grad_norm": 0.07842343300580978, "grad_norm_var": 0.00020907872321523335, "learning_rate": 0.009987570520365103, "loss": 3.5203, "step": 336 }, { "crossentropy": 3.46624755859375, "epoch": 0.028666212997618238, "grad_norm": 0.12891621887683868, "grad_norm_var": 0.0003464157408773853, "learning_rate": 0.009987428934400864, "loss": 3.4662, "step": 337 }, { "crossentropy": 3.335681676864624, "epoch": 0.02875127594419871, "grad_norm": 0.1162722185254097, "grad_norm_var": 0.00039370149712253757, "learning_rate": 0.009987286547599249, "loss": 3.3357, "step": 338 }, { "crossentropy": 3.4212839603424072, "epoch": 0.028836338890779176, "grad_norm": 0.07289907336235046, "grad_norm_var": 0.0003928264170795235, "learning_rate": 0.009987143359983117, "loss": 3.4213, "step": 339 }, { "crossentropy": 3.387260675430298, "epoch": 0.028921401837359646, "grad_norm": 0.07665447890758514, "grad_norm_var": 0.0003919492766894332, "learning_rate": 0.009986999371575465, "loss": 3.3873, "step": 340 }, { "crossentropy": 3.426910877227783, "epoch": 0.029006464783940117, "grad_norm": 0.07234711199998856, "grad_norm_var": 0.00039556733311388407, "learning_rate": 0.00998685458239941, "loss": 3.4269, "step": 341 }, { "crossentropy": 3.499580144882202, "epoch": 0.029091527730520584, "grad_norm": 0.07294967025518417, "grad_norm_var": 0.0004031223635554822, "learning_rate": 0.009986708992478202, "loss": 3.4996, "step": 342 }, { "crossentropy": 3.3696582317352295, "epoch": 0.029176590677101055, "grad_norm": 0.08247341960668564, "grad_norm_var": 0.00030139914374975043, "learning_rate": 0.009986562601835218, "loss": 3.3697, "step": 343 }, { "crossentropy": 3.2916512489318848, "epoch": 0.029261653623681525, "grad_norm": 0.07415365427732468, "grad_norm_var": 0.0002856303815935407, "learning_rate": 0.009986415410493965, "loss": 3.2917, "step": 344 }, { "crossentropy": 3.413924217224121, "epoch": 0.029346716570261992, "grad_norm": 0.0708424523472786, "grad_norm_var": 0.00028740087572565187, "learning_rate": 0.009986267418478079, "loss": 3.4139, "step": 345 }, { "crossentropy": 3.370905876159668, "epoch": 0.029431779516842463, "grad_norm": 0.06906882673501968, "grad_norm_var": 0.00029455671622077086, "learning_rate": 0.00998611862581132, "loss": 3.3709, "step": 346 }, { "crossentropy": 3.4571080207824707, "epoch": 0.029516842463422933, "grad_norm": 0.08012387901544571, "grad_norm_var": 0.0002912611737901691, "learning_rate": 0.009985969032517581, "loss": 3.4571, "step": 347 }, { "crossentropy": 3.2782251834869385, "epoch": 0.029601905410003404, "grad_norm": 0.08949696272611618, "grad_norm_var": 0.0002889821363466917, "learning_rate": 0.009985818638620884, "loss": 3.2782, "step": 348 }, { "crossentropy": 3.3532207012176514, "epoch": 0.02968696835658387, "grad_norm": 0.11449147015810013, "grad_norm_var": 0.00035573464110658993, "learning_rate": 0.009985667444145377, "loss": 3.3532, "step": 349 }, { "crossentropy": 3.4292449951171875, "epoch": 0.029772031303164342, "grad_norm": 0.1435619294643402, "grad_norm_var": 0.000564385760897157, "learning_rate": 0.009985515449115337, "loss": 3.4292, "step": 350 }, { "crossentropy": 3.297581911087036, "epoch": 0.029857094249744812, "grad_norm": 0.11012925207614899, "grad_norm_var": 0.0005657284355766047, "learning_rate": 0.00998536265355517, "loss": 3.2976, "step": 351 }, { "crossentropy": 3.4212913513183594, "epoch": 0.02994215719632528, "grad_norm": 0.06968950480222702, "grad_norm_var": 0.0005849091306486455, "learning_rate": 0.00998520905748941, "loss": 3.4213, "step": 352 }, { "crossentropy": 3.295607805252075, "epoch": 0.03002722014290575, "grad_norm": 0.07366588711738586, "grad_norm_var": 0.0004908860080631384, "learning_rate": 0.00998505466094272, "loss": 3.2956, "step": 353 }, { "crossentropy": 3.419577121734619, "epoch": 0.03011228308948622, "grad_norm": 0.07332951575517654, "grad_norm_var": 0.0004373989270834745, "learning_rate": 0.009984899463939895, "loss": 3.4196, "step": 354 }, { "crossentropy": 3.284982204437256, "epoch": 0.030197346036066688, "grad_norm": 0.0650143250823021, "grad_norm_var": 0.00045307824360321474, "learning_rate": 0.009984743466505852, "loss": 3.285, "step": 355 }, { "crossentropy": 3.4085898399353027, "epoch": 0.03028240898264716, "grad_norm": 0.06718350201845169, "grad_norm_var": 0.00046748620432329724, "learning_rate": 0.00998458666866564, "loss": 3.4086, "step": 356 }, { "crossentropy": 3.365543842315674, "epoch": 0.03036747192922763, "grad_norm": 0.07749520242214203, "grad_norm_var": 0.0004618079938686681, "learning_rate": 0.009984429070444436, "loss": 3.3655, "step": 357 }, { "crossentropy": 3.3450300693511963, "epoch": 0.0304525348758081, "grad_norm": 0.07689516991376877, "grad_norm_var": 0.0004573073794284473, "learning_rate": 0.009984270671867548, "loss": 3.345, "step": 358 }, { "crossentropy": 3.4556262493133545, "epoch": 0.030537597822388567, "grad_norm": 0.072667695581913, "grad_norm_var": 0.00046479104018096277, "learning_rate": 0.009984111472960407, "loss": 3.4556, "step": 359 }, { "crossentropy": 3.42598032951355, "epoch": 0.030622660768969037, "grad_norm": 0.06897289305925369, "grad_norm_var": 0.00047257109594774175, "learning_rate": 0.009983951473748578, "loss": 3.426, "step": 360 }, { "crossentropy": 3.4205896854400635, "epoch": 0.030707723715549508, "grad_norm": 0.06717023998498917, "grad_norm_var": 0.0004792022186195381, "learning_rate": 0.009983790674257751, "loss": 3.4206, "step": 361 }, { "crossentropy": 3.4069275856018066, "epoch": 0.030792786662129975, "grad_norm": 0.06680091470479965, "grad_norm_var": 0.000483565385939506, "learning_rate": 0.009983629074513748, "loss": 3.4069, "step": 362 }, { "crossentropy": 3.393822431564331, "epoch": 0.030877849608710446, "grad_norm": 0.07394885271787643, "grad_norm_var": 0.0004877345052124819, "learning_rate": 0.009983466674542515, "loss": 3.3938, "step": 363 }, { "crossentropy": 3.3875539302825928, "epoch": 0.030962912555290916, "grad_norm": 0.09052139520645142, "grad_norm_var": 0.0004888368059123365, "learning_rate": 0.009983303474370129, "loss": 3.3876, "step": 364 }, { "crossentropy": 3.4004530906677246, "epoch": 0.031047975501871383, "grad_norm": 0.07625739276409149, "grad_norm_var": 0.0004144172992605564, "learning_rate": 0.009983139474022795, "loss": 3.4005, "step": 365 }, { "crossentropy": 3.3034257888793945, "epoch": 0.031133038448451854, "grad_norm": 0.07273928821086884, "grad_norm_var": 0.00012373911278111086, "learning_rate": 0.00998297467352685, "loss": 3.3034, "step": 366 }, { "crossentropy": 3.350733995437622, "epoch": 0.031218101395032324, "grad_norm": 0.0644666999578476, "grad_norm_var": 4.1121149116389126e-05, "learning_rate": 0.009982809072908749, "loss": 3.3507, "step": 367 }, { "crossentropy": 3.291994333267212, "epoch": 0.031303164341612795, "grad_norm": 0.07443951070308685, "grad_norm_var": 4.087726171769941e-05, "learning_rate": 0.009982642672195091, "loss": 3.292, "step": 368 }, { "crossentropy": 3.383469343185425, "epoch": 0.031388227288193266, "grad_norm": 0.11697887629270554, "grad_norm_var": 0.00016429514379914152, "learning_rate": 0.00998247547141259, "loss": 3.3835, "step": 369 }, { "crossentropy": 3.45406436920166, "epoch": 0.03147329023477373, "grad_norm": 0.11302178353071213, "grad_norm_var": 0.000252307053743391, "learning_rate": 0.009982307470588097, "loss": 3.4541, "step": 370 }, { "crossentropy": 3.2305753231048584, "epoch": 0.0315583531813542, "grad_norm": 0.07006682455539703, "grad_norm_var": 0.0002452987824687778, "learning_rate": 0.009982138669748584, "loss": 3.2306, "step": 371 }, { "crossentropy": 3.3380119800567627, "epoch": 0.03164341612793467, "grad_norm": 0.07602626085281372, "grad_norm_var": 0.0002373130698579316, "learning_rate": 0.009981969068921157, "loss": 3.338, "step": 372 }, { "crossentropy": 3.3998255729675293, "epoch": 0.03172847907451514, "grad_norm": 0.06918134540319443, "grad_norm_var": 0.0002429179736088025, "learning_rate": 0.00998179866813305, "loss": 3.3998, "step": 373 }, { "crossentropy": 3.338010549545288, "epoch": 0.03181354202109561, "grad_norm": 0.06526299566030502, "grad_norm_var": 0.0002532971428199765, "learning_rate": 0.009981627467411627, "loss": 3.338, "step": 374 }, { "crossentropy": 3.3464598655700684, "epoch": 0.03189860496767608, "grad_norm": 0.06360694020986557, "grad_norm_var": 0.00026415460861922306, "learning_rate": 0.009981455466784373, "loss": 3.3465, "step": 375 }, { "crossentropy": 3.302626371383667, "epoch": 0.03198366791425655, "grad_norm": 0.07620560377836227, "grad_norm_var": 0.0002598360417636019, "learning_rate": 0.009981282666278908, "loss": 3.3026, "step": 376 }, { "crossentropy": 3.34964919090271, "epoch": 0.032068730860837016, "grad_norm": 0.07659807056188583, "grad_norm_var": 0.00025266599155359277, "learning_rate": 0.009981109065922979, "loss": 3.3496, "step": 377 }, { "crossentropy": 3.326112985610962, "epoch": 0.03215379380741749, "grad_norm": 0.07312722504138947, "grad_norm_var": 0.0002458198276144056, "learning_rate": 0.009980934665744463, "loss": 3.3261, "step": 378 }, { "crossentropy": 3.403752565383911, "epoch": 0.03223885675399796, "grad_norm": 0.06946047395467758, "grad_norm_var": 0.0002496697443237809, "learning_rate": 0.009980759465771361, "loss": 3.4038, "step": 379 }, { "crossentropy": 3.2765955924987793, "epoch": 0.03232391970057843, "grad_norm": 0.06661012768745422, "grad_norm_var": 0.00024547588131238845, "learning_rate": 0.009980583466031808, "loss": 3.2766, "step": 380 }, { "crossentropy": 3.4938790798187256, "epoch": 0.0324089826471589, "grad_norm": 0.07014542818069458, "grad_norm_var": 0.0002480108629001896, "learning_rate": 0.009980406666554061, "loss": 3.4939, "step": 381 }, { "crossentropy": 3.34346342086792, "epoch": 0.03249404559373937, "grad_norm": 0.06900740414857864, "grad_norm_var": 0.00025056403056270893, "learning_rate": 0.009980229067366508, "loss": 3.3435, "step": 382 }, { "crossentropy": 3.3144760131835938, "epoch": 0.03257910854031984, "grad_norm": 0.07004904747009277, "grad_norm_var": 0.00024401078419839864, "learning_rate": 0.009980050668497673, "loss": 3.3145, "step": 383 }, { "crossentropy": 3.409323215484619, "epoch": 0.032664171486900304, "grad_norm": 0.07010595500469208, "grad_norm_var": 0.0002462229710130749, "learning_rate": 0.009979871469976196, "loss": 3.4093, "step": 384 }, { "crossentropy": 3.383810520172119, "epoch": 0.032749234433480774, "grad_norm": 0.07415691763162613, "grad_norm_var": 0.0001266630031615458, "learning_rate": 0.009979691471830851, "loss": 3.3838, "step": 385 }, { "crossentropy": 3.288189649581909, "epoch": 0.032834297380061245, "grad_norm": 0.13169844448566437, "grad_norm_var": 0.00024740622949040263, "learning_rate": 0.009979510674090544, "loss": 3.2882, "step": 386 }, { "crossentropy": 3.2909250259399414, "epoch": 0.032919360326641715, "grad_norm": 0.12848614156246185, "grad_norm_var": 0.0004265125558491937, "learning_rate": 0.009979329076784305, "loss": 3.2909, "step": 387 }, { "crossentropy": 3.23103666305542, "epoch": 0.033004423273222186, "grad_norm": 0.09697467088699341, "grad_norm_var": 0.00044812518188043585, "learning_rate": 0.00997914667994129, "loss": 3.231, "step": 388 }, { "crossentropy": 3.381491184234619, "epoch": 0.033089486219802657, "grad_norm": 0.06638453155755997, "grad_norm_var": 0.0004524311417719723, "learning_rate": 0.009978963483590791, "loss": 3.3815, "step": 389 }, { "crossentropy": 3.374422788619995, "epoch": 0.03317454916638312, "grad_norm": 0.06625670194625854, "grad_norm_var": 0.00045064065487212884, "learning_rate": 0.009978779487762221, "loss": 3.3744, "step": 390 }, { "crossentropy": 3.362058162689209, "epoch": 0.03325961211296359, "grad_norm": 0.06725726276636124, "grad_norm_var": 0.00044383325285436405, "learning_rate": 0.009978594692485125, "loss": 3.3621, "step": 391 }, { "crossentropy": 3.334821939468384, "epoch": 0.03334467505954406, "grad_norm": 0.06450778245925903, "grad_norm_var": 0.0004575750740803436, "learning_rate": 0.009978409097789177, "loss": 3.3348, "step": 392 }, { "crossentropy": 3.274158477783203, "epoch": 0.03342973800612453, "grad_norm": 0.0658506527543068, "grad_norm_var": 0.00046795194688699083, "learning_rate": 0.009978222703704178, "loss": 3.2742, "step": 393 }, { "crossentropy": 3.26657772064209, "epoch": 0.033514800952705, "grad_norm": 0.06524788588285446, "grad_norm_var": 0.0004770879231449252, "learning_rate": 0.009978035510260058, "loss": 3.2666, "step": 394 }, { "crossentropy": 3.453578233718872, "epoch": 0.03359986389928547, "grad_norm": 0.06993375718593597, "grad_norm_var": 0.000476585918638465, "learning_rate": 0.00997784751748687, "loss": 3.4536, "step": 395 }, { "crossentropy": 3.2534966468811035, "epoch": 0.033684926845865944, "grad_norm": 0.07359013706445694, "grad_norm_var": 0.00046934063410490314, "learning_rate": 0.009977658725414808, "loss": 3.2535, "step": 396 }, { "crossentropy": 3.3113813400268555, "epoch": 0.03376998979244641, "grad_norm": 0.06386815011501312, "grad_norm_var": 0.00047846389694578527, "learning_rate": 0.009977469134074182, "loss": 3.3114, "step": 397 }, { "crossentropy": 3.319552421569824, "epoch": 0.03385505273902688, "grad_norm": 0.06237108260393143, "grad_norm_var": 0.0004889177286209243, "learning_rate": 0.009977278743495435, "loss": 3.3196, "step": 398 }, { "crossentropy": 3.2209362983703613, "epoch": 0.03394011568560735, "grad_norm": 0.06902694702148438, "grad_norm_var": 0.0004899706634144571, "learning_rate": 0.009977087553709137, "loss": 3.2209, "step": 399 }, { "crossentropy": 3.3014280796051025, "epoch": 0.03402517863218782, "grad_norm": 0.07511848211288452, "grad_norm_var": 0.00048677819377383074, "learning_rate": 0.00997689556474599, "loss": 3.3014, "step": 400 }, { "crossentropy": 3.351083517074585, "epoch": 0.03411024157876829, "grad_norm": 0.07612574845552444, "grad_norm_var": 0.0004861308974123029, "learning_rate": 0.009976702776636823, "loss": 3.3511, "step": 401 }, { "crossentropy": 3.251296043395996, "epoch": 0.03419530452534876, "grad_norm": 0.07960160821676254, "grad_norm_var": 0.0002804567291335112, "learning_rate": 0.00997650918941259, "loss": 3.2513, "step": 402 }, { "crossentropy": 3.358945608139038, "epoch": 0.03428036747192923, "grad_norm": 0.10914386063814163, "grad_norm_var": 0.00016438537514334046, "learning_rate": 0.009976314803104376, "loss": 3.3589, "step": 403 }, { "crossentropy": 3.347078800201416, "epoch": 0.034365430418509695, "grad_norm": 0.0863305851817131, "grad_norm_var": 0.0001377303821221291, "learning_rate": 0.009976119617743393, "loss": 3.3471, "step": 404 }, { "crossentropy": 3.2810821533203125, "epoch": 0.034450493365090165, "grad_norm": 0.07267109304666519, "grad_norm_var": 0.00013504217194953242, "learning_rate": 0.009975923633360985, "loss": 3.2811, "step": 405 }, { "crossentropy": 3.3274447917938232, "epoch": 0.034535556311670636, "grad_norm": 0.06269153952598572, "grad_norm_var": 0.0001390094022304907, "learning_rate": 0.009975726849988618, "loss": 3.3274, "step": 406 }, { "crossentropy": 3.3019955158233643, "epoch": 0.034620619258251106, "grad_norm": 0.060417987406253815, "grad_norm_var": 0.00014690391713539547, "learning_rate": 0.009975529267657891, "loss": 3.302, "step": 407 }, { "crossentropy": 3.4511733055114746, "epoch": 0.03470568220483158, "grad_norm": 0.07773897796869278, "grad_norm_var": 0.00014413211244423385, "learning_rate": 0.00997533088640053, "loss": 3.4512, "step": 408 }, { "crossentropy": 3.3955347537994385, "epoch": 0.03479074515141205, "grad_norm": 0.08763832598924637, "grad_norm_var": 0.00015271818022043448, "learning_rate": 0.00997513170624839, "loss": 3.3955, "step": 409 }, { "crossentropy": 3.324408531188965, "epoch": 0.03487580809799251, "grad_norm": 0.07555755972862244, "grad_norm_var": 0.000146684663503537, "learning_rate": 0.009974931727233453, "loss": 3.3244, "step": 410 }, { "crossentropy": 3.4177472591400146, "epoch": 0.03496087104457298, "grad_norm": 0.08061745762825012, "grad_norm_var": 0.00014643911877260107, "learning_rate": 0.009974730949387832, "loss": 3.4177, "step": 411 }, { "crossentropy": 3.276197671890259, "epoch": 0.03504593399115345, "grad_norm": 0.06572733074426651, "grad_norm_var": 0.00015260083296303462, "learning_rate": 0.00997452937274376, "loss": 3.2762, "step": 412 }, { "crossentropy": 3.2938344478607178, "epoch": 0.03513099693773392, "grad_norm": 0.0728665441274643, "grad_norm_var": 0.00014395724716748104, "learning_rate": 0.009974326997333614, "loss": 3.2938, "step": 413 }, { "crossentropy": 3.319789171218872, "epoch": 0.035216059884314393, "grad_norm": 0.07059314101934433, "grad_norm_var": 0.00013340270242027385, "learning_rate": 0.00997412382318988, "loss": 3.3198, "step": 414 }, { "crossentropy": 3.2129480838775635, "epoch": 0.035301122830894864, "grad_norm": 0.07249385118484497, "grad_norm_var": 0.00013076108741882827, "learning_rate": 0.009973919850345188, "loss": 3.2129, "step": 415 }, { "crossentropy": 3.3157753944396973, "epoch": 0.035386185777475335, "grad_norm": 0.08282238245010376, "grad_norm_var": 0.00013296574296768986, "learning_rate": 0.009973715078832287, "loss": 3.3158, "step": 416 }, { "crossentropy": 3.244736671447754, "epoch": 0.0354712487240558, "grad_norm": 0.08251158893108368, "grad_norm_var": 0.00013471481326155475, "learning_rate": 0.00997350950868406, "loss": 3.2447, "step": 417 }, { "crossentropy": 3.171665668487549, "epoch": 0.03555631167063627, "grad_norm": 0.07086639851331711, "grad_norm_var": 0.00013699413339260922, "learning_rate": 0.009973303139933512, "loss": 3.1717, "step": 418 }, { "crossentropy": 3.3177998065948486, "epoch": 0.03564137461721674, "grad_norm": 0.07029038667678833, "grad_norm_var": 6.439897792880253e-05, "learning_rate": 0.009973095972613784, "loss": 3.3178, "step": 419 }, { "crossentropy": 3.311358690261841, "epoch": 0.03572643756379721, "grad_norm": 0.08174002915620804, "grad_norm_var": 5.8468551733955225e-05, "learning_rate": 0.00997288800675814, "loss": 3.3114, "step": 420 }, { "crossentropy": 3.251302480697632, "epoch": 0.03581150051037768, "grad_norm": 0.07124411314725876, "grad_norm_var": 5.8887244933768695e-05, "learning_rate": 0.009972679242399972, "loss": 3.2513, "step": 421 }, { "crossentropy": 3.236875534057617, "epoch": 0.03589656345695815, "grad_norm": 0.06335081160068512, "grad_norm_var": 5.791037710090781e-05, "learning_rate": 0.009972469679572802, "loss": 3.2369, "step": 422 }, { "crossentropy": 3.2707536220550537, "epoch": 0.03598162640353862, "grad_norm": 0.06215329468250275, "grad_norm_var": 5.4920236116082906e-05, "learning_rate": 0.00997225931831028, "loss": 3.2708, "step": 423 }, { "crossentropy": 3.3169848918914795, "epoch": 0.036066689350119085, "grad_norm": 0.06338908523321152, "grad_norm_var": 6.114004663563428e-05, "learning_rate": 0.009972048158646184, "loss": 3.317, "step": 424 }, { "crossentropy": 3.222294807434082, "epoch": 0.036151752296699556, "grad_norm": 0.07373842597007751, "grad_norm_var": 4.676504051964273e-05, "learning_rate": 0.009971836200614419, "loss": 3.2223, "step": 425 }, { "crossentropy": 3.206998348236084, "epoch": 0.03623681524328003, "grad_norm": 0.09200314432382584, "grad_norm_var": 7.037821927596439e-05, "learning_rate": 0.00997162344424902, "loss": 3.207, "step": 426 }, { "crossentropy": 3.2682130336761475, "epoch": 0.0363218781898605, "grad_norm": 0.07542216777801514, "grad_norm_var": 6.715252170140587e-05, "learning_rate": 0.009971409889584152, "loss": 3.2682, "step": 427 }, { "crossentropy": 3.29677414894104, "epoch": 0.03640694113644097, "grad_norm": 0.08125858008861542, "grad_norm_var": 6.67524582616763e-05, "learning_rate": 0.009971195536654102, "loss": 3.2968, "step": 428 }, { "crossentropy": 3.385305881500244, "epoch": 0.03649200408302144, "grad_norm": 0.08763643354177475, "grad_norm_var": 7.781694327044763e-05, "learning_rate": 0.009970980385493291, "loss": 3.3853, "step": 429 }, { "crossentropy": 3.2255959510803223, "epoch": 0.0365770670296019, "grad_norm": 0.07030566781759262, "grad_norm_var": 7.799464872777102e-05, "learning_rate": 0.009970764436136266, "loss": 3.2256, "step": 430 }, { "crossentropy": 3.288771867752075, "epoch": 0.03666212997618237, "grad_norm": 0.06679102778434753, "grad_norm_var": 8.199118345290241e-05, "learning_rate": 0.0099705476886177, "loss": 3.2888, "step": 431 }, { "crossentropy": 3.187129259109497, "epoch": 0.03674719292276284, "grad_norm": 0.07058535516262054, "grad_norm_var": 7.813071982761983e-05, "learning_rate": 0.0099703301429724, "loss": 3.1871, "step": 432 }, { "crossentropy": 3.339808225631714, "epoch": 0.036832255869343314, "grad_norm": 0.07832010835409164, "grad_norm_var": 7.444700889423691e-05, "learning_rate": 0.009970111799235298, "loss": 3.3398, "step": 433 }, { "crossentropy": 3.1441144943237305, "epoch": 0.036917318815923784, "grad_norm": 0.06640251725912094, "grad_norm_var": 7.737500858964555e-05, "learning_rate": 0.009969892657441448, "loss": 3.1441, "step": 434 }, { "crossentropy": 3.2620484828948975, "epoch": 0.037002381762504255, "grad_norm": 0.09616348892450333, "grad_norm_var": 0.00010843638045004595, "learning_rate": 0.009969672717626044, "loss": 3.262, "step": 435 }, { "crossentropy": 3.296464204788208, "epoch": 0.037087444709084726, "grad_norm": 0.09477391839027405, "grad_norm_var": 0.00013071242511799956, "learning_rate": 0.009969451979824398, "loss": 3.2965, "step": 436 }, { "crossentropy": 3.211941957473755, "epoch": 0.03717250765566519, "grad_norm": 0.08555970340967178, "grad_norm_var": 0.00013473684738569597, "learning_rate": 0.009969230444071955, "loss": 3.2119, "step": 437 }, { "crossentropy": 3.282749891281128, "epoch": 0.03725757060224566, "grad_norm": 0.08181913197040558, "grad_norm_var": 0.00012308205338601718, "learning_rate": 0.00996900811040429, "loss": 3.2827, "step": 438 }, { "crossentropy": 3.184140682220459, "epoch": 0.03734263354882613, "grad_norm": 0.05656908452510834, "grad_norm_var": 0.00013675177693819225, "learning_rate": 0.0099687849788571, "loss": 3.1841, "step": 439 }, { "crossentropy": 3.2615277767181396, "epoch": 0.0374276964954066, "grad_norm": 0.06240234896540642, "grad_norm_var": 0.00013867519726683627, "learning_rate": 0.009968561049466213, "loss": 3.2615, "step": 440 }, { "crossentropy": 3.237208127975464, "epoch": 0.03751275944198707, "grad_norm": 0.06648119539022446, "grad_norm_var": 0.00014559167172672002, "learning_rate": 0.009968336322267589, "loss": 3.2372, "step": 441 }, { "crossentropy": 3.252429962158203, "epoch": 0.03759782238856754, "grad_norm": 0.08408305794000626, "grad_norm_var": 0.00013370126005612468, "learning_rate": 0.00996811079729731, "loss": 3.2524, "step": 442 }, { "crossentropy": 3.352821111679077, "epoch": 0.03768288533514801, "grad_norm": 0.07029247283935547, "grad_norm_var": 0.00013610759203901696, "learning_rate": 0.009967884474591591, "loss": 3.3528, "step": 443 }, { "crossentropy": 3.2392635345458984, "epoch": 0.037767948281728476, "grad_norm": 0.07958119362592697, "grad_norm_var": 0.00013515549643182962, "learning_rate": 0.009967657354186771, "loss": 3.2393, "step": 444 }, { "crossentropy": 3.2546894550323486, "epoch": 0.03785301122830895, "grad_norm": 0.08290807157754898, "grad_norm_var": 0.0001292862786012996, "learning_rate": 0.00996742943611932, "loss": 3.2547, "step": 445 }, { "crossentropy": 3.2304694652557373, "epoch": 0.03793807417488942, "grad_norm": 0.07602987438440323, "grad_norm_var": 0.0001271293923386429, "learning_rate": 0.009967200720425836, "loss": 3.2305, "step": 446 }, { "crossentropy": 3.15134859085083, "epoch": 0.03802313712146989, "grad_norm": 0.08537574857473373, "grad_norm_var": 0.00012546904886653881, "learning_rate": 0.00996697120714304, "loss": 3.1513, "step": 447 }, { "crossentropy": 3.3129382133483887, "epoch": 0.03810820006805036, "grad_norm": 0.08384397625923157, "grad_norm_var": 0.000124525263704088, "learning_rate": 0.009966740896307792, "loss": 3.3129, "step": 448 }, { "crossentropy": 3.263798475265503, "epoch": 0.03819326301463083, "grad_norm": 0.07271149009466171, "grad_norm_var": 0.00012637371451687754, "learning_rate": 0.009966509787957066, "loss": 3.2638, "step": 449 }, { "crossentropy": 3.232168674468994, "epoch": 0.03827832596121129, "grad_norm": 0.070599265396595, "grad_norm_var": 0.00012108996010600683, "learning_rate": 0.009966277882127976, "loss": 3.2322, "step": 450 }, { "crossentropy": 3.364875078201294, "epoch": 0.038363388907791764, "grad_norm": 0.08756887912750244, "grad_norm_var": 0.00010497777210159934, "learning_rate": 0.009966045178857758, "loss": 3.3649, "step": 451 }, { "crossentropy": 3.289454936981201, "epoch": 0.038448451854372234, "grad_norm": 0.08340272307395935, "grad_norm_var": 8.692606426738732e-05, "learning_rate": 0.009965811678183776, "loss": 3.2895, "step": 452 }, { "crossentropy": 3.2192764282226562, "epoch": 0.038533514800952705, "grad_norm": 0.0756022110581398, "grad_norm_var": 8.152861841962348e-05, "learning_rate": 0.00996557738014353, "loss": 3.2193, "step": 453 }, { "crossentropy": 3.2207489013671875, "epoch": 0.038618577747533175, "grad_norm": 0.06802979856729507, "grad_norm_var": 8.308964149330156e-05, "learning_rate": 0.009965342284774633, "loss": 3.2207, "step": 454 }, { "crossentropy": 3.2457406520843506, "epoch": 0.038703640694113646, "grad_norm": 0.061204854398965836, "grad_norm_var": 7.282883753503463e-05, "learning_rate": 0.009965106392114838, "loss": 3.2457, "step": 455 }, { "crossentropy": 3.2661633491516113, "epoch": 0.03878870364069412, "grad_norm": 0.058162834495306015, "grad_norm_var": 8.14306688291235e-05, "learning_rate": 0.009964869702202022, "loss": 3.2662, "step": 456 }, { "crossentropy": 3.298088788986206, "epoch": 0.03887376658727458, "grad_norm": 0.08778776973485947, "grad_norm_var": 8.455932390202722e-05, "learning_rate": 0.009964632215074194, "loss": 3.2981, "step": 457 }, { "crossentropy": 3.3108150959014893, "epoch": 0.03895882953385505, "grad_norm": 0.0728469267487526, "grad_norm_var": 8.138757736996701e-05, "learning_rate": 0.009964393930769483, "loss": 3.3108, "step": 458 }, { "crossentropy": 3.207704782485962, "epoch": 0.03904389248043552, "grad_norm": 0.06788784265518188, "grad_norm_var": 8.357785983966471e-05, "learning_rate": 0.009964154849326152, "loss": 3.2077, "step": 459 }, { "crossentropy": 3.35791015625, "epoch": 0.03912895542701599, "grad_norm": 0.06378915160894394, "grad_norm_var": 9.130078386575258e-05, "learning_rate": 0.009963914970782592, "loss": 3.3579, "step": 460 }, { "crossentropy": 3.253526210784912, "epoch": 0.03921401837359646, "grad_norm": 0.060169726610183716, "grad_norm_var": 9.921370134284637e-05, "learning_rate": 0.00996367429517732, "loss": 3.2535, "step": 461 }, { "crossentropy": 3.3017032146453857, "epoch": 0.03929908132017693, "grad_norm": 0.11120190471410751, "grad_norm_var": 0.00018868406144632936, "learning_rate": 0.009963432822548982, "loss": 3.3017, "step": 462 }, { "crossentropy": 3.252845525741577, "epoch": 0.039384144266757404, "grad_norm": 0.09739701449871063, "grad_norm_var": 0.00021332629102547757, "learning_rate": 0.00996319055293635, "loss": 3.2528, "step": 463 }, { "crossentropy": 3.298283576965332, "epoch": 0.03946920721333787, "grad_norm": 0.06843919306993484, "grad_norm_var": 0.00021284343102516862, "learning_rate": 0.009962947486378325, "loss": 3.2983, "step": 464 }, { "crossentropy": 3.301518440246582, "epoch": 0.03955427015991834, "grad_norm": 0.055471021682024, "grad_norm_var": 0.0002376583925499169, "learning_rate": 0.009962703622913939, "loss": 3.3015, "step": 465 }, { "crossentropy": 3.127671241760254, "epoch": 0.03963933310649881, "grad_norm": 0.06733681261539459, "grad_norm_var": 0.00023995410626843344, "learning_rate": 0.009962458962582348, "loss": 3.1277, "step": 466 }, { "crossentropy": 3.1937012672424316, "epoch": 0.03972439605307928, "grad_norm": 0.07104530930519104, "grad_norm_var": 0.0002274407204081063, "learning_rate": 0.009962213505422837, "loss": 3.1937, "step": 467 }, { "crossentropy": 3.2647409439086914, "epoch": 0.03980945899965975, "grad_norm": 0.06284224987030029, "grad_norm_var": 0.00022564768860311717, "learning_rate": 0.009961967251474822, "loss": 3.2647, "step": 468 }, { "crossentropy": 3.290968418121338, "epoch": 0.03989452194624022, "grad_norm": 0.06552805751562119, "grad_norm_var": 0.00022691832448620407, "learning_rate": 0.009961720200777839, "loss": 3.291, "step": 469 }, { "crossentropy": 3.166895627975464, "epoch": 0.039979584892820684, "grad_norm": 0.06154194474220276, "grad_norm_var": 0.00023228824511917947, "learning_rate": 0.009961472353371564, "loss": 3.1669, "step": 470 }, { "crossentropy": 3.2207999229431152, "epoch": 0.040064647839401155, "grad_norm": 0.07067424058914185, "grad_norm_var": 0.0002257895199993698, "learning_rate": 0.009961223709295789, "loss": 3.2208, "step": 471 }, { "crossentropy": 3.2845070362091064, "epoch": 0.040149710785981625, "grad_norm": 0.0704517588019371, "grad_norm_var": 0.00021356718975950773, "learning_rate": 0.009960974268590439, "loss": 3.2845, "step": 472 }, { "crossentropy": 3.2071573734283447, "epoch": 0.040234773732562096, "grad_norm": 0.07283949106931686, "grad_norm_var": 0.00019636654171030862, "learning_rate": 0.009960724031295571, "loss": 3.2072, "step": 473 }, { "crossentropy": 3.277452230453491, "epoch": 0.040319836679142566, "grad_norm": 0.06233981251716614, "grad_norm_var": 0.0002009822447430332, "learning_rate": 0.009960472997451363, "loss": 3.2775, "step": 474 }, { "crossentropy": 3.242330312728882, "epoch": 0.04040489962572304, "grad_norm": 0.05847366899251938, "grad_norm_var": 0.00020987521459390427, "learning_rate": 0.009960221167098124, "loss": 3.2423, "step": 475 }, { "crossentropy": 3.2468063831329346, "epoch": 0.04048996257230351, "grad_norm": 0.06457792967557907, "grad_norm_var": 0.00020926391752808105, "learning_rate": 0.00995996854027629, "loss": 3.2468, "step": 476 }, { "crossentropy": 3.240889549255371, "epoch": 0.04057502551888397, "grad_norm": 0.0626453086733818, "grad_norm_var": 0.00020639538539497971, "learning_rate": 0.00995971511702643, "loss": 3.2409, "step": 477 }, { "crossentropy": 3.2554574012756348, "epoch": 0.04066008846546444, "grad_norm": 0.06615982204675674, "grad_norm_var": 8.680522630086548e-05, "learning_rate": 0.00995946089738923, "loss": 3.2555, "step": 478 }, { "crossentropy": 3.3518810272216797, "epoch": 0.04074515141204491, "grad_norm": 0.0676303282380104, "grad_norm_var": 2.2970952260176608e-05, "learning_rate": 0.009959205881405515, "loss": 3.3519, "step": 479 }, { "crossentropy": 3.2612438201904297, "epoch": 0.04083021435862538, "grad_norm": 0.07734574377536774, "grad_norm_var": 3.141950363570959e-05, "learning_rate": 0.009958950069116231, "loss": 3.2612, "step": 480 }, { "crossentropy": 3.1188580989837646, "epoch": 0.040915277305205854, "grad_norm": 0.07276030629873276, "grad_norm_var": 2.5699989492609237e-05, "learning_rate": 0.009958693460562454, "loss": 3.1189, "step": 481 }, { "crossentropy": 3.2721521854400635, "epoch": 0.041000340251786324, "grad_norm": 0.061666227877140045, "grad_norm_var": 2.7558673363724713e-05, "learning_rate": 0.009958436055785391, "loss": 3.2722, "step": 482 }, { "crossentropy": 3.2876710891723633, "epoch": 0.041085403198366795, "grad_norm": 0.0607600212097168, "grad_norm_var": 2.8324662636779647e-05, "learning_rate": 0.00995817785482637, "loss": 3.2877, "step": 483 }, { "crossentropy": 3.1682329177856445, "epoch": 0.04117046614494726, "grad_norm": 0.06517985463142395, "grad_norm_var": 2.7638402690539448e-05, "learning_rate": 0.009957918857726853, "loss": 3.1682, "step": 484 }, { "crossentropy": 3.2736830711364746, "epoch": 0.04125552909152773, "grad_norm": 0.07542437314987183, "grad_norm_var": 3.275947972576516e-05, "learning_rate": 0.009957659064528427, "loss": 3.2737, "step": 485 }, { "crossentropy": 3.2302584648132324, "epoch": 0.0413405920381082, "grad_norm": 0.06469649821519852, "grad_norm_var": 3.112593192336662e-05, "learning_rate": 0.009957398475272805, "loss": 3.2303, "step": 486 }, { "crossentropy": 3.273022413253784, "epoch": 0.04142565498468867, "grad_norm": 0.06303127110004425, "grad_norm_var": 3.11361102889605e-05, "learning_rate": 0.009957137090001834, "loss": 3.273, "step": 487 }, { "crossentropy": 3.18406081199646, "epoch": 0.04151071793126914, "grad_norm": 0.06285669654607773, "grad_norm_var": 3.086504519811313e-05, "learning_rate": 0.00995687490875748, "loss": 3.1841, "step": 488 }, { "crossentropy": 3.2978570461273193, "epoch": 0.04159578087784961, "grad_norm": 0.07782544195652008, "grad_norm_var": 3.6866432131759984e-05, "learning_rate": 0.009956611931581849, "loss": 3.2979, "step": 489 }, { "crossentropy": 3.2733867168426514, "epoch": 0.041680843824430075, "grad_norm": 0.09131862968206406, "grad_norm_var": 7.342920476264281e-05, "learning_rate": 0.009956348158517161, "loss": 3.2734, "step": 490 }, { "crossentropy": 3.24580979347229, "epoch": 0.041765906771010546, "grad_norm": 0.07685400545597076, "grad_norm_var": 7.05311013376432e-05, "learning_rate": 0.009956083589605771, "loss": 3.2458, "step": 491 }, { "crossentropy": 3.237799644470215, "epoch": 0.041850969717591016, "grad_norm": 0.08131671696901321, "grad_norm_var": 7.723433297240322e-05, "learning_rate": 0.009955818224890165, "loss": 3.2378, "step": 492 }, { "crossentropy": 3.0794951915740967, "epoch": 0.04193603266417149, "grad_norm": 0.06748141348361969, "grad_norm_var": 7.365257185159753e-05, "learning_rate": 0.009955552064412952, "loss": 3.0795, "step": 493 }, { "crossentropy": 3.3270926475524902, "epoch": 0.04202109561075196, "grad_norm": 0.059737566858530045, "grad_norm_var": 8.01774285498102e-05, "learning_rate": 0.009955285108216866, "loss": 3.3271, "step": 494 }, { "crossentropy": 3.2674427032470703, "epoch": 0.04210615855733243, "grad_norm": 0.05789000168442726, "grad_norm_var": 8.966225763580467e-05, "learning_rate": 0.009955017356344775, "loss": 3.2674, "step": 495 }, { "crossentropy": 3.219116687774658, "epoch": 0.0421912215039129, "grad_norm": 0.06324418634176254, "grad_norm_var": 8.78260643748056e-05, "learning_rate": 0.009954748808839673, "loss": 3.2191, "step": 496 }, { "crossentropy": 3.2286291122436523, "epoch": 0.04227628445049336, "grad_norm": 0.06821280717849731, "grad_norm_var": 8.676439591093557e-05, "learning_rate": 0.009954479465744681, "loss": 3.2286, "step": 497 }, { "crossentropy": 3.242824077606201, "epoch": 0.04236134739707383, "grad_norm": 0.0724756121635437, "grad_norm_var": 8.408315658920788e-05, "learning_rate": 0.009954209327103045, "loss": 3.2428, "step": 498 }, { "crossentropy": 3.1747167110443115, "epoch": 0.0424464103436543, "grad_norm": 0.08688803017139435, "grad_norm_var": 9.710694787424164e-05, "learning_rate": 0.009953938392958146, "loss": 3.1747, "step": 499 }, { "crossentropy": 3.242832660675049, "epoch": 0.042531473290234774, "grad_norm": 0.08735410869121552, "grad_norm_var": 0.00011091993266595194, "learning_rate": 0.009953666663353483, "loss": 3.2428, "step": 500 }, { "crossentropy": 3.230050802230835, "epoch": 0.042616536236815244, "grad_norm": 0.06767909228801727, "grad_norm_var": 0.0001114302818540867, "learning_rate": 0.009953394138332692, "loss": 3.2301, "step": 501 }, { "crossentropy": 3.270430088043213, "epoch": 0.042701599183395715, "grad_norm": 0.060683827847242355, "grad_norm_var": 0.00011623923773532204, "learning_rate": 0.00995312081793953, "loss": 3.2704, "step": 502 }, { "crossentropy": 3.197721004486084, "epoch": 0.042786662129976186, "grad_norm": 0.06282982975244522, "grad_norm_var": 0.00011647066007026552, "learning_rate": 0.009952846702217886, "loss": 3.1977, "step": 503 }, { "crossentropy": 3.2025225162506104, "epoch": 0.04287172507655665, "grad_norm": 0.06468215584754944, "grad_norm_var": 0.00011456533878674793, "learning_rate": 0.009952571791211775, "loss": 3.2025, "step": 504 }, { "crossentropy": 3.1899027824401855, "epoch": 0.04295678802313712, "grad_norm": 0.06685823947191238, "grad_norm_var": 0.00011305921046607884, "learning_rate": 0.00995229608496534, "loss": 3.1899, "step": 505 }, { "crossentropy": 3.1430914402008057, "epoch": 0.04304185096971759, "grad_norm": 0.08816234022378922, "grad_norm_var": 0.00010511799460349667, "learning_rate": 0.00995201958352285, "loss": 3.1431, "step": 506 }, { "crossentropy": 3.2408804893493652, "epoch": 0.04312691391629806, "grad_norm": 0.09276600927114487, "grad_norm_var": 0.00013384634595591059, "learning_rate": 0.009951742286928704, "loss": 3.2409, "step": 507 }, { "crossentropy": 3.2368481159210205, "epoch": 0.04321197686287853, "grad_norm": 0.08149556070566177, "grad_norm_var": 0.000134076080961451, "learning_rate": 0.009951464195227428, "loss": 3.2368, "step": 508 }, { "crossentropy": 3.0374162197113037, "epoch": 0.043297039809459, "grad_norm": 0.0678931176662445, "grad_norm_var": 0.0001338508431444243, "learning_rate": 0.009951185308463676, "loss": 3.0374, "step": 509 }, { "crossentropy": 3.1237709522247314, "epoch": 0.043382102756039466, "grad_norm": 0.07242826372385025, "grad_norm_var": 0.00012350039644993608, "learning_rate": 0.009950905626682228, "loss": 3.1238, "step": 510 }, { "crossentropy": 3.222780704498291, "epoch": 0.043467165702619937, "grad_norm": 0.0723910853266716, "grad_norm_var": 0.00010820839606182024, "learning_rate": 0.009950625149927995, "loss": 3.2228, "step": 511 }, { "crossentropy": 3.2027032375335693, "epoch": 0.04355222864920041, "grad_norm": 0.06978543102741241, "grad_norm_var": 0.00010193545415125472, "learning_rate": 0.00995034387824601, "loss": 3.2027, "step": 512 }, { "crossentropy": 3.315277576446533, "epoch": 0.04363729159578088, "grad_norm": 0.07377910614013672, "grad_norm_var": 9.964244724587524e-05, "learning_rate": 0.009950061811681439, "loss": 3.3153, "step": 513 }, { "crossentropy": 3.245925188064575, "epoch": 0.04372235454236135, "grad_norm": 0.08683416247367859, "grad_norm_var": 0.00010911276211389148, "learning_rate": 0.009949778950279575, "loss": 3.2459, "step": 514 }, { "crossentropy": 3.1223058700561523, "epoch": 0.04380741748894182, "grad_norm": 0.07218457013368607, "grad_norm_var": 9.962631370096132e-05, "learning_rate": 0.009949495294085835, "loss": 3.1223, "step": 515 }, { "crossentropy": 3.1825337409973145, "epoch": 0.04389248043552229, "grad_norm": 0.06541673094034195, "grad_norm_var": 9.133974249708228e-05, "learning_rate": 0.009949210843145768, "loss": 3.1825, "step": 516 }, { "crossentropy": 3.0967025756835938, "epoch": 0.04397754338210275, "grad_norm": 0.06675159186124802, "grad_norm_var": 9.203506096471335e-05, "learning_rate": 0.009948925597505048, "loss": 3.0967, "step": 517 }, { "crossentropy": 3.069472074508667, "epoch": 0.044062606328683224, "grad_norm": 0.08564745634794235, "grad_norm_var": 9.06259550238669e-05, "learning_rate": 0.009948639557209479, "loss": 3.0695, "step": 518 }, { "crossentropy": 3.1244266033172607, "epoch": 0.044147669275263694, "grad_norm": 0.07061419636011124, "grad_norm_var": 8.243643643984754e-05, "learning_rate": 0.009948352722304986, "loss": 3.1244, "step": 519 }, { "crossentropy": 3.202604055404663, "epoch": 0.044232732221844165, "grad_norm": 0.06146254390478134, "grad_norm_var": 8.745158876400557e-05, "learning_rate": 0.00994806509283763, "loss": 3.2026, "step": 520 }, { "crossentropy": 3.213679075241089, "epoch": 0.044317795168424635, "grad_norm": 0.06100878864526749, "grad_norm_var": 9.56705280745281e-05, "learning_rate": 0.009947776668853596, "loss": 3.2137, "step": 521 }, { "crossentropy": 3.2263100147247314, "epoch": 0.044402858115005106, "grad_norm": 0.06137204170227051, "grad_norm_var": 9.097123066671408e-05, "learning_rate": 0.009947487450399195, "loss": 3.2263, "step": 522 }, { "crossentropy": 3.1594810485839844, "epoch": 0.04448792106158558, "grad_norm": 0.06877820938825607, "grad_norm_var": 6.248232143618003e-05, "learning_rate": 0.009947197437520868, "loss": 3.1595, "step": 523 }, { "crossentropy": 3.1102406978607178, "epoch": 0.04457298400816604, "grad_norm": 0.06512235850095749, "grad_norm_var": 5.657608464380547e-05, "learning_rate": 0.009946906630265184, "loss": 3.1102, "step": 524 }, { "crossentropy": 3.203032970428467, "epoch": 0.04465804695474651, "grad_norm": 0.06988143175840378, "grad_norm_var": 5.624026807355224e-05, "learning_rate": 0.009946615028678836, "loss": 3.203, "step": 525 }, { "crossentropy": 3.2757935523986816, "epoch": 0.04474310990132698, "grad_norm": 0.06065011024475098, "grad_norm_var": 6.143658324597895e-05, "learning_rate": 0.009946322632808648, "loss": 3.2758, "step": 526 }, { "crossentropy": 3.258782386779785, "epoch": 0.04482817284790745, "grad_norm": 0.060153644531965256, "grad_norm_var": 6.60463512691486e-05, "learning_rate": 0.00994602944270157, "loss": 3.2588, "step": 527 }, { "crossentropy": 3.1232731342315674, "epoch": 0.04491323579448792, "grad_norm": 0.05896924436092377, "grad_norm_var": 7.181470271366974e-05, "learning_rate": 0.009945735458404681, "loss": 3.1233, "step": 528 }, { "crossentropy": 3.166743755340576, "epoch": 0.04499829874106839, "grad_norm": 0.05763143673539162, "grad_norm_var": 7.575312057170233e-05, "learning_rate": 0.009945440679965185, "loss": 3.1667, "step": 529 }, { "crossentropy": 3.1591875553131104, "epoch": 0.04508336168764886, "grad_norm": 0.07633521407842636, "grad_norm_var": 5.491918509497017e-05, "learning_rate": 0.009945145107430416, "loss": 3.1592, "step": 530 }, { "crossentropy": 3.243839979171753, "epoch": 0.04516842463422933, "grad_norm": 0.07514523714780807, "grad_norm_var": 5.776089633883185e-05, "learning_rate": 0.009944848740847834, "loss": 3.2438, "step": 531 }, { "crossentropy": 3.1444246768951416, "epoch": 0.0452534875808098, "grad_norm": 0.09499816596508026, "grad_norm_var": 0.0001079478274878772, "learning_rate": 0.009944551580265025, "loss": 3.1444, "step": 532 }, { "crossentropy": 3.224369764328003, "epoch": 0.04533855052739027, "grad_norm": 0.0732896700501442, "grad_norm_var": 0.00010917586298168543, "learning_rate": 0.009944253625729708, "loss": 3.2244, "step": 533 }, { "crossentropy": 3.196533203125, "epoch": 0.04542361347397074, "grad_norm": 0.05922657623887062, "grad_norm_var": 9.351203481769981e-05, "learning_rate": 0.009943954877289723, "loss": 3.1965, "step": 534 }, { "crossentropy": 3.1770877838134766, "epoch": 0.04550867642055121, "grad_norm": 0.05596734583377838, "grad_norm_var": 0.00010018405443929784, "learning_rate": 0.009943655334993043, "loss": 3.1771, "step": 535 }, { "crossentropy": 3.153810977935791, "epoch": 0.04559373936713168, "grad_norm": 0.06393677741289139, "grad_norm_var": 9.898746222185417e-05, "learning_rate": 0.009943354998887762, "loss": 3.1538, "step": 536 }, { "crossentropy": 3.19108247756958, "epoch": 0.045678802313712144, "grad_norm": 0.07549943029880524, "grad_norm_var": 0.00010168684876143969, "learning_rate": 0.00994305386902211, "loss": 3.1911, "step": 537 }, { "crossentropy": 3.2152020931243896, "epoch": 0.045763865260292615, "grad_norm": 0.06720729917287827, "grad_norm_var": 9.919520374226243e-05, "learning_rate": 0.009942751945444438, "loss": 3.2152, "step": 538 }, { "crossentropy": 3.1665217876434326, "epoch": 0.045848928206873085, "grad_norm": 0.06663482636213303, "grad_norm_var": 9.916691419785169e-05, "learning_rate": 0.009942449228203223, "loss": 3.1665, "step": 539 }, { "crossentropy": 3.115779399871826, "epoch": 0.045933991153453556, "grad_norm": 0.05426739156246185, "grad_norm_var": 0.00011003122407121409, "learning_rate": 0.009942145717347077, "loss": 3.1158, "step": 540 }, { "crossentropy": 3.27826189994812, "epoch": 0.046019054100034026, "grad_norm": 0.06319809705018997, "grad_norm_var": 0.00011013235985266872, "learning_rate": 0.009941841412924732, "loss": 3.2783, "step": 541 }, { "crossentropy": 3.138817548751831, "epoch": 0.0461041170466145, "grad_norm": 0.059702951461076736, "grad_norm_var": 0.00011092017806687097, "learning_rate": 0.009941536314985053, "loss": 3.1388, "step": 542 }, { "crossentropy": 3.0840513706207275, "epoch": 0.04618917999319497, "grad_norm": 0.06472934037446976, "grad_norm_var": 0.00010842690897342784, "learning_rate": 0.00994123042357703, "loss": 3.0841, "step": 543 }, { "crossentropy": 3.196929931640625, "epoch": 0.04627424293977543, "grad_norm": 0.08242134004831314, "grad_norm_var": 0.00011871839668240082, "learning_rate": 0.00994092373874978, "loss": 3.1969, "step": 544 }, { "crossentropy": 3.225580930709839, "epoch": 0.0463593058863559, "grad_norm": 0.08205101639032364, "grad_norm_var": 0.00012178279658448138, "learning_rate": 0.009940616260552544, "loss": 3.2256, "step": 545 }, { "crossentropy": 3.061109781265259, "epoch": 0.04644436883293637, "grad_norm": 0.06112368777394295, "grad_norm_var": 0.00012271243652546738, "learning_rate": 0.009940307989034699, "loss": 3.0611, "step": 546 }, { "crossentropy": 3.095165729522705, "epoch": 0.04652943177951684, "grad_norm": 0.062123626470565796, "grad_norm_var": 0.00012214137800578326, "learning_rate": 0.009939998924245743, "loss": 3.0952, "step": 547 }, { "crossentropy": 3.1073191165924072, "epoch": 0.046614494726097314, "grad_norm": 0.058417681604623795, "grad_norm_var": 7.359923369199365e-05, "learning_rate": 0.009939689066235302, "loss": 3.1073, "step": 548 }, { "crossentropy": 3.264578104019165, "epoch": 0.046699557672677784, "grad_norm": 0.06318086385726929, "grad_norm_var": 6.963813031241212e-05, "learning_rate": 0.009939378415053131, "loss": 3.2646, "step": 549 }, { "crossentropy": 3.144163131713867, "epoch": 0.04678462061925825, "grad_norm": 0.05851880833506584, "grad_norm_var": 7.021243260227164e-05, "learning_rate": 0.009939066970749111, "loss": 3.1442, "step": 550 }, { "crossentropy": 3.280022621154785, "epoch": 0.04686968356583872, "grad_norm": 0.06006484478712082, "grad_norm_var": 6.636174962210301e-05, "learning_rate": 0.009938754733373252, "loss": 3.28, "step": 551 }, { "crossentropy": 3.237403154373169, "epoch": 0.04695474651241919, "grad_norm": 0.07637914270162582, "grad_norm_var": 7.395451591774733e-05, "learning_rate": 0.009938441702975689, "loss": 3.2374, "step": 552 }, { "crossentropy": 3.1865155696868896, "epoch": 0.04703980945899966, "grad_norm": 0.08593980222940445, "grad_norm_var": 9.403251054442008e-05, "learning_rate": 0.009938127879606686, "loss": 3.1865, "step": 553 }, { "crossentropy": 3.2006537914276123, "epoch": 0.04712487240558013, "grad_norm": 0.064642995595932, "grad_norm_var": 9.424355722151626e-05, "learning_rate": 0.009937813263316636, "loss": 3.2007, "step": 554 }, { "crossentropy": 3.2059261798858643, "epoch": 0.0472099353521606, "grad_norm": 0.06321319192647934, "grad_norm_var": 9.489656055882862e-05, "learning_rate": 0.009937497854156055, "loss": 3.2059, "step": 555 }, { "crossentropy": 3.145968437194824, "epoch": 0.04729499829874107, "grad_norm": 0.06507620960474014, "grad_norm_var": 8.493169666391002e-05, "learning_rate": 0.009937181652175591, "loss": 3.146, "step": 556 }, { "crossentropy": 3.0524747371673584, "epoch": 0.047380061245321535, "grad_norm": 0.0698874294757843, "grad_norm_var": 8.440524317976719e-05, "learning_rate": 0.009936864657426013, "loss": 3.0525, "step": 557 }, { "crossentropy": 3.240402936935425, "epoch": 0.047465124191902006, "grad_norm": 0.08925572037696838, "grad_norm_var": 0.00010888972641429576, "learning_rate": 0.009936546869958224, "loss": 3.2404, "step": 558 }, { "crossentropy": 3.1717662811279297, "epoch": 0.047550187138482476, "grad_norm": 0.059148143976926804, "grad_norm_var": 0.00011415536329504108, "learning_rate": 0.009936228289823252, "loss": 3.1718, "step": 559 }, { "crossentropy": 3.184075117111206, "epoch": 0.04763525008506295, "grad_norm": 0.062373436987400055, "grad_norm_var": 0.00010297236519376321, "learning_rate": 0.009935908917072252, "loss": 3.1841, "step": 560 }, { "crossentropy": 3.2069149017333984, "epoch": 0.04772031303164342, "grad_norm": 0.06091253459453583, "grad_norm_var": 9.01340762057797e-05, "learning_rate": 0.009935588751756503, "loss": 3.2069, "step": 561 }, { "crossentropy": 3.0430381298065186, "epoch": 0.04780537597822389, "grad_norm": 0.06290073692798615, "grad_norm_var": 8.911299490900002e-05, "learning_rate": 0.009935267793927416, "loss": 3.043, "step": 562 }, { "crossentropy": 3.200119733810425, "epoch": 0.04789043892480436, "grad_norm": 0.06356833130121231, "grad_norm_var": 8.842408909968488e-05, "learning_rate": 0.009934946043636528, "loss": 3.2001, "step": 563 }, { "crossentropy": 3.0607547760009766, "epoch": 0.04797550187138482, "grad_norm": 0.05456225574016571, "grad_norm_var": 9.349116808728316e-05, "learning_rate": 0.009934623500935503, "loss": 3.0608, "step": 564 }, { "crossentropy": 3.1326711177825928, "epoch": 0.04806056481796529, "grad_norm": 0.06544360518455505, "grad_norm_var": 9.289229462405568e-05, "learning_rate": 0.00993430016587613, "loss": 3.1327, "step": 565 }, { "crossentropy": 3.210108518600464, "epoch": 0.04814562776454576, "grad_norm": 0.06482771039009094, "grad_norm_var": 8.877734002038871e-05, "learning_rate": 0.009933976038510333, "loss": 3.2101, "step": 566 }, { "crossentropy": 3.1812918186187744, "epoch": 0.048230690711126234, "grad_norm": 0.06326117366552353, "grad_norm_var": 8.656158867654893e-05, "learning_rate": 0.00993365111889015, "loss": 3.1813, "step": 567 }, { "crossentropy": 3.2414212226867676, "epoch": 0.048315753657706705, "grad_norm": 0.07063822448253632, "grad_norm_var": 8.141308652948407e-05, "learning_rate": 0.00993332540706776, "loss": 3.2414, "step": 568 }, { "crossentropy": 3.1168391704559326, "epoch": 0.048400816604287175, "grad_norm": 0.07024037092924118, "grad_norm_var": 5.634114727616716e-05, "learning_rate": 0.009932998903095456, "loss": 3.1168, "step": 569 }, { "crossentropy": 3.23638916015625, "epoch": 0.04848587955086764, "grad_norm": 0.05533446744084358, "grad_norm_var": 6.297177499657302e-05, "learning_rate": 0.009932671607025673, "loss": 3.2364, "step": 570 }, { "crossentropy": 3.0348470211029053, "epoch": 0.04857094249744811, "grad_norm": 0.06304553896188736, "grad_norm_var": 6.301437263821948e-05, "learning_rate": 0.00993234351891096, "loss": 3.0348, "step": 571 }, { "crossentropy": 3.1962671279907227, "epoch": 0.04865600544402858, "grad_norm": 0.07810916751623154, "grad_norm_var": 7.371124321896756e-05, "learning_rate": 0.009932014638804, "loss": 3.1963, "step": 572 }, { "crossentropy": 3.148603677749634, "epoch": 0.04874106839060905, "grad_norm": 0.06005309522151947, "grad_norm_var": 7.445434872804524e-05, "learning_rate": 0.009931684966757604, "loss": 3.1486, "step": 573 }, { "crossentropy": 3.172536611557007, "epoch": 0.04882613133718952, "grad_norm": 0.061448223888874054, "grad_norm_var": 3.370227986619137e-05, "learning_rate": 0.009931354502824704, "loss": 3.1725, "step": 574 }, { "crossentropy": 3.1477558612823486, "epoch": 0.04891119428376999, "grad_norm": 0.05679318681359291, "grad_norm_var": 3.541274201434852e-05, "learning_rate": 0.009931023247058364, "loss": 3.1478, "step": 575 }, { "crossentropy": 3.108724355697632, "epoch": 0.04899625723035046, "grad_norm": 0.059740301221609116, "grad_norm_var": 3.618700651273985e-05, "learning_rate": 0.009930691199511774, "loss": 3.1087, "step": 576 }, { "crossentropy": 3.130723476409912, "epoch": 0.049081320176930926, "grad_norm": 0.06693211942911148, "grad_norm_var": 3.663187969072107e-05, "learning_rate": 0.009930358360238254, "loss": 3.1307, "step": 577 }, { "crossentropy": 3.2568211555480957, "epoch": 0.0491663831235114, "grad_norm": 0.061447951942682266, "grad_norm_var": 3.6890749094430536e-05, "learning_rate": 0.009930024729291246, "loss": 3.2568, "step": 578 }, { "crossentropy": 3.1540920734405518, "epoch": 0.04925144607009187, "grad_norm": 0.062290292233228683, "grad_norm_var": 3.697528835749497e-05, "learning_rate": 0.009929690306724321, "loss": 3.1541, "step": 579 }, { "crossentropy": 3.170461416244507, "epoch": 0.04933650901667234, "grad_norm": 0.057889096438884735, "grad_norm_var": 3.375323515695984e-05, "learning_rate": 0.009929355092591179, "loss": 3.1705, "step": 580 }, { "crossentropy": 3.113471269607544, "epoch": 0.04942157196325281, "grad_norm": 0.06151947006583214, "grad_norm_var": 3.3747605860800826e-05, "learning_rate": 0.009929019086945646, "loss": 3.1135, "step": 581 }, { "crossentropy": 3.0647330284118652, "epoch": 0.04950663490983328, "grad_norm": 0.05453195795416832, "grad_norm_var": 3.834167078135686e-05, "learning_rate": 0.009928682289841673, "loss": 3.0647, "step": 582 }, { "crossentropy": 3.172018051147461, "epoch": 0.04959169785641375, "grad_norm": 0.06191813573241234, "grad_norm_var": 3.835475022549971e-05, "learning_rate": 0.009928344701333343, "loss": 3.172, "step": 583 }, { "crossentropy": 3.223353147506714, "epoch": 0.04967676080299421, "grad_norm": 0.07921001315116882, "grad_norm_var": 5.211021426261078e-05, "learning_rate": 0.009928006321474859, "loss": 3.2234, "step": 584 }, { "crossentropy": 3.1121890544891357, "epoch": 0.049761823749574684, "grad_norm": 0.078638456761837, "grad_norm_var": 6.445037485897434e-05, "learning_rate": 0.009927667150320556, "loss": 3.1122, "step": 585 }, { "crossentropy": 3.0927178859710693, "epoch": 0.049846886696155154, "grad_norm": 0.06874630600214005, "grad_norm_var": 6.076645502141962e-05, "learning_rate": 0.009927327187924898, "loss": 3.0927, "step": 586 }, { "crossentropy": 3.20103120803833, "epoch": 0.049931949642735625, "grad_norm": 0.06641366332769394, "grad_norm_var": 6.0813503306526795e-05, "learning_rate": 0.00992698643434247, "loss": 3.201, "step": 587 }, { "crossentropy": 3.1671886444091797, "epoch": 0.050017012589316096, "grad_norm": 0.0665605366230011, "grad_norm_var": 4.854784574761095e-05, "learning_rate": 0.009926644889627991, "loss": 3.1672, "step": 588 }, { "crossentropy": 3.160092830657959, "epoch": 0.050102075535896566, "grad_norm": 0.05463789403438568, "grad_norm_var": 5.323638524663132e-05, "learning_rate": 0.0099263025538363, "loss": 3.1601, "step": 589 }, { "crossentropy": 3.1434950828552246, "epoch": 0.05018713848247703, "grad_norm": 0.053421568125486374, "grad_norm_var": 5.9640716197944854e-05, "learning_rate": 0.009925959427022365, "loss": 3.1435, "step": 590 }, { "crossentropy": 3.1052093505859375, "epoch": 0.0502722014290575, "grad_norm": 0.05432531610131264, "grad_norm_var": 6.211905461381261e-05, "learning_rate": 0.009925615509241285, "loss": 3.1052, "step": 591 }, { "crossentropy": 3.0265896320343018, "epoch": 0.05035726437563797, "grad_norm": 0.06510204076766968, "grad_norm_var": 6.157549875007526e-05, "learning_rate": 0.009925270800548284, "loss": 3.0266, "step": 592 }, { "crossentropy": 3.173037528991699, "epoch": 0.05044232732221844, "grad_norm": 0.06751763075590134, "grad_norm_var": 6.187664882654527e-05, "learning_rate": 0.009924925300998711, "loss": 3.173, "step": 593 }, { "crossentropy": 3.0827980041503906, "epoch": 0.05052739026879891, "grad_norm": 0.05533879995346069, "grad_norm_var": 6.578761281600931e-05, "learning_rate": 0.00992457901064804, "loss": 3.0828, "step": 594 }, { "crossentropy": 3.087204933166504, "epoch": 0.05061245321537938, "grad_norm": 0.061524808406829834, "grad_norm_var": 6.589706189122536e-05, "learning_rate": 0.009924231929551882, "loss": 3.0872, "step": 595 }, { "crossentropy": 3.1490416526794434, "epoch": 0.05069751616195985, "grad_norm": 0.06381415575742722, "grad_norm_var": 6.408832859753349e-05, "learning_rate": 0.009923884057765963, "loss": 3.149, "step": 596 }, { "crossentropy": 3.0780317783355713, "epoch": 0.05078257910854032, "grad_norm": 0.06051252782344818, "grad_norm_var": 6.439428211805486e-05, "learning_rate": 0.009923535395346144, "loss": 3.078, "step": 597 }, { "crossentropy": 3.0936577320098877, "epoch": 0.05086764205512079, "grad_norm": 0.05820344015955925, "grad_norm_var": 6.096247526627222e-05, "learning_rate": 0.009923185942348407, "loss": 3.0937, "step": 598 }, { "crossentropy": 3.0888845920562744, "epoch": 0.05095270500170126, "grad_norm": 0.067149318754673, "grad_norm_var": 6.157446922770005e-05, "learning_rate": 0.009922835698828868, "loss": 3.0889, "step": 599 }, { "crossentropy": 3.0827927589416504, "epoch": 0.05103776794828173, "grad_norm": 0.06864902377128601, "grad_norm_var": 4.687389644770171e-05, "learning_rate": 0.009922484664843762, "loss": 3.0828, "step": 600 }, { "crossentropy": 3.1191036701202393, "epoch": 0.0511228308948622, "grad_norm": 0.08387736976146698, "grad_norm_var": 5.9401520003427887e-05, "learning_rate": 0.009922132840449458, "loss": 3.1191, "step": 601 }, { "crossentropy": 3.184844493865967, "epoch": 0.05120789384144267, "grad_norm": 0.08071059733629227, "grad_norm_var": 7.673764663060165e-05, "learning_rate": 0.009921780225702448, "loss": 3.1848, "step": 602 }, { "crossentropy": 3.184297800064087, "epoch": 0.05129295678802314, "grad_norm": 0.057271163910627365, "grad_norm_var": 7.930583748352662e-05, "learning_rate": 0.009921426820659352, "loss": 3.1843, "step": 603 }, { "crossentropy": 3.180743455886841, "epoch": 0.051378019734603604, "grad_norm": 0.060499463230371475, "grad_norm_var": 7.92606651379155e-05, "learning_rate": 0.009921072625376916, "loss": 3.1807, "step": 604 }, { "crossentropy": 3.0871171951293945, "epoch": 0.051463082681184075, "grad_norm": 0.07380412518978119, "grad_norm_var": 8.01228132558275e-05, "learning_rate": 0.009920717639912013, "loss": 3.0871, "step": 605 }, { "crossentropy": 3.1481714248657227, "epoch": 0.051548145627764545, "grad_norm": 0.07513435930013657, "grad_norm_var": 7.756607311077944e-05, "learning_rate": 0.009920361864321645, "loss": 3.1482, "step": 606 }, { "crossentropy": 3.1753013134002686, "epoch": 0.051633208574345016, "grad_norm": 0.06793764978647232, "grad_norm_var": 6.824881857306197e-05, "learning_rate": 0.009920005298662939, "loss": 3.1753, "step": 607 }, { "crossentropy": 3.0893962383270264, "epoch": 0.051718271520925486, "grad_norm": 0.06450506299734116, "grad_norm_var": 6.839752153521648e-05, "learning_rate": 0.009919647942993149, "loss": 3.0894, "step": 608 }, { "crossentropy": 3.1099512577056885, "epoch": 0.05180333446750596, "grad_norm": 0.05845556780695915, "grad_norm_var": 7.248548435384744e-05, "learning_rate": 0.009919289797369654, "loss": 3.11, "step": 609 }, { "crossentropy": 3.088318347930908, "epoch": 0.05188839741408642, "grad_norm": 0.06151672080159187, "grad_norm_var": 6.601760133221653e-05, "learning_rate": 0.009918930861849966, "loss": 3.0883, "step": 610 }, { "crossentropy": 3.118070125579834, "epoch": 0.05197346036066689, "grad_norm": 0.06535844504833221, "grad_norm_var": 6.44069581437885e-05, "learning_rate": 0.009918571136491717, "loss": 3.1181, "step": 611 }, { "crossentropy": 3.1633412837982178, "epoch": 0.05205852330724736, "grad_norm": 0.060223162174224854, "grad_norm_var": 6.660060488831899e-05, "learning_rate": 0.009918210621352667, "loss": 3.1633, "step": 612 }, { "crossentropy": 3.251969575881958, "epoch": 0.05214358625382783, "grad_norm": 0.06856393814086914, "grad_norm_var": 6.423738342745163e-05, "learning_rate": 0.009917849316490706, "loss": 3.252, "step": 613 }, { "crossentropy": 3.1598799228668213, "epoch": 0.0522286492004083, "grad_norm": 0.06157307326793671, "grad_norm_var": 6.099882575305559e-05, "learning_rate": 0.009917487221963851, "loss": 3.1599, "step": 614 }, { "crossentropy": 3.1790497303009033, "epoch": 0.052313712146988774, "grad_norm": 0.059671495109796524, "grad_norm_var": 6.454603225865223e-05, "learning_rate": 0.009917124337830242, "loss": 3.179, "step": 615 }, { "crossentropy": 3.192744731903076, "epoch": 0.052398775093569244, "grad_norm": 0.05990449711680412, "grad_norm_var": 6.709293301097596e-05, "learning_rate": 0.009916760664148148, "loss": 3.1927, "step": 616 }, { "crossentropy": 3.1892287731170654, "epoch": 0.05248383804014971, "grad_norm": 0.05694003403186798, "grad_norm_var": 4.890996039660922e-05, "learning_rate": 0.009916396200975966, "loss": 3.1892, "step": 617 }, { "crossentropy": 3.08316707611084, "epoch": 0.05256890098673018, "grad_norm": 0.06276710331439972, "grad_norm_var": 3.026009128023176e-05, "learning_rate": 0.009916030948372215, "loss": 3.0832, "step": 618 }, { "crossentropy": 3.113513946533203, "epoch": 0.05265396393331065, "grad_norm": 0.06605713069438934, "grad_norm_var": 2.792503846123084e-05, "learning_rate": 0.009915664906395545, "loss": 3.1135, "step": 619 }, { "crossentropy": 3.0262796878814697, "epoch": 0.05273902687989112, "grad_norm": 0.06357202678918839, "grad_norm_var": 2.7108858305697574e-05, "learning_rate": 0.009915298075104734, "loss": 3.0263, "step": 620 }, { "crossentropy": 3.0544493198394775, "epoch": 0.05282408982647159, "grad_norm": 0.06560079008340836, "grad_norm_var": 2.07268961459633e-05, "learning_rate": 0.009914930454558683, "loss": 3.0544, "step": 621 }, { "crossentropy": 3.0792863368988037, "epoch": 0.05290915277305206, "grad_norm": 0.06561029702425003, "grad_norm_var": 1.1763307056020939e-05, "learning_rate": 0.009914562044816423, "loss": 3.0793, "step": 622 }, { "crossentropy": 3.2157723903656006, "epoch": 0.05299421571963253, "grad_norm": 0.0615137554705143, "grad_norm_var": 1.0127023668618912e-05, "learning_rate": 0.009914192845937107, "loss": 3.2158, "step": 623 }, { "crossentropy": 3.155122995376587, "epoch": 0.053079278666212995, "grad_norm": 0.08635905385017395, "grad_norm_var": 4.548547681934758e-05, "learning_rate": 0.00991382285798002, "loss": 3.1551, "step": 624 }, { "crossentropy": 2.9701874256134033, "epoch": 0.053164341612793466, "grad_norm": 0.07240045070648193, "grad_norm_var": 4.736671309844657e-05, "learning_rate": 0.00991345208100457, "loss": 2.9702, "step": 625 }, { "crossentropy": 3.2031068801879883, "epoch": 0.053249404559373936, "grad_norm": 0.0582190603017807, "grad_norm_var": 4.951285513433567e-05, "learning_rate": 0.009913080515070295, "loss": 3.2031, "step": 626 }, { "crossentropy": 3.062178373336792, "epoch": 0.05333446750595441, "grad_norm": 0.06332788616418839, "grad_norm_var": 4.9577636460226776e-05, "learning_rate": 0.009912708160236854, "loss": 3.0622, "step": 627 }, { "crossentropy": 3.127454996109009, "epoch": 0.05341953045253488, "grad_norm": 0.06009446084499359, "grad_norm_var": 4.96523887871011e-05, "learning_rate": 0.00991233501656404, "loss": 3.1275, "step": 628 }, { "crossentropy": 3.0582869052886963, "epoch": 0.05350459339911535, "grad_norm": 0.055286552757024765, "grad_norm_var": 5.349535460991058e-05, "learning_rate": 0.009911961084111768, "loss": 3.0583, "step": 629 }, { "crossentropy": 3.111429214477539, "epoch": 0.05358965634569581, "grad_norm": 0.0745406225323677, "grad_norm_var": 6.036038861656282e-05, "learning_rate": 0.00991158636294008, "loss": 3.1114, "step": 630 }, { "crossentropy": 3.0206096172332764, "epoch": 0.05367471929227628, "grad_norm": 0.07367957383394241, "grad_norm_var": 6.362185402828997e-05, "learning_rate": 0.009911210853109148, "loss": 3.0206, "step": 631 }, { "crossentropy": 3.1037914752960205, "epoch": 0.05375978223885675, "grad_norm": 0.06820553541183472, "grad_norm_var": 6.188254062636234e-05, "learning_rate": 0.009910834554679266, "loss": 3.1038, "step": 632 }, { "crossentropy": 3.129155158996582, "epoch": 0.05384484518543722, "grad_norm": 0.05703002214431763, "grad_norm_var": 6.17757105799116e-05, "learning_rate": 0.009910457467710855, "loss": 3.1292, "step": 633 }, { "crossentropy": 3.109654426574707, "epoch": 0.053929908132017694, "grad_norm": 0.059308011084795, "grad_norm_var": 6.396456247338555e-05, "learning_rate": 0.009910079592264469, "loss": 3.1097, "step": 634 }, { "crossentropy": 3.0231049060821533, "epoch": 0.054014971078598165, "grad_norm": 0.06417690962553024, "grad_norm_var": 6.408979767810365e-05, "learning_rate": 0.00990970092840078, "loss": 3.0231, "step": 635 }, { "crossentropy": 3.146134614944458, "epoch": 0.054100034025178635, "grad_norm": 0.06179177761077881, "grad_norm_var": 6.47592373694887e-05, "learning_rate": 0.009909321476180592, "loss": 3.1461, "step": 636 }, { "crossentropy": 3.133150100708008, "epoch": 0.0541850969717591, "grad_norm": 0.06914098560810089, "grad_norm_var": 6.56153554283537e-05, "learning_rate": 0.009908941235664834, "loss": 3.1332, "step": 637 }, { "crossentropy": 3.180995464324951, "epoch": 0.05427015991833957, "grad_norm": 0.06271430850028992, "grad_norm_var": 6.616173474170469e-05, "learning_rate": 0.00990856020691456, "loss": 3.181, "step": 638 }, { "crossentropy": 3.030651569366455, "epoch": 0.05435522286492004, "grad_norm": 0.05736298859119415, "grad_norm_var": 6.943736882354205e-05, "learning_rate": 0.009908178389990956, "loss": 3.0307, "step": 639 }, { "crossentropy": 3.2052416801452637, "epoch": 0.05444028581150051, "grad_norm": 0.06488339602947235, "grad_norm_var": 3.7753753077508315e-05, "learning_rate": 0.009907795784955327, "loss": 3.2052, "step": 640 }, { "crossentropy": 3.151552677154541, "epoch": 0.05452534875808098, "grad_norm": 0.06181960552930832, "grad_norm_var": 3.27376979123511e-05, "learning_rate": 0.009907412391869111, "loss": 3.1516, "step": 641 }, { "crossentropy": 3.118748426437378, "epoch": 0.05461041170466145, "grad_norm": 0.056506410241127014, "grad_norm_var": 3.406388288069139e-05, "learning_rate": 0.009907028210793867, "loss": 3.1187, "step": 642 }, { "crossentropy": 3.2163851261138916, "epoch": 0.05469547465124192, "grad_norm": 0.06330523639917374, "grad_norm_var": 3.406327751685825e-05, "learning_rate": 0.009906643241791286, "loss": 3.2164, "step": 643 }, { "crossentropy": 3.1319565773010254, "epoch": 0.054780537597822386, "grad_norm": 0.07220965623855591, "grad_norm_var": 3.8356997891575195e-05, "learning_rate": 0.009906257484923183, "loss": 3.132, "step": 644 }, { "crossentropy": 3.0932343006134033, "epoch": 0.05486560054440286, "grad_norm": 0.05800836160778999, "grad_norm_var": 3.5704069366457317e-05, "learning_rate": 0.009905870940251497, "loss": 3.0932, "step": 645 }, { "crossentropy": 3.0682294368743896, "epoch": 0.05495066349098333, "grad_norm": 0.05972297117114067, "grad_norm_var": 2.8686161195803395e-05, "learning_rate": 0.009905483607838298, "loss": 3.0682, "step": 646 }, { "crossentropy": 3.0917279720306396, "epoch": 0.0550357264375638, "grad_norm": 0.07417721301317215, "grad_norm_var": 2.9402511656203592e-05, "learning_rate": 0.00990509548774578, "loss": 3.0917, "step": 647 }, { "crossentropy": 3.1394171714782715, "epoch": 0.05512078938414427, "grad_norm": 0.08026926964521408, "grad_norm_var": 4.663386553089051e-05, "learning_rate": 0.009904706580036263, "loss": 3.1394, "step": 648 }, { "crossentropy": 3.180701494216919, "epoch": 0.05520585233072474, "grad_norm": 0.06149878725409508, "grad_norm_var": 4.3787596180977266e-05, "learning_rate": 0.009904316884772196, "loss": 3.1807, "step": 649 }, { "crossentropy": 3.0716116428375244, "epoch": 0.0552909152773052, "grad_norm": 0.06079305708408356, "grad_norm_var": 4.296055097400362e-05, "learning_rate": 0.009903926402016152, "loss": 3.0716, "step": 650 }, { "crossentropy": 3.08490252494812, "epoch": 0.05537597822388567, "grad_norm": 0.06587185710668564, "grad_norm_var": 4.311820546829542e-05, "learning_rate": 0.009903535131830832, "loss": 3.0849, "step": 651 }, { "crossentropy": 3.198612689971924, "epoch": 0.055461041170466144, "grad_norm": 0.05707905441522598, "grad_norm_var": 4.613249714888046e-05, "learning_rate": 0.00990314307427906, "loss": 3.1986, "step": 652 }, { "crossentropy": 3.13834810256958, "epoch": 0.055546104117046614, "grad_norm": 0.06015634164214134, "grad_norm_var": 4.512114208221965e-05, "learning_rate": 0.009902750229423794, "loss": 3.1383, "step": 653 }, { "crossentropy": 3.1119277477264404, "epoch": 0.055631167063627085, "grad_norm": 0.06091351807117462, "grad_norm_var": 4.5518148942478434e-05, "learning_rate": 0.009902356597328108, "loss": 3.1119, "step": 654 }, { "crossentropy": 2.995866060256958, "epoch": 0.055716230010207556, "grad_norm": 0.057171955704689026, "grad_norm_var": 4.567448174505213e-05, "learning_rate": 0.009901962178055211, "loss": 2.9959, "step": 655 }, { "crossentropy": 3.0832114219665527, "epoch": 0.055801292956788026, "grad_norm": 0.06576905399560928, "grad_norm_var": 4.589877521589125e-05, "learning_rate": 0.009901566971668437, "loss": 3.0832, "step": 656 }, { "crossentropy": 3.1908047199249268, "epoch": 0.05588635590336849, "grad_norm": 0.05592362582683563, "grad_norm_var": 4.935669402528741e-05, "learning_rate": 0.00990117097823124, "loss": 3.1908, "step": 657 }, { "crossentropy": 3.221153974533081, "epoch": 0.05597141884994896, "grad_norm": 0.07336770743131638, "grad_norm_var": 5.2333545276554244e-05, "learning_rate": 0.00990077419780721, "loss": 3.2212, "step": 658 }, { "crossentropy": 3.208343029022217, "epoch": 0.05605648179652943, "grad_norm": 0.08822660893201828, "grad_norm_var": 8.8377411472354e-05, "learning_rate": 0.009900376630460055, "loss": 3.2083, "step": 659 }, { "crossentropy": 3.1393589973449707, "epoch": 0.0561415447431099, "grad_norm": 0.05968687683343887, "grad_norm_var": 8.730518864856e-05, "learning_rate": 0.009899978276253616, "loss": 3.1394, "step": 660 }, { "crossentropy": 3.0916924476623535, "epoch": 0.05622660768969037, "grad_norm": 0.06230948865413666, "grad_norm_var": 8.450070968492267e-05, "learning_rate": 0.009899579135251855, "loss": 3.0917, "step": 661 }, { "crossentropy": 3.1009204387664795, "epoch": 0.05631167063627084, "grad_norm": 0.05698639899492264, "grad_norm_var": 8.696121051626852e-05, "learning_rate": 0.009899179207518862, "loss": 3.1009, "step": 662 }, { "crossentropy": 3.1185970306396484, "epoch": 0.05639673358285131, "grad_norm": 0.055839285254478455, "grad_norm_var": 8.557056363419241e-05, "learning_rate": 0.009898778493118855, "loss": 3.1186, "step": 663 }, { "crossentropy": 3.139530897140503, "epoch": 0.05648179652943178, "grad_norm": 0.05510038137435913, "grad_norm_var": 7.011712517216254e-05, "learning_rate": 0.009898376992116178, "loss": 3.1395, "step": 664 }, { "crossentropy": 3.093919515609741, "epoch": 0.05656685947601225, "grad_norm": 0.06412984430789948, "grad_norm_var": 7.027103160384448e-05, "learning_rate": 0.0098979747045753, "loss": 3.0939, "step": 665 }, { "crossentropy": 3.243880271911621, "epoch": 0.05665192242259272, "grad_norm": 0.0728764757514, "grad_norm_var": 7.671446403114859e-05, "learning_rate": 0.009897571630560816, "loss": 3.2439, "step": 666 }, { "crossentropy": 3.0902743339538574, "epoch": 0.05673698536917319, "grad_norm": 0.07639763504266739, "grad_norm_var": 8.737046109648043e-05, "learning_rate": 0.009897167770137447, "loss": 3.0903, "step": 667 }, { "crossentropy": 3.134528636932373, "epoch": 0.05682204831575366, "grad_norm": 0.08354643732309341, "grad_norm_var": 0.0001071848240303832, "learning_rate": 0.009896763123370043, "loss": 3.1345, "step": 668 }, { "crossentropy": 3.0774128437042236, "epoch": 0.05690711126233413, "grad_norm": 0.07326079159975052, "grad_norm_var": 0.00010853711653815411, "learning_rate": 0.00989635769032358, "loss": 3.0774, "step": 669 }, { "crossentropy": 3.0469555854797363, "epoch": 0.056992174208914594, "grad_norm": 0.06375575810670853, "grad_norm_var": 0.00010698399825809502, "learning_rate": 0.009895951471063156, "loss": 3.047, "step": 670 }, { "crossentropy": 3.1645748615264893, "epoch": 0.057077237155495064, "grad_norm": 0.06264977157115936, "grad_norm_var": 0.00010203052737773055, "learning_rate": 0.009895544465654, "loss": 3.1646, "step": 671 }, { "crossentropy": 3.1185107231140137, "epoch": 0.057162300102075535, "grad_norm": 0.06869278103113174, "grad_norm_var": 0.00010213789335164943, "learning_rate": 0.009895136674161465, "loss": 3.1185, "step": 672 }, { "crossentropy": 3.0596227645874023, "epoch": 0.057247363048656005, "grad_norm": 0.05930137261748314, "grad_norm_var": 9.784143450970445e-05, "learning_rate": 0.00989472809665103, "loss": 3.0596, "step": 673 }, { "crossentropy": 3.093726634979248, "epoch": 0.057332425995236476, "grad_norm": 0.06068730354309082, "grad_norm_var": 9.756112626565315e-05, "learning_rate": 0.009894318733188301, "loss": 3.0937, "step": 674 }, { "crossentropy": 3.102799892425537, "epoch": 0.057417488941816947, "grad_norm": 0.05845452472567558, "grad_norm_var": 6.65763505334406e-05, "learning_rate": 0.00989390858383901, "loss": 3.1028, "step": 675 }, { "crossentropy": 3.1593947410583496, "epoch": 0.05750255188839742, "grad_norm": 0.05869746580719948, "grad_norm_var": 6.728629977765747e-05, "learning_rate": 0.009893497648669014, "loss": 3.1594, "step": 676 }, { "crossentropy": 3.1950016021728516, "epoch": 0.05758761483497788, "grad_norm": 0.06891778856515884, "grad_norm_var": 6.804781559357713e-05, "learning_rate": 0.009893085927744301, "loss": 3.195, "step": 677 }, { "crossentropy": 3.1031482219696045, "epoch": 0.05767267778155835, "grad_norm": 0.0571833960711956, "grad_norm_var": 6.784091259545433e-05, "learning_rate": 0.009892673421130978, "loss": 3.1031, "step": 678 }, { "crossentropy": 3.128129243850708, "epoch": 0.05775774072813882, "grad_norm": 0.05571357533335686, "grad_norm_var": 6.799491277276234e-05, "learning_rate": 0.009892260128895282, "loss": 3.1281, "step": 679 }, { "crossentropy": 3.1001017093658447, "epoch": 0.05784280367471929, "grad_norm": 0.2048487365245819, "grad_norm_var": 0.001272662356102958, "learning_rate": 0.009891846051103577, "loss": 3.1001, "step": 680 }, { "crossentropy": 3.1051418781280518, "epoch": 0.05792786662129976, "grad_norm": 0.07708574831485748, "grad_norm_var": 0.0012655509825090906, "learning_rate": 0.00989143118782235, "loss": 3.1051, "step": 681 }, { "crossentropy": 3.1125755310058594, "epoch": 0.058012929567880234, "grad_norm": 0.07189701497554779, "grad_norm_var": 0.00126590515475615, "learning_rate": 0.00989101553911822, "loss": 3.1126, "step": 682 }, { "crossentropy": 3.119439125061035, "epoch": 0.058097992514460704, "grad_norm": 0.06317123770713806, "grad_norm_var": 0.0012744941479819055, "learning_rate": 0.009890599105057926, "loss": 3.1194, "step": 683 }, { "crossentropy": 3.0380215644836426, "epoch": 0.05818305546104117, "grad_norm": 0.06324312090873718, "grad_norm_var": 0.0012750686607334236, "learning_rate": 0.009890181885708334, "loss": 3.038, "step": 684 }, { "crossentropy": 3.1282379627227783, "epoch": 0.05826811840762164, "grad_norm": 0.061158571392297745, "grad_norm_var": 0.0012837574873856737, "learning_rate": 0.00988976388113644, "loss": 3.1282, "step": 685 }, { "crossentropy": 3.0471155643463135, "epoch": 0.05835318135420211, "grad_norm": 0.06866273283958435, "grad_norm_var": 0.0012797270730696324, "learning_rate": 0.00988934509140936, "loss": 3.0471, "step": 686 }, { "crossentropy": 3.0930216312408447, "epoch": 0.05843824430078258, "grad_norm": 0.05346294492483139, "grad_norm_var": 0.00129709553415595, "learning_rate": 0.009888925516594342, "loss": 3.093, "step": 687 }, { "crossentropy": 3.062594175338745, "epoch": 0.05852330724736305, "grad_norm": 0.07305099070072174, "grad_norm_var": 0.0012963906937532963, "learning_rate": 0.009888505156758758, "loss": 3.0626, "step": 688 }, { "crossentropy": 3.0833683013916016, "epoch": 0.05860837019394352, "grad_norm": 0.08002721518278122, "grad_norm_var": 0.0012875354490460664, "learning_rate": 0.009888084011970104, "loss": 3.0834, "step": 689 }, { "crossentropy": 3.0675363540649414, "epoch": 0.058693433140523985, "grad_norm": 0.08474676311016083, "grad_norm_var": 0.0012825592382267547, "learning_rate": 0.009887662082296005, "loss": 3.0675, "step": 690 }, { "crossentropy": 3.1227071285247803, "epoch": 0.058778496087104455, "grad_norm": 0.07704239338636398, "grad_norm_var": 0.0012630976752208542, "learning_rate": 0.00988723936780421, "loss": 3.1227, "step": 691 }, { "crossentropy": 3.0289628505706787, "epoch": 0.058863559033684926, "grad_norm": 0.057386357337236404, "grad_norm_var": 0.0012662616373333361, "learning_rate": 0.009886815868562597, "loss": 3.029, "step": 692 }, { "crossentropy": 3.0629541873931885, "epoch": 0.058948621980265396, "grad_norm": 0.0569617785513401, "grad_norm_var": 0.0012866450447891742, "learning_rate": 0.009886391584639163, "loss": 3.063, "step": 693 }, { "crossentropy": 3.060515880584717, "epoch": 0.05903368492684587, "grad_norm": 0.054563332349061966, "grad_norm_var": 0.001293421374938993, "learning_rate": 0.00988596651610204, "loss": 3.0605, "step": 694 }, { "crossentropy": 3.0622055530548096, "epoch": 0.05911874787342634, "grad_norm": 0.0605032742023468, "grad_norm_var": 0.0012824177376234663, "learning_rate": 0.009885540663019481, "loss": 3.0622, "step": 695 }, { "crossentropy": 3.1298203468322754, "epoch": 0.05920381082000681, "grad_norm": 0.059198617935180664, "grad_norm_var": 9.610761700748644e-05, "learning_rate": 0.009885114025459866, "loss": 3.1298, "step": 696 }, { "crossentropy": 3.072157382965088, "epoch": 0.05928887376658727, "grad_norm": 0.05669621005654335, "grad_norm_var": 9.300019321879e-05, "learning_rate": 0.009884686603491698, "loss": 3.0722, "step": 697 }, { "crossentropy": 3.092668294906616, "epoch": 0.05937393671316774, "grad_norm": 0.06727812439203262, "grad_norm_var": 9.015426360590803e-05, "learning_rate": 0.00988425839718361, "loss": 3.0927, "step": 698 }, { "crossentropy": 3.073640823364258, "epoch": 0.05945899965974821, "grad_norm": 0.0673714354634285, "grad_norm_var": 9.033233874300202e-05, "learning_rate": 0.009883829406604362, "loss": 3.0736, "step": 699 }, { "crossentropy": 3.008303642272949, "epoch": 0.059544062606328683, "grad_norm": 0.05502724647521973, "grad_norm_var": 9.656839190679874e-05, "learning_rate": 0.009883399631822836, "loss": 3.0083, "step": 700 }, { "crossentropy": 3.0481462478637695, "epoch": 0.059629125552909154, "grad_norm": 0.05708926543593407, "grad_norm_var": 9.945490799171576e-05, "learning_rate": 0.009882969072908039, "loss": 3.0481, "step": 701 }, { "crossentropy": 3.11576509475708, "epoch": 0.059714188499489625, "grad_norm": 0.05947606638073921, "grad_norm_var": 9.940629179689015e-05, "learning_rate": 0.00988253772992911, "loss": 3.1158, "step": 702 }, { "crossentropy": 3.0984280109405518, "epoch": 0.059799251446070095, "grad_norm": 0.06791076809167862, "grad_norm_var": 9.264998120072989e-05, "learning_rate": 0.009882105602955308, "loss": 3.0984, "step": 703 }, { "crossentropy": 3.062105417251587, "epoch": 0.05988431439265056, "grad_norm": 0.0679122805595398, "grad_norm_var": 8.854133894927521e-05, "learning_rate": 0.009881672692056021, "loss": 3.0621, "step": 704 }, { "crossentropy": 3.064069986343384, "epoch": 0.05996937733923103, "grad_norm": 0.06619619578123093, "grad_norm_var": 7.153936604074202e-05, "learning_rate": 0.009881238997300762, "loss": 3.0641, "step": 705 }, { "crossentropy": 3.0242667198181152, "epoch": 0.0600544402858115, "grad_norm": 0.057740695774555206, "grad_norm_var": 4.047280252269617e-05, "learning_rate": 0.00988080451875917, "loss": 3.0243, "step": 706 }, { "crossentropy": 3.1607472896575928, "epoch": 0.06013950323239197, "grad_norm": 0.057130809873342514, "grad_norm_var": 2.471156032960205e-05, "learning_rate": 0.009880369256501008, "loss": 3.1607, "step": 707 }, { "crossentropy": 3.2335777282714844, "epoch": 0.06022456617897244, "grad_norm": 0.05957750231027603, "grad_norm_var": 2.4093892009898154e-05, "learning_rate": 0.009879933210596171, "loss": 3.2336, "step": 708 }, { "crossentropy": 3.1513097286224365, "epoch": 0.06030962912555291, "grad_norm": 0.06039979308843613, "grad_norm_var": 2.313525787398975e-05, "learning_rate": 0.009879496381114669, "loss": 3.1513, "step": 709 }, { "crossentropy": 3.007822275161743, "epoch": 0.060394692072133375, "grad_norm": 0.05254704877734184, "grad_norm_var": 2.508736359954396e-05, "learning_rate": 0.00987905876812665, "loss": 3.0078, "step": 710 }, { "crossentropy": 3.021573543548584, "epoch": 0.060479755018713846, "grad_norm": 0.05601416528224945, "grad_norm_var": 2.649661700308923e-05, "learning_rate": 0.009878620371702378, "loss": 3.0216, "step": 711 }, { "crossentropy": 3.1177685260772705, "epoch": 0.06056481796529432, "grad_norm": 0.07035257667303085, "grad_norm_var": 3.237720265045479e-05, "learning_rate": 0.00987818119191225, "loss": 3.1178, "step": 712 }, { "crossentropy": 3.037036180496216, "epoch": 0.06064988091187479, "grad_norm": 0.06082640588283539, "grad_norm_var": 3.097966984394809e-05, "learning_rate": 0.009877741228826785, "loss": 3.037, "step": 713 }, { "crossentropy": 3.0322091579437256, "epoch": 0.06073494385845526, "grad_norm": 0.05715866759419441, "grad_norm_var": 2.9486739294991235e-05, "learning_rate": 0.009877300482516624, "loss": 3.0322, "step": 714 }, { "crossentropy": 2.974689245223999, "epoch": 0.06082000680503573, "grad_norm": 0.05400117486715317, "grad_norm_var": 2.8936877356121733e-05, "learning_rate": 0.009876858953052545, "loss": 2.9747, "step": 715 }, { "crossentropy": 3.0816924571990967, "epoch": 0.0609050697516162, "grad_norm": 0.058231789618730545, "grad_norm_var": 2.747104867097186e-05, "learning_rate": 0.009876416640505441, "loss": 3.0817, "step": 716 }, { "crossentropy": 3.1616272926330566, "epoch": 0.06099013269819666, "grad_norm": 0.05203327536582947, "grad_norm_var": 3.113903820246468e-05, "learning_rate": 0.009875973544946334, "loss": 3.1616, "step": 717 }, { "crossentropy": 3.196659564971924, "epoch": 0.06107519564477713, "grad_norm": 0.06186644732952118, "grad_norm_var": 3.137878750114227e-05, "learning_rate": 0.009875529666446375, "loss": 3.1967, "step": 718 }, { "crossentropy": 3.1673951148986816, "epoch": 0.061160258591357604, "grad_norm": 0.05702856555581093, "grad_norm_var": 2.7292867448092522e-05, "learning_rate": 0.009875085005076835, "loss": 3.1674, "step": 719 }, { "crossentropy": 3.0838358402252197, "epoch": 0.061245321537938074, "grad_norm": 0.059667956084012985, "grad_norm_var": 2.208886701766481e-05, "learning_rate": 0.009874639560909117, "loss": 3.0838, "step": 720 }, { "crossentropy": 3.1290414333343506, "epoch": 0.061330384484518545, "grad_norm": 0.0657302662730217, "grad_norm_var": 2.164284981720314e-05, "learning_rate": 0.009874193334014746, "loss": 3.129, "step": 721 }, { "crossentropy": 3.093641519546509, "epoch": 0.061415447431099016, "grad_norm": 0.0628686398267746, "grad_norm_var": 2.2583125553798702e-05, "learning_rate": 0.009873746324465372, "loss": 3.0936, "step": 722 }, { "crossentropy": 3.152629852294922, "epoch": 0.061500510377679486, "grad_norm": 0.05988690257072449, "grad_norm_var": 2.2338030107153738e-05, "learning_rate": 0.00987329853233277, "loss": 3.1526, "step": 723 }, { "crossentropy": 3.057389497756958, "epoch": 0.06158557332425995, "grad_norm": 0.06047016382217407, "grad_norm_var": 2.2425390583746762e-05, "learning_rate": 0.009872849957688846, "loss": 3.0574, "step": 724 }, { "crossentropy": 3.0120465755462646, "epoch": 0.06167063627084042, "grad_norm": 0.06185392290353775, "grad_norm_var": 2.2767339199838455e-05, "learning_rate": 0.009872400600605626, "loss": 3.012, "step": 725 }, { "crossentropy": 3.0383541584014893, "epoch": 0.06175569921742089, "grad_norm": 0.06849031150341034, "grad_norm_var": 2.4067948383974663e-05, "learning_rate": 0.009871950461155266, "loss": 3.0384, "step": 726 }, { "crossentropy": 3.0852224826812744, "epoch": 0.06184076216400136, "grad_norm": 0.05836061015725136, "grad_norm_var": 2.303832348294014e-05, "learning_rate": 0.009871499539410043, "loss": 3.0852, "step": 727 }, { "crossentropy": 3.1050541400909424, "epoch": 0.06192582511058183, "grad_norm": 0.06359013170003891, "grad_norm_var": 1.7059464792886816e-05, "learning_rate": 0.009871047835442364, "loss": 3.1051, "step": 728 }, { "crossentropy": 3.080281972885132, "epoch": 0.0620108880571623, "grad_norm": 0.06343652307987213, "grad_norm_var": 1.772794064770992e-05, "learning_rate": 0.009870595349324758, "loss": 3.0803, "step": 729 }, { "crossentropy": 3.076554536819458, "epoch": 0.062095951003742766, "grad_norm": 0.05702982842922211, "grad_norm_var": 1.7782807837344457e-05, "learning_rate": 0.009870142081129882, "loss": 3.0766, "step": 730 }, { "crossentropy": 3.082230806350708, "epoch": 0.06218101395032324, "grad_norm": 0.05605310574173927, "grad_norm_var": 1.6326993153735422e-05, "learning_rate": 0.009869688030930517, "loss": 3.0822, "step": 731 }, { "crossentropy": 3.0726728439331055, "epoch": 0.06226607689690371, "grad_norm": 0.06124085187911987, "grad_norm_var": 1.601801664505483e-05, "learning_rate": 0.009869233198799572, "loss": 3.0727, "step": 732 }, { "crossentropy": 3.0992698669433594, "epoch": 0.06235113984348418, "grad_norm": 0.07059626281261444, "grad_norm_var": 1.6350187303059335e-05, "learning_rate": 0.00986877758481008, "loss": 3.0993, "step": 733 }, { "crossentropy": 3.0759010314941406, "epoch": 0.06243620279006465, "grad_norm": 0.0693092793226242, "grad_norm_var": 1.9917406896181414e-05, "learning_rate": 0.009868321189035197, "loss": 3.0759, "step": 734 }, { "crossentropy": 3.1024820804595947, "epoch": 0.06252126573664511, "grad_norm": 0.062225013971328735, "grad_norm_var": 1.8004121805704983e-05, "learning_rate": 0.009867864011548209, "loss": 3.1025, "step": 735 }, { "crossentropy": 3.112173318862915, "epoch": 0.06260632868322559, "grad_norm": 0.05509059503674507, "grad_norm_var": 2.1072963191294478e-05, "learning_rate": 0.009867406052422523, "loss": 3.1122, "step": 736 }, { "crossentropy": 3.052995443344116, "epoch": 0.06269139162980605, "grad_norm": 0.054820988327264786, "grad_norm_var": 2.3470070948616862e-05, "learning_rate": 0.00986694731173168, "loss": 3.053, "step": 737 }, { "crossentropy": 3.0943713188171387, "epoch": 0.06277645457638653, "grad_norm": 0.06846628338098526, "grad_norm_var": 2.6388189369315705e-05, "learning_rate": 0.009866487789549334, "loss": 3.0944, "step": 738 }, { "crossentropy": 3.0764310359954834, "epoch": 0.062861517522967, "grad_norm": 0.0621754415333271, "grad_norm_var": 2.609132216835937e-05, "learning_rate": 0.009866027485949273, "loss": 3.0764, "step": 739 }, { "crossentropy": 2.9698708057403564, "epoch": 0.06294658046954746, "grad_norm": 0.05917016789317131, "grad_norm_var": 2.6475218136906335e-05, "learning_rate": 0.00986556640100541, "loss": 2.9699, "step": 740 }, { "crossentropy": 3.057067632675171, "epoch": 0.06303164341612794, "grad_norm": 0.053823694586753845, "grad_norm_var": 3.065583955636127e-05, "learning_rate": 0.009865104534791782, "loss": 3.0571, "step": 741 }, { "crossentropy": 3.0574636459350586, "epoch": 0.0631167063627084, "grad_norm": 0.051265351474285126, "grad_norm_var": 3.312780871652816e-05, "learning_rate": 0.00986464188738255, "loss": 3.0575, "step": 742 }, { "crossentropy": 3.0898802280426025, "epoch": 0.06320176930928888, "grad_norm": 0.05747964605689049, "grad_norm_var": 3.3417731047519214e-05, "learning_rate": 0.009864178458852003, "loss": 3.0899, "step": 743 }, { "crossentropy": 3.0505621433258057, "epoch": 0.06328683225586934, "grad_norm": 0.06218014284968376, "grad_norm_var": 3.2934880064953e-05, "learning_rate": 0.009863714249274553, "loss": 3.0506, "step": 744 }, { "crossentropy": 3.1088550090789795, "epoch": 0.06337189520244982, "grad_norm": 0.06428556144237518, "grad_norm_var": 3.333809532151347e-05, "learning_rate": 0.009863249258724739, "loss": 3.1089, "step": 745 }, { "crossentropy": 3.0553486347198486, "epoch": 0.06345695814903028, "grad_norm": 0.05572955682873726, "grad_norm_var": 3.401517921680618e-05, "learning_rate": 0.009862783487277225, "loss": 3.0553, "step": 746 }, { "crossentropy": 3.0562126636505127, "epoch": 0.06354202109561075, "grad_norm": 0.05905952677130699, "grad_norm_var": 3.289994499218093e-05, "learning_rate": 0.009862316935006802, "loss": 3.0562, "step": 747 }, { "crossentropy": 2.9643664360046387, "epoch": 0.06362708404219122, "grad_norm": 0.05803784355521202, "grad_norm_var": 3.3195884184490594e-05, "learning_rate": 0.009861849601988383, "loss": 2.9644, "step": 748 }, { "crossentropy": 3.010713815689087, "epoch": 0.06371214698877169, "grad_norm": 0.053928814828395844, "grad_norm_var": 2.752631434664998e-05, "learning_rate": 0.00986138148829701, "loss": 3.0107, "step": 749 }, { "crossentropy": 3.0823705196380615, "epoch": 0.06379720993535216, "grad_norm": 0.0583949089050293, "grad_norm_var": 2.0246176140126192e-05, "learning_rate": 0.009860912594007846, "loss": 3.0824, "step": 750 }, { "crossentropy": 3.0495898723602295, "epoch": 0.06388227288193263, "grad_norm": 0.05850107967853546, "grad_norm_var": 1.9267489623310483e-05, "learning_rate": 0.009860442919196185, "loss": 3.0496, "step": 751 }, { "crossentropy": 3.129120111465454, "epoch": 0.0639673358285131, "grad_norm": 0.05730432644486427, "grad_norm_var": 1.8633678083973478e-05, "learning_rate": 0.00985997246393744, "loss": 3.1291, "step": 752 }, { "crossentropy": 3.225325345993042, "epoch": 0.06405239877509357, "grad_norm": 0.07135695219039917, "grad_norm_var": 2.7801796263001353e-05, "learning_rate": 0.009859501228307156, "loss": 3.2253, "step": 753 }, { "crossentropy": 3.159531831741333, "epoch": 0.06413746172167403, "grad_norm": 0.07077691704034805, "grad_norm_var": 3.091404627590573e-05, "learning_rate": 0.009859029212380995, "loss": 3.1595, "step": 754 }, { "crossentropy": 3.0391457080841064, "epoch": 0.06422252466825451, "grad_norm": 0.056867457926273346, "grad_norm_var": 3.0846490887302624e-05, "learning_rate": 0.009858556416234756, "loss": 3.0391, "step": 755 }, { "crossentropy": 3.0651416778564453, "epoch": 0.06430758761483497, "grad_norm": 0.05805139243602753, "grad_norm_var": 3.093813797173083e-05, "learning_rate": 0.00985808283994435, "loss": 3.0651, "step": 756 }, { "crossentropy": 3.0429344177246094, "epoch": 0.06439265056141545, "grad_norm": 0.055470023304224014, "grad_norm_var": 2.9929533969082154e-05, "learning_rate": 0.009857608483585823, "loss": 3.0429, "step": 757 }, { "crossentropy": 2.9755821228027344, "epoch": 0.06447771350799592, "grad_norm": 0.056293439120054245, "grad_norm_var": 2.6127745747672737e-05, "learning_rate": 0.009857133347235343, "loss": 2.9756, "step": 758 }, { "crossentropy": 3.063309669494629, "epoch": 0.06456277645457639, "grad_norm": 0.06133190169930458, "grad_norm_var": 2.5962376915193013e-05, "learning_rate": 0.009856657430969203, "loss": 3.0633, "step": 759 }, { "crossentropy": 3.0509426593780518, "epoch": 0.06464783940115686, "grad_norm": 0.06324402987957001, "grad_norm_var": 2.636391973746769e-05, "learning_rate": 0.00985618073486382, "loss": 3.0509, "step": 760 }, { "crossentropy": 3.0631794929504395, "epoch": 0.06473290234773732, "grad_norm": 0.06600529700517654, "grad_norm_var": 2.7551014040216812e-05, "learning_rate": 0.009855703258995738, "loss": 3.0632, "step": 761 }, { "crossentropy": 3.100247383117676, "epoch": 0.0648179652943178, "grad_norm": 0.054450761526823044, "grad_norm_var": 2.838512450719175e-05, "learning_rate": 0.009855225003441628, "loss": 3.1002, "step": 762 }, { "crossentropy": 3.117356300354004, "epoch": 0.06490302824089826, "grad_norm": 0.05258641391992569, "grad_norm_var": 3.176573953916698e-05, "learning_rate": 0.009854745968278282, "loss": 3.1174, "step": 763 }, { "crossentropy": 3.136849880218506, "epoch": 0.06498809118747874, "grad_norm": 0.05163827911019325, "grad_norm_var": 3.560509401442831e-05, "learning_rate": 0.009854266153582618, "loss": 3.1368, "step": 764 }, { "crossentropy": 3.1726675033569336, "epoch": 0.0650731541340592, "grad_norm": 0.0552322119474411, "grad_norm_var": 3.480605206551667e-05, "learning_rate": 0.009853785559431685, "loss": 3.1727, "step": 765 }, { "crossentropy": 3.0661139488220215, "epoch": 0.06515821708063968, "grad_norm": 0.061399780213832855, "grad_norm_var": 3.5040173737937994e-05, "learning_rate": 0.009853304185902648, "loss": 3.0661, "step": 766 }, { "crossentropy": 3.1631572246551514, "epoch": 0.06524328002722014, "grad_norm": 0.06498821079730988, "grad_norm_var": 3.688687047859e-05, "learning_rate": 0.009852822033072804, "loss": 3.1632, "step": 767 }, { "crossentropy": 3.025454521179199, "epoch": 0.06532834297380061, "grad_norm": 0.05678630992770195, "grad_norm_var": 3.7076867261522324e-05, "learning_rate": 0.009852339101019575, "loss": 3.0255, "step": 768 }, { "crossentropy": 3.074666976928711, "epoch": 0.06541340592038108, "grad_norm": 0.05723511800169945, "grad_norm_var": 2.7742558389475384e-05, "learning_rate": 0.009851855389820498, "loss": 3.0747, "step": 769 }, { "crossentropy": 3.1218278408050537, "epoch": 0.06549846886696155, "grad_norm": 0.060708194971084595, "grad_norm_var": 1.8130476924409627e-05, "learning_rate": 0.009851370899553253, "loss": 3.1218, "step": 770 }, { "crossentropy": 3.094623327255249, "epoch": 0.06558353181354203, "grad_norm": 0.07282959669828415, "grad_norm_var": 3.107398182328232e-05, "learning_rate": 0.00985088563029563, "loss": 3.0946, "step": 771 }, { "crossentropy": 3.0080344676971436, "epoch": 0.06566859476012249, "grad_norm": 0.0737462043762207, "grad_norm_var": 4.392834774083656e-05, "learning_rate": 0.009850399582125548, "loss": 3.008, "step": 772 }, { "crossentropy": 3.1333494186401367, "epoch": 0.06575365770670297, "grad_norm": 0.05993351340293884, "grad_norm_var": 4.233081911096554e-05, "learning_rate": 0.009849912755121055, "loss": 3.1333, "step": 773 }, { "crossentropy": 3.1206748485565186, "epoch": 0.06583872065328343, "grad_norm": 0.0556190051138401, "grad_norm_var": 4.273982113709163e-05, "learning_rate": 0.009849425149360321, "loss": 3.1207, "step": 774 }, { "crossentropy": 3.008094072341919, "epoch": 0.0659237835998639, "grad_norm": 0.06495123356580734, "grad_norm_var": 4.396799880634159e-05, "learning_rate": 0.00984893676492164, "loss": 3.0081, "step": 775 }, { "crossentropy": 3.127744436264038, "epoch": 0.06600884654644437, "grad_norm": 0.07398544251918793, "grad_norm_var": 5.480885118452714e-05, "learning_rate": 0.009848447601883435, "loss": 3.1277, "step": 776 }, { "crossentropy": 3.0832481384277344, "epoch": 0.06609390949302484, "grad_norm": 0.05707647651433945, "grad_norm_var": 5.4286290082504406e-05, "learning_rate": 0.00984795766032425, "loss": 3.0832, "step": 777 }, { "crossentropy": 3.0958597660064697, "epoch": 0.06617897243960531, "grad_norm": 0.05388646572828293, "grad_norm_var": 5.478562972897431e-05, "learning_rate": 0.009847466940322753, "loss": 3.0959, "step": 778 }, { "crossentropy": 3.059068202972412, "epoch": 0.06626403538618578, "grad_norm": 0.07629682868719101, "grad_norm_var": 6.399480948997097e-05, "learning_rate": 0.009846975441957744, "loss": 3.0591, "step": 779 }, { "crossentropy": 3.158196210861206, "epoch": 0.06634909833276624, "grad_norm": 0.07431437075138092, "grad_norm_var": 6.398919366075815e-05, "learning_rate": 0.009846483165308142, "loss": 3.1582, "step": 780 }, { "crossentropy": 3.0902624130249023, "epoch": 0.06643416127934672, "grad_norm": 0.05165253207087517, "grad_norm_var": 6.882537610963778e-05, "learning_rate": 0.00984599011045299, "loss": 3.0903, "step": 781 }, { "crossentropy": 3.082587957382202, "epoch": 0.06651922422592718, "grad_norm": 0.05354264751076698, "grad_norm_var": 7.484533408640872e-05, "learning_rate": 0.009845496277471461, "loss": 3.0826, "step": 782 }, { "crossentropy": 2.930307149887085, "epoch": 0.06660428717250766, "grad_norm": 0.054857708513736725, "grad_norm_var": 7.853617534080605e-05, "learning_rate": 0.00984500166644285, "loss": 2.9303, "step": 783 }, { "crossentropy": 3.1923322677612305, "epoch": 0.06668935011908812, "grad_norm": 0.06308749318122864, "grad_norm_var": 7.63527200323568e-05, "learning_rate": 0.009844506277446577, "loss": 3.1923, "step": 784 }, { "crossentropy": 3.0714595317840576, "epoch": 0.0667744130656686, "grad_norm": 0.06536412239074707, "grad_norm_var": 7.452414041781285e-05, "learning_rate": 0.009844010110562186, "loss": 3.0715, "step": 785 }, { "crossentropy": 3.06917405128479, "epoch": 0.06685947601224906, "grad_norm": 0.05792236328125, "grad_norm_var": 7.594989318894373e-05, "learning_rate": 0.00984351316586935, "loss": 3.0692, "step": 786 }, { "crossentropy": 3.0253868103027344, "epoch": 0.06694453895882953, "grad_norm": 0.061566442251205444, "grad_norm_var": 6.921697801925136e-05, "learning_rate": 0.00984301544344786, "loss": 3.0254, "step": 787 }, { "crossentropy": 3.0526044368743896, "epoch": 0.06702960190541, "grad_norm": 0.06900369375944138, "grad_norm_var": 6.342449173436371e-05, "learning_rate": 0.00984251694337764, "loss": 3.0526, "step": 788 }, { "crossentropy": 3.10251522064209, "epoch": 0.06711466485199047, "grad_norm": 0.056597739458084106, "grad_norm_var": 6.506853985908523e-05, "learning_rate": 0.009842017665738733, "loss": 3.1025, "step": 789 }, { "crossentropy": 3.06075382232666, "epoch": 0.06719972779857095, "grad_norm": 0.05955670773983002, "grad_norm_var": 6.2762105566333e-05, "learning_rate": 0.009841517610611307, "loss": 3.0608, "step": 790 }, { "crossentropy": 3.116706609725952, "epoch": 0.06728479074515141, "grad_norm": 0.05366816371679306, "grad_norm_var": 6.643526708966508e-05, "learning_rate": 0.00984101677807566, "loss": 3.1167, "step": 791 }, { "crossentropy": 3.0452685356140137, "epoch": 0.06736985369173189, "grad_norm": 0.05382014438509941, "grad_norm_var": 5.800816442683704e-05, "learning_rate": 0.009840515168212207, "loss": 3.0453, "step": 792 }, { "crossentropy": 3.082880973815918, "epoch": 0.06745491663831235, "grad_norm": 0.06645223498344421, "grad_norm_var": 5.967454241961132e-05, "learning_rate": 0.009840012781101495, "loss": 3.0829, "step": 793 }, { "crossentropy": 2.982604503631592, "epoch": 0.06753997958489281, "grad_norm": 0.057036373764276505, "grad_norm_var": 5.7422833377221426e-05, "learning_rate": 0.009839509616824192, "loss": 2.9826, "step": 794 }, { "crossentropy": 3.0784218311309814, "epoch": 0.06762504253147329, "grad_norm": 0.058751512318849564, "grad_norm_var": 4.06934005691042e-05, "learning_rate": 0.009839005675461093, "loss": 3.0784, "step": 795 }, { "crossentropy": 3.098069429397583, "epoch": 0.06771010547805376, "grad_norm": 0.06504110246896744, "grad_norm_var": 2.8152373974679675e-05, "learning_rate": 0.009838500957093113, "loss": 3.0981, "step": 796 }, { "crossentropy": 3.0823090076446533, "epoch": 0.06779516842463423, "grad_norm": 0.06083005666732788, "grad_norm_var": 2.4125808922742287e-05, "learning_rate": 0.009837995461801299, "loss": 3.0823, "step": 797 }, { "crossentropy": 3.045271158218384, "epoch": 0.0678802313712147, "grad_norm": 0.06693819165229797, "grad_norm_var": 2.4131438940756755e-05, "learning_rate": 0.009837489189666817, "loss": 3.0453, "step": 798 }, { "crossentropy": 3.03605055809021, "epoch": 0.06796529431779517, "grad_norm": 0.07208141684532166, "grad_norm_var": 2.9356982608673364e-05, "learning_rate": 0.00983698214077096, "loss": 3.0361, "step": 799 }, { "crossentropy": 3.0273725986480713, "epoch": 0.06805035726437564, "grad_norm": 0.06706991046667099, "grad_norm_var": 3.1067771385080194e-05, "learning_rate": 0.009836474315195147, "loss": 3.0274, "step": 800 }, { "crossentropy": 2.967099666595459, "epoch": 0.0681354202109561, "grad_norm": 0.05772222578525543, "grad_norm_var": 3.127081928049839e-05, "learning_rate": 0.009835965713020919, "loss": 2.9671, "step": 801 }, { "crossentropy": 3.0698060989379883, "epoch": 0.06822048315753658, "grad_norm": 0.06265562027692795, "grad_norm_var": 3.0410903254759516e-05, "learning_rate": 0.009835456334329942, "loss": 3.0698, "step": 802 }, { "crossentropy": 3.0357556343078613, "epoch": 0.06830554610411704, "grad_norm": 0.059321079403162, "grad_norm_var": 3.079577120582988e-05, "learning_rate": 0.009834946179204008, "loss": 3.0358, "step": 803 }, { "crossentropy": 3.080796241760254, "epoch": 0.06839060905069752, "grad_norm": 0.06612368673086166, "grad_norm_var": 2.8493856618190074e-05, "learning_rate": 0.009834435247725032, "loss": 3.0808, "step": 804 }, { "crossentropy": 3.0817878246307373, "epoch": 0.06847567199727798, "grad_norm": 0.05271575227379799, "grad_norm_var": 3.1962322856281196e-05, "learning_rate": 0.009833923539975057, "loss": 3.0818, "step": 805 }, { "crossentropy": 3.121781349182129, "epoch": 0.06856073494385846, "grad_norm": 0.055470895022153854, "grad_norm_var": 3.392080444228988e-05, "learning_rate": 0.00983341105603625, "loss": 3.1218, "step": 806 }, { "crossentropy": 3.116285800933838, "epoch": 0.06864579789043893, "grad_norm": 0.0626586526632309, "grad_norm_var": 3.020630300511635e-05, "learning_rate": 0.009832897795990897, "loss": 3.1163, "step": 807 }, { "crossentropy": 3.087080240249634, "epoch": 0.06873086083701939, "grad_norm": 0.06363072246313095, "grad_norm_var": 2.6119607786586652e-05, "learning_rate": 0.009832383759921416, "loss": 3.0871, "step": 808 }, { "crossentropy": 3.047938346862793, "epoch": 0.06881592378359987, "grad_norm": 0.07181815803050995, "grad_norm_var": 3.0992793717298366e-05, "learning_rate": 0.009831868947910343, "loss": 3.0479, "step": 809 }, { "crossentropy": 2.9975485801696777, "epoch": 0.06890098673018033, "grad_norm": 0.06861051917076111, "grad_norm_var": 3.094675889889024e-05, "learning_rate": 0.009831353360040346, "loss": 2.9975, "step": 810 }, { "crossentropy": 3.086026906967163, "epoch": 0.06898604967676081, "grad_norm": 0.05592836067080498, "grad_norm_var": 3.312503077892229e-05, "learning_rate": 0.009830836996394211, "loss": 3.086, "step": 811 }, { "crossentropy": 3.0333356857299805, "epoch": 0.06907111262334127, "grad_norm": 0.05338325351476669, "grad_norm_var": 3.850635003187844e-05, "learning_rate": 0.009830319857054853, "loss": 3.0333, "step": 812 }, { "crossentropy": 3.0282068252563477, "epoch": 0.06915617556992175, "grad_norm": 0.05388249456882477, "grad_norm_var": 4.289398507079633e-05, "learning_rate": 0.009829801942105306, "loss": 3.0282, "step": 813 }, { "crossentropy": 3.0496251583099365, "epoch": 0.06924123851650221, "grad_norm": 0.05366520956158638, "grad_norm_var": 4.494546147234864e-05, "learning_rate": 0.009829283251628736, "loss": 3.0496, "step": 814 }, { "crossentropy": 3.017735481262207, "epoch": 0.06932630146308268, "grad_norm": 0.05547880753874779, "grad_norm_var": 3.774475177611587e-05, "learning_rate": 0.009828763785708428, "loss": 3.0177, "step": 815 }, { "crossentropy": 3.0794458389282227, "epoch": 0.06941136440966315, "grad_norm": 0.060024749487638474, "grad_norm_var": 3.421368703508115e-05, "learning_rate": 0.009828243544427795, "loss": 3.0794, "step": 816 }, { "crossentropy": 3.0274126529693604, "epoch": 0.06949642735624362, "grad_norm": 0.08423422276973724, "grad_norm_var": 7.161888976930369e-05, "learning_rate": 0.00982772252787037, "loss": 3.0274, "step": 817 }, { "crossentropy": 3.1211745738983154, "epoch": 0.0695814903028241, "grad_norm": 0.06853119283914566, "grad_norm_var": 7.489719165121352e-05, "learning_rate": 0.009827200736119814, "loss": 3.1212, "step": 818 }, { "crossentropy": 3.0170414447784424, "epoch": 0.06966655324940456, "grad_norm": 0.05717103183269501, "grad_norm_var": 7.583722547082778e-05, "learning_rate": 0.009826678169259915, "loss": 3.017, "step": 819 }, { "crossentropy": 3.050603151321411, "epoch": 0.06975161619598502, "grad_norm": 0.053324609994888306, "grad_norm_var": 7.811351961275285e-05, "learning_rate": 0.009826154827374579, "loss": 3.0506, "step": 820 }, { "crossentropy": 3.126265048980713, "epoch": 0.0698366791425655, "grad_norm": 0.05515707656741142, "grad_norm_var": 7.590073045676987e-05, "learning_rate": 0.009825630710547838, "loss": 3.1263, "step": 821 }, { "crossentropy": 3.075298309326172, "epoch": 0.06992174208914596, "grad_norm": 0.07993538677692413, "grad_norm_var": 9.588986175095518e-05, "learning_rate": 0.009825105818863854, "loss": 3.0753, "step": 822 }, { "crossentropy": 2.9748716354370117, "epoch": 0.07000680503572644, "grad_norm": 0.05668641999363899, "grad_norm_var": 9.786506576991335e-05, "learning_rate": 0.009824580152406908, "loss": 2.9749, "step": 823 }, { "crossentropy": 3.026420831680298, "epoch": 0.0700918679823069, "grad_norm": 0.054868947714567184, "grad_norm_var": 0.00010071877339496247, "learning_rate": 0.009824053711261405, "loss": 3.0264, "step": 824 }, { "crossentropy": 2.978156805038452, "epoch": 0.07017693092888738, "grad_norm": 0.05324913188815117, "grad_norm_var": 9.652180343893147e-05, "learning_rate": 0.009823526495511879, "loss": 2.9782, "step": 825 }, { "crossentropy": 2.9932377338409424, "epoch": 0.07026199387546785, "grad_norm": 0.049181897193193436, "grad_norm_var": 9.847725748685683e-05, "learning_rate": 0.009822998505242984, "loss": 2.9932, "step": 826 }, { "crossentropy": 2.9676475524902344, "epoch": 0.07034705682204831, "grad_norm": 0.05303162708878517, "grad_norm_var": 0.00010020502688200856, "learning_rate": 0.009822469740539503, "loss": 2.9676, "step": 827 }, { "crossentropy": 3.1239192485809326, "epoch": 0.07043211976862879, "grad_norm": 0.056653089821338654, "grad_norm_var": 9.84842692368222e-05, "learning_rate": 0.009821940201486334, "loss": 3.1239, "step": 828 }, { "crossentropy": 2.9943532943725586, "epoch": 0.07051718271520925, "grad_norm": 0.04768233001232147, "grad_norm_var": 0.0001051730696253888, "learning_rate": 0.009821409888168512, "loss": 2.9944, "step": 829 }, { "crossentropy": 2.9889419078826904, "epoch": 0.07060224566178973, "grad_norm": 0.06302942335605621, "grad_norm_var": 0.00010439265802697286, "learning_rate": 0.009820878800671189, "loss": 2.9889, "step": 830 }, { "crossentropy": 3.0236105918884277, "epoch": 0.07068730860837019, "grad_norm": 0.060270652174949646, "grad_norm_var": 0.00010340872502853319, "learning_rate": 0.00982034693907964, "loss": 3.0236, "step": 831 }, { "crossentropy": 2.999070882797241, "epoch": 0.07077237155495067, "grad_norm": 0.05752190202474594, "grad_norm_var": 0.00010364664474227583, "learning_rate": 0.009819814303479266, "loss": 2.9991, "step": 832 }, { "crossentropy": 3.0535056591033936, "epoch": 0.07085743450153113, "grad_norm": 0.05620862543582916, "grad_norm_var": 5.996720001915992e-05, "learning_rate": 0.009819280893955597, "loss": 3.0535, "step": 833 }, { "crossentropy": 2.9653520584106445, "epoch": 0.0709424974481116, "grad_norm": 0.055735599249601364, "grad_norm_var": 5.1646994438804956e-05, "learning_rate": 0.00981874671059428, "loss": 2.9654, "step": 834 }, { "crossentropy": 3.064237356185913, "epoch": 0.07102756039469207, "grad_norm": 0.057998403906822205, "grad_norm_var": 5.172445059062516e-05, "learning_rate": 0.00981821175348109, "loss": 3.0642, "step": 835 }, { "crossentropy": 2.9671552181243896, "epoch": 0.07111262334127254, "grad_norm": 0.0667550191283226, "grad_norm_var": 5.658029394106593e-05, "learning_rate": 0.009817676022701926, "loss": 2.9672, "step": 836 }, { "crossentropy": 3.065833806991577, "epoch": 0.07119768628785302, "grad_norm": 0.07103994488716125, "grad_norm_var": 6.686037525608716e-05, "learning_rate": 0.00981713951834281, "loss": 3.0658, "step": 837 }, { "crossentropy": 3.0262317657470703, "epoch": 0.07128274923443348, "grad_norm": 0.0615493506193161, "grad_norm_var": 3.6029671373880715e-05, "learning_rate": 0.00981660224048989, "loss": 3.0262, "step": 838 }, { "crossentropy": 3.0548248291015625, "epoch": 0.07136781218101396, "grad_norm": 0.05721230432391167, "grad_norm_var": 3.598350086775522e-05, "learning_rate": 0.00981606418922944, "loss": 3.0548, "step": 839 }, { "crossentropy": 2.967855215072632, "epoch": 0.07145287512759442, "grad_norm": 0.05389031022787094, "grad_norm_var": 3.640288671897662e-05, "learning_rate": 0.009815525364647852, "loss": 2.9679, "step": 840 }, { "crossentropy": 3.002519130706787, "epoch": 0.07153793807417488, "grad_norm": 0.06352663785219193, "grad_norm_var": 3.7093003567352745e-05, "learning_rate": 0.009814985766831645, "loss": 3.0025, "step": 841 }, { "crossentropy": 3.104769706726074, "epoch": 0.07162300102075536, "grad_norm": 0.0637776330113411, "grad_norm_var": 3.284701357595855e-05, "learning_rate": 0.009814445395867467, "loss": 3.1048, "step": 842 }, { "crossentropy": 3.0621423721313477, "epoch": 0.07170806396733582, "grad_norm": 0.05986468121409416, "grad_norm_var": 3.022033553874975e-05, "learning_rate": 0.009813904251842084, "loss": 3.0621, "step": 843 }, { "crossentropy": 3.0643162727355957, "epoch": 0.0717931269139163, "grad_norm": 0.05959230288863182, "grad_norm_var": 2.962704626711272e-05, "learning_rate": 0.009813362334842384, "loss": 3.0643, "step": 844 }, { "crossentropy": 3.0708696842193604, "epoch": 0.07187818986049677, "grad_norm": 0.05816878378391266, "grad_norm_var": 1.9657099557653332e-05, "learning_rate": 0.009812819644955388, "loss": 3.0709, "step": 845 }, { "crossentropy": 2.942948579788208, "epoch": 0.07196325280707724, "grad_norm": 0.05268415808677673, "grad_norm_var": 2.2696908329022706e-05, "learning_rate": 0.009812276182268235, "loss": 2.9429, "step": 846 }, { "crossentropy": 3.0653371810913086, "epoch": 0.07204831575365771, "grad_norm": 0.05454040318727493, "grad_norm_var": 2.434162069716213e-05, "learning_rate": 0.009811731946868192, "loss": 3.0653, "step": 847 }, { "crossentropy": 3.0611414909362793, "epoch": 0.07213337870023817, "grad_norm": 0.0522419773042202, "grad_norm_var": 2.739143997859215e-05, "learning_rate": 0.009811186938842644, "loss": 3.0611, "step": 848 }, { "crossentropy": 2.9783785343170166, "epoch": 0.07221844164681865, "grad_norm": 0.05252374708652496, "grad_norm_var": 2.96356757014772e-05, "learning_rate": 0.009810641158279105, "loss": 2.9784, "step": 849 }, { "crossentropy": 3.0206727981567383, "epoch": 0.07230350459339911, "grad_norm": 0.06096252426505089, "grad_norm_var": 2.91944478253124e-05, "learning_rate": 0.009810094605265211, "loss": 3.0207, "step": 850 }, { "crossentropy": 3.0707337856292725, "epoch": 0.07238856753997959, "grad_norm": 0.057545579969882965, "grad_norm_var": 2.927652178196834e-05, "learning_rate": 0.009809547279888723, "loss": 3.0707, "step": 851 }, { "crossentropy": 2.9922940731048584, "epoch": 0.07247363048656005, "grad_norm": 0.06050410866737366, "grad_norm_var": 2.5352871138165493e-05, "learning_rate": 0.009808999182237528, "loss": 2.9923, "step": 852 }, { "crossentropy": 3.10455060005188, "epoch": 0.07255869343314053, "grad_norm": 0.07736887782812119, "grad_norm_var": 3.8247105148855356e-05, "learning_rate": 0.009808450312399629, "loss": 3.1046, "step": 853 }, { "crossentropy": 3.0169851779937744, "epoch": 0.072643756379721, "grad_norm": 0.05994206666946411, "grad_norm_var": 3.7888391517347e-05, "learning_rate": 0.009807900670463164, "loss": 3.017, "step": 854 }, { "crossentropy": 3.0949885845184326, "epoch": 0.07272881932630146, "grad_norm": 0.05506741255521774, "grad_norm_var": 3.869336788686964e-05, "learning_rate": 0.009807350256516387, "loss": 3.095, "step": 855 }, { "crossentropy": 3.096492052078247, "epoch": 0.07281388227288194, "grad_norm": 0.05499151721596718, "grad_norm_var": 3.803542251146526e-05, "learning_rate": 0.00980679907064768, "loss": 3.0965, "step": 856 }, { "crossentropy": 3.040985584259033, "epoch": 0.0728989452194624, "grad_norm": 0.06710875034332275, "grad_norm_var": 4.102020693067572e-05, "learning_rate": 0.009806247112945548, "loss": 3.041, "step": 857 }, { "crossentropy": 2.999753952026367, "epoch": 0.07298400816604288, "grad_norm": 0.05882876738905907, "grad_norm_var": 3.951735554647932e-05, "learning_rate": 0.009805694383498618, "loss": 2.9998, "step": 858 }, { "crossentropy": 3.0755343437194824, "epoch": 0.07306907111262334, "grad_norm": 0.06649443507194519, "grad_norm_var": 4.314285826130844e-05, "learning_rate": 0.009805140882395643, "loss": 3.0755, "step": 859 }, { "crossentropy": 3.051525354385376, "epoch": 0.0731541340592038, "grad_norm": 0.08160477876663208, "grad_norm_var": 7.432811682351488e-05, "learning_rate": 0.009804586609725498, "loss": 3.0515, "step": 860 }, { "crossentropy": 3.000039577484131, "epoch": 0.07323919700578428, "grad_norm": 0.07458248734474182, "grad_norm_var": 8.57117628341987e-05, "learning_rate": 0.009804031565577185, "loss": 3.0, "step": 861 }, { "crossentropy": 3.0632100105285645, "epoch": 0.07332425995236475, "grad_norm": 0.05780196562409401, "grad_norm_var": 8.12054690762171e-05, "learning_rate": 0.009803475750039828, "loss": 3.0632, "step": 862 }, { "crossentropy": 3.051729917526245, "epoch": 0.07340932289894522, "grad_norm": 0.09283655136823654, "grad_norm_var": 0.0001347428980036535, "learning_rate": 0.009802919163202676, "loss": 3.0517, "step": 863 }, { "crossentropy": 3.012857675552368, "epoch": 0.07349438584552569, "grad_norm": 0.05071191489696503, "grad_norm_var": 0.0001373696247188592, "learning_rate": 0.009802361805155096, "loss": 3.0129, "step": 864 }, { "crossentropy": 3.0048511028289795, "epoch": 0.07357944879210616, "grad_norm": 0.05890634283423424, "grad_norm_var": 0.00012988996375751417, "learning_rate": 0.00980180367598659, "loss": 3.0049, "step": 865 }, { "crossentropy": 3.049130916595459, "epoch": 0.07366451173868663, "grad_norm": 0.052887603640556335, "grad_norm_var": 0.00013799311950295768, "learning_rate": 0.009801244775786772, "loss": 3.0491, "step": 866 }, { "crossentropy": 3.0358283519744873, "epoch": 0.07374957468526709, "grad_norm": 0.05672507733106613, "grad_norm_var": 0.00013876307678922783, "learning_rate": 0.00980068510464539, "loss": 3.0358, "step": 867 }, { "crossentropy": 3.1132137775421143, "epoch": 0.07383463763184757, "grad_norm": 0.07267581671476364, "grad_norm_var": 0.00014210937261752501, "learning_rate": 0.009800124662652308, "loss": 3.1132, "step": 868 }, { "crossentropy": 2.991874933242798, "epoch": 0.07391970057842803, "grad_norm": 0.057078346610069275, "grad_norm_var": 0.00013413034178759998, "learning_rate": 0.009799563449897518, "loss": 2.9919, "step": 869 }, { "crossentropy": 2.9893674850463867, "epoch": 0.07400476352500851, "grad_norm": 0.06244185194373131, "grad_norm_var": 0.00013328828124139163, "learning_rate": 0.009799001466471132, "loss": 2.9894, "step": 870 }, { "crossentropy": 2.994790554046631, "epoch": 0.07408982647158897, "grad_norm": 0.05922921001911163, "grad_norm_var": 0.00012952700345038217, "learning_rate": 0.009798438712463393, "loss": 2.9948, "step": 871 }, { "crossentropy": 2.9667234420776367, "epoch": 0.07417488941816945, "grad_norm": 0.05333367735147476, "grad_norm_var": 0.00013170257206918275, "learning_rate": 0.00979787518796466, "loss": 2.9667, "step": 872 }, { "crossentropy": 3.2066235542297363, "epoch": 0.07425995236474991, "grad_norm": 0.05768999829888344, "grad_norm_var": 0.000133283997184314, "learning_rate": 0.009797310893065419, "loss": 3.2066, "step": 873 }, { "crossentropy": 2.9919819831848145, "epoch": 0.07434501531133038, "grad_norm": 0.051934633404016495, "grad_norm_var": 0.00014042370894691518, "learning_rate": 0.009796745827856279, "loss": 2.992, "step": 874 }, { "crossentropy": 3.121819019317627, "epoch": 0.07443007825791086, "grad_norm": 0.05233709141612053, "grad_norm_var": 0.00014622866904511386, "learning_rate": 0.009796179992427975, "loss": 3.1218, "step": 875 }, { "crossentropy": 3.06063175201416, "epoch": 0.07451514120449132, "grad_norm": 0.05658901482820511, "grad_norm_var": 0.00012011202738504214, "learning_rate": 0.009795613386871365, "loss": 3.0606, "step": 876 }, { "crossentropy": 2.9831349849700928, "epoch": 0.0746002041510718, "grad_norm": 0.0579221248626709, "grad_norm_var": 0.00010614432722709025, "learning_rate": 0.009795046011277427, "loss": 2.9831, "step": 877 }, { "crossentropy": 2.981163740158081, "epoch": 0.07468526709765226, "grad_norm": 0.06128733977675438, "grad_norm_var": 0.00010614056694365215, "learning_rate": 0.009794477865737264, "loss": 2.9812, "step": 878 }, { "crossentropy": 2.8985438346862793, "epoch": 0.07477033004423274, "grad_norm": 0.0698743686079979, "grad_norm_var": 3.752538269762111e-05, "learning_rate": 0.009793908950342106, "loss": 2.8985, "step": 879 }, { "crossentropy": 3.0627195835113525, "epoch": 0.0748553929908132, "grad_norm": 0.05713225528597832, "grad_norm_var": 3.366883289600732e-05, "learning_rate": 0.009793339265183302, "loss": 3.0627, "step": 880 }, { "crossentropy": 3.0310025215148926, "epoch": 0.07494045593739367, "grad_norm": 0.05367850139737129, "grad_norm_var": 3.5182819189657524e-05, "learning_rate": 0.009792768810352332, "loss": 3.031, "step": 881 }, { "crossentropy": 3.171278476715088, "epoch": 0.07502551888397414, "grad_norm": 0.05335028097033501, "grad_norm_var": 3.4862240966732996e-05, "learning_rate": 0.009792197585940791, "loss": 3.1713, "step": 882 }, { "crossentropy": 3.0352015495300293, "epoch": 0.0751105818305546, "grad_norm": 0.05093933641910553, "grad_norm_var": 3.819248491008393e-05, "learning_rate": 0.009791625592040401, "loss": 3.0352, "step": 883 }, { "crossentropy": 3.080305337905884, "epoch": 0.07519564477713508, "grad_norm": 0.06391490995883942, "grad_norm_var": 2.5809496691076018e-05, "learning_rate": 0.00979105282874301, "loss": 3.0803, "step": 884 }, { "crossentropy": 3.0644497871398926, "epoch": 0.07528070772371555, "grad_norm": 0.05691353976726532, "grad_norm_var": 2.5818719620829268e-05, "learning_rate": 0.009790479296140584, "loss": 3.0644, "step": 885 }, { "crossentropy": 3.04431414604187, "epoch": 0.07536577067029603, "grad_norm": 0.05581436678767204, "grad_norm_var": 2.411792146395364e-05, "learning_rate": 0.009789904994325218, "loss": 3.0443, "step": 886 }, { "crossentropy": 3.0957190990448, "epoch": 0.07545083361687649, "grad_norm": 0.05953856557607651, "grad_norm_var": 2.421600490630574e-05, "learning_rate": 0.009789329923389127, "loss": 3.0957, "step": 887 }, { "crossentropy": 3.0344629287719727, "epoch": 0.07553589656345695, "grad_norm": 0.06439389288425446, "grad_norm_var": 2.6431776172863854e-05, "learning_rate": 0.009788754083424653, "loss": 3.0345, "step": 888 }, { "crossentropy": 2.9830257892608643, "epoch": 0.07562095951003743, "grad_norm": 0.05389329791069031, "grad_norm_var": 2.734125994300427e-05, "learning_rate": 0.009788177474524259, "loss": 2.983, "step": 889 }, { "crossentropy": 3.092524766921997, "epoch": 0.0757060224566179, "grad_norm": 0.053541772067546844, "grad_norm_var": 2.63166307584319e-05, "learning_rate": 0.009787600096780529, "loss": 3.0925, "step": 890 }, { "crossentropy": 2.996852397918701, "epoch": 0.07579108540319837, "grad_norm": 0.05369093641638756, "grad_norm_var": 2.5486573063728442e-05, "learning_rate": 0.009787021950286174, "loss": 2.9969, "step": 891 }, { "crossentropy": 2.9981844425201416, "epoch": 0.07587614834977884, "grad_norm": 0.06253205984830856, "grad_norm_var": 2.6849638737141043e-05, "learning_rate": 0.00978644303513403, "loss": 2.9982, "step": 892 }, { "crossentropy": 2.9841833114624023, "epoch": 0.07596121129635931, "grad_norm": 0.056677818298339844, "grad_norm_var": 2.6963657139593886e-05, "learning_rate": 0.009785863351417052, "loss": 2.9842, "step": 893 }, { "crossentropy": 2.9936816692352295, "epoch": 0.07604627424293978, "grad_norm": 0.053862959146499634, "grad_norm_var": 2.7103400086027294e-05, "learning_rate": 0.009785282899228323, "loss": 2.9937, "step": 894 }, { "crossentropy": 3.039564847946167, "epoch": 0.07613133718952024, "grad_norm": 0.05880796164274216, "grad_norm_var": 1.6475685398953793e-05, "learning_rate": 0.009784701678661044, "loss": 3.0396, "step": 895 }, { "crossentropy": 2.945770263671875, "epoch": 0.07621640013610072, "grad_norm": 0.0509801022708416, "grad_norm_var": 1.8562676032330267e-05, "learning_rate": 0.009784119689808544, "loss": 2.9458, "step": 896 }, { "crossentropy": 3.0557477474212646, "epoch": 0.07630146308268118, "grad_norm": 0.05629444867372513, "grad_norm_var": 1.8038294698470802e-05, "learning_rate": 0.009783536932764273, "loss": 3.0557, "step": 897 }, { "crossentropy": 3.0440216064453125, "epoch": 0.07638652602926166, "grad_norm": 0.05635048821568489, "grad_norm_var": 1.7312239575424196e-05, "learning_rate": 0.009782953407621805, "loss": 3.044, "step": 898 }, { "crossentropy": 3.0900657176971436, "epoch": 0.07647158897584212, "grad_norm": 0.05941100791096687, "grad_norm_var": 1.5224005239966066e-05, "learning_rate": 0.009782369114474837, "loss": 3.0901, "step": 899 }, { "crossentropy": 3.0690112113952637, "epoch": 0.07655665192242259, "grad_norm": 0.06153253838419914, "grad_norm_var": 1.3473902272895244e-05, "learning_rate": 0.00978178405341719, "loss": 3.069, "step": 900 }, { "crossentropy": 3.0373575687408447, "epoch": 0.07664171486900306, "grad_norm": 0.058572959154844284, "grad_norm_var": 1.359595984100666e-05, "learning_rate": 0.00978119822454281, "loss": 3.0374, "step": 901 }, { "crossentropy": 2.9906647205352783, "epoch": 0.07672677781558353, "grad_norm": 0.05811012163758278, "grad_norm_var": 1.3487922665940805e-05, "learning_rate": 0.00978061162794576, "loss": 2.9907, "step": 902 }, { "crossentropy": 3.0571482181549072, "epoch": 0.076811840762164, "grad_norm": 0.057784054428339005, "grad_norm_var": 1.3176975252477778e-05, "learning_rate": 0.009780024263720233, "loss": 3.0571, "step": 903 }, { "crossentropy": 2.9477837085723877, "epoch": 0.07689690370874447, "grad_norm": 0.05415748432278633, "grad_norm_var": 1.0012832863560914e-05, "learning_rate": 0.009779436131960542, "loss": 2.9478, "step": 904 }, { "crossentropy": 2.946969747543335, "epoch": 0.07698196665532495, "grad_norm": 0.059765636920928955, "grad_norm_var": 1.001945386796954e-05, "learning_rate": 0.009778847232761127, "loss": 2.947, "step": 905 }, { "crossentropy": 2.966862678527832, "epoch": 0.07706702960190541, "grad_norm": 0.0697539746761322, "grad_norm_var": 1.8961500017868466e-05, "learning_rate": 0.009778257566216544, "loss": 2.9669, "step": 906 }, { "crossentropy": 3.0302345752716064, "epoch": 0.07715209254848587, "grad_norm": 0.05686522275209427, "grad_norm_var": 1.775996857825308e-05, "learning_rate": 0.009777667132421479, "loss": 3.0302, "step": 907 }, { "crossentropy": 3.0046470165252686, "epoch": 0.07723715549506635, "grad_norm": 0.05536296218633652, "grad_norm_var": 1.6846751782486023e-05, "learning_rate": 0.009777075931470735, "loss": 3.0046, "step": 908 }, { "crossentropy": 3.028444290161133, "epoch": 0.07732221844164681, "grad_norm": 0.05322221294045448, "grad_norm_var": 1.8095425794926954e-05, "learning_rate": 0.009776483963459249, "loss": 3.0284, "step": 909 }, { "crossentropy": 2.909207820892334, "epoch": 0.07740728138822729, "grad_norm": 0.056741878390312195, "grad_norm_var": 1.7197325474678907e-05, "learning_rate": 0.009775891228482068, "loss": 2.9092, "step": 910 }, { "crossentropy": 3.0053131580352783, "epoch": 0.07749234433480776, "grad_norm": 0.05463126674294472, "grad_norm_var": 1.768846581145488e-05, "learning_rate": 0.00977529772663437, "loss": 3.0053, "step": 911 }, { "crossentropy": 2.993997812271118, "epoch": 0.07757740728138823, "grad_norm": 0.06025095283985138, "grad_norm_var": 1.5036744200464374e-05, "learning_rate": 0.009774703458011453, "loss": 2.994, "step": 912 }, { "crossentropy": 3.0355262756347656, "epoch": 0.0776624702279687, "grad_norm": 0.06640662252902985, "grad_norm_var": 1.9060148337654975e-05, "learning_rate": 0.009774108422708741, "loss": 3.0355, "step": 913 }, { "crossentropy": 2.9432289600372314, "epoch": 0.07774753317454916, "grad_norm": 0.060333702713251114, "grad_norm_var": 1.8813273222391808e-05, "learning_rate": 0.00977351262082178, "loss": 2.9432, "step": 914 }, { "crossentropy": 2.997549295425415, "epoch": 0.07783259612112964, "grad_norm": 0.052743833512067795, "grad_norm_var": 2.1165134545361673e-05, "learning_rate": 0.009772916052446236, "loss": 2.9975, "step": 915 }, { "crossentropy": 2.976132869720459, "epoch": 0.0779176590677101, "grad_norm": 0.050778843462467194, "grad_norm_var": 2.4065721677464744e-05, "learning_rate": 0.009772318717677904, "loss": 2.9761, "step": 916 }, { "crossentropy": 3.057413101196289, "epoch": 0.07800272201429058, "grad_norm": 0.05120636150240898, "grad_norm_var": 2.6740033898816027e-05, "learning_rate": 0.009771720616612698, "loss": 3.0574, "step": 917 }, { "crossentropy": 2.9345006942749023, "epoch": 0.07808778496087104, "grad_norm": 0.059923846274614334, "grad_norm_var": 2.7121668003863426e-05, "learning_rate": 0.009771121749346653, "loss": 2.9345, "step": 918 }, { "crossentropy": 2.9840660095214844, "epoch": 0.07817284790745152, "grad_norm": 0.061992377042770386, "grad_norm_var": 2.8390422362890333e-05, "learning_rate": 0.009770522115975932, "loss": 2.9841, "step": 919 }, { "crossentropy": 2.9970736503601074, "epoch": 0.07825791085403198, "grad_norm": 0.07226977497339249, "grad_norm_var": 4.0197334132387416e-05, "learning_rate": 0.00976992171659682, "loss": 2.9971, "step": 920 }, { "crossentropy": 2.975525379180908, "epoch": 0.07834297380061245, "grad_norm": 0.07131297886371613, "grad_norm_var": 4.9878412554010586e-05, "learning_rate": 0.009769320551305721, "loss": 2.9755, "step": 921 }, { "crossentropy": 2.852834939956665, "epoch": 0.07842803674719293, "grad_norm": 0.050560951232910156, "grad_norm_var": 4.6948420375058966e-05, "learning_rate": 0.009768718620199167, "loss": 2.8528, "step": 922 }, { "crossentropy": 2.9865431785583496, "epoch": 0.07851309969377339, "grad_norm": 0.05446033924818039, "grad_norm_var": 4.780609903402728e-05, "learning_rate": 0.00976811592337381, "loss": 2.9865, "step": 923 }, { "crossentropy": 3.003411293029785, "epoch": 0.07859816264035387, "grad_norm": 0.05575607344508171, "grad_norm_var": 4.766378235718556e-05, "learning_rate": 0.009767512460926426, "loss": 3.0034, "step": 924 }, { "crossentropy": 3.002171039581299, "epoch": 0.07868322558693433, "grad_norm": 0.054068390280008316, "grad_norm_var": 4.7137105513412175e-05, "learning_rate": 0.009766908232953914, "loss": 3.0022, "step": 925 }, { "crossentropy": 3.079634428024292, "epoch": 0.07876828853351481, "grad_norm": 0.0664224848151207, "grad_norm_var": 5.093161357028793e-05, "learning_rate": 0.009766303239553294, "loss": 3.0796, "step": 926 }, { "crossentropy": 3.045903205871582, "epoch": 0.07885335148009527, "grad_norm": 0.06754235178232193, "grad_norm_var": 5.392425312066087e-05, "learning_rate": 0.009765697480821714, "loss": 3.0459, "step": 927 }, { "crossentropy": 3.0558063983917236, "epoch": 0.07893841442667573, "grad_norm": 0.06854565441608429, "grad_norm_var": 5.87763509756957e-05, "learning_rate": 0.009765090956856437, "loss": 3.0558, "step": 928 }, { "crossentropy": 3.0482981204986572, "epoch": 0.07902347737325621, "grad_norm": 0.05582670122385025, "grad_norm_var": 5.711601178151522e-05, "learning_rate": 0.009764483667754856, "loss": 3.0483, "step": 929 }, { "crossentropy": 2.984412908554077, "epoch": 0.07910854031983668, "grad_norm": 0.053367484360933304, "grad_norm_var": 5.94759377781399e-05, "learning_rate": 0.009763875613614482, "loss": 2.9844, "step": 930 }, { "crossentropy": 3.030747413635254, "epoch": 0.07919360326641715, "grad_norm": 0.05106113478541374, "grad_norm_var": 6.109549815266161e-05, "learning_rate": 0.009763266794532953, "loss": 3.0307, "step": 931 }, { "crossentropy": 3.0034420490264893, "epoch": 0.07927866621299762, "grad_norm": 0.06072881445288658, "grad_norm_var": 5.6285560298561095e-05, "learning_rate": 0.009762657210608029, "loss": 3.0034, "step": 932 }, { "crossentropy": 3.060572385787964, "epoch": 0.0793637291595781, "grad_norm": 0.06353352218866348, "grad_norm_var": 5.183851350383702e-05, "learning_rate": 0.009762046861937587, "loss": 3.0606, "step": 933 }, { "crossentropy": 3.0545809268951416, "epoch": 0.07944879210615856, "grad_norm": 0.06099335476756096, "grad_norm_var": 5.1833433113051876e-05, "learning_rate": 0.009761435748619636, "loss": 3.0546, "step": 934 }, { "crossentropy": 2.8590810298919678, "epoch": 0.07953385505273902, "grad_norm": 0.052815742790699005, "grad_norm_var": 5.530442711831248e-05, "learning_rate": 0.009760823870752302, "loss": 2.8591, "step": 935 }, { "crossentropy": 2.9877021312713623, "epoch": 0.0796189179993195, "grad_norm": 0.054262518882751465, "grad_norm_var": 4.600124457624898e-05, "learning_rate": 0.009760211228433832, "loss": 2.9877, "step": 936 }, { "crossentropy": 2.9786627292633057, "epoch": 0.07970398094589996, "grad_norm": 0.062384653836488724, "grad_norm_var": 3.61215524579197e-05, "learning_rate": 0.009759597821762602, "loss": 2.9787, "step": 937 }, { "crossentropy": 3.060579776763916, "epoch": 0.07978904389248044, "grad_norm": 0.05525600165128708, "grad_norm_var": 3.2672956754281555e-05, "learning_rate": 0.009758983650837107, "loss": 3.0606, "step": 938 }, { "crossentropy": 3.0889384746551514, "epoch": 0.0798741068390609, "grad_norm": 0.055842868983745575, "grad_norm_var": 3.203594670898585e-05, "learning_rate": 0.009758368715755965, "loss": 3.0889, "step": 939 }, { "crossentropy": 3.0863733291625977, "epoch": 0.07995916978564137, "grad_norm": 0.05015159770846367, "grad_norm_var": 3.6161968544803856e-05, "learning_rate": 0.009757753016617916, "loss": 3.0864, "step": 940 }, { "crossentropy": 3.0180671215057373, "epoch": 0.08004423273222185, "grad_norm": 0.053215526044368744, "grad_norm_var": 3.66886514009399e-05, "learning_rate": 0.009757136553521825, "loss": 3.0181, "step": 941 }, { "crossentropy": 3.2077126502990723, "epoch": 0.08012929567880231, "grad_norm": 0.06043684110045433, "grad_norm_var": 3.24030793197399e-05, "learning_rate": 0.009756519326566678, "loss": 3.2077, "step": 942 }, { "crossentropy": 3.10659122467041, "epoch": 0.08021435862538279, "grad_norm": 0.07055306434631348, "grad_norm_var": 3.685123662060663e-05, "learning_rate": 0.00975590133585158, "loss": 3.1066, "step": 943 }, { "crossentropy": 2.9407715797424316, "epoch": 0.08029942157196325, "grad_norm": 0.07291015982627869, "grad_norm_var": 4.414318965108201e-05, "learning_rate": 0.009755282581475769, "loss": 2.9408, "step": 944 }, { "crossentropy": 2.985671281814575, "epoch": 0.08038448451854373, "grad_norm": 0.07366187125444412, "grad_norm_var": 5.806220339054548e-05, "learning_rate": 0.009754663063538593, "loss": 2.9857, "step": 945 }, { "crossentropy": 3.033886194229126, "epoch": 0.08046954746512419, "grad_norm": 0.05834905058145523, "grad_norm_var": 5.5574174233232144e-05, "learning_rate": 0.009754042782139533, "loss": 3.0339, "step": 946 }, { "crossentropy": 2.974964141845703, "epoch": 0.08055461041170466, "grad_norm": 0.048773691058158875, "grad_norm_var": 5.85542253187067e-05, "learning_rate": 0.009753421737378186, "loss": 2.975, "step": 947 }, { "crossentropy": 3.037642240524292, "epoch": 0.08063967335828513, "grad_norm": 0.05608108639717102, "grad_norm_var": 5.9215217791434384e-05, "learning_rate": 0.009752799929354275, "loss": 3.0376, "step": 948 }, { "crossentropy": 2.957618474960327, "epoch": 0.0807247363048656, "grad_norm": 0.05859851837158203, "grad_norm_var": 5.7969035769317216e-05, "learning_rate": 0.009752177358167644, "loss": 2.9576, "step": 949 }, { "crossentropy": 3.046475887298584, "epoch": 0.08080979925144607, "grad_norm": 0.057708799839019775, "grad_norm_var": 5.777817666771234e-05, "learning_rate": 0.00975155402391826, "loss": 3.0465, "step": 950 }, { "crossentropy": 3.115068197250366, "epoch": 0.08089486219802654, "grad_norm": 0.058604057878255844, "grad_norm_var": 5.524396815715186e-05, "learning_rate": 0.009750929926706215, "loss": 3.1151, "step": 951 }, { "crossentropy": 2.9084596633911133, "epoch": 0.08097992514460702, "grad_norm": 0.06265252828598022, "grad_norm_var": 5.414872688306316e-05, "learning_rate": 0.009750305066631717, "loss": 2.9085, "step": 952 }, { "crossentropy": 2.9293785095214844, "epoch": 0.08106498809118748, "grad_norm": 0.05867592245340347, "grad_norm_var": 5.368023193479532e-05, "learning_rate": 0.0097496794437951, "loss": 2.9294, "step": 953 }, { "crossentropy": 2.9931833744049072, "epoch": 0.08115005103776794, "grad_norm": 0.05133830010890961, "grad_norm_var": 5.683915046092858e-05, "learning_rate": 0.009749053058296826, "loss": 2.9932, "step": 954 }, { "crossentropy": 2.9774272441864014, "epoch": 0.08123511398434842, "grad_norm": 0.0522121824324131, "grad_norm_var": 5.929888401015581e-05, "learning_rate": 0.009748425910237472, "loss": 2.9774, "step": 955 }, { "crossentropy": 2.9678642749786377, "epoch": 0.08132017693092888, "grad_norm": 0.053754452615976334, "grad_norm_var": 5.586187384191051e-05, "learning_rate": 0.009747797999717738, "loss": 2.9679, "step": 956 }, { "crossentropy": 3.0445735454559326, "epoch": 0.08140523987750936, "grad_norm": 0.054829828441143036, "grad_norm_var": 5.473226079047155e-05, "learning_rate": 0.00974716932683845, "loss": 3.0446, "step": 957 }, { "crossentropy": 2.9263131618499756, "epoch": 0.08149030282408982, "grad_norm": 0.05012720450758934, "grad_norm_var": 5.9841817362089414e-05, "learning_rate": 0.009746539891700557, "loss": 2.9263, "step": 958 }, { "crossentropy": 2.9792842864990234, "epoch": 0.0815753657706703, "grad_norm": 0.05393484979867935, "grad_norm_var": 5.0787424429437945e-05, "learning_rate": 0.009745909694405124, "loss": 2.9793, "step": 959 }, { "crossentropy": 3.064539670944214, "epoch": 0.08166042871725077, "grad_norm": 0.06620114296674728, "grad_norm_var": 3.993936756672423e-05, "learning_rate": 0.009745278735053344, "loss": 3.0645, "step": 960 }, { "crossentropy": 2.9642324447631836, "epoch": 0.08174549166383123, "grad_norm": 0.05578635632991791, "grad_norm_var": 2.0720196192989397e-05, "learning_rate": 0.00974464701374653, "loss": 2.9642, "step": 961 }, { "crossentropy": 3.0193088054656982, "epoch": 0.08183055461041171, "grad_norm": 0.05394682660698891, "grad_norm_var": 2.061233593032326e-05, "learning_rate": 0.009744014530586122, "loss": 3.0193, "step": 962 }, { "crossentropy": 3.0495078563690186, "epoch": 0.08191561755699217, "grad_norm": 0.05163835361599922, "grad_norm_var": 1.843133172891471e-05, "learning_rate": 0.009743381285673676, "loss": 3.0495, "step": 963 }, { "crossentropy": 2.970799684524536, "epoch": 0.08200068050357265, "grad_norm": 0.04963225498795509, "grad_norm_var": 2.0965682912603827e-05, "learning_rate": 0.009742747279110871, "loss": 2.9708, "step": 964 }, { "crossentropy": 2.9446258544921875, "epoch": 0.08208574345015311, "grad_norm": 0.05316834896802902, "grad_norm_var": 2.063949063741081e-05, "learning_rate": 0.009742112510999514, "loss": 2.9446, "step": 965 }, { "crossentropy": 3.0829508304595947, "epoch": 0.08217080639673359, "grad_norm": 0.05647148936986923, "grad_norm_var": 2.0331714204617433e-05, "learning_rate": 0.009741476981441528, "loss": 3.083, "step": 966 }, { "crossentropy": 2.9763195514678955, "epoch": 0.08225586934331405, "grad_norm": 0.05858500301837921, "grad_norm_var": 2.0323052513939362e-05, "learning_rate": 0.009740840690538962, "loss": 2.9763, "step": 967 }, { "crossentropy": 3.0705251693725586, "epoch": 0.08234093228989452, "grad_norm": 0.053255364298820496, "grad_norm_var": 1.6485354254163383e-05, "learning_rate": 0.009740203638393983, "loss": 3.0705, "step": 968 }, { "crossentropy": 2.904892921447754, "epoch": 0.082425995236475, "grad_norm": 0.04951563477516174, "grad_norm_var": 1.6748352132720663e-05, "learning_rate": 0.009739565825108888, "loss": 2.9049, "step": 969 }, { "crossentropy": 2.9926018714904785, "epoch": 0.08251105818305546, "grad_norm": 0.04806813597679138, "grad_norm_var": 1.858811982409017e-05, "learning_rate": 0.009738927250786088, "loss": 2.9926, "step": 970 }, { "crossentropy": 2.9762532711029053, "epoch": 0.08259612112963594, "grad_norm": 0.05201199650764465, "grad_norm_var": 1.863355185994734e-05, "learning_rate": 0.00973828791552812, "loss": 2.9763, "step": 971 }, { "crossentropy": 3.003765821456909, "epoch": 0.0826811840762164, "grad_norm": 0.05265461280941963, "grad_norm_var": 1.8717000367430604e-05, "learning_rate": 0.009737647819437645, "loss": 3.0038, "step": 972 }, { "crossentropy": 3.0501043796539307, "epoch": 0.08276624702279688, "grad_norm": 0.06468400359153748, "grad_norm_var": 2.6218997516983384e-05, "learning_rate": 0.00973700696261744, "loss": 3.0501, "step": 973 }, { "crossentropy": 3.0205163955688477, "epoch": 0.08285130996937734, "grad_norm": 0.05402204021811485, "grad_norm_var": 2.497151285668671e-05, "learning_rate": 0.009736365345170413, "loss": 3.0205, "step": 974 }, { "crossentropy": 2.967601776123047, "epoch": 0.0829363729159578, "grad_norm": 0.05261608213186264, "grad_norm_var": 2.5196907554961234e-05, "learning_rate": 0.009735722967199585, "loss": 2.9676, "step": 975 }, { "crossentropy": 2.990175485610962, "epoch": 0.08302143586253828, "grad_norm": 0.05254777893424034, "grad_norm_var": 1.5575790264823106e-05, "learning_rate": 0.009735079828808106, "loss": 2.9902, "step": 976 }, { "crossentropy": 2.994816541671753, "epoch": 0.08310649880911875, "grad_norm": 0.05905655026435852, "grad_norm_var": 1.717011537098375e-05, "learning_rate": 0.009734435930099246, "loss": 2.9948, "step": 977 }, { "crossentropy": 3.0143752098083496, "epoch": 0.08319156175569922, "grad_norm": 0.050527866929769516, "grad_norm_var": 1.7864376387251533e-05, "learning_rate": 0.009733791271176392, "loss": 3.0144, "step": 978 }, { "crossentropy": 2.969043493270874, "epoch": 0.08327662470227969, "grad_norm": 0.04940124601125717, "grad_norm_var": 1.8778237930578274e-05, "learning_rate": 0.009733145852143063, "loss": 2.969, "step": 979 }, { "crossentropy": 3.00123929977417, "epoch": 0.08336168764886015, "grad_norm": 0.0520930178463459, "grad_norm_var": 1.7883205007670987e-05, "learning_rate": 0.009732499673102895, "loss": 3.0012, "step": 980 }, { "crossentropy": 3.0503549575805664, "epoch": 0.08344675059544063, "grad_norm": 0.056241560727357864, "grad_norm_var": 1.8268982719624875e-05, "learning_rate": 0.00973185273415964, "loss": 3.0504, "step": 981 }, { "crossentropy": 2.9034998416900635, "epoch": 0.08353181354202109, "grad_norm": 0.06105273962020874, "grad_norm_var": 2.1176199208740773e-05, "learning_rate": 0.009731205035417183, "loss": 2.9035, "step": 982 }, { "crossentropy": 2.9620423316955566, "epoch": 0.08361687648860157, "grad_norm": 0.054766684770584106, "grad_norm_var": 1.98274092232655e-05, "learning_rate": 0.009730556576979523, "loss": 2.962, "step": 983 }, { "crossentropy": 2.8910999298095703, "epoch": 0.08370193943518203, "grad_norm": 0.05101349949836731, "grad_norm_var": 2.0336377409467068e-05, "learning_rate": 0.009729907358950785, "loss": 2.8911, "step": 984 }, { "crossentropy": 3.0728096961975098, "epoch": 0.08378700238176251, "grad_norm": 0.05620859935879707, "grad_norm_var": 1.93421341833907e-05, "learning_rate": 0.009729257381435213, "loss": 3.0728, "step": 985 }, { "crossentropy": 3.0012829303741455, "epoch": 0.08387206532834297, "grad_norm": 0.05725979059934616, "grad_norm_var": 1.7125503103299482e-05, "learning_rate": 0.009728606644537176, "loss": 3.0013, "step": 986 }, { "crossentropy": 3.0233657360076904, "epoch": 0.08395712827492344, "grad_norm": 0.05570556968450546, "grad_norm_var": 1.6624890757729823e-05, "learning_rate": 0.009727955148361165, "loss": 3.0234, "step": 987 }, { "crossentropy": 2.966981887817383, "epoch": 0.08404219122150391, "grad_norm": 0.05648618936538696, "grad_norm_var": 1.634898487285023e-05, "learning_rate": 0.00972730289301179, "loss": 2.967, "step": 988 }, { "crossentropy": 3.0159175395965576, "epoch": 0.08412725416808438, "grad_norm": 0.05105433613061905, "grad_norm_var": 1.077918356123415e-05, "learning_rate": 0.009726649878593785, "loss": 3.0159, "step": 989 }, { "crossentropy": 2.97021222114563, "epoch": 0.08421231711466486, "grad_norm": 0.0550532191991806, "grad_norm_var": 1.0796652882565962e-05, "learning_rate": 0.009725996105212003, "loss": 2.9702, "step": 990 }, { "crossentropy": 2.9446113109588623, "epoch": 0.08429738006124532, "grad_norm": 0.050988562405109406, "grad_norm_var": 1.1358605808612678e-05, "learning_rate": 0.009725341572971424, "loss": 2.9446, "step": 991 }, { "crossentropy": 3.04758620262146, "epoch": 0.0843824430078258, "grad_norm": 0.054956842213869095, "grad_norm_var": 1.1145307871805673e-05, "learning_rate": 0.009724686281977146, "loss": 3.0476, "step": 992 }, { "crossentropy": 3.0283777713775635, "epoch": 0.08446750595440626, "grad_norm": 0.06155702471733093, "grad_norm_var": 1.3058005963754946e-05, "learning_rate": 0.009724030232334391, "loss": 3.0284, "step": 993 }, { "crossentropy": 3.0002570152282715, "epoch": 0.08455256890098672, "grad_norm": 0.05499985069036484, "grad_norm_var": 1.1851278412591815e-05, "learning_rate": 0.009723373424148498, "loss": 3.0003, "step": 994 }, { "crossentropy": 2.9437379837036133, "epoch": 0.0846376318475672, "grad_norm": 0.0529063381254673, "grad_norm_var": 1.00364992755295e-05, "learning_rate": 0.009722715857524935, "loss": 2.9437, "step": 995 }, { "crossentropy": 3.0219876766204834, "epoch": 0.08472269479414767, "grad_norm": 0.05692131444811821, "grad_norm_var": 9.527785322552678e-06, "learning_rate": 0.009722057532569288, "loss": 3.022, "step": 996 }, { "crossentropy": 3.0438554286956787, "epoch": 0.08480775774072814, "grad_norm": 0.0675552561879158, "grad_norm_var": 1.872445888650038e-05, "learning_rate": 0.009721398449387264, "loss": 3.0439, "step": 997 }, { "crossentropy": 2.9008736610412598, "epoch": 0.0848928206873086, "grad_norm": 0.0676705539226532, "grad_norm_var": 2.5782998895403317e-05, "learning_rate": 0.009720738608084693, "loss": 2.9009, "step": 998 }, { "crossentropy": 2.98826265335083, "epoch": 0.08497788363388908, "grad_norm": 0.06653183698654175, "grad_norm_var": 3.160694205661836e-05, "learning_rate": 0.009720078008767527, "loss": 2.9883, "step": 999 }, { "crossentropy": 2.9898011684417725, "epoch": 0.08506294658046955, "grad_norm": 0.05318893492221832, "grad_norm_var": 3.007802744292368e-05, "learning_rate": 0.009719416651541838, "loss": 2.9898, "step": 1000 }, { "crossentropy": 2.9244675636291504, "epoch": 0.08514800952705001, "grad_norm": 0.05214806646108627, "grad_norm_var": 3.177535137490467e-05, "learning_rate": 0.009718754536513823, "loss": 2.9245, "step": 1001 }, { "crossentropy": 2.968000888824463, "epoch": 0.08523307247363049, "grad_norm": 0.053118884563446045, "grad_norm_var": 3.280656920178476e-05, "learning_rate": 0.009718091663789794, "loss": 2.968, "step": 1002 }, { "crossentropy": 2.9502902030944824, "epoch": 0.08531813542021095, "grad_norm": 0.060076016932725906, "grad_norm_var": 3.328821759274921e-05, "learning_rate": 0.009717428033476196, "loss": 2.9503, "step": 1003 }, { "crossentropy": 2.986891508102417, "epoch": 0.08540319836679143, "grad_norm": 0.06019783392548561, "grad_norm_var": 3.3795572704241734e-05, "learning_rate": 0.009716763645679584, "loss": 2.9869, "step": 1004 }, { "crossentropy": 3.0194942951202393, "epoch": 0.0854882613133719, "grad_norm": 0.052629370242357254, "grad_norm_var": 3.261111111644381e-05, "learning_rate": 0.00971609850050664, "loss": 3.0195, "step": 1005 }, { "crossentropy": 3.037658452987671, "epoch": 0.08557332425995237, "grad_norm": 0.04894842579960823, "grad_norm_var": 3.6957436544953796e-05, "learning_rate": 0.009715432598064169, "loss": 3.0377, "step": 1006 }, { "crossentropy": 3.0649666786193848, "epoch": 0.08565838720653284, "grad_norm": 0.051316238939762115, "grad_norm_var": 3.6694966166008795e-05, "learning_rate": 0.009714765938459095, "loss": 3.065, "step": 1007 }, { "crossentropy": 2.955587863922119, "epoch": 0.0857434501531133, "grad_norm": 0.053414393216371536, "grad_norm_var": 3.729885655744617e-05, "learning_rate": 0.009714098521798465, "loss": 2.9556, "step": 1008 }, { "crossentropy": 2.9188687801361084, "epoch": 0.08582851309969378, "grad_norm": 0.05226905643939972, "grad_norm_var": 3.7138461602959025e-05, "learning_rate": 0.009713430348189445, "loss": 2.9189, "step": 1009 }, { "crossentropy": 3.0531697273254395, "epoch": 0.08591357604627424, "grad_norm": 0.05347172170877457, "grad_norm_var": 3.75886958763324e-05, "learning_rate": 0.009712761417739326, "loss": 3.0532, "step": 1010 }, { "crossentropy": 3.016639232635498, "epoch": 0.08599863899285472, "grad_norm": 0.056668929755687714, "grad_norm_var": 3.672193913575294e-05, "learning_rate": 0.009712091730555518, "loss": 3.0166, "step": 1011 }, { "crossentropy": 2.9527125358581543, "epoch": 0.08608370193943518, "grad_norm": 0.052001066505908966, "grad_norm_var": 3.804580002484213e-05, "learning_rate": 0.009711421286745554, "loss": 2.9527, "step": 1012 }, { "crossentropy": 2.999486207962036, "epoch": 0.08616876488601564, "grad_norm": 0.05089451000094414, "grad_norm_var": 3.0448233733892708e-05, "learning_rate": 0.009710750086417088, "loss": 2.9995, "step": 1013 }, { "crossentropy": 2.989367961883545, "epoch": 0.08625382783259612, "grad_norm": 0.05298018455505371, "grad_norm_var": 1.9674653226814157e-05, "learning_rate": 0.009710078129677896, "loss": 2.9894, "step": 1014 }, { "crossentropy": 2.9404900074005127, "epoch": 0.08633889077917659, "grad_norm": 0.053655289113521576, "grad_norm_var": 9.150241628621177e-06, "learning_rate": 0.009709405416635872, "loss": 2.9405, "step": 1015 }, { "crossentropy": 2.990171432495117, "epoch": 0.08642395372575706, "grad_norm": 0.054125480353832245, "grad_norm_var": 9.158577869537026e-06, "learning_rate": 0.009708731947399038, "loss": 2.9902, "step": 1016 }, { "crossentropy": 2.98933482170105, "epoch": 0.08650901667233753, "grad_norm": 0.05764266476035118, "grad_norm_var": 9.967340710452464e-06, "learning_rate": 0.009708057722075532, "loss": 2.9893, "step": 1017 }, { "crossentropy": 3.0251986980438232, "epoch": 0.086594079618918, "grad_norm": 0.05784817785024643, "grad_norm_var": 1.0832871763788385e-05, "learning_rate": 0.009707382740773617, "loss": 3.0252, "step": 1018 }, { "crossentropy": 2.9901511669158936, "epoch": 0.08667914256549847, "grad_norm": 0.05910535529255867, "grad_norm_var": 1.0138873374004818e-05, "learning_rate": 0.009706707003601671, "loss": 2.9902, "step": 1019 }, { "crossentropy": 3.0061426162719727, "epoch": 0.08676420551207893, "grad_norm": 0.04979833960533142, "grad_norm_var": 8.578911359104675e-06, "learning_rate": 0.009706030510668203, "loss": 3.0061, "step": 1020 }, { "crossentropy": 2.9350314140319824, "epoch": 0.08684926845865941, "grad_norm": 0.048593491315841675, "grad_norm_var": 1.0091302240484007e-05, "learning_rate": 0.009705353262081835, "loss": 2.935, "step": 1021 }, { "crossentropy": 3.113180637359619, "epoch": 0.08693433140523987, "grad_norm": 0.055973440408706665, "grad_norm_var": 9.103649415496267e-06, "learning_rate": 0.009704675257951316, "loss": 3.1132, "step": 1022 }, { "crossentropy": 3.0421042442321777, "epoch": 0.08701939435182035, "grad_norm": 0.055691104382276535, "grad_norm_var": 8.889024861635727e-06, "learning_rate": 0.009703996498385513, "loss": 3.0421, "step": 1023 }, { "crossentropy": 3.0707476139068604, "epoch": 0.08710445729840081, "grad_norm": 0.05149511620402336, "grad_norm_var": 9.271240744036732e-06, "learning_rate": 0.009703316983493413, "loss": 3.0707, "step": 1024 }, { "crossentropy": 3.026768684387207, "epoch": 0.08718952024498129, "grad_norm": 0.06088887155056, "grad_norm_var": 1.2053974802410326e-05, "learning_rate": 0.00970263671338413, "loss": 3.0268, "step": 1025 }, { "crossentropy": 3.0252246856689453, "epoch": 0.08727458319156176, "grad_norm": 0.0511491484940052, "grad_norm_var": 1.2686982360526899e-05, "learning_rate": 0.009701955688166893, "loss": 3.0252, "step": 1026 }, { "crossentropy": 3.019643545150757, "epoch": 0.08735964613814222, "grad_norm": 0.059234775602817535, "grad_norm_var": 1.391507208486631e-05, "learning_rate": 0.009701273907951056, "loss": 3.0196, "step": 1027 }, { "crossentropy": 3.006037712097168, "epoch": 0.0874447090847227, "grad_norm": 0.052445486187934875, "grad_norm_var": 1.3782757959860531e-05, "learning_rate": 0.009700591372846094, "loss": 3.006, "step": 1028 }, { "crossentropy": 3.0377445220947266, "epoch": 0.08752977203130316, "grad_norm": 0.06015860661864281, "grad_norm_var": 1.473012370065858e-05, "learning_rate": 0.009699908082961603, "loss": 3.0377, "step": 1029 }, { "crossentropy": 3.0433144569396973, "epoch": 0.08761483497788364, "grad_norm": 0.07338738441467285, "grad_norm_var": 3.5129061965997324e-05, "learning_rate": 0.0096992240384073, "loss": 3.0433, "step": 1030 }, { "crossentropy": 3.0415661334991455, "epoch": 0.0876998979244641, "grad_norm": 0.07293418049812317, "grad_norm_var": 5.149741575177502e-05, "learning_rate": 0.009698539239293019, "loss": 3.0416, "step": 1031 }, { "crossentropy": 2.959207773208618, "epoch": 0.08778496087104458, "grad_norm": 0.050436582416296005, "grad_norm_var": 5.402217965229331e-05, "learning_rate": 0.009697853685728721, "loss": 2.9592, "step": 1032 }, { "crossentropy": 2.9389891624450684, "epoch": 0.08787002381762504, "grad_norm": 0.04969841241836548, "grad_norm_var": 5.7602520587092763e-05, "learning_rate": 0.009697167377824488, "loss": 2.939, "step": 1033 }, { "crossentropy": 3.0489864349365234, "epoch": 0.0879550867642055, "grad_norm": 0.048536237329244614, "grad_norm_var": 6.17236115534408e-05, "learning_rate": 0.009696480315690521, "loss": 3.049, "step": 1034 }, { "crossentropy": 2.968843936920166, "epoch": 0.08804014971078598, "grad_norm": 0.04837449640035629, "grad_norm_var": 6.479284195481012e-05, "learning_rate": 0.00969579249943714, "loss": 2.9688, "step": 1035 }, { "crossentropy": 3.084331512451172, "epoch": 0.08812521265736645, "grad_norm": 0.051426444202661514, "grad_norm_var": 6.370999697218228e-05, "learning_rate": 0.00969510392917479, "loss": 3.0843, "step": 1036 }, { "crossentropy": 3.0857248306274414, "epoch": 0.08821027560394692, "grad_norm": 0.061028093099594116, "grad_norm_var": 6.167192387532729e-05, "learning_rate": 0.009694414605014036, "loss": 3.0857, "step": 1037 }, { "crossentropy": 2.970551013946533, "epoch": 0.08829533855052739, "grad_norm": 0.05245349556207657, "grad_norm_var": 6.265994071540793e-05, "learning_rate": 0.009693724527065559, "loss": 2.9706, "step": 1038 }, { "crossentropy": 2.9994475841522217, "epoch": 0.08838040149710787, "grad_norm": 0.048356540501117706, "grad_norm_var": 6.65283116051886e-05, "learning_rate": 0.009693033695440173, "loss": 2.9994, "step": 1039 }, { "crossentropy": 2.989464282989502, "epoch": 0.08846546444368833, "grad_norm": 0.04666374623775482, "grad_norm_var": 7.072827333262962e-05, "learning_rate": 0.009692342110248801, "loss": 2.9895, "step": 1040 }, { "crossentropy": 3.0219714641571045, "epoch": 0.0885505273902688, "grad_norm": 0.06716141104698181, "grad_norm_var": 7.773749590836796e-05, "learning_rate": 0.009691649771602496, "loss": 3.022, "step": 1041 }, { "crossentropy": 2.9543347358703613, "epoch": 0.08863559033684927, "grad_norm": 0.07121240347623825, "grad_norm_var": 9.034654062966367e-05, "learning_rate": 0.00969095667961242, "loss": 2.9543, "step": 1042 }, { "crossentropy": 3.0458552837371826, "epoch": 0.08872065328342973, "grad_norm": 0.0532427616417408, "grad_norm_var": 9.088042862798317e-05, "learning_rate": 0.009690262834389871, "loss": 3.0459, "step": 1043 }, { "crossentropy": 3.085355281829834, "epoch": 0.08880571623001021, "grad_norm": 0.05368639901280403, "grad_norm_var": 9.026946874706639e-05, "learning_rate": 0.009689568236046256, "loss": 3.0854, "step": 1044 }, { "crossentropy": 3.034127712249756, "epoch": 0.08889077917659068, "grad_norm": 0.05957363545894623, "grad_norm_var": 9.002868863482629e-05, "learning_rate": 0.00968887288469311, "loss": 3.0341, "step": 1045 }, { "crossentropy": 3.054795026779175, "epoch": 0.08897584212317115, "grad_norm": 0.061910275369882584, "grad_norm_var": 7.281803586997754e-05, "learning_rate": 0.009688176780442086, "loss": 3.0548, "step": 1046 }, { "crossentropy": 3.092742681503296, "epoch": 0.08906090506975162, "grad_norm": 0.05495171993970871, "grad_norm_var": 5.253032590200231e-05, "learning_rate": 0.009687479923404959, "loss": 3.0927, "step": 1047 }, { "crossentropy": 2.9942452907562256, "epoch": 0.08914596801633208, "grad_norm": 0.05216898024082184, "grad_norm_var": 5.168239870842143e-05, "learning_rate": 0.009686782313693621, "loss": 2.9942, "step": 1048 }, { "crossentropy": 3.054840087890625, "epoch": 0.08923103096291256, "grad_norm": 0.058494213968515396, "grad_norm_var": 5.026759830836017e-05, "learning_rate": 0.009686083951420088, "loss": 3.0548, "step": 1049 }, { "crossentropy": 2.8950886726379395, "epoch": 0.08931609390949302, "grad_norm": 0.05576879158616066, "grad_norm_var": 4.674673669450846e-05, "learning_rate": 0.009685384836696502, "loss": 2.8951, "step": 1050 }, { "crossentropy": 2.9563467502593994, "epoch": 0.0894011568560735, "grad_norm": 0.048279572278261185, "grad_norm_var": 4.6844186902737266e-05, "learning_rate": 0.009684684969635116, "loss": 2.9563, "step": 1051 }, { "crossentropy": 2.979379415512085, "epoch": 0.08948621980265396, "grad_norm": 0.05010608211159706, "grad_norm_var": 4.776247774401585e-05, "learning_rate": 0.00968398435034831, "loss": 2.9794, "step": 1052 }, { "crossentropy": 3.0067715644836426, "epoch": 0.08957128274923443, "grad_norm": 0.05239054560661316, "grad_norm_var": 4.656691435279131e-05, "learning_rate": 0.009683282978948585, "loss": 3.0068, "step": 1053 }, { "crossentropy": 3.052076578140259, "epoch": 0.0896563456958149, "grad_norm": 0.06091201677918434, "grad_norm_var": 4.771404941973344e-05, "learning_rate": 0.009682580855548559, "loss": 3.0521, "step": 1054 }, { "crossentropy": 2.9119813442230225, "epoch": 0.08974140864239537, "grad_norm": 0.060964275151491165, "grad_norm_var": 4.491760792322269e-05, "learning_rate": 0.009681877980260972, "loss": 2.912, "step": 1055 }, { "crossentropy": 2.9402570724487305, "epoch": 0.08982647158897585, "grad_norm": 0.06766663491725922, "grad_norm_var": 4.433211301975524e-05, "learning_rate": 0.009681174353198686, "loss": 2.9403, "step": 1056 }, { "crossentropy": 2.9765055179595947, "epoch": 0.08991153453555631, "grad_norm": 0.05859347805380821, "grad_norm_var": 3.8489256759531114e-05, "learning_rate": 0.009680469974474686, "loss": 2.9765, "step": 1057 }, { "crossentropy": 3.04263973236084, "epoch": 0.08999659748213679, "grad_norm": 0.05356118083000183, "grad_norm_var": 2.5678514163742182e-05, "learning_rate": 0.00967976484420207, "loss": 3.0426, "step": 1058 }, { "crossentropy": 3.0277864933013916, "epoch": 0.09008166042871725, "grad_norm": 0.05081704258918762, "grad_norm_var": 2.7064797837189104e-05, "learning_rate": 0.009679058962494066, "loss": 3.0278, "step": 1059 }, { "crossentropy": 3.0510692596435547, "epoch": 0.09016672337529771, "grad_norm": 0.04939422383904457, "grad_norm_var": 2.9677794444974738e-05, "learning_rate": 0.009678352329464018, "loss": 3.0511, "step": 1060 }, { "crossentropy": 2.9429800510406494, "epoch": 0.09025178632187819, "grad_norm": 0.050595756620168686, "grad_norm_var": 3.040414918389078e-05, "learning_rate": 0.009677644945225387, "loss": 2.943, "step": 1061 }, { "crossentropy": 2.9508588314056396, "epoch": 0.09033684926845865, "grad_norm": 0.05031399428844452, "grad_norm_var": 2.8759650807130775e-05, "learning_rate": 0.009676936809891762, "loss": 2.9509, "step": 1062 }, { "crossentropy": 2.9843568801879883, "epoch": 0.09042191221503913, "grad_norm": 0.05528699979186058, "grad_norm_var": 2.8778548322696182e-05, "learning_rate": 0.009676227923576848, "loss": 2.9844, "step": 1063 }, { "crossentropy": 2.9844677448272705, "epoch": 0.0905069751616196, "grad_norm": 0.055379465222358704, "grad_norm_var": 2.8336264756599153e-05, "learning_rate": 0.009675518286394472, "loss": 2.9845, "step": 1064 }, { "crossentropy": 2.947098731994629, "epoch": 0.09059203810820007, "grad_norm": 0.05318794026970863, "grad_norm_var": 2.755862605144007e-05, "learning_rate": 0.009674807898458582, "loss": 2.9471, "step": 1065 }, { "crossentropy": 2.8759920597076416, "epoch": 0.09067710105478054, "grad_norm": 0.05270915478467941, "grad_norm_var": 2.7657161924075856e-05, "learning_rate": 0.009674096759883245, "loss": 2.876, "step": 1066 }, { "crossentropy": 2.9322731494903564, "epoch": 0.090762164001361, "grad_norm": 0.053977854549884796, "grad_norm_var": 2.5047914215077748e-05, "learning_rate": 0.00967338487078265, "loss": 2.9323, "step": 1067 }, { "crossentropy": 2.961542844772339, "epoch": 0.09084722694794148, "grad_norm": 0.053089067339897156, "grad_norm_var": 2.3760583513241357e-05, "learning_rate": 0.009672672231271103, "loss": 2.9615, "step": 1068 }, { "crossentropy": 2.9718408584594727, "epoch": 0.09093228989452194, "grad_norm": 0.055989377200603485, "grad_norm_var": 2.3352726010131678e-05, "learning_rate": 0.009671958841463036, "loss": 2.9718, "step": 1069 }, { "crossentropy": 2.9643521308898926, "epoch": 0.09101735284110242, "grad_norm": 0.05524219200015068, "grad_norm_var": 2.1007775290038793e-05, "learning_rate": 0.009671244701472998, "loss": 2.9644, "step": 1070 }, { "crossentropy": 2.922567129135132, "epoch": 0.09110241578768288, "grad_norm": 0.06261783838272095, "grad_norm_var": 2.2538168581393013e-05, "learning_rate": 0.009670529811415663, "loss": 2.9226, "step": 1071 }, { "crossentropy": 2.977191925048828, "epoch": 0.09118747873426336, "grad_norm": 0.06117043271660805, "grad_norm_var": 1.4118958359162805e-05, "learning_rate": 0.009669814171405817, "loss": 2.9772, "step": 1072 }, { "crossentropy": 3.044426202774048, "epoch": 0.09127254168084382, "grad_norm": 0.056702855974435806, "grad_norm_var": 1.3309299693362899e-05, "learning_rate": 0.009669097781558372, "loss": 3.0444, "step": 1073 }, { "crossentropy": 2.9435319900512695, "epoch": 0.09135760462742429, "grad_norm": 0.047385796904563904, "grad_norm_var": 1.636466674069196e-05, "learning_rate": 0.00966838064198836, "loss": 2.9435, "step": 1074 }, { "crossentropy": 3.0422134399414062, "epoch": 0.09144266757400477, "grad_norm": 0.05112975463271141, "grad_norm_var": 1.6238430176888224e-05, "learning_rate": 0.009667662752810934, "loss": 3.0422, "step": 1075 }, { "crossentropy": 2.971259832382202, "epoch": 0.09152773052058523, "grad_norm": 0.05600869283080101, "grad_norm_var": 1.4901392127305199e-05, "learning_rate": 0.009666944114141365, "loss": 2.9713, "step": 1076 }, { "crossentropy": 3.0388801097869873, "epoch": 0.0916127934671657, "grad_norm": 0.06296463310718536, "grad_norm_var": 1.814940818102891e-05, "learning_rate": 0.009666224726095048, "loss": 3.0389, "step": 1077 }, { "crossentropy": 2.94305682182312, "epoch": 0.09169785641374617, "grad_norm": 0.05243965983390808, "grad_norm_var": 1.7047788215660867e-05, "learning_rate": 0.009665504588787491, "loss": 2.9431, "step": 1078 }, { "crossentropy": 2.960604429244995, "epoch": 0.09178291936032665, "grad_norm": 0.048801399767398834, "grad_norm_var": 1.971400320161798e-05, "learning_rate": 0.009664783702334333, "loss": 2.9606, "step": 1079 }, { "crossentropy": 2.8316869735717773, "epoch": 0.09186798230690711, "grad_norm": 0.05145774036645889, "grad_norm_var": 2.043748327687292e-05, "learning_rate": 0.009664062066851324, "loss": 2.8317, "step": 1080 }, { "crossentropy": 2.8878376483917236, "epoch": 0.09195304525348758, "grad_norm": 0.05291319265961647, "grad_norm_var": 2.049684697051704e-05, "learning_rate": 0.00966333968245434, "loss": 2.8878, "step": 1081 }, { "crossentropy": 2.9702606201171875, "epoch": 0.09203810820006805, "grad_norm": 0.06277435272932053, "grad_norm_var": 2.4207199181345422e-05, "learning_rate": 0.009662616549259373, "loss": 2.9703, "step": 1082 }, { "crossentropy": 2.952707052230835, "epoch": 0.09212317114664852, "grad_norm": 0.06358634680509567, "grad_norm_var": 2.8294373437977536e-05, "learning_rate": 0.00966189266738254, "loss": 2.9527, "step": 1083 }, { "crossentropy": 3.0477688312530518, "epoch": 0.092208234093229, "grad_norm": 0.05341086536645889, "grad_norm_var": 2.8180578225967905e-05, "learning_rate": 0.009661168036940072, "loss": 3.0478, "step": 1084 }, { "crossentropy": 3.0026822090148926, "epoch": 0.09229329703980946, "grad_norm": 0.05296153947710991, "grad_norm_var": 2.872240673338072e-05, "learning_rate": 0.009660442658048326, "loss": 3.0027, "step": 1085 }, { "crossentropy": 3.000178813934326, "epoch": 0.09237835998638994, "grad_norm": 0.0593535490334034, "grad_norm_var": 2.9515314697812986e-05, "learning_rate": 0.009659716530823776, "loss": 3.0002, "step": 1086 }, { "crossentropy": 3.005659818649292, "epoch": 0.0924634229329704, "grad_norm": 0.06158854439854622, "grad_norm_var": 2.867054688954577e-05, "learning_rate": 0.00965898965538302, "loss": 3.0057, "step": 1087 }, { "crossentropy": 2.939972400665283, "epoch": 0.09254848587955086, "grad_norm": 0.05751581862568855, "grad_norm_var": 2.6944717477605148e-05, "learning_rate": 0.00965826203184277, "loss": 2.94, "step": 1088 }, { "crossentropy": 2.9097635746002197, "epoch": 0.09263354882613134, "grad_norm": 0.051482394337654114, "grad_norm_var": 2.7941064733039088e-05, "learning_rate": 0.009657533660319864, "loss": 2.9098, "step": 1089 }, { "crossentropy": 2.8859992027282715, "epoch": 0.0927186117727118, "grad_norm": 0.047161586582660675, "grad_norm_var": 2.8182619793208573e-05, "learning_rate": 0.009656804540931254, "loss": 2.886, "step": 1090 }, { "crossentropy": 2.953998327255249, "epoch": 0.09280367471929228, "grad_norm": 0.05503828823566437, "grad_norm_var": 2.6939706289591293e-05, "learning_rate": 0.009656074673794018, "loss": 2.954, "step": 1091 }, { "crossentropy": 3.0262515544891357, "epoch": 0.09288873766587274, "grad_norm": 0.05075126886367798, "grad_norm_var": 2.8374553615758725e-05, "learning_rate": 0.00965534405902535, "loss": 3.0263, "step": 1092 }, { "crossentropy": 2.9902210235595703, "epoch": 0.09297380061245321, "grad_norm": 0.05558200553059578, "grad_norm_var": 2.419947822410081e-05, "learning_rate": 0.009654612696742568, "loss": 2.9902, "step": 1093 }, { "crossentropy": 2.962664842605591, "epoch": 0.09305886355903369, "grad_norm": 0.0556865893304348, "grad_norm_var": 2.3836037931804626e-05, "learning_rate": 0.009653880587063108, "loss": 2.9627, "step": 1094 }, { "crossentropy": 2.930424928665161, "epoch": 0.09314392650561415, "grad_norm": 0.049213990569114685, "grad_norm_var": 2.350545417700896e-05, "learning_rate": 0.009653147730104523, "loss": 2.9304, "step": 1095 }, { "crossentropy": 2.875824451446533, "epoch": 0.09322898945219463, "grad_norm": 0.05460016056895256, "grad_norm_var": 2.2625941158123618e-05, "learning_rate": 0.00965241412598449, "loss": 2.8758, "step": 1096 }, { "crossentropy": 2.9430623054504395, "epoch": 0.09331405239877509, "grad_norm": 0.06527741253376007, "grad_norm_var": 2.8367291230151487e-05, "learning_rate": 0.009651679774820803, "loss": 2.9431, "step": 1097 }, { "crossentropy": 3.01989483833313, "epoch": 0.09339911534535557, "grad_norm": 0.06081339344382286, "grad_norm_var": 2.6836145850840884e-05, "learning_rate": 0.00965094467673138, "loss": 3.0199, "step": 1098 }, { "crossentropy": 2.961015462875366, "epoch": 0.09348417829193603, "grad_norm": 0.05171915143728256, "grad_norm_var": 2.3438782423368155e-05, "learning_rate": 0.009650208831834258, "loss": 2.961, "step": 1099 }, { "crossentropy": 2.954627513885498, "epoch": 0.0935692412385165, "grad_norm": 0.060347363352775574, "grad_norm_var": 2.485157494704071e-05, "learning_rate": 0.009649472240247588, "loss": 2.9546, "step": 1100 }, { "crossentropy": 2.963688373565674, "epoch": 0.09365430418509697, "grad_norm": 0.052258796989917755, "grad_norm_var": 2.512669273559461e-05, "learning_rate": 0.009648734902089649, "loss": 2.9637, "step": 1101 }, { "crossentropy": 2.9626715183258057, "epoch": 0.09373936713167744, "grad_norm": 0.05426548048853874, "grad_norm_var": 2.4146986976871358e-05, "learning_rate": 0.009647996817478835, "loss": 2.9627, "step": 1102 }, { "crossentropy": 3.039278030395508, "epoch": 0.09382443007825791, "grad_norm": 0.0568806454539299, "grad_norm_var": 2.1526051652947843e-05, "learning_rate": 0.00964725798653366, "loss": 3.0393, "step": 1103 }, { "crossentropy": 2.962146759033203, "epoch": 0.09390949302483838, "grad_norm": 0.04904414713382721, "grad_norm_var": 2.307063394557379e-05, "learning_rate": 0.00964651840937276, "loss": 2.9621, "step": 1104 }, { "crossentropy": 2.948246717453003, "epoch": 0.09399455597141886, "grad_norm": 0.05280127376317978, "grad_norm_var": 2.2669334438876233e-05, "learning_rate": 0.009645778086114892, "loss": 2.9482, "step": 1105 }, { "crossentropy": 2.8782687187194824, "epoch": 0.09407961891799932, "grad_norm": 0.0492323562502861, "grad_norm_var": 2.0920821459497785e-05, "learning_rate": 0.009645037016878926, "loss": 2.8783, "step": 1106 }, { "crossentropy": 2.8922533988952637, "epoch": 0.09416468186457978, "grad_norm": 0.053457289934158325, "grad_norm_var": 2.0983497494738746e-05, "learning_rate": 0.00964429520178386, "loss": 2.8923, "step": 1107 }, { "crossentropy": 2.9339911937713623, "epoch": 0.09424974481116026, "grad_norm": 0.05627276003360748, "grad_norm_var": 2.013227496568583e-05, "learning_rate": 0.009643552640948806, "loss": 2.934, "step": 1108 }, { "crossentropy": 2.939776659011841, "epoch": 0.09433480775774072, "grad_norm": 0.06083489581942558, "grad_norm_var": 2.237595726497458e-05, "learning_rate": 0.009642809334493, "loss": 2.9398, "step": 1109 }, { "crossentropy": 2.9301445484161377, "epoch": 0.0944198707043212, "grad_norm": 0.059417273849248886, "grad_norm_var": 2.350324129592133e-05, "learning_rate": 0.009642065282535793, "loss": 2.9301, "step": 1110 }, { "crossentropy": 2.969036102294922, "epoch": 0.09450493365090167, "grad_norm": 0.05469949543476105, "grad_norm_var": 2.0857799158431552e-05, "learning_rate": 0.00964132048519666, "loss": 2.969, "step": 1111 }, { "crossentropy": 2.9404397010803223, "epoch": 0.09458999659748214, "grad_norm": 0.059675998985767365, "grad_norm_var": 2.169317475463987e-05, "learning_rate": 0.009640574942595195, "loss": 2.9404, "step": 1112 }, { "crossentropy": 2.841841220855713, "epoch": 0.0946750595440626, "grad_norm": 0.049469802528619766, "grad_norm_var": 1.7888310611035145e-05, "learning_rate": 0.009639828654851109, "loss": 2.8418, "step": 1113 }, { "crossentropy": 2.912229061126709, "epoch": 0.09476012249064307, "grad_norm": 0.05584416165947914, "grad_norm_var": 1.5629176434452974e-05, "learning_rate": 0.009639081622084234, "loss": 2.9122, "step": 1114 }, { "crossentropy": 2.8592727184295654, "epoch": 0.09484518543722355, "grad_norm": 0.05218705162405968, "grad_norm_var": 1.545291368210966e-05, "learning_rate": 0.009638333844414522, "loss": 2.8593, "step": 1115 }, { "crossentropy": 2.952401638031006, "epoch": 0.09493024838380401, "grad_norm": 0.048572517931461334, "grad_norm_var": 1.5398192081596492e-05, "learning_rate": 0.009637585321962046, "loss": 2.9524, "step": 1116 }, { "crossentropy": 2.874199867248535, "epoch": 0.09501531133038449, "grad_norm": 0.044395722448825836, "grad_norm_var": 2.114781995285637e-05, "learning_rate": 0.009636836054846996, "loss": 2.8742, "step": 1117 }, { "crossentropy": 2.927647829055786, "epoch": 0.09510037427696495, "grad_norm": 0.05451389029622078, "grad_norm_var": 2.117485498848252e-05, "learning_rate": 0.009636086043189683, "loss": 2.9276, "step": 1118 }, { "crossentropy": 2.974756956100464, "epoch": 0.09518543722354543, "grad_norm": 0.05853535979986191, "grad_norm_var": 2.207393577835966e-05, "learning_rate": 0.009635335287110539, "loss": 2.9748, "step": 1119 }, { "crossentropy": 2.9164576530456543, "epoch": 0.0952705001701259, "grad_norm": 0.05539437010884285, "grad_norm_var": 2.0665193262453935e-05, "learning_rate": 0.009634583786730109, "loss": 2.9165, "step": 1120 }, { "crossentropy": 2.9716105461120605, "epoch": 0.09535556311670636, "grad_norm": 0.06310346722602844, "grad_norm_var": 2.5540071936668424e-05, "learning_rate": 0.009633831542169067, "loss": 2.9716, "step": 1121 }, { "crossentropy": 3.0213143825531006, "epoch": 0.09544062606328683, "grad_norm": 0.05096476152539253, "grad_norm_var": 2.445882475876233e-05, "learning_rate": 0.0096330785535482, "loss": 3.0213, "step": 1122 }, { "crossentropy": 2.899541139602661, "epoch": 0.0955256890098673, "grad_norm": 0.053441695868968964, "grad_norm_var": 2.446170175147977e-05, "learning_rate": 0.009632324820988416, "loss": 2.8995, "step": 1123 }, { "crossentropy": 2.9750723838806152, "epoch": 0.09561075195644778, "grad_norm": 0.05913250893354416, "grad_norm_var": 2.552193106761039e-05, "learning_rate": 0.009631570344610743, "loss": 2.9751, "step": 1124 }, { "crossentropy": 2.895469903945923, "epoch": 0.09569581490302824, "grad_norm": 0.05491967871785164, "grad_norm_var": 2.3115855840716294e-05, "learning_rate": 0.00963081512453633, "loss": 2.8955, "step": 1125 }, { "crossentropy": 3.0642735958099365, "epoch": 0.09578087784960872, "grad_norm": 0.055178359150886536, "grad_norm_var": 2.153980035357831e-05, "learning_rate": 0.00963005916088644, "loss": 3.0643, "step": 1126 }, { "crossentropy": 2.9789109230041504, "epoch": 0.09586594079618918, "grad_norm": 0.055824603885412216, "grad_norm_var": 2.1667325753060852e-05, "learning_rate": 0.00962930245378246, "loss": 2.9789, "step": 1127 }, { "crossentropy": 2.9767956733703613, "epoch": 0.09595100374276964, "grad_norm": 0.05260435491800308, "grad_norm_var": 1.9862600784092967e-05, "learning_rate": 0.009628545003345899, "loss": 2.9768, "step": 1128 }, { "crossentropy": 2.9469313621520996, "epoch": 0.09603606668935012, "grad_norm": 0.04884450137615204, "grad_norm_var": 2.0265165626653404e-05, "learning_rate": 0.009627786809698377, "loss": 2.9469, "step": 1129 }, { "crossentropy": 2.962696075439453, "epoch": 0.09612112963593059, "grad_norm": 0.052636466920375824, "grad_norm_var": 2.010499819665168e-05, "learning_rate": 0.009627027872961642, "loss": 2.9627, "step": 1130 }, { "crossentropy": 2.9752862453460693, "epoch": 0.09620619258251106, "grad_norm": 0.06494257599115372, "grad_norm_var": 2.758929659231694e-05, "learning_rate": 0.009626268193257554, "loss": 2.9753, "step": 1131 }, { "crossentropy": 2.898660659790039, "epoch": 0.09629125552909153, "grad_norm": 0.05662564933300018, "grad_norm_var": 2.521053158846768e-05, "learning_rate": 0.009625507770708098, "loss": 2.8987, "step": 1132 }, { "crossentropy": 2.9240493774414062, "epoch": 0.09637631847567199, "grad_norm": 0.05171623453497887, "grad_norm_var": 1.8144860761966594e-05, "learning_rate": 0.009624746605435372, "loss": 2.924, "step": 1133 }, { "crossentropy": 3.0071165561676025, "epoch": 0.09646138142225247, "grad_norm": 0.05319375917315483, "grad_norm_var": 1.843151862445658e-05, "learning_rate": 0.009623984697561603, "loss": 3.0071, "step": 1134 }, { "crossentropy": 2.931643009185791, "epoch": 0.09654644436883293, "grad_norm": 0.04944666102528572, "grad_norm_var": 1.9844647041659427e-05, "learning_rate": 0.009623222047209128, "loss": 2.9316, "step": 1135 }, { "crossentropy": 2.97351336479187, "epoch": 0.09663150731541341, "grad_norm": 0.0523349829018116, "grad_norm_var": 2.0217003299486704e-05, "learning_rate": 0.009622458654500407, "loss": 2.9735, "step": 1136 }, { "crossentropy": 3.0406334400177, "epoch": 0.09671657026199387, "grad_norm": 0.05021228268742561, "grad_norm_var": 1.612820638249949e-05, "learning_rate": 0.00962169451955802, "loss": 3.0406, "step": 1137 }, { "crossentropy": 2.90452241897583, "epoch": 0.09680163320857435, "grad_norm": 0.058825623244047165, "grad_norm_var": 1.6938764095077394e-05, "learning_rate": 0.009620929642504664, "loss": 2.9045, "step": 1138 }, { "crossentropy": 3.079136371612549, "epoch": 0.09688669615515481, "grad_norm": 0.052858833223581314, "grad_norm_var": 1.7031945737098982e-05, "learning_rate": 0.009620164023463157, "loss": 3.0791, "step": 1139 }, { "crossentropy": 2.969249725341797, "epoch": 0.09697175910173528, "grad_norm": 0.05168905854225159, "grad_norm_var": 1.572951628235706e-05, "learning_rate": 0.009619397662556433, "loss": 2.9692, "step": 1140 }, { "crossentropy": 3.0456626415252686, "epoch": 0.09705682204831576, "grad_norm": 0.05431050434708595, "grad_norm_var": 1.566711435800564e-05, "learning_rate": 0.00961863055990755, "loss": 3.0457, "step": 1141 }, { "crossentropy": 2.937812328338623, "epoch": 0.09714188499489622, "grad_norm": 0.054415833204984665, "grad_norm_var": 1.5566140986376236e-05, "learning_rate": 0.009617862715639684, "loss": 2.9378, "step": 1142 }, { "crossentropy": 2.9463531970977783, "epoch": 0.0972269479414767, "grad_norm": 0.0644354447722435, "grad_norm_var": 2.2547598628481525e-05, "learning_rate": 0.009617094129876125, "loss": 2.9464, "step": 1143 }, { "crossentropy": 2.975116014480591, "epoch": 0.09731201088805716, "grad_norm": 0.0631163939833641, "grad_norm_var": 2.7051762567365372e-05, "learning_rate": 0.009616324802740286, "loss": 2.9751, "step": 1144 }, { "crossentropy": 2.881518602371216, "epoch": 0.09739707383463764, "grad_norm": 0.049480922520160675, "grad_norm_var": 2.6556841048314534e-05, "learning_rate": 0.009615554734355703, "loss": 2.8815, "step": 1145 }, { "crossentropy": 2.894049644470215, "epoch": 0.0974821367812181, "grad_norm": 0.05129915103316307, "grad_norm_var": 2.7092743920543542e-05, "learning_rate": 0.009614783924846021, "loss": 2.894, "step": 1146 }, { "crossentropy": 2.8838586807250977, "epoch": 0.09756719972779856, "grad_norm": 0.05153469368815422, "grad_norm_var": 2.0431463514398025e-05, "learning_rate": 0.009614012374335014, "loss": 2.8839, "step": 1147 }, { "crossentropy": 2.902920722961426, "epoch": 0.09765226267437904, "grad_norm": 0.05733424052596092, "grad_norm_var": 2.0702079185763723e-05, "learning_rate": 0.009613240082946568, "loss": 2.9029, "step": 1148 }, { "crossentropy": 3.0127370357513428, "epoch": 0.0977373256209595, "grad_norm": 0.0537387989461422, "grad_norm_var": 2.0304718870578496e-05, "learning_rate": 0.009612467050804693, "loss": 3.0127, "step": 1149 }, { "crossentropy": 2.9197447299957275, "epoch": 0.09782238856753998, "grad_norm": 0.0504859983921051, "grad_norm_var": 2.114943289199944e-05, "learning_rate": 0.009611693278033516, "loss": 2.9197, "step": 1150 }, { "crossentropy": 3.0331742763519287, "epoch": 0.09790745151412045, "grad_norm": 0.05109679698944092, "grad_norm_var": 2.029690627754488e-05, "learning_rate": 0.009610918764757281, "loss": 3.0332, "step": 1151 }, { "crossentropy": 2.91591477394104, "epoch": 0.09799251446070092, "grad_norm": 0.05232907831668854, "grad_norm_var": 2.0298375245668815e-05, "learning_rate": 0.009610143511100353, "loss": 2.9159, "step": 1152 }, { "crossentropy": 3.004972219467163, "epoch": 0.09807757740728139, "grad_norm": 0.05619290471076965, "grad_norm_var": 1.9355805977934692e-05, "learning_rate": 0.009609367517187216, "loss": 3.005, "step": 1153 }, { "crossentropy": 3.0557641983032227, "epoch": 0.09816264035386185, "grad_norm": 0.05270072817802429, "grad_norm_var": 1.822632518174155e-05, "learning_rate": 0.00960859078314247, "loss": 3.0558, "step": 1154 }, { "crossentropy": 3.0751547813415527, "epoch": 0.09824770330044233, "grad_norm": 0.050451260060071945, "grad_norm_var": 1.9015504562533366e-05, "learning_rate": 0.009607813309090838, "loss": 3.0752, "step": 1155 }, { "crossentropy": 3.039196491241455, "epoch": 0.0983327662470228, "grad_norm": 0.06013677641749382, "grad_norm_var": 2.0829723255954188e-05, "learning_rate": 0.00960703509515716, "loss": 3.0392, "step": 1156 }, { "crossentropy": 2.965782642364502, "epoch": 0.09841782919360327, "grad_norm": 0.048552852123975754, "grad_norm_var": 2.3097943066096907e-05, "learning_rate": 0.009606256141466395, "loss": 2.9658, "step": 1157 }, { "crossentropy": 2.9714534282684326, "epoch": 0.09850289214018373, "grad_norm": 0.04987538605928421, "grad_norm_var": 2.425961253991974e-05, "learning_rate": 0.00960547644814362, "loss": 2.9715, "step": 1158 }, { "crossentropy": 2.9672467708587646, "epoch": 0.09858795508676421, "grad_norm": 0.05227537825703621, "grad_norm_var": 1.6456377191271614e-05, "learning_rate": 0.009604696015314029, "loss": 2.9672, "step": 1159 }, { "crossentropy": 2.929337978363037, "epoch": 0.09867301803334468, "grad_norm": 0.05453136935830116, "grad_norm_var": 9.668966662639404e-06, "learning_rate": 0.009603914843102941, "loss": 2.9293, "step": 1160 }, { "crossentropy": 3.03539776802063, "epoch": 0.09875808097992514, "grad_norm": 0.06097683310508728, "grad_norm_var": 1.3107944760220626e-05, "learning_rate": 0.009603132931635788, "loss": 3.0354, "step": 1161 }, { "crossentropy": 2.897721290588379, "epoch": 0.09884314392650562, "grad_norm": 0.05589877814054489, "grad_norm_var": 1.3175841913299717e-05, "learning_rate": 0.009602350281038122, "loss": 2.8977, "step": 1162 }, { "crossentropy": 2.864438056945801, "epoch": 0.09892820687308608, "grad_norm": 0.056224625557661057, "grad_norm_var": 1.3239066680143889e-05, "learning_rate": 0.009601566891435615, "loss": 2.8644, "step": 1163 }, { "crossentropy": 3.061221122741699, "epoch": 0.09901326981966656, "grad_norm": 0.05974680930376053, "grad_norm_var": 1.4699481194887082e-05, "learning_rate": 0.009600782762954056, "loss": 3.0612, "step": 1164 }, { "crossentropy": 2.9017467498779297, "epoch": 0.09909833276624702, "grad_norm": 0.05465374141931534, "grad_norm_var": 1.4710677633914675e-05, "learning_rate": 0.009599997895719355, "loss": 2.9017, "step": 1165 }, { "crossentropy": 2.9794719219207764, "epoch": 0.0991833957128275, "grad_norm": 0.06345511227846146, "grad_norm_var": 1.89164544916585e-05, "learning_rate": 0.009599212289857537, "loss": 2.9795, "step": 1166 }, { "crossentropy": 2.9659423828125, "epoch": 0.09926845865940796, "grad_norm": 0.06790514290332794, "grad_norm_var": 2.7952749545965642e-05, "learning_rate": 0.00959842594549475, "loss": 2.9659, "step": 1167 }, { "crossentropy": 2.910001516342163, "epoch": 0.09935352160598843, "grad_norm": 0.052862849086523056, "grad_norm_var": 2.7709713740290257e-05, "learning_rate": 0.009597638862757255, "loss": 2.91, "step": 1168 }, { "crossentropy": 2.9909768104553223, "epoch": 0.0994385845525689, "grad_norm": 0.049683164805173874, "grad_norm_var": 3.0214722461235678e-05, "learning_rate": 0.009596851041771435, "loss": 2.991, "step": 1169 }, { "crossentropy": 3.0259406566619873, "epoch": 0.09952364749914937, "grad_norm": 0.0460578016936779, "grad_norm_var": 3.5559017979219726e-05, "learning_rate": 0.009596062482663795, "loss": 3.0259, "step": 1170 }, { "crossentropy": 2.9158952236175537, "epoch": 0.09960871044572985, "grad_norm": 0.05648508295416832, "grad_norm_var": 3.400963041508779e-05, "learning_rate": 0.009595273185560953, "loss": 2.9159, "step": 1171 }, { "crossentropy": 2.983628749847412, "epoch": 0.09969377339231031, "grad_norm": 0.06136278808116913, "grad_norm_var": 3.4848036516327054e-05, "learning_rate": 0.009594483150589646, "loss": 2.9836, "step": 1172 }, { "crossentropy": 2.9949138164520264, "epoch": 0.09977883633889077, "grad_norm": 0.045880191028118134, "grad_norm_var": 3.78268740895859e-05, "learning_rate": 0.009593692377876733, "loss": 2.9949, "step": 1173 }, { "crossentropy": 2.9782025814056396, "epoch": 0.09986389928547125, "grad_norm": 0.05196414887905121, "grad_norm_var": 3.653526747936919e-05, "learning_rate": 0.009592900867549188, "loss": 2.9782, "step": 1174 }, { "crossentropy": 2.9092602729797363, "epoch": 0.09994896223205171, "grad_norm": 0.05670936405658722, "grad_norm_var": 3.5785078789553566e-05, "learning_rate": 0.009592108619734106, "loss": 2.9093, "step": 1175 }, { "crossentropy": 2.9128684997558594, "epoch": 0.10003402517863219, "grad_norm": 0.052225932478904724, "grad_norm_var": 3.6537932065198025e-05, "learning_rate": 0.009591315634558697, "loss": 2.9129, "step": 1176 }, { "crossentropy": 2.830641508102417, "epoch": 0.10011908812521265, "grad_norm": 0.046922717243433, "grad_norm_var": 3.909916644333095e-05, "learning_rate": 0.009590521912150294, "loss": 2.8306, "step": 1177 }, { "crossentropy": 2.878242254257202, "epoch": 0.10020415107179313, "grad_norm": 0.05188193544745445, "grad_norm_var": 3.956057178614482e-05, "learning_rate": 0.009589727452636343, "loss": 2.8782, "step": 1178 }, { "crossentropy": 3.0262508392333984, "epoch": 0.1002892140183736, "grad_norm": 0.05104488134384155, "grad_norm_var": 4.013360170436649e-05, "learning_rate": 0.009588932256144414, "loss": 3.0263, "step": 1179 }, { "crossentropy": 3.10703706741333, "epoch": 0.10037427696495406, "grad_norm": 0.09490121901035309, "grad_norm_var": 0.00014289151111349628, "learning_rate": 0.009588136322802192, "loss": 3.107, "step": 1180 }, { "crossentropy": 3.013611316680908, "epoch": 0.10045933991153454, "grad_norm": 0.05647249519824982, "grad_norm_var": 0.0001426505935823219, "learning_rate": 0.00958733965273748, "loss": 3.0136, "step": 1181 }, { "crossentropy": 2.9491052627563477, "epoch": 0.100544402858115, "grad_norm": 0.05329218879342079, "grad_norm_var": 0.00013983503660989024, "learning_rate": 0.009586542246078203, "loss": 2.9491, "step": 1182 }, { "crossentropy": 3.0339503288269043, "epoch": 0.10062946580469548, "grad_norm": 0.05367545038461685, "grad_norm_var": 0.00012986148193580427, "learning_rate": 0.009585744102952399, "loss": 3.034, "step": 1183 }, { "crossentropy": 2.9011425971984863, "epoch": 0.10071452875127594, "grad_norm": 0.05303265526890755, "grad_norm_var": 0.00012981288471004682, "learning_rate": 0.009584945223488225, "loss": 2.9011, "step": 1184 }, { "crossentropy": 2.937511682510376, "epoch": 0.10079959169785642, "grad_norm": 0.04689614474773407, "grad_norm_var": 0.00013231107729061906, "learning_rate": 0.009584145607813963, "loss": 2.9375, "step": 1185 }, { "crossentropy": 3.0389060974121094, "epoch": 0.10088465464443688, "grad_norm": 0.04835403710603714, "grad_norm_var": 0.00012992570194457712, "learning_rate": 0.009583345256058003, "loss": 3.0389, "step": 1186 }, { "crossentropy": 2.9778642654418945, "epoch": 0.10096971759101735, "grad_norm": 0.053403276950120926, "grad_norm_var": 0.00012993734731351155, "learning_rate": 0.009582544168348863, "loss": 2.9779, "step": 1187 }, { "crossentropy": 3.0617733001708984, "epoch": 0.10105478053759782, "grad_norm": 0.06025506556034088, "grad_norm_var": 0.00012905599468038907, "learning_rate": 0.009581742344815172, "loss": 3.0618, "step": 1188 }, { "crossentropy": 2.9849729537963867, "epoch": 0.10113984348417829, "grad_norm": 0.05975628271698952, "grad_norm_var": 0.00012457425303004921, "learning_rate": 0.00958093978558568, "loss": 2.985, "step": 1189 }, { "crossentropy": 2.9470434188842773, "epoch": 0.10122490643075877, "grad_norm": 0.05028457194566727, "grad_norm_var": 0.00012558141469900876, "learning_rate": 0.009580136490789256, "loss": 2.947, "step": 1190 }, { "crossentropy": 2.996075391769409, "epoch": 0.10130996937733923, "grad_norm": 0.05710203945636749, "grad_norm_var": 0.0001256507437403166, "learning_rate": 0.009579332460554884, "loss": 2.9961, "step": 1191 }, { "crossentropy": 2.969982385635376, "epoch": 0.1013950323239197, "grad_norm": 0.054682813584804535, "grad_norm_var": 0.000124924748360275, "learning_rate": 0.00957852769501167, "loss": 2.97, "step": 1192 }, { "crossentropy": 3.0703930854797363, "epoch": 0.10148009527050017, "grad_norm": 0.061567436903715134, "grad_norm_var": 0.00012109772902675134, "learning_rate": 0.009577722194288835, "loss": 3.0704, "step": 1193 }, { "crossentropy": 2.9117014408111572, "epoch": 0.10156515821708063, "grad_norm": 0.057843804359436035, "grad_norm_var": 0.0001195189510978337, "learning_rate": 0.00957691595851572, "loss": 2.9117, "step": 1194 }, { "crossentropy": 2.9889020919799805, "epoch": 0.10165022116366111, "grad_norm": 0.06301602721214294, "grad_norm_var": 0.00011891414209127624, "learning_rate": 0.009576108987821783, "loss": 2.9889, "step": 1195 }, { "crossentropy": 2.931178331375122, "epoch": 0.10173528411024158, "grad_norm": 0.059183377772569656, "grad_norm_var": 2.188061812688861e-05, "learning_rate": 0.0095753012823366, "loss": 2.9312, "step": 1196 }, { "crossentropy": 2.9636499881744385, "epoch": 0.10182034705682205, "grad_norm": 0.050048504024744034, "grad_norm_var": 2.367064606128329e-05, "learning_rate": 0.009574492842189865, "loss": 2.9636, "step": 1197 }, { "crossentropy": 2.8896472454071045, "epoch": 0.10190541000340252, "grad_norm": 0.04626237601041794, "grad_norm_var": 2.8500259367332758e-05, "learning_rate": 0.009573683667511391, "loss": 2.8896, "step": 1198 }, { "crossentropy": 2.9208030700683594, "epoch": 0.101990472949983, "grad_norm": 0.04715057834982872, "grad_norm_var": 3.2061382174229136e-05, "learning_rate": 0.009572873758431107, "loss": 2.9208, "step": 1199 }, { "crossentropy": 2.937321901321411, "epoch": 0.10207553589656346, "grad_norm": 0.051359258592128754, "grad_norm_var": 3.2519711338380165e-05, "learning_rate": 0.009572063115079063, "loss": 2.9373, "step": 1200 }, { "crossentropy": 3.028536081314087, "epoch": 0.10216059884314392, "grad_norm": 0.05170324444770813, "grad_norm_var": 2.9283971146951136e-05, "learning_rate": 0.009571251737585423, "loss": 3.0285, "step": 1201 }, { "crossentropy": 3.0483286380767822, "epoch": 0.1022456617897244, "grad_norm": 0.04906030371785164, "grad_norm_var": 2.873654918308186e-05, "learning_rate": 0.009570439626080472, "loss": 3.0483, "step": 1202 }, { "crossentropy": 2.9874024391174316, "epoch": 0.10233072473630486, "grad_norm": 0.049327682703733444, "grad_norm_var": 3.039373631558654e-05, "learning_rate": 0.009569626780694611, "loss": 2.9874, "step": 1203 }, { "crossentropy": 3.0334877967834473, "epoch": 0.10241578768288534, "grad_norm": 0.048824962228536606, "grad_norm_var": 2.9464858812825825e-05, "learning_rate": 0.00956881320155836, "loss": 3.0335, "step": 1204 }, { "crossentropy": 3.0376193523406982, "epoch": 0.1025008506294658, "grad_norm": 0.04790552332997322, "grad_norm_var": 2.8472697036703114e-05, "learning_rate": 0.009567998888802356, "loss": 3.0376, "step": 1205 }, { "crossentropy": 2.8964545726776123, "epoch": 0.10258591357604628, "grad_norm": 0.04771478846669197, "grad_norm_var": 2.9758503824719024e-05, "learning_rate": 0.009567183842557354, "loss": 2.8965, "step": 1206 }, { "crossentropy": 3.082009792327881, "epoch": 0.10267097652262674, "grad_norm": 0.07744867354631424, "grad_norm_var": 6.765066204887134e-05, "learning_rate": 0.009566368062954227, "loss": 3.082, "step": 1207 }, { "crossentropy": 2.9232499599456787, "epoch": 0.10275603946920721, "grad_norm": 0.05255107581615448, "grad_norm_var": 6.772460441392073e-05, "learning_rate": 0.009565551550123966, "loss": 2.9232, "step": 1208 }, { "crossentropy": 3.0262441635131836, "epoch": 0.10284110241578769, "grad_norm": 0.051512423902750015, "grad_norm_var": 6.364404757654979e-05, "learning_rate": 0.00956473430419768, "loss": 3.0262, "step": 1209 }, { "crossentropy": 2.9610543251037598, "epoch": 0.10292616536236815, "grad_norm": 0.06173075735569, "grad_norm_var": 6.70043318110477e-05, "learning_rate": 0.009563916325306595, "loss": 2.9611, "step": 1210 }, { "crossentropy": 3.0581905841827393, "epoch": 0.10301122830894863, "grad_norm": 0.06941024214029312, "grad_norm_var": 7.77366746752966e-05, "learning_rate": 0.009563097613582053, "loss": 3.0582, "step": 1211 }, { "crossentropy": 2.975712776184082, "epoch": 0.10309629125552909, "grad_norm": 0.061981670558452606, "grad_norm_var": 8.022546386979487e-05, "learning_rate": 0.009562278169155518, "loss": 2.9757, "step": 1212 }, { "crossentropy": 2.948543071746826, "epoch": 0.10318135420210955, "grad_norm": 0.053956493735313416, "grad_norm_var": 7.912125883798437e-05, "learning_rate": 0.00956145799215857, "loss": 2.9485, "step": 1213 }, { "crossentropy": 2.840909481048584, "epoch": 0.10326641714869003, "grad_norm": 0.04903189465403557, "grad_norm_var": 7.665337195694363e-05, "learning_rate": 0.0095606370827229, "loss": 2.8409, "step": 1214 }, { "crossentropy": 3.035432815551758, "epoch": 0.1033514800952705, "grad_norm": 0.05101287364959717, "grad_norm_var": 7.384377404097784e-05, "learning_rate": 0.009559815440980328, "loss": 3.0354, "step": 1215 }, { "crossentropy": 2.8731205463409424, "epoch": 0.10343654304185097, "grad_norm": 0.045423541218042374, "grad_norm_var": 7.865673126561668e-05, "learning_rate": 0.009558993067062784, "loss": 2.8731, "step": 1216 }, { "crossentropy": 2.9687514305114746, "epoch": 0.10352160598843144, "grad_norm": 0.047419365495443344, "grad_norm_var": 8.127965511923689e-05, "learning_rate": 0.00955816996110232, "loss": 2.9688, "step": 1217 }, { "crossentropy": 3.0335946083068848, "epoch": 0.10360666893501191, "grad_norm": 0.0474240668118, "grad_norm_var": 8.252891095007157e-05, "learning_rate": 0.009557346123231098, "loss": 3.0336, "step": 1218 }, { "crossentropy": 2.953207015991211, "epoch": 0.10369173188159238, "grad_norm": 0.04673108831048012, "grad_norm_var": 8.453927158455678e-05, "learning_rate": 0.009556521553581408, "loss": 2.9532, "step": 1219 }, { "crossentropy": 2.9376027584075928, "epoch": 0.10377679482817284, "grad_norm": 0.053737539798021317, "grad_norm_var": 8.281840737879754e-05, "learning_rate": 0.009555696252285648, "loss": 2.9376, "step": 1220 }, { "crossentropy": 2.7931671142578125, "epoch": 0.10386185777475332, "grad_norm": 0.05569491162896156, "grad_norm_var": 8.02165418104965e-05, "learning_rate": 0.00955487021947634, "loss": 2.7932, "step": 1221 }, { "crossentropy": 2.8839032649993896, "epoch": 0.10394692072133378, "grad_norm": 0.10140538960695267, "grad_norm_var": 0.00021146085253826997, "learning_rate": 0.00955404345528612, "loss": 2.8839, "step": 1222 }, { "crossentropy": 2.9705750942230225, "epoch": 0.10403198366791426, "grad_norm": 0.05378761515021324, "grad_norm_var": 0.0001847930985594543, "learning_rate": 0.009553215959847742, "loss": 2.9706, "step": 1223 }, { "crossentropy": 2.9089484214782715, "epoch": 0.10411704661449472, "grad_norm": 0.06013691425323486, "grad_norm_var": 0.00018447070368174384, "learning_rate": 0.00955238773329408, "loss": 2.9089, "step": 1224 }, { "crossentropy": 2.9011847972869873, "epoch": 0.1042021095610752, "grad_norm": 0.052181925624608994, "grad_norm_var": 0.00018401780393879346, "learning_rate": 0.00955155877575812, "loss": 2.9012, "step": 1225 }, { "crossentropy": 3.004678249359131, "epoch": 0.10428717250765566, "grad_norm": 0.044909246265888214, "grad_norm_var": 0.00019096165439432122, "learning_rate": 0.009550729087372974, "loss": 3.0047, "step": 1226 }, { "crossentropy": 2.896791934967041, "epoch": 0.10437223545423613, "grad_norm": 0.05183878168463707, "grad_norm_var": 0.0001785835647231224, "learning_rate": 0.00954989866827186, "loss": 2.8968, "step": 1227 }, { "crossentropy": 2.9222803115844727, "epoch": 0.1044572984008166, "grad_norm": 0.05124621465802193, "grad_norm_var": 0.00017549555634844646, "learning_rate": 0.009549067518588125, "loss": 2.9223, "step": 1228 }, { "crossentropy": 3.011718511581421, "epoch": 0.10454236134739707, "grad_norm": 0.05141960084438324, "grad_norm_var": 0.00017595347945680104, "learning_rate": 0.009548235638455223, "loss": 3.0117, "step": 1229 }, { "crossentropy": 3.0303614139556885, "epoch": 0.10462742429397755, "grad_norm": 0.05397005379199982, "grad_norm_var": 0.00017423111285441508, "learning_rate": 0.009547403028006734, "loss": 3.0304, "step": 1230 }, { "crossentropy": 2.952256202697754, "epoch": 0.10471248724055801, "grad_norm": 0.05125201866030693, "grad_norm_var": 0.00017413079239367066, "learning_rate": 0.009546569687376348, "loss": 2.9523, "step": 1231 }, { "crossentropy": 2.997500419616699, "epoch": 0.10479755018713849, "grad_norm": 0.05135631933808327, "grad_norm_var": 0.00016932001245093572, "learning_rate": 0.009545735616697875, "loss": 2.9975, "step": 1232 }, { "crossentropy": 3.03300142288208, "epoch": 0.10488261313371895, "grad_norm": 0.05116446688771248, "grad_norm_var": 0.0001665825635590705, "learning_rate": 0.009544900816105246, "loss": 3.033, "step": 1233 }, { "crossentropy": 3.000562906265259, "epoch": 0.10496767608029942, "grad_norm": 0.050090331584215164, "grad_norm_var": 0.000164372361186676, "learning_rate": 0.009544065285732505, "loss": 3.0006, "step": 1234 }, { "crossentropy": 2.948274850845337, "epoch": 0.1050527390268799, "grad_norm": 0.04768069088459015, "grad_norm_var": 0.00016337446352057485, "learning_rate": 0.009543229025713812, "loss": 2.9483, "step": 1235 }, { "crossentropy": 2.986143112182617, "epoch": 0.10513780197346036, "grad_norm": 0.05110674723982811, "grad_norm_var": 0.00016429090730080698, "learning_rate": 0.009542392036183447, "loss": 2.9861, "step": 1236 }, { "crossentropy": 3.010611057281494, "epoch": 0.10522286492004083, "grad_norm": 0.04939882829785347, "grad_norm_var": 0.0001661452752539233, "learning_rate": 0.009541554317275807, "loss": 3.0106, "step": 1237 }, { "crossentropy": 2.9889070987701416, "epoch": 0.1053079278666213, "grad_norm": 0.10118217766284943, "grad_norm_var": 0.00016475416819677877, "learning_rate": 0.009540715869125407, "loss": 2.9889, "step": 1238 }, { "crossentropy": 2.857292652130127, "epoch": 0.10539299081320178, "grad_norm": 0.05483969673514366, "grad_norm_var": 0.00016471708689274773, "learning_rate": 0.009539876691866876, "loss": 2.8573, "step": 1239 }, { "crossentropy": 2.9323530197143555, "epoch": 0.10547805375978224, "grad_norm": 0.06696662306785583, "grad_norm_var": 0.00017266455892790572, "learning_rate": 0.00953903678563496, "loss": 2.9324, "step": 1240 }, { "crossentropy": 2.9758870601654053, "epoch": 0.1055631167063627, "grad_norm": 0.047681745141744614, "grad_norm_var": 0.00017564383842106878, "learning_rate": 0.009538196150564528, "loss": 2.9759, "step": 1241 }, { "crossentropy": 3.0440454483032227, "epoch": 0.10564817965294318, "grad_norm": 0.046792250126600266, "grad_norm_var": 0.0001733931298275583, "learning_rate": 0.00953735478679056, "loss": 3.044, "step": 1242 }, { "crossentropy": 2.9439640045166016, "epoch": 0.10573324259952364, "grad_norm": 0.05143019184470177, "grad_norm_var": 0.00017356892718420238, "learning_rate": 0.00953651269444815, "loss": 2.944, "step": 1243 }, { "crossentropy": 2.9553730487823486, "epoch": 0.10581830554610412, "grad_norm": 0.051895707845687866, "grad_norm_var": 0.00017328332704225045, "learning_rate": 0.009535669873672522, "loss": 2.9554, "step": 1244 }, { "crossentropy": 3.011190414428711, "epoch": 0.10590336849268459, "grad_norm": 0.05155332386493683, "grad_norm_var": 0.00017322258234187514, "learning_rate": 0.009534826324599002, "loss": 3.0112, "step": 1245 }, { "crossentropy": 2.9755969047546387, "epoch": 0.10598843143926506, "grad_norm": 0.055193349719047546, "grad_norm_var": 0.00017316482653640178, "learning_rate": 0.009533982047363044, "loss": 2.9756, "step": 1246 }, { "crossentropy": 2.7497050762176514, "epoch": 0.10607349438584553, "grad_norm": 0.05784624069929123, "grad_norm_var": 0.0001726100598542774, "learning_rate": 0.009533137042100213, "loss": 2.7497, "step": 1247 }, { "crossentropy": 2.9818389415740967, "epoch": 0.10615855733242599, "grad_norm": 0.05498889461159706, "grad_norm_var": 0.0001714829544794298, "learning_rate": 0.00953229130894619, "loss": 2.9818, "step": 1248 }, { "crossentropy": 2.9733126163482666, "epoch": 0.10624362027900647, "grad_norm": 0.05306623503565788, "grad_norm_var": 0.00017058093736971723, "learning_rate": 0.009531444848036781, "loss": 2.9733, "step": 1249 }, { "crossentropy": 2.9616000652313232, "epoch": 0.10632868322558693, "grad_norm": 0.050384536385536194, "grad_norm_var": 0.00017036503715478466, "learning_rate": 0.009530597659507898, "loss": 2.9616, "step": 1250 }, { "crossentropy": 2.946955442428589, "epoch": 0.10641374617216741, "grad_norm": 0.04887224733829498, "grad_norm_var": 0.0001691716982159676, "learning_rate": 0.009529749743495578, "loss": 2.947, "step": 1251 }, { "crossentropy": 2.9187238216400146, "epoch": 0.10649880911874787, "grad_norm": 0.0536380261182785, "grad_norm_var": 0.000167979755944748, "learning_rate": 0.00952890110013597, "loss": 2.9187, "step": 1252 }, { "crossentropy": 2.9675590991973877, "epoch": 0.10658387206532834, "grad_norm": 0.04462531954050064, "grad_norm_var": 0.00017359460108663238, "learning_rate": 0.009528051729565343, "loss": 2.9676, "step": 1253 }, { "crossentropy": 3.0215446949005127, "epoch": 0.10666893501190881, "grad_norm": 0.04913788661360741, "grad_norm_var": 2.7165345237637786e-05, "learning_rate": 0.009527201631920081, "loss": 3.0215, "step": 1254 }, { "crossentropy": 2.962756872177124, "epoch": 0.10675399795848928, "grad_norm": 0.04931420460343361, "grad_norm_var": 2.72997214265685e-05, "learning_rate": 0.009526350807336686, "loss": 2.9628, "step": 1255 }, { "crossentropy": 2.9853508472442627, "epoch": 0.10683906090506975, "grad_norm": 0.052937425673007965, "grad_norm_var": 1.1767037619551887e-05, "learning_rate": 0.009525499255951775, "loss": 2.9854, "step": 1256 }, { "crossentropy": 2.957204818725586, "epoch": 0.10692412385165022, "grad_norm": 0.05493763089179993, "grad_norm_var": 1.1644260790767229e-05, "learning_rate": 0.009524646977902083, "loss": 2.9572, "step": 1257 }, { "crossentropy": 2.9926633834838867, "epoch": 0.1070091867982307, "grad_norm": 0.06292126327753067, "grad_norm_var": 1.7427873528709447e-05, "learning_rate": 0.00952379397332446, "loss": 2.9927, "step": 1258 }, { "crossentropy": 2.890845537185669, "epoch": 0.10709424974481116, "grad_norm": 0.0540645495057106, "grad_norm_var": 1.742564020611806e-05, "learning_rate": 0.009522940242355877, "loss": 2.8908, "step": 1259 }, { "crossentropy": 2.996473789215088, "epoch": 0.10717931269139162, "grad_norm": 0.05032260715961456, "grad_norm_var": 1.7777539832029148e-05, "learning_rate": 0.009522085785133414, "loss": 2.9965, "step": 1260 }, { "crossentropy": 2.7847468852996826, "epoch": 0.1072643756379721, "grad_norm": 0.05160658434033394, "grad_norm_var": 1.7769306159486017e-05, "learning_rate": 0.009521230601794278, "loss": 2.7847, "step": 1261 }, { "crossentropy": 2.9434826374053955, "epoch": 0.10734943858455256, "grad_norm": 0.05306700989603996, "grad_norm_var": 1.7356635955306555e-05, "learning_rate": 0.009520374692475781, "loss": 2.9435, "step": 1262 }, { "crossentropy": 2.9207942485809326, "epoch": 0.10743450153113304, "grad_norm": 0.04711125046014786, "grad_norm_var": 1.7061713326495485e-05, "learning_rate": 0.00951951805731536, "loss": 2.9208, "step": 1263 }, { "crossentropy": 2.823857307434082, "epoch": 0.1075195644777135, "grad_norm": 0.05282464623451233, "grad_norm_var": 1.647385333776215e-05, "learning_rate": 0.009518660696450567, "loss": 2.8239, "step": 1264 }, { "crossentropy": 2.909872531890869, "epoch": 0.10760462742429398, "grad_norm": 0.05640805885195732, "grad_norm_var": 1.7735169489044705e-05, "learning_rate": 0.009517802610019069, "loss": 2.9099, "step": 1265 }, { "crossentropy": 2.9754178524017334, "epoch": 0.10768969037087445, "grad_norm": 0.05325290933251381, "grad_norm_var": 1.762741744297779e-05, "learning_rate": 0.009516943798158648, "loss": 2.9754, "step": 1266 }, { "crossentropy": 3.0284066200256348, "epoch": 0.10777475331745491, "grad_norm": 0.06278438866138458, "grad_norm_var": 2.3569686889409146e-05, "learning_rate": 0.009516084261007206, "loss": 3.0284, "step": 1267 }, { "crossentropy": 2.9100329875946045, "epoch": 0.10785981626403539, "grad_norm": 0.05204587057232857, "grad_norm_var": 2.3605331402549942e-05, "learning_rate": 0.00951522399870276, "loss": 2.91, "step": 1268 }, { "crossentropy": 2.9716076850891113, "epoch": 0.10794487921061585, "grad_norm": 0.050899602472782135, "grad_norm_var": 1.9093109149460827e-05, "learning_rate": 0.009514363011383444, "loss": 2.9716, "step": 1269 }, { "crossentropy": 2.859919309616089, "epoch": 0.10802994215719633, "grad_norm": 0.06081448867917061, "grad_norm_var": 2.1053300502092177e-05, "learning_rate": 0.009513501299187505, "loss": 2.8599, "step": 1270 }, { "crossentropy": 2.9086132049560547, "epoch": 0.10811500510377679, "grad_norm": 0.05977100133895874, "grad_norm_var": 2.1239846144695727e-05, "learning_rate": 0.009512638862253312, "loss": 2.9086, "step": 1271 }, { "crossentropy": 2.9227302074432373, "epoch": 0.10820006805035727, "grad_norm": 0.05496075749397278, "grad_norm_var": 2.1010611283570763e-05, "learning_rate": 0.009511775700719345, "loss": 2.9227, "step": 1272 }, { "crossentropy": 3.0363872051239014, "epoch": 0.10828513099693773, "grad_norm": 0.05214592069387436, "grad_norm_var": 2.1469576614804865e-05, "learning_rate": 0.009510911814724207, "loss": 3.0364, "step": 1273 }, { "crossentropy": 2.8695292472839355, "epoch": 0.1083701939435182, "grad_norm": 0.05878358706831932, "grad_norm_var": 1.7997144542367984e-05, "learning_rate": 0.009510047204406609, "loss": 2.8695, "step": 1274 }, { "crossentropy": 3.0136821269989014, "epoch": 0.10845525689009868, "grad_norm": 0.05768568441271782, "grad_norm_var": 1.8640743066337256e-05, "learning_rate": 0.009509181869905384, "loss": 3.0137, "step": 1275 }, { "crossentropy": 2.877607583999634, "epoch": 0.10854031983667914, "grad_norm": 0.048947159200906754, "grad_norm_var": 1.955356498193981e-05, "learning_rate": 0.009508315811359481, "loss": 2.8776, "step": 1276 }, { "crossentropy": 2.882564067840576, "epoch": 0.10862538278325962, "grad_norm": 0.045451968908309937, "grad_norm_var": 2.4352276885724936e-05, "learning_rate": 0.00950744902890796, "loss": 2.8826, "step": 1277 }, { "crossentropy": 2.985016107559204, "epoch": 0.10871044572984008, "grad_norm": 0.05380404368042946, "grad_norm_var": 2.4276396846549067e-05, "learning_rate": 0.009506581522690006, "loss": 2.985, "step": 1278 }, { "crossentropy": 2.8141930103302, "epoch": 0.10879550867642056, "grad_norm": 0.090690977871418, "grad_norm_var": 0.00010160739741610728, "learning_rate": 0.009505713292844914, "loss": 2.8142, "step": 1279 }, { "crossentropy": 2.9535090923309326, "epoch": 0.10888057162300102, "grad_norm": 0.062943235039711, "grad_norm_var": 0.00010243481890274778, "learning_rate": 0.009504844339512096, "loss": 2.9535, "step": 1280 }, { "crossentropy": 2.8511810302734375, "epoch": 0.10896563456958148, "grad_norm": 0.05921286717057228, "grad_norm_var": 0.00010248566422818734, "learning_rate": 0.00950397466283108, "loss": 2.8512, "step": 1281 }, { "crossentropy": 2.9129903316497803, "epoch": 0.10905069751616196, "grad_norm": 0.052710119634866714, "grad_norm_var": 0.00010283042088612577, "learning_rate": 0.009503104262941514, "loss": 2.913, "step": 1282 }, { "crossentropy": 2.956981897354126, "epoch": 0.10913576046274243, "grad_norm": 0.05464120954275131, "grad_norm_var": 0.00010148512003826845, "learning_rate": 0.009502233139983156, "loss": 2.957, "step": 1283 }, { "crossentropy": 2.985276460647583, "epoch": 0.1092208234093229, "grad_norm": 0.05537034571170807, "grad_norm_var": 9.98826952872497e-05, "learning_rate": 0.009501361294095886, "loss": 2.9853, "step": 1284 }, { "crossentropy": 2.8819408416748047, "epoch": 0.10930588635590337, "grad_norm": 0.05182476341724396, "grad_norm_var": 9.913099722235608e-05, "learning_rate": 0.009500488725419696, "loss": 2.8819, "step": 1285 }, { "crossentropy": 2.8809099197387695, "epoch": 0.10939094930248384, "grad_norm": 0.050000011920928955, "grad_norm_var": 0.00010163949501505083, "learning_rate": 0.009499615434094694, "loss": 2.8809, "step": 1286 }, { "crossentropy": 2.941106081008911, "epoch": 0.10947601224906431, "grad_norm": 0.062190942466259, "grad_norm_var": 0.00010296122504968933, "learning_rate": 0.009498741420261107, "loss": 2.9411, "step": 1287 }, { "crossentropy": 2.908254861831665, "epoch": 0.10956107519564477, "grad_norm": 0.06086771562695503, "grad_norm_var": 0.00010356721553638799, "learning_rate": 0.009497866684059277, "loss": 2.9083, "step": 1288 }, { "crossentropy": 2.8581016063690186, "epoch": 0.10964613814222525, "grad_norm": 0.05978989228606224, "grad_norm_var": 0.00010193611721173815, "learning_rate": 0.00949699122562966, "loss": 2.8581, "step": 1289 }, { "crossentropy": 2.8948192596435547, "epoch": 0.10973120108880571, "grad_norm": 0.04676622524857521, "grad_norm_var": 0.00010939763152219353, "learning_rate": 0.00949611504511283, "loss": 2.8948, "step": 1290 }, { "crossentropy": 2.8006999492645264, "epoch": 0.10981626403538619, "grad_norm": 0.04674268886446953, "grad_norm_var": 0.00011596330830928568, "learning_rate": 0.009495238142649477, "loss": 2.8007, "step": 1291 }, { "crossentropy": 2.960775375366211, "epoch": 0.10990132698196665, "grad_norm": 0.058037545531988144, "grad_norm_var": 0.000112128549902978, "learning_rate": 0.009494360518380405, "loss": 2.9608, "step": 1292 }, { "crossentropy": 2.9817616939544678, "epoch": 0.10998638992854712, "grad_norm": 0.05184618756175041, "grad_norm_var": 0.00010488941995185346, "learning_rate": 0.009493482172446535, "loss": 2.9818, "step": 1293 }, { "crossentropy": 3.003455400466919, "epoch": 0.1100714528751276, "grad_norm": 0.04478908330202103, "grad_norm_var": 0.00011421887289699469, "learning_rate": 0.009492603104988907, "loss": 3.0035, "step": 1294 }, { "crossentropy": 2.8775129318237305, "epoch": 0.11015651582170806, "grad_norm": 0.04681174084544182, "grad_norm_var": 3.613666389890666e-05, "learning_rate": 0.00949172331614867, "loss": 2.8775, "step": 1295 }, { "crossentropy": 3.0063912868499756, "epoch": 0.11024157876828854, "grad_norm": 0.04641944169998169, "grad_norm_var": 3.3572895959018724e-05, "learning_rate": 0.009490842806067094, "loss": 3.0064, "step": 1296 }, { "crossentropy": 2.9356448650360107, "epoch": 0.110326641714869, "grad_norm": 0.051771730184555054, "grad_norm_var": 3.087073595297677e-05, "learning_rate": 0.009489961574885566, "loss": 2.9356, "step": 1297 }, { "crossentropy": 2.913625955581665, "epoch": 0.11041170466144948, "grad_norm": 0.04987085238099098, "grad_norm_var": 3.130874578675021e-05, "learning_rate": 0.009489079622745585, "loss": 2.9136, "step": 1298 }, { "crossentropy": 2.9364283084869385, "epoch": 0.11049676760802994, "grad_norm": 0.05651581659913063, "grad_norm_var": 3.209886968169312e-05, "learning_rate": 0.009488196949788764, "loss": 2.9364, "step": 1299 }, { "crossentropy": 2.906080484390259, "epoch": 0.1105818305546104, "grad_norm": 0.05043606460094452, "grad_norm_var": 3.171632147053732e-05, "learning_rate": 0.009487313556156838, "loss": 2.9061, "step": 1300 }, { "crossentropy": 2.9234135150909424, "epoch": 0.11066689350119088, "grad_norm": 0.04910071939229965, "grad_norm_var": 3.2304597347724597e-05, "learning_rate": 0.009486429441991654, "loss": 2.9234, "step": 1301 }, { "crossentropy": 2.936462879180908, "epoch": 0.11075195644777135, "grad_norm": 0.049430474638938904, "grad_norm_var": 3.247654064078142e-05, "learning_rate": 0.009485544607435175, "loss": 2.9365, "step": 1302 }, { "crossentropy": 2.959780693054199, "epoch": 0.11083701939435182, "grad_norm": 0.05032894387841225, "grad_norm_var": 2.5092153051932872e-05, "learning_rate": 0.00948465905262948, "loss": 2.9598, "step": 1303 }, { "crossentropy": 2.8517017364501953, "epoch": 0.11092208234093229, "grad_norm": 0.04790536314249039, "grad_norm_var": 1.8919839018272504e-05, "learning_rate": 0.009483772777716766, "loss": 2.8517, "step": 1304 }, { "crossentropy": 3.0885698795318604, "epoch": 0.11100714528751277, "grad_norm": 0.05156107246875763, "grad_norm_var": 1.286072873144135e-05, "learning_rate": 0.009482885782839338, "loss": 3.0886, "step": 1305 }, { "crossentropy": 2.882413625717163, "epoch": 0.11109220823409323, "grad_norm": 0.06008711829781532, "grad_norm_var": 1.839249060477735e-05, "learning_rate": 0.009481998068139626, "loss": 2.8824, "step": 1306 }, { "crossentropy": 2.963371992111206, "epoch": 0.11117727118067369, "grad_norm": 0.05343717336654663, "grad_norm_var": 1.7635836017458084e-05, "learning_rate": 0.009481109633760169, "loss": 2.9634, "step": 1307 }, { "crossentropy": 2.855401039123535, "epoch": 0.11126233412725417, "grad_norm": 0.047072798013687134, "grad_norm_var": 1.5075951765092244e-05, "learning_rate": 0.009480220479843626, "loss": 2.8554, "step": 1308 }, { "crossentropy": 2.83967924118042, "epoch": 0.11134739707383463, "grad_norm": 0.05367295444011688, "grad_norm_var": 1.5621777106731607e-05, "learning_rate": 0.009479330606532768, "loss": 2.8397, "step": 1309 }, { "crossentropy": 2.996121406555176, "epoch": 0.11143246002041511, "grad_norm": 0.05005275085568428, "grad_norm_var": 1.3292231030412328e-05, "learning_rate": 0.009478440013970484, "loss": 2.9961, "step": 1310 }, { "crossentropy": 2.9204766750335693, "epoch": 0.11151752296699557, "grad_norm": 0.05201764777302742, "grad_norm_var": 1.2145072159339827e-05, "learning_rate": 0.009477548702299776, "loss": 2.9205, "step": 1311 }, { "crossentropy": 2.8149948120117188, "epoch": 0.11160258591357605, "grad_norm": 0.050806570798158646, "grad_norm_var": 1.0534031112868249e-05, "learning_rate": 0.009476656671663766, "loss": 2.815, "step": 1312 }, { "crossentropy": 2.9647748470306396, "epoch": 0.11168764886015652, "grad_norm": 0.060602426528930664, "grad_norm_var": 1.5722790414354237e-05, "learning_rate": 0.009475763922205685, "loss": 2.9648, "step": 1313 }, { "crossentropy": 3.0206570625305176, "epoch": 0.11177271180673698, "grad_norm": 0.0645427405834198, "grad_norm_var": 2.4901774616651466e-05, "learning_rate": 0.009474870454068885, "loss": 3.0207, "step": 1314 }, { "crossentropy": 2.9632692337036133, "epoch": 0.11185777475331746, "grad_norm": 0.068884938955307, "grad_norm_var": 4.0306573147070626e-05, "learning_rate": 0.00947397626739683, "loss": 2.9633, "step": 1315 }, { "crossentropy": 2.9198756217956543, "epoch": 0.11194283769989792, "grad_norm": 0.055758606642484665, "grad_norm_var": 3.972802796056718e-05, "learning_rate": 0.009473081362333102, "loss": 2.9199, "step": 1316 }, { "crossentropy": 2.9669289588928223, "epoch": 0.1120279006464784, "grad_norm": 0.048035938292741776, "grad_norm_var": 4.0505643345138155e-05, "learning_rate": 0.009472185739021395, "loss": 2.9669, "step": 1317 }, { "crossentropy": 2.7367911338806152, "epoch": 0.11211296359305886, "grad_norm": 0.04982227087020874, "grad_norm_var": 4.0275882762102515e-05, "learning_rate": 0.009471289397605522, "loss": 2.7368, "step": 1318 }, { "crossentropy": 2.939761161804199, "epoch": 0.11219802653963934, "grad_norm": 0.049718454480171204, "grad_norm_var": 4.060099321022669e-05, "learning_rate": 0.009470392338229409, "loss": 2.9398, "step": 1319 }, { "crossentropy": 2.9477226734161377, "epoch": 0.1122830894862198, "grad_norm": 0.046564582735300064, "grad_norm_var": 4.180265500531155e-05, "learning_rate": 0.009469494561037097, "loss": 2.9477, "step": 1320 }, { "crossentropy": 2.9253666400909424, "epoch": 0.11236815243280027, "grad_norm": 0.048162709921598434, "grad_norm_var": 4.359100378146149e-05, "learning_rate": 0.009468596066172745, "loss": 2.9254, "step": 1321 }, { "crossentropy": 2.9883692264556885, "epoch": 0.11245321537938074, "grad_norm": 0.0513380765914917, "grad_norm_var": 4.0927183241794225e-05, "learning_rate": 0.009467696853780625, "loss": 2.9884, "step": 1322 }, { "crossentropy": 2.8717079162597656, "epoch": 0.11253827832596121, "grad_norm": 0.04801909625530243, "grad_norm_var": 4.255854121576191e-05, "learning_rate": 0.009466796924005124, "loss": 2.8717, "step": 1323 }, { "crossentropy": 2.987630844116211, "epoch": 0.11262334127254169, "grad_norm": 0.05205918103456497, "grad_norm_var": 4.0293479933856066e-05, "learning_rate": 0.009465896276990744, "loss": 2.9876, "step": 1324 }, { "crossentropy": 2.832737684249878, "epoch": 0.11270840421912215, "grad_norm": 0.050912436097860336, "grad_norm_var": 4.056942970443918e-05, "learning_rate": 0.009464994912882104, "loss": 2.8327, "step": 1325 }, { "crossentropy": 2.8909361362457275, "epoch": 0.11279346716570263, "grad_norm": 0.04828663915395737, "grad_norm_var": 4.144807399206884e-05, "learning_rate": 0.009464092831823936, "loss": 2.8909, "step": 1326 }, { "crossentropy": 2.940980911254883, "epoch": 0.11287853011228309, "grad_norm": 0.04916216433048248, "grad_norm_var": 4.227297742722402e-05, "learning_rate": 0.00946319003396109, "loss": 2.941, "step": 1327 }, { "crossentropy": 2.814769983291626, "epoch": 0.11296359305886355, "grad_norm": 0.05258096754550934, "grad_norm_var": 4.202953426134659e-05, "learning_rate": 0.00946228651943853, "loss": 2.8148, "step": 1328 }, { "crossentropy": 2.905529260635376, "epoch": 0.11304865600544403, "grad_norm": 0.058994702994823456, "grad_norm_var": 4.0513857348436064e-05, "learning_rate": 0.00946138228840133, "loss": 2.9055, "step": 1329 }, { "crossentropy": 2.9225199222564697, "epoch": 0.1131337189520245, "grad_norm": 0.05642715096473694, "grad_norm_var": 3.1791396063477785e-05, "learning_rate": 0.00946047734099469, "loss": 2.9225, "step": 1330 }, { "crossentropy": 2.913259506225586, "epoch": 0.11321878189860497, "grad_norm": 0.05061804875731468, "grad_norm_var": 1.1936895436972146e-05, "learning_rate": 0.009459571677363911, "loss": 2.9133, "step": 1331 }, { "crossentropy": 2.9503650665283203, "epoch": 0.11330384484518544, "grad_norm": 0.04638339951634407, "grad_norm_var": 1.1517931011652504e-05, "learning_rate": 0.00945866529765442, "loss": 2.9504, "step": 1332 }, { "crossentropy": 2.964353561401367, "epoch": 0.1133889077917659, "grad_norm": 0.05384209752082825, "grad_norm_var": 1.1761566517831147e-05, "learning_rate": 0.00945775820201176, "loss": 2.9644, "step": 1333 }, { "crossentropy": 2.82548189163208, "epoch": 0.11347397073834638, "grad_norm": 0.05174938216805458, "grad_norm_var": 1.1740973565275287e-05, "learning_rate": 0.009456850390581577, "loss": 2.8255, "step": 1334 }, { "crossentropy": 2.937696695327759, "epoch": 0.11355903368492684, "grad_norm": 0.05184003338217735, "grad_norm_var": 1.1680650418506057e-05, "learning_rate": 0.009455941863509641, "loss": 2.9377, "step": 1335 }, { "crossentropy": 2.937436580657959, "epoch": 0.11364409663150732, "grad_norm": 0.053982995450496674, "grad_norm_var": 1.0674883669410894e-05, "learning_rate": 0.00945503262094184, "loss": 2.9374, "step": 1336 }, { "crossentropy": 2.799705743789673, "epoch": 0.11372915957808778, "grad_norm": 0.05374671891331673, "grad_norm_var": 1.0122268943526155e-05, "learning_rate": 0.009454122663024167, "loss": 2.7997, "step": 1337 }, { "crossentropy": 2.9212002754211426, "epoch": 0.11381422252466826, "grad_norm": 0.05368775501847267, "grad_norm_var": 1.030023207965926e-05, "learning_rate": 0.009453211989902737, "loss": 2.9212, "step": 1338 }, { "crossentropy": 2.9422073364257812, "epoch": 0.11389928547124872, "grad_norm": 0.04871885105967522, "grad_norm_var": 9.95770753775919e-06, "learning_rate": 0.009452300601723779, "loss": 2.9422, "step": 1339 }, { "crossentropy": 2.9013113975524902, "epoch": 0.11398434841782919, "grad_norm": 0.049471672624349594, "grad_norm_var": 1.0377141366620044e-05, "learning_rate": 0.009451388498633634, "loss": 2.9013, "step": 1340 }, { "crossentropy": 3.017146348953247, "epoch": 0.11406941136440966, "grad_norm": 0.05311087146401405, "grad_norm_var": 1.0389639972068337e-05, "learning_rate": 0.009450475680778761, "loss": 3.0171, "step": 1341 }, { "crossentropy": 2.889514923095703, "epoch": 0.11415447431099013, "grad_norm": 0.045663971453905106, "grad_norm_var": 1.2131249387657e-05, "learning_rate": 0.009449562148305729, "loss": 2.8895, "step": 1342 }, { "crossentropy": 2.883260726928711, "epoch": 0.1142395372575706, "grad_norm": 0.050893135368824005, "grad_norm_var": 1.169268088508276e-05, "learning_rate": 0.009448647901361228, "loss": 2.8833, "step": 1343 }, { "crossentropy": 2.9177355766296387, "epoch": 0.11432460020415107, "grad_norm": 0.055720020085573196, "grad_norm_var": 1.2559232572267356e-05, "learning_rate": 0.00944773294009206, "loss": 2.9177, "step": 1344 }, { "crossentropy": 2.864854097366333, "epoch": 0.11440966315073155, "grad_norm": 0.05424287170171738, "grad_norm_var": 9.651677824342766e-06, "learning_rate": 0.009446817264645139, "loss": 2.8649, "step": 1345 }, { "crossentropy": 2.8971781730651855, "epoch": 0.11449472609731201, "grad_norm": 0.04715513810515404, "grad_norm_var": 9.404783937796978e-06, "learning_rate": 0.009445900875167498, "loss": 2.8972, "step": 1346 }, { "crossentropy": 2.929614782333374, "epoch": 0.11457978904389247, "grad_norm": 0.04526808112859726, "grad_norm_var": 1.1681326224138346e-05, "learning_rate": 0.009444983771806281, "loss": 2.9296, "step": 1347 }, { "crossentropy": 2.9164071083068848, "epoch": 0.11466485199047295, "grad_norm": 0.04890865087509155, "grad_norm_var": 1.0536477899673186e-05, "learning_rate": 0.009444065954708751, "loss": 2.9164, "step": 1348 }, { "crossentropy": 2.889873504638672, "epoch": 0.11474991493705342, "grad_norm": 0.050585754215717316, "grad_norm_var": 1.0019567612630971e-05, "learning_rate": 0.009443147424022282, "loss": 2.8899, "step": 1349 }, { "crossentropy": 2.8843376636505127, "epoch": 0.11483497788363389, "grad_norm": 0.05794814974069595, "grad_norm_var": 1.3105260825094164e-05, "learning_rate": 0.009442228179894362, "loss": 2.8843, "step": 1350 }, { "crossentropy": 2.8101449012756348, "epoch": 0.11492004083021436, "grad_norm": 0.05662871524691582, "grad_norm_var": 1.4877511180176549e-05, "learning_rate": 0.009441308222472595, "loss": 2.8101, "step": 1351 }, { "crossentropy": 2.914909839630127, "epoch": 0.11500510377679483, "grad_norm": 0.05663342773914337, "grad_norm_var": 1.6155744368513632e-05, "learning_rate": 0.009440387551904704, "loss": 2.9149, "step": 1352 }, { "crossentropy": 2.9547741413116455, "epoch": 0.1150901667233753, "grad_norm": 0.05236625671386719, "grad_norm_var": 1.5911744782968334e-05, "learning_rate": 0.009439466168338517, "loss": 2.9548, "step": 1353 }, { "crossentropy": 3.005758762359619, "epoch": 0.11517522966995576, "grad_norm": 0.05009794980287552, "grad_norm_var": 1.575985956677363e-05, "learning_rate": 0.009438544071921984, "loss": 3.0058, "step": 1354 }, { "crossentropy": 2.8619797229766846, "epoch": 0.11526029261653624, "grad_norm": 0.04831651225686073, "grad_norm_var": 1.5917205693995128e-05, "learning_rate": 0.009437621262803167, "loss": 2.862, "step": 1355 }, { "crossentropy": 2.8740720748901367, "epoch": 0.1153453555631167, "grad_norm": 0.04648343473672867, "grad_norm_var": 1.7258829895207737e-05, "learning_rate": 0.009436697741130243, "loss": 2.8741, "step": 1356 }, { "crossentropy": 2.9394333362579346, "epoch": 0.11543041850969718, "grad_norm": 0.04433009400963783, "grad_norm_var": 1.9900733734740107e-05, "learning_rate": 0.0094357735070515, "loss": 2.9394, "step": 1357 }, { "crossentropy": 2.871107578277588, "epoch": 0.11551548145627764, "grad_norm": 0.05463431775569916, "grad_norm_var": 1.8903453791084027e-05, "learning_rate": 0.009434848560715348, "loss": 2.8711, "step": 1358 }, { "crossentropy": 2.8287670612335205, "epoch": 0.11560054440285812, "grad_norm": 0.052403081208467484, "grad_norm_var": 1.8971429668062968e-05, "learning_rate": 0.009433922902270303, "loss": 2.8288, "step": 1359 }, { "crossentropy": 2.9154040813446045, "epoch": 0.11568560734943859, "grad_norm": 0.0482940673828125, "grad_norm_var": 1.809868091442035e-05, "learning_rate": 0.009432996531865, "loss": 2.9154, "step": 1360 }, { "crossentropy": 2.90872859954834, "epoch": 0.11577067029601905, "grad_norm": 0.04533359035849571, "grad_norm_var": 1.9080941467419558e-05, "learning_rate": 0.00943206944964819, "loss": 2.9087, "step": 1361 }, { "crossentropy": 2.872162103652954, "epoch": 0.11585573324259953, "grad_norm": 0.051236364990472794, "grad_norm_var": 1.839067642370925e-05, "learning_rate": 0.009431141655768732, "loss": 2.8722, "step": 1362 }, { "crossentropy": 2.8548593521118164, "epoch": 0.11594079618917999, "grad_norm": 0.05029883235692978, "grad_norm_var": 1.6401495760322755e-05, "learning_rate": 0.009430213150375606, "loss": 2.8549, "step": 1363 }, { "crossentropy": 2.911198139190674, "epoch": 0.11602585913576047, "grad_norm": 0.05147809162735939, "grad_norm_var": 1.6129777834429127e-05, "learning_rate": 0.0094292839336179, "loss": 2.9112, "step": 1364 }, { "crossentropy": 2.897557258605957, "epoch": 0.11611092208234093, "grad_norm": 0.0479959137737751, "grad_norm_var": 1.6715089889265954e-05, "learning_rate": 0.009428354005644821, "loss": 2.8976, "step": 1365 }, { "crossentropy": 2.894669771194458, "epoch": 0.11619598502892141, "grad_norm": 0.047171421349048615, "grad_norm_var": 1.385331770152413e-05, "learning_rate": 0.00942742336660569, "loss": 2.8947, "step": 1366 }, { "crossentropy": 2.9025087356567383, "epoch": 0.11628104797550187, "grad_norm": 0.05168592557311058, "grad_norm_var": 1.1164174440403406e-05, "learning_rate": 0.009426492016649938, "loss": 2.9025, "step": 1367 }, { "crossentropy": 2.970682382583618, "epoch": 0.11636611092208234, "grad_norm": 0.06328115612268448, "grad_norm_var": 1.9874555835702514e-05, "learning_rate": 0.009425559955927117, "loss": 2.9707, "step": 1368 }, { "crossentropy": 3.0493948459625244, "epoch": 0.11645117386866281, "grad_norm": 0.053135331720113754, "grad_norm_var": 2.0119513659262675e-05, "learning_rate": 0.009424627184586885, "loss": 3.0494, "step": 1369 }, { "crossentropy": 2.888702154159546, "epoch": 0.11653623681524328, "grad_norm": 0.050228435546159744, "grad_norm_var": 2.0115566199006377e-05, "learning_rate": 0.009423693702779021, "loss": 2.8887, "step": 1370 }, { "crossentropy": 2.9594979286193848, "epoch": 0.11662129976182375, "grad_norm": 0.04992330074310303, "grad_norm_var": 1.983181456933671e-05, "learning_rate": 0.009422759510653413, "loss": 2.9595, "step": 1371 }, { "crossentropy": 2.914823532104492, "epoch": 0.11670636270840422, "grad_norm": 0.052945397794246674, "grad_norm_var": 1.8985637710016746e-05, "learning_rate": 0.009421824608360068, "loss": 2.9148, "step": 1372 }, { "crossentropy": 2.8601629734039307, "epoch": 0.11679142565498468, "grad_norm": 0.05253450199961662, "grad_norm_var": 1.600738576920894e-05, "learning_rate": 0.009420888996049101, "loss": 2.8602, "step": 1373 }, { "crossentropy": 2.8638792037963867, "epoch": 0.11687648860156516, "grad_norm": 0.04961054027080536, "grad_norm_var": 1.542584071218215e-05, "learning_rate": 0.009419952673870748, "loss": 2.8639, "step": 1374 }, { "crossentropy": 2.929502487182617, "epoch": 0.11696155154814562, "grad_norm": 0.05518589913845062, "grad_norm_var": 1.6394365266717005e-05, "learning_rate": 0.009419015641975352, "loss": 2.9295, "step": 1375 }, { "crossentropy": 2.954014778137207, "epoch": 0.1170466144947261, "grad_norm": 0.057043105363845825, "grad_norm_var": 1.7705561150245056e-05, "learning_rate": 0.009418077900513376, "loss": 2.954, "step": 1376 }, { "crossentropy": 2.9160969257354736, "epoch": 0.11713167744130656, "grad_norm": 0.05895354598760605, "grad_norm_var": 1.7523882771769822e-05, "learning_rate": 0.009417139449635393, "loss": 2.9161, "step": 1377 }, { "crossentropy": 2.921733856201172, "epoch": 0.11721674038788704, "grad_norm": 0.04947831854224205, "grad_norm_var": 1.8052926896876668e-05, "learning_rate": 0.009416200289492091, "loss": 2.9217, "step": 1378 }, { "crossentropy": 2.9787302017211914, "epoch": 0.1173018033344675, "grad_norm": 0.050885844975709915, "grad_norm_var": 1.7897535824310897e-05, "learning_rate": 0.009415260420234274, "loss": 2.9787, "step": 1379 }, { "crossentropy": 2.937032461166382, "epoch": 0.11738686628104797, "grad_norm": 0.05050661414861679, "grad_norm_var": 1.8101330310112997e-05, "learning_rate": 0.009414319842012854, "loss": 2.937, "step": 1380 }, { "crossentropy": 2.8895771503448486, "epoch": 0.11747192922762845, "grad_norm": 0.052497755736112595, "grad_norm_var": 1.6643228043485872e-05, "learning_rate": 0.009413378554978865, "loss": 2.8896, "step": 1381 }, { "crossentropy": 2.8933825492858887, "epoch": 0.11755699217420891, "grad_norm": 0.047902654856443405, "grad_norm_var": 1.612624533147052e-05, "learning_rate": 0.009412436559283447, "loss": 2.8934, "step": 1382 }, { "crossentropy": 2.8466553688049316, "epoch": 0.11764205512078939, "grad_norm": 0.0478472001850605, "grad_norm_var": 1.764938629942291e-05, "learning_rate": 0.009411493855077861, "loss": 2.8467, "step": 1383 }, { "crossentropy": 2.9246342182159424, "epoch": 0.11772711806736985, "grad_norm": 0.0538482628762722, "grad_norm_var": 9.804976909205617e-06, "learning_rate": 0.009410550442513474, "loss": 2.9246, "step": 1384 }, { "crossentropy": 2.9568662643432617, "epoch": 0.11781218101395033, "grad_norm": 0.0653439536690712, "grad_norm_var": 2.0915154779699066e-05, "learning_rate": 0.009409606321741775, "loss": 2.9569, "step": 1385 }, { "crossentropy": 2.856811285018921, "epoch": 0.11789724396053079, "grad_norm": 0.056423865258693695, "grad_norm_var": 2.119319326138961e-05, "learning_rate": 0.00940866149291436, "loss": 2.8568, "step": 1386 }, { "crossentropy": 2.900620222091675, "epoch": 0.11798230690711126, "grad_norm": 0.049822647124528885, "grad_norm_var": 2.1237575511408776e-05, "learning_rate": 0.009407715956182942, "loss": 2.9006, "step": 1387 }, { "crossentropy": 2.95534348487854, "epoch": 0.11806736985369173, "grad_norm": 0.053020622581243515, "grad_norm_var": 2.1235607406988543e-05, "learning_rate": 0.009406769711699346, "loss": 2.9553, "step": 1388 }, { "crossentropy": 2.9268343448638916, "epoch": 0.1181524328002722, "grad_norm": 0.05070184916257858, "grad_norm_var": 2.1603637649457903e-05, "learning_rate": 0.009405822759615514, "loss": 2.9268, "step": 1389 }, { "crossentropy": 2.901972770690918, "epoch": 0.11823749574685267, "grad_norm": 0.054657503962516785, "grad_norm_var": 2.0869648977734728e-05, "learning_rate": 0.009404875100083497, "loss": 2.902, "step": 1390 }, { "crossentropy": 2.9799306392669678, "epoch": 0.11832255869343314, "grad_norm": 0.04770958423614502, "grad_norm_var": 2.2565378073069466e-05, "learning_rate": 0.009403926733255462, "loss": 2.9799, "step": 1391 }, { "crossentropy": 2.797567367553711, "epoch": 0.11840762164001362, "grad_norm": 0.048258226364851, "grad_norm_var": 2.255368212950807e-05, "learning_rate": 0.00940297765928369, "loss": 2.7976, "step": 1392 }, { "crossentropy": 2.815124034881592, "epoch": 0.11849268458659408, "grad_norm": 0.05122965946793556, "grad_norm_var": 1.9498296886164695e-05, "learning_rate": 0.009402027878320576, "loss": 2.8151, "step": 1393 }, { "crossentropy": 2.8747870922088623, "epoch": 0.11857774753317454, "grad_norm": 0.057636525481939316, "grad_norm_var": 2.1041903442074045e-05, "learning_rate": 0.009401077390518623, "loss": 2.8748, "step": 1394 }, { "crossentropy": 2.956378221511841, "epoch": 0.11866281047975502, "grad_norm": 0.05623636767268181, "grad_norm_var": 2.1755737620185456e-05, "learning_rate": 0.00940012619603046, "loss": 2.9564, "step": 1395 }, { "crossentropy": 2.868525266647339, "epoch": 0.11874787342633548, "grad_norm": 0.04864215850830078, "grad_norm_var": 2.2525150044727428e-05, "learning_rate": 0.009399174295008814, "loss": 2.8685, "step": 1396 }, { "crossentropy": 2.9555795192718506, "epoch": 0.11883293637291596, "grad_norm": 0.04782145470380783, "grad_norm_var": 2.3962606134032318e-05, "learning_rate": 0.009398221687606536, "loss": 2.9556, "step": 1397 }, { "crossentropy": 2.8900563716888428, "epoch": 0.11891799931949643, "grad_norm": 0.047338955104351044, "grad_norm_var": 2.431439145058889e-05, "learning_rate": 0.009397268373976586, "loss": 2.8901, "step": 1398 }, { "crossentropy": 2.845309019088745, "epoch": 0.1190030622660769, "grad_norm": 0.050288788974285126, "grad_norm_var": 2.324270249302266e-05, "learning_rate": 0.00939631435427204, "loss": 2.8453, "step": 1399 }, { "crossentropy": 2.9255480766296387, "epoch": 0.11908812521265737, "grad_norm": 0.055940721184015274, "grad_norm_var": 2.3910287683379973e-05, "learning_rate": 0.009395359628646086, "loss": 2.9255, "step": 1400 }, { "crossentropy": 2.853520154953003, "epoch": 0.11917318815923783, "grad_norm": 0.052720390260219574, "grad_norm_var": 1.2364603049120198e-05, "learning_rate": 0.009394404197252024, "loss": 2.8535, "step": 1401 }, { "crossentropy": 2.906799077987671, "epoch": 0.11925825110581831, "grad_norm": 0.051038406789302826, "grad_norm_var": 1.084134472015685e-05, "learning_rate": 0.00939344806024327, "loss": 2.9068, "step": 1402 }, { "crossentropy": 2.948704481124878, "epoch": 0.11934331405239877, "grad_norm": 0.061819497495889664, "grad_norm_var": 1.7247149973001417e-05, "learning_rate": 0.009392491217773354, "loss": 2.9487, "step": 1403 }, { "crossentropy": 2.9064300060272217, "epoch": 0.11942837699897925, "grad_norm": 0.05378606542944908, "grad_norm_var": 1.7368409328583947e-05, "learning_rate": 0.009391533669995913, "loss": 2.9064, "step": 1404 }, { "crossentropy": 2.9188780784606934, "epoch": 0.11951343994555971, "grad_norm": 0.049658339470624924, "grad_norm_var": 1.7650356002595532e-05, "learning_rate": 0.009390575417064706, "loss": 2.9189, "step": 1405 }, { "crossentropy": 2.9339051246643066, "epoch": 0.11959850289214019, "grad_norm": 0.05153069645166397, "grad_norm_var": 1.7225986678353945e-05, "learning_rate": 0.009389616459133596, "loss": 2.9339, "step": 1406 }, { "crossentropy": 2.8695995807647705, "epoch": 0.11968356583872065, "grad_norm": 0.051703743636608124, "grad_norm_var": 1.5949643245805056e-05, "learning_rate": 0.009388656796356568, "loss": 2.8696, "step": 1407 }, { "crossentropy": 2.915646553039551, "epoch": 0.11976862878530112, "grad_norm": 0.06282702833414078, "grad_norm_var": 2.150371242964164e-05, "learning_rate": 0.009387696428887716, "loss": 2.9156, "step": 1408 }, { "crossentropy": 2.907503604888916, "epoch": 0.1198536917318816, "grad_norm": 0.05443368852138519, "grad_norm_var": 2.1329786262800492e-05, "learning_rate": 0.009386735356881244, "loss": 2.9075, "step": 1409 }, { "crossentropy": 2.7817766666412354, "epoch": 0.11993875467846206, "grad_norm": 0.05015527829527855, "grad_norm_var": 2.0540999357680085e-05, "learning_rate": 0.009385773580491474, "loss": 2.7818, "step": 1410 }, { "crossentropy": 2.936711311340332, "epoch": 0.12002381762504254, "grad_norm": 0.04500145837664604, "grad_norm_var": 2.3389190966107096e-05, "learning_rate": 0.00938481109987284, "loss": 2.9367, "step": 1411 }, { "crossentropy": 2.936143398284912, "epoch": 0.120108880571623, "grad_norm": 0.04787357524037361, "grad_norm_var": 2.3787550939792917e-05, "learning_rate": 0.009383847915179891, "loss": 2.9361, "step": 1412 }, { "crossentropy": 2.9093103408813477, "epoch": 0.12019394351820346, "grad_norm": 0.05108577385544777, "grad_norm_var": 2.2582135279756305e-05, "learning_rate": 0.009382884026567284, "loss": 2.9093, "step": 1413 }, { "crossentropy": 2.884242057800293, "epoch": 0.12027900646478394, "grad_norm": 0.044955749064683914, "grad_norm_var": 2.4521532171699738e-05, "learning_rate": 0.009381919434189792, "loss": 2.8842, "step": 1414 }, { "crossentropy": 2.9542129039764404, "epoch": 0.1203640694113644, "grad_norm": 0.05234837904572487, "grad_norm_var": 2.426834586734076e-05, "learning_rate": 0.0093809541382023, "loss": 2.9542, "step": 1415 }, { "crossentropy": 2.9582505226135254, "epoch": 0.12044913235794488, "grad_norm": 0.04755346477031708, "grad_norm_var": 2.4599060672399602e-05, "learning_rate": 0.00937998813875981, "loss": 2.9583, "step": 1416 }, { "crossentropy": 2.8045647144317627, "epoch": 0.12053419530452535, "grad_norm": 0.04374702647328377, "grad_norm_var": 2.850737332222459e-05, "learning_rate": 0.00937902143601743, "loss": 2.8046, "step": 1417 }, { "crossentropy": 2.9143643379211426, "epoch": 0.12061925825110582, "grad_norm": 0.05357157438993454, "grad_norm_var": 2.8847136419053964e-05, "learning_rate": 0.009378054030130385, "loss": 2.9144, "step": 1418 }, { "crossentropy": 2.8143792152404785, "epoch": 0.12070432119768629, "grad_norm": 0.049708254635334015, "grad_norm_var": 2.1153842859356933e-05, "learning_rate": 0.009377085921254016, "loss": 2.8144, "step": 1419 }, { "crossentropy": 3.014829397201538, "epoch": 0.12078938414426675, "grad_norm": 0.052795227617025375, "grad_norm_var": 2.0797094452198034e-05, "learning_rate": 0.00937611710954377, "loss": 3.0148, "step": 1420 }, { "crossentropy": 2.9837450981140137, "epoch": 0.12087444709084723, "grad_norm": 0.048838939517736435, "grad_norm_var": 2.093749403384437e-05, "learning_rate": 0.009375147595155208, "loss": 2.9837, "step": 1421 }, { "crossentropy": 2.9665374755859375, "epoch": 0.12095951003742769, "grad_norm": 0.05784808471798897, "grad_norm_var": 2.429316943962622e-05, "learning_rate": 0.009374177378244013, "loss": 2.9665, "step": 1422 }, { "crossentropy": 2.8911285400390625, "epoch": 0.12104457298400817, "grad_norm": 0.05427870899438858, "grad_norm_var": 2.4982506803844275e-05, "learning_rate": 0.00937320645896597, "loss": 2.8911, "step": 1423 }, { "crossentropy": 2.945739269256592, "epoch": 0.12112963593058863, "grad_norm": 0.04997511953115463, "grad_norm_var": 1.5148555467179443e-05, "learning_rate": 0.009372234837476979, "loss": 2.9457, "step": 1424 }, { "crossentropy": 2.861807107925415, "epoch": 0.12121469887716911, "grad_norm": 0.048477720469236374, "grad_norm_var": 1.405171665481101e-05, "learning_rate": 0.009371262513933056, "loss": 2.8618, "step": 1425 }, { "crossentropy": 2.818547487258911, "epoch": 0.12129976182374957, "grad_norm": 0.05438608676195145, "grad_norm_var": 1.532100084112679e-05, "learning_rate": 0.00937028948849033, "loss": 2.8185, "step": 1426 }, { "crossentropy": 2.9327919483184814, "epoch": 0.12138482477033004, "grad_norm": 0.056309495121240616, "grad_norm_var": 1.5546074699314635e-05, "learning_rate": 0.009369315761305038, "loss": 2.9328, "step": 1427 }, { "crossentropy": 2.863231658935547, "epoch": 0.12146988771691052, "grad_norm": 0.05271695554256439, "grad_norm_var": 1.5083910306132725e-05, "learning_rate": 0.009368341332533534, "loss": 2.8632, "step": 1428 }, { "crossentropy": 2.849494218826294, "epoch": 0.12155495066349098, "grad_norm": 0.0565103255212307, "grad_norm_var": 1.686768184403908e-05, "learning_rate": 0.009367366202332283, "loss": 2.8495, "step": 1429 }, { "crossentropy": 2.9212865829467773, "epoch": 0.12164001361007146, "grad_norm": 0.04721897095441818, "grad_norm_var": 1.5212607234729724e-05, "learning_rate": 0.009366390370857862, "loss": 2.9213, "step": 1430 }, { "crossentropy": 2.8815417289733887, "epoch": 0.12172507655665192, "grad_norm": 0.04905502870678902, "grad_norm_var": 1.5580650027147146e-05, "learning_rate": 0.009365413838266964, "loss": 2.8815, "step": 1431 }, { "crossentropy": 2.8836817741394043, "epoch": 0.1218101395032324, "grad_norm": 0.04999997466802597, "grad_norm_var": 1.4687944597017304e-05, "learning_rate": 0.009364436604716388, "loss": 2.8837, "step": 1432 }, { "crossentropy": 2.851404905319214, "epoch": 0.12189520244981286, "grad_norm": 0.04853775352239609, "grad_norm_var": 1.1112680206151227e-05, "learning_rate": 0.009363458670363055, "loss": 2.8514, "step": 1433 }, { "crossentropy": 2.8885250091552734, "epoch": 0.12198026539639333, "grad_norm": 0.050467003136873245, "grad_norm_var": 1.1018697456577122e-05, "learning_rate": 0.009362480035363985, "loss": 2.8885, "step": 1434 }, { "crossentropy": 2.8605926036834717, "epoch": 0.1220653283429738, "grad_norm": 0.05364133045077324, "grad_norm_var": 1.0943526327515605e-05, "learning_rate": 0.009361500699876327, "loss": 2.8606, "step": 1435 }, { "crossentropy": 2.960439682006836, "epoch": 0.12215039128955427, "grad_norm": 0.05404505506157875, "grad_norm_var": 1.1183499680066043e-05, "learning_rate": 0.009360520664057329, "loss": 2.9604, "step": 1436 }, { "crossentropy": 2.9231808185577393, "epoch": 0.12223545423613474, "grad_norm": 0.05786121264100075, "grad_norm_var": 1.2445379462215112e-05, "learning_rate": 0.009359539928064358, "loss": 2.9232, "step": 1437 }, { "crossentropy": 2.9895570278167725, "epoch": 0.12232051718271521, "grad_norm": 0.052003901451826096, "grad_norm_var": 1.0477391890872412e-05, "learning_rate": 0.009358558492054891, "loss": 2.9896, "step": 1438 }, { "crossentropy": 2.96684193611145, "epoch": 0.12240558012929569, "grad_norm": 0.051317960023880005, "grad_norm_var": 1.021168726878559e-05, "learning_rate": 0.00935757635618652, "loss": 2.9668, "step": 1439 }, { "crossentropy": 2.904853582382202, "epoch": 0.12249064307587615, "grad_norm": 0.0475861094892025, "grad_norm_var": 1.1223822358142714e-05, "learning_rate": 0.009356593520616947, "loss": 2.9049, "step": 1440 }, { "crossentropy": 2.9224557876586914, "epoch": 0.12257570602245661, "grad_norm": 0.05077569559216499, "grad_norm_var": 1.0510367291096066e-05, "learning_rate": 0.009355609985503987, "loss": 2.9225, "step": 1441 }, { "crossentropy": 2.966682195663452, "epoch": 0.12266076896903709, "grad_norm": 0.04913708195090294, "grad_norm_var": 1.0581360367129383e-05, "learning_rate": 0.009354625751005567, "loss": 2.9667, "step": 1442 }, { "crossentropy": 2.9122419357299805, "epoch": 0.12274583191561755, "grad_norm": 0.053215596824884415, "grad_norm_var": 9.27769911526251e-06, "learning_rate": 0.009353640817279726, "loss": 2.9122, "step": 1443 }, { "crossentropy": 2.873551607131958, "epoch": 0.12283089486219803, "grad_norm": 0.049562033265829086, "grad_norm_var": 9.39024002865954e-06, "learning_rate": 0.00935265518448462, "loss": 2.8736, "step": 1444 }, { "crossentropy": 2.948695182800293, "epoch": 0.1229159578087785, "grad_norm": 0.05028219148516655, "grad_norm_var": 7.494854324938821e-06, "learning_rate": 0.009351668852778508, "loss": 2.9487, "step": 1445 }, { "crossentropy": 2.889227867126465, "epoch": 0.12300102075535897, "grad_norm": 0.04933306574821472, "grad_norm_var": 6.731178928823245e-06, "learning_rate": 0.00935068182231977, "loss": 2.8892, "step": 1446 }, { "crossentropy": 2.8698811531066895, "epoch": 0.12308608370193944, "grad_norm": 0.04919953644275665, "grad_norm_var": 6.694020297877354e-06, "learning_rate": 0.009349694093266891, "loss": 2.8699, "step": 1447 }, { "crossentropy": 2.972734212875366, "epoch": 0.1231711466485199, "grad_norm": 0.05446409434080124, "grad_norm_var": 7.308394413643206e-06, "learning_rate": 0.009348705665778478, "loss": 2.9727, "step": 1448 }, { "crossentropy": 2.8712048530578613, "epoch": 0.12325620959510038, "grad_norm": 0.06865503638982773, "grad_norm_var": 2.5087723184960032e-05, "learning_rate": 0.009347716540013238, "loss": 2.8712, "step": 1449 }, { "crossentropy": 2.969957113265991, "epoch": 0.12334127254168084, "grad_norm": 0.07335023581981659, "grad_norm_var": 5.1317512518498504e-05, "learning_rate": 0.009346726716129999, "loss": 2.97, "step": 1450 }, { "crossentropy": 2.9607722759246826, "epoch": 0.12342633548826132, "grad_norm": 0.05938323214650154, "grad_norm_var": 5.30829278034319e-05, "learning_rate": 0.009345736194287696, "loss": 2.9608, "step": 1451 }, { "crossentropy": 2.949924945831299, "epoch": 0.12351139843484178, "grad_norm": 0.05121294781565666, "grad_norm_var": 5.3712881962456745e-05, "learning_rate": 0.00934474497464538, "loss": 2.9499, "step": 1452 }, { "crossentropy": 2.9100427627563477, "epoch": 0.12359646138142225, "grad_norm": 0.04949405789375305, "grad_norm_var": 5.401369440805089e-05, "learning_rate": 0.009343753057362214, "loss": 2.91, "step": 1453 }, { "crossentropy": 2.9146130084991455, "epoch": 0.12368152432800272, "grad_norm": 0.05116049200296402, "grad_norm_var": 5.424729017981615e-05, "learning_rate": 0.009342760442597466, "loss": 2.9146, "step": 1454 }, { "crossentropy": 2.7581183910369873, "epoch": 0.12376658727458319, "grad_norm": 0.05642182379961014, "grad_norm_var": 5.429990129536428e-05, "learning_rate": 0.009341767130510529, "loss": 2.7581, "step": 1455 }, { "crossentropy": 2.9315593242645264, "epoch": 0.12385165022116366, "grad_norm": 0.06156153976917267, "grad_norm_var": 5.464465744872179e-05, "learning_rate": 0.009340773121260892, "loss": 2.9316, "step": 1456 }, { "crossentropy": 2.8936614990234375, "epoch": 0.12393671316774413, "grad_norm": 0.04917661100625992, "grad_norm_var": 5.5667947204339026e-05, "learning_rate": 0.009339778415008172, "loss": 2.8937, "step": 1457 }, { "crossentropy": 2.9507951736450195, "epoch": 0.1240217761143246, "grad_norm": 0.04866506904363632, "grad_norm_var": 5.603358556112616e-05, "learning_rate": 0.009338783011912082, "loss": 2.9508, "step": 1458 }, { "crossentropy": 2.8050131797790527, "epoch": 0.12410683906090507, "grad_norm": 0.05219919979572296, "grad_norm_var": 5.629878889087184e-05, "learning_rate": 0.009337786912132462, "loss": 2.805, "step": 1459 }, { "crossentropy": 2.9339141845703125, "epoch": 0.12419190200748553, "grad_norm": 0.046618543565273285, "grad_norm_var": 5.8830307951180694e-05, "learning_rate": 0.009336790115829255, "loss": 2.9339, "step": 1460 }, { "crossentropy": 2.9085633754730225, "epoch": 0.12427696495406601, "grad_norm": 0.047711387276649475, "grad_norm_var": 6.067151031716155e-05, "learning_rate": 0.009335792623162517, "loss": 2.9086, "step": 1461 }, { "crossentropy": 2.956822156906128, "epoch": 0.12436202790064647, "grad_norm": 0.04968205466866493, "grad_norm_var": 6.044856338693749e-05, "learning_rate": 0.009334794434292415, "loss": 2.9568, "step": 1462 }, { "crossentropy": 2.876981258392334, "epoch": 0.12444709084722695, "grad_norm": 0.05341281369328499, "grad_norm_var": 5.868728361092072e-05, "learning_rate": 0.009333795549379232, "loss": 2.877, "step": 1463 }, { "crossentropy": 2.9020297527313232, "epoch": 0.12453215379380742, "grad_norm": 0.045936938375234604, "grad_norm_var": 6.335570959249177e-05, "learning_rate": 0.00933279596858336, "loss": 2.902, "step": 1464 }, { "crossentropy": 2.8038313388824463, "epoch": 0.12461721674038789, "grad_norm": 0.048961788415908813, "grad_norm_var": 4.921936458237199e-05, "learning_rate": 0.009331795692065303, "loss": 2.8038, "step": 1465 }, { "crossentropy": 2.906263828277588, "epoch": 0.12470227968696836, "grad_norm": 0.04595300182700157, "grad_norm_var": 2.1097066654298066e-05, "learning_rate": 0.009330794719985673, "loss": 2.9063, "step": 1466 }, { "crossentropy": 2.8549721240997314, "epoch": 0.12478734263354882, "grad_norm": 0.04426489770412445, "grad_norm_var": 1.8679052220971157e-05, "learning_rate": 0.009329793052505203, "loss": 2.855, "step": 1467 }, { "crossentropy": 2.7737061977386475, "epoch": 0.1248724055801293, "grad_norm": 0.04969353601336479, "grad_norm_var": 1.8608419705885317e-05, "learning_rate": 0.009328790689784726, "loss": 2.7737, "step": 1468 }, { "crossentropy": 2.95794939994812, "epoch": 0.12495746852670976, "grad_norm": 0.050999294966459274, "grad_norm_var": 1.86370248485677e-05, "learning_rate": 0.009327787631985197, "loss": 2.9579, "step": 1469 }, { "crossentropy": 2.8785126209259033, "epoch": 0.12504253147329022, "grad_norm": 0.04769732803106308, "grad_norm_var": 1.8920567060148886e-05, "learning_rate": 0.009326783879267676, "loss": 2.8785, "step": 1470 }, { "crossentropy": 2.934846878051758, "epoch": 0.1251275944198707, "grad_norm": 0.04707736521959305, "grad_norm_var": 1.629555964045015e-05, "learning_rate": 0.009325779431793338, "loss": 2.9348, "step": 1471 }, { "crossentropy": 3.005762815475464, "epoch": 0.12521265736645118, "grad_norm": 0.05889848619699478, "grad_norm_var": 1.2403054899132876e-05, "learning_rate": 0.009324774289723468, "loss": 3.0058, "step": 1472 }, { "crossentropy": 2.8586273193359375, "epoch": 0.12529772031303163, "grad_norm": 0.05098405107855797, "grad_norm_var": 1.260538668204584e-05, "learning_rate": 0.009323768453219462, "loss": 2.8586, "step": 1473 }, { "crossentropy": 3.007476806640625, "epoch": 0.1253827832596121, "grad_norm": 0.04575688764452934, "grad_norm_var": 1.337910862601784e-05, "learning_rate": 0.00932276192244283, "loss": 3.0075, "step": 1474 }, { "crossentropy": 2.91286039352417, "epoch": 0.12546784620619258, "grad_norm": 0.05396522581577301, "grad_norm_var": 1.4300162092728553e-05, "learning_rate": 0.009321754697555189, "loss": 2.9129, "step": 1475 }, { "crossentropy": 2.9468531608581543, "epoch": 0.12555290915277306, "grad_norm": 0.057771917432546616, "grad_norm_var": 1.8197652875496553e-05, "learning_rate": 0.009320746778718273, "loss": 2.9469, "step": 1476 }, { "crossentropy": 2.8551995754241943, "epoch": 0.1256379720993535, "grad_norm": 0.04856756329536438, "grad_norm_var": 1.7991004392318006e-05, "learning_rate": 0.009319738166093924, "loss": 2.8552, "step": 1477 }, { "crossentropy": 2.8149619102478027, "epoch": 0.125723035045934, "grad_norm": 0.051776450127363205, "grad_norm_var": 1.8182950330131117e-05, "learning_rate": 0.009318728859844097, "loss": 2.815, "step": 1478 }, { "crossentropy": 2.781432867050171, "epoch": 0.12580809799251447, "grad_norm": 0.0581122450530529, "grad_norm_var": 2.1634416693831702e-05, "learning_rate": 0.009317718860130855, "loss": 2.7814, "step": 1479 }, { "crossentropy": 2.900559186935425, "epoch": 0.12589316093909492, "grad_norm": 0.060433849692344666, "grad_norm_var": 2.6140645267752385e-05, "learning_rate": 0.009316708167116376, "loss": 2.9006, "step": 1480 }, { "crossentropy": 2.770901679992676, "epoch": 0.1259782238856754, "grad_norm": 0.045282989740371704, "grad_norm_var": 2.8136892146810728e-05, "learning_rate": 0.00931569678096295, "loss": 2.7709, "step": 1481 }, { "crossentropy": 2.9568960666656494, "epoch": 0.12606328683225587, "grad_norm": 0.04708501324057579, "grad_norm_var": 2.7443563692073485e-05, "learning_rate": 0.009314684701832974, "loss": 2.9569, "step": 1482 }, { "crossentropy": 2.8612964153289795, "epoch": 0.12614834977883635, "grad_norm": 0.04429792985320091, "grad_norm_var": 2.7413316979830032e-05, "learning_rate": 0.00931367192988896, "loss": 2.8613, "step": 1483 }, { "crossentropy": 2.8233695030212402, "epoch": 0.1262334127254168, "grad_norm": 0.046072542667388916, "grad_norm_var": 2.8935975076495542e-05, "learning_rate": 0.009312658465293529, "loss": 2.8234, "step": 1484 }, { "crossentropy": 2.952012777328491, "epoch": 0.12631847567199728, "grad_norm": 0.04392097145318985, "grad_norm_var": 3.1996043275018575e-05, "learning_rate": 0.009311644308209415, "loss": 2.952, "step": 1485 }, { "crossentropy": 2.850823402404785, "epoch": 0.12640353861857775, "grad_norm": 0.05069383233785629, "grad_norm_var": 3.1444941541420614e-05, "learning_rate": 0.009310629458799463, "loss": 2.8508, "step": 1486 }, { "crossentropy": 2.8410704135894775, "epoch": 0.1264886015651582, "grad_norm": 0.04900695011019707, "grad_norm_var": 3.075370649631495e-05, "learning_rate": 0.009309613917226628, "loss": 2.8411, "step": 1487 }, { "crossentropy": 2.794081687927246, "epoch": 0.12657366451173868, "grad_norm": 0.04758186265826225, "grad_norm_var": 2.652183655794249e-05, "learning_rate": 0.009308597683653975, "loss": 2.7941, "step": 1488 }, { "crossentropy": 2.882148504257202, "epoch": 0.12665872745831916, "grad_norm": 0.04322453588247299, "grad_norm_var": 2.935159136184968e-05, "learning_rate": 0.009307580758244685, "loss": 2.8821, "step": 1489 }, { "crossentropy": 2.9253129959106445, "epoch": 0.12674379040489964, "grad_norm": 0.04683474823832512, "grad_norm_var": 2.887233313189422e-05, "learning_rate": 0.009306563141162046, "loss": 2.9253, "step": 1490 }, { "crossentropy": 2.891301155090332, "epoch": 0.1268288533514801, "grad_norm": 0.06547577679157257, "grad_norm_var": 4.375395210036691e-05, "learning_rate": 0.009305544832569455, "loss": 2.8913, "step": 1491 }, { "crossentropy": 2.8653764724731445, "epoch": 0.12691391629806056, "grad_norm": 0.05554939806461334, "grad_norm_var": 4.187328204639512e-05, "learning_rate": 0.009304525832630426, "loss": 2.8654, "step": 1492 }, { "crossentropy": 2.889861583709717, "epoch": 0.12699897924464104, "grad_norm": 0.06132255494594574, "grad_norm_var": 4.918899157350798e-05, "learning_rate": 0.009303506141508582, "loss": 2.8899, "step": 1493 }, { "crossentropy": 2.921264171600342, "epoch": 0.1270840421912215, "grad_norm": 0.051585663110017776, "grad_norm_var": 4.9172582858890367e-05, "learning_rate": 0.009302485759367653, "loss": 2.9213, "step": 1494 }, { "crossentropy": 2.813857078552246, "epoch": 0.12716910513780197, "grad_norm": 0.04586915671825409, "grad_norm_var": 4.6979856501999416e-05, "learning_rate": 0.009301464686371485, "loss": 2.8139, "step": 1495 }, { "crossentropy": 2.8525588512420654, "epoch": 0.12725416808438245, "grad_norm": 0.04819757863879204, "grad_norm_var": 3.974701843514472e-05, "learning_rate": 0.00930044292268403, "loss": 2.8526, "step": 1496 }, { "crossentropy": 2.996642589569092, "epoch": 0.12733923103096292, "grad_norm": 0.04488573968410492, "grad_norm_var": 3.9980247399344114e-05, "learning_rate": 0.00929942046846936, "loss": 2.9966, "step": 1497 }, { "crossentropy": 2.8872249126434326, "epoch": 0.12742429397754337, "grad_norm": 0.04910539463162422, "grad_norm_var": 3.9591472444029144e-05, "learning_rate": 0.009298397323891646, "loss": 2.8872, "step": 1498 }, { "crossentropy": 2.9852654933929443, "epoch": 0.12750935692412385, "grad_norm": 0.048375196754932404, "grad_norm_var": 3.774724795913171e-05, "learning_rate": 0.009297373489115177, "loss": 2.9853, "step": 1499 }, { "crossentropy": 2.87644100189209, "epoch": 0.12759441987070433, "grad_norm": 0.04499009996652603, "grad_norm_var": 3.8366581444176e-05, "learning_rate": 0.009296348964304351, "loss": 2.8764, "step": 1500 }, { "crossentropy": 2.7231428623199463, "epoch": 0.12767948281728478, "grad_norm": 0.047015268355607986, "grad_norm_var": 3.654412592051231e-05, "learning_rate": 0.009295323749623679, "loss": 2.7231, "step": 1501 }, { "crossentropy": 2.9324560165405273, "epoch": 0.12776454576386526, "grad_norm": 0.048867885023355484, "grad_norm_var": 3.6579230432353994e-05, "learning_rate": 0.009294297845237777, "loss": 2.9325, "step": 1502 }, { "crossentropy": 2.849114418029785, "epoch": 0.12784960871044573, "grad_norm": 0.04735590144991875, "grad_norm_var": 3.693915178618589e-05, "learning_rate": 0.009293271251311382, "loss": 2.8491, "step": 1503 }, { "crossentropy": 2.8363301753997803, "epoch": 0.1279346716570262, "grad_norm": 0.05077114701271057, "grad_norm_var": 3.664660602189316e-05, "learning_rate": 0.009292243968009331, "loss": 2.8363, "step": 1504 }, { "crossentropy": 2.8709592819213867, "epoch": 0.12801973460360666, "grad_norm": 0.056804753839969635, "grad_norm_var": 3.596965066338242e-05, "learning_rate": 0.009291215995496576, "loss": 2.871, "step": 1505 }, { "crossentropy": 2.8691022396087646, "epoch": 0.12810479755018714, "grad_norm": 0.05156637728214264, "grad_norm_var": 3.4859173855588305e-05, "learning_rate": 0.009290187333938181, "loss": 2.8691, "step": 1506 }, { "crossentropy": 2.799976348876953, "epoch": 0.12818986049676762, "grad_norm": 0.05203492194414139, "grad_norm_var": 2.040262440488952e-05, "learning_rate": 0.00928915798349932, "loss": 2.8, "step": 1507 }, { "crossentropy": 2.7405104637145996, "epoch": 0.12827492344334807, "grad_norm": 0.052457187324762344, "grad_norm_var": 1.882297496316922e-05, "learning_rate": 0.009288127944345274, "loss": 2.7405, "step": 1508 }, { "crossentropy": 2.8526687622070312, "epoch": 0.12835998638992854, "grad_norm": 0.05274369567632675, "grad_norm_var": 1.0557629538625728e-05, "learning_rate": 0.009287097216641442, "loss": 2.8527, "step": 1509 }, { "crossentropy": 2.9407033920288086, "epoch": 0.12844504933650902, "grad_norm": 0.048894088715314865, "grad_norm_var": 1.0275959992888234e-05, "learning_rate": 0.009286065800553326, "loss": 2.9407, "step": 1510 }, { "crossentropy": 2.861304521560669, "epoch": 0.1285301122830895, "grad_norm": 0.04874955117702484, "grad_norm_var": 9.449648582712741e-06, "learning_rate": 0.009285033696246543, "loss": 2.8613, "step": 1511 }, { "crossentropy": 2.868190288543701, "epoch": 0.12861517522966995, "grad_norm": 0.051462866365909576, "grad_norm_var": 9.526821717930946e-06, "learning_rate": 0.009284000903886818, "loss": 2.8682, "step": 1512 }, { "crossentropy": 2.972404956817627, "epoch": 0.12870023817625043, "grad_norm": 0.048571985214948654, "grad_norm_var": 7.982856246122454e-06, "learning_rate": 0.009282967423639989, "loss": 2.9724, "step": 1513 }, { "crossentropy": 2.888139009475708, "epoch": 0.1287853011228309, "grad_norm": 0.047452785074710846, "grad_norm_var": 8.347457422154768e-06, "learning_rate": 0.009281933255672002, "loss": 2.8881, "step": 1514 }, { "crossentropy": 2.826382875442505, "epoch": 0.12887036406941135, "grad_norm": 0.04434165358543396, "grad_norm_var": 1.0174724304270096e-05, "learning_rate": 0.009280898400148917, "loss": 2.8264, "step": 1515 }, { "crossentropy": 2.9150919914245605, "epoch": 0.12895542701599183, "grad_norm": 0.04251595214009285, "grad_norm_var": 1.2087955545973747e-05, "learning_rate": 0.009279862857236898, "loss": 2.9151, "step": 1516 }, { "crossentropy": 2.84213924407959, "epoch": 0.1290404899625723, "grad_norm": 0.04475012049078941, "grad_norm_var": 1.3151637563403537e-05, "learning_rate": 0.009278826627102226, "loss": 2.8421, "step": 1517 }, { "crossentropy": 2.875241756439209, "epoch": 0.12912555290915279, "grad_norm": 0.045282650738954544, "grad_norm_var": 1.4177730691710261e-05, "learning_rate": 0.009277789709911291, "loss": 2.8752, "step": 1518 }, { "crossentropy": 2.860300064086914, "epoch": 0.12921061585573324, "grad_norm": 0.050760459154844284, "grad_norm_var": 1.4106035471477391e-05, "learning_rate": 0.00927675210583059, "loss": 2.8603, "step": 1519 }, { "crossentropy": 2.906069278717041, "epoch": 0.1292956788023137, "grad_norm": 0.051336050033569336, "grad_norm_var": 1.4235091940456521e-05, "learning_rate": 0.009275713815026732, "loss": 2.9061, "step": 1520 }, { "crossentropy": 2.898005962371826, "epoch": 0.1293807417488942, "grad_norm": 0.04550383239984512, "grad_norm_var": 1.099605462129462e-05, "learning_rate": 0.009274674837666436, "loss": 2.898, "step": 1521 }, { "crossentropy": 2.8529372215270996, "epoch": 0.12946580469547464, "grad_norm": 0.04977938160300255, "grad_norm_var": 1.0501125444734952e-05, "learning_rate": 0.009273635173916535, "loss": 2.8529, "step": 1522 }, { "crossentropy": 2.882462978363037, "epoch": 0.12955086764205512, "grad_norm": 0.07411891967058182, "grad_norm_var": 5.127399194323386e-05, "learning_rate": 0.009272594823943965, "loss": 2.8825, "step": 1523 }, { "crossentropy": 2.79805588722229, "epoch": 0.1296359305886356, "grad_norm": 0.04881895333528519, "grad_norm_var": 5.087054020510336e-05, "learning_rate": 0.00927155378791578, "loss": 2.7981, "step": 1524 }, { "crossentropy": 2.8837058544158936, "epoch": 0.12972099353521607, "grad_norm": 0.04739717021584511, "grad_norm_var": 5.048214879163441e-05, "learning_rate": 0.009270512065999137, "loss": 2.8837, "step": 1525 }, { "crossentropy": 2.849277973175049, "epoch": 0.12980605648179652, "grad_norm": 0.043093305081129074, "grad_norm_var": 5.2944430514119864e-05, "learning_rate": 0.009269469658361309, "loss": 2.8493, "step": 1526 }, { "crossentropy": 2.8896682262420654, "epoch": 0.129891119428377, "grad_norm": 0.04811056703329086, "grad_norm_var": 5.299094429204893e-05, "learning_rate": 0.009268426565169676, "loss": 2.8897, "step": 1527 }, { "crossentropy": 2.934138059616089, "epoch": 0.12997618237495748, "grad_norm": 0.053380660712718964, "grad_norm_var": 5.386182536974773e-05, "learning_rate": 0.009267382786591729, "loss": 2.9341, "step": 1528 }, { "crossentropy": 2.831573247909546, "epoch": 0.13006124532153793, "grad_norm": 0.05053997039794922, "grad_norm_var": 5.3971658718367174e-05, "learning_rate": 0.00926633832279507, "loss": 2.8316, "step": 1529 }, { "crossentropy": 2.7883448600769043, "epoch": 0.1301463082681184, "grad_norm": 0.047142788767814636, "grad_norm_var": 5.4049836797502523e-05, "learning_rate": 0.009265293173947404, "loss": 2.7883, "step": 1530 }, { "crossentropy": 2.925487518310547, "epoch": 0.13023137121469888, "grad_norm": 0.04952319338917732, "grad_norm_var": 5.238550766840817e-05, "learning_rate": 0.009264247340216558, "loss": 2.9255, "step": 1531 }, { "crossentropy": 2.8990869522094727, "epoch": 0.13031643416127936, "grad_norm": 0.05015749856829643, "grad_norm_var": 4.89157907453001e-05, "learning_rate": 0.009263200821770462, "loss": 2.8991, "step": 1532 }, { "crossentropy": 2.9149465560913086, "epoch": 0.1304014971078598, "grad_norm": 0.050774045288562775, "grad_norm_var": 4.698240416214402e-05, "learning_rate": 0.009262153618777153, "loss": 2.9149, "step": 1533 }, { "crossentropy": 2.734449863433838, "epoch": 0.1304865600544403, "grad_norm": 0.04184161499142647, "grad_norm_var": 5.005079875513244e-05, "learning_rate": 0.009261105731404786, "loss": 2.7344, "step": 1534 }, { "crossentropy": 2.9802181720733643, "epoch": 0.13057162300102076, "grad_norm": 0.06872299313545227, "grad_norm_var": 7.16968408258578e-05, "learning_rate": 0.009260057159821618, "loss": 2.9802, "step": 1535 }, { "crossentropy": 2.7688345909118652, "epoch": 0.13065668594760121, "grad_norm": 0.061302121728658676, "grad_norm_var": 7.799883590255217e-05, "learning_rate": 0.009259007904196022, "loss": 2.7688, "step": 1536 }, { "crossentropy": 2.8906989097595215, "epoch": 0.1307417488941817, "grad_norm": 0.0525086410343647, "grad_norm_var": 7.51029526797784e-05, "learning_rate": 0.009257957964696474, "loss": 2.8907, "step": 1537 }, { "crossentropy": 2.9120917320251465, "epoch": 0.13082681184076217, "grad_norm": 0.050543490797281265, "grad_norm_var": 7.488001808931e-05, "learning_rate": 0.009256907341491571, "loss": 2.9121, "step": 1538 }, { "crossentropy": 2.9104368686676025, "epoch": 0.13091187478734265, "grad_norm": 0.05128707364201546, "grad_norm_var": 4.126242029422831e-05, "learning_rate": 0.009255856034750008, "loss": 2.9104, "step": 1539 }, { "crossentropy": 2.86472487449646, "epoch": 0.1309969377339231, "grad_norm": 0.05857587605714798, "grad_norm_var": 4.444448498934211e-05, "learning_rate": 0.009254804044640595, "loss": 2.8647, "step": 1540 }, { "crossentropy": 2.8881947994232178, "epoch": 0.13108200068050357, "grad_norm": 0.0539376363158226, "grad_norm_var": 4.349105990016829e-05, "learning_rate": 0.009253751371332252, "loss": 2.8882, "step": 1541 }, { "crossentropy": 2.8211452960968018, "epoch": 0.13116706362708405, "grad_norm": 0.059399183839559555, "grad_norm_var": 4.0820360109750097e-05, "learning_rate": 0.009252698014994008, "loss": 2.8211, "step": 1542 }, { "crossentropy": 2.9270856380462646, "epoch": 0.1312521265736645, "grad_norm": 0.05462120473384857, "grad_norm_var": 3.923889906223624e-05, "learning_rate": 0.009251643975795, "loss": 2.9271, "step": 1543 }, { "crossentropy": 2.8507659435272217, "epoch": 0.13133718952024498, "grad_norm": 0.05429691821336746, "grad_norm_var": 3.929009120303672e-05, "learning_rate": 0.00925058925390448, "loss": 2.8508, "step": 1544 }, { "crossentropy": 2.881539821624756, "epoch": 0.13142225246682546, "grad_norm": 0.04964454472064972, "grad_norm_var": 3.968743945978806e-05, "learning_rate": 0.009249533849491802, "loss": 2.8815, "step": 1545 }, { "crossentropy": 2.916851282119751, "epoch": 0.13150731541340593, "grad_norm": 0.04568463936448097, "grad_norm_var": 4.103538102550545e-05, "learning_rate": 0.009248477762726437, "loss": 2.9169, "step": 1546 }, { "crossentropy": 2.848252534866333, "epoch": 0.13159237835998638, "grad_norm": 0.04450102895498276, "grad_norm_var": 4.514166232359174e-05, "learning_rate": 0.00924742099377796, "loss": 2.8483, "step": 1547 }, { "crossentropy": 2.846944808959961, "epoch": 0.13167744130656686, "grad_norm": 0.047885410487651825, "grad_norm_var": 4.632161815628453e-05, "learning_rate": 0.009246363542816056, "loss": 2.8469, "step": 1548 }, { "crossentropy": 2.9062321186065674, "epoch": 0.13176250425314734, "grad_norm": 0.04369811341166496, "grad_norm_var": 5.1405155586714546e-05, "learning_rate": 0.009245305410010524, "loss": 2.9062, "step": 1549 }, { "crossentropy": 2.956315040588379, "epoch": 0.1318475671997278, "grad_norm": 0.05927007272839546, "grad_norm_var": 4.584675296307461e-05, "learning_rate": 0.009244246595531271, "loss": 2.9563, "step": 1550 }, { "crossentropy": 2.8136091232299805, "epoch": 0.13193263014630827, "grad_norm": 0.05355982482433319, "grad_norm_var": 2.9424387306948667e-05, "learning_rate": 0.009243187099548309, "loss": 2.8136, "step": 1551 }, { "crossentropy": 2.9469988346099854, "epoch": 0.13201769309288874, "grad_norm": 0.05399616062641144, "grad_norm_var": 2.4229638397082974e-05, "learning_rate": 0.009242126922231763, "loss": 2.947, "step": 1552 }, { "crossentropy": 2.875734806060791, "epoch": 0.1321027560394692, "grad_norm": 0.055581752210855484, "grad_norm_var": 2.4992199452056735e-05, "learning_rate": 0.009241066063751868, "loss": 2.8757, "step": 1553 }, { "crossentropy": 2.829397201538086, "epoch": 0.13218781898604967, "grad_norm": 0.052677053958177567, "grad_norm_var": 2.478265946768045e-05, "learning_rate": 0.009240004524278965, "loss": 2.8294, "step": 1554 }, { "crossentropy": 2.8896803855895996, "epoch": 0.13227288193263015, "grad_norm": 0.042247116565704346, "grad_norm_var": 3.1247960975356684e-05, "learning_rate": 0.00923894230398351, "loss": 2.8897, "step": 1555 }, { "crossentropy": 2.9889068603515625, "epoch": 0.13235794487921063, "grad_norm": 0.048011843115091324, "grad_norm_var": 2.874716977976499e-05, "learning_rate": 0.009237879403036062, "loss": 2.9889, "step": 1556 }, { "crossentropy": 2.875342845916748, "epoch": 0.13244300782579108, "grad_norm": 0.05354492366313934, "grad_norm_var": 2.8612847875335706e-05, "learning_rate": 0.009236815821607295, "loss": 2.8753, "step": 1557 }, { "crossentropy": 2.8500890731811523, "epoch": 0.13252807077237155, "grad_norm": 0.051447171717882156, "grad_norm_var": 2.383322080633145e-05, "learning_rate": 0.009235751559867987, "loss": 2.8501, "step": 1558 }, { "crossentropy": 2.86160945892334, "epoch": 0.13261313371895203, "grad_norm": 0.04734190180897713, "grad_norm_var": 2.3306883255562e-05, "learning_rate": 0.00923468661798903, "loss": 2.8616, "step": 1559 }, { "crossentropy": 2.7908520698547363, "epoch": 0.13269819666553248, "grad_norm": 0.05095568299293518, "grad_norm_var": 2.2184702930584897e-05, "learning_rate": 0.00923362099614142, "loss": 2.7909, "step": 1560 }, { "crossentropy": 2.921064615249634, "epoch": 0.13278325961211296, "grad_norm": 0.04606715589761734, "grad_norm_var": 2.3155515087213328e-05, "learning_rate": 0.009232554694496268, "loss": 2.9211, "step": 1561 }, { "crossentropy": 2.857104539871216, "epoch": 0.13286832255869344, "grad_norm": 0.048155996948480606, "grad_norm_var": 2.2187969518734178e-05, "learning_rate": 0.009231487713224792, "loss": 2.8571, "step": 1562 }, { "crossentropy": 2.909672975540161, "epoch": 0.1329533855052739, "grad_norm": 0.047834768891334534, "grad_norm_var": 2.046771256388353e-05, "learning_rate": 0.009230420052498316, "loss": 2.9097, "step": 1563 }, { "crossentropy": 2.9207677841186523, "epoch": 0.13303844845185436, "grad_norm": 0.052940208464860916, "grad_norm_var": 2.0543644171387188e-05, "learning_rate": 0.009229351712488275, "loss": 2.9208, "step": 1564 }, { "crossentropy": 2.83799147605896, "epoch": 0.13312351139843484, "grad_norm": 0.04914852976799011, "grad_norm_var": 1.748769516475082e-05, "learning_rate": 0.009228282693366216, "loss": 2.838, "step": 1565 }, { "crossentropy": 2.7971408367156982, "epoch": 0.13320857434501532, "grad_norm": 0.0474369116127491, "grad_norm_var": 1.2873522501307087e-05, "learning_rate": 0.009227212995303791, "loss": 2.7971, "step": 1566 }, { "crossentropy": 2.862630844116211, "epoch": 0.13329363729159577, "grad_norm": 0.04519575089216232, "grad_norm_var": 1.3341935759414447e-05, "learning_rate": 0.009226142618472764, "loss": 2.8626, "step": 1567 }, { "crossentropy": 2.7716965675354004, "epoch": 0.13337870023817625, "grad_norm": 0.04981393367052078, "grad_norm_var": 1.1948245614012743e-05, "learning_rate": 0.009225071563045006, "loss": 2.7717, "step": 1568 }, { "crossentropy": 2.80894136428833, "epoch": 0.13346376318475672, "grad_norm": 0.04925250634551048, "grad_norm_var": 9.1297275121055e-06, "learning_rate": 0.009223999829192498, "loss": 2.8089, "step": 1569 }, { "crossentropy": 2.864107131958008, "epoch": 0.1335488261313372, "grad_norm": 0.047615181654691696, "grad_norm_var": 8.168082935623942e-06, "learning_rate": 0.009222927417087327, "loss": 2.8641, "step": 1570 }, { "crossentropy": 2.8990211486816406, "epoch": 0.13363388907791765, "grad_norm": 0.04842451959848404, "grad_norm_var": 5.35091966018438e-06, "learning_rate": 0.009221854326901696, "loss": 2.899, "step": 1571 }, { "crossentropy": 2.927493095397949, "epoch": 0.13371895202449813, "grad_norm": 0.05071238800883293, "grad_norm_var": 5.469216793004226e-06, "learning_rate": 0.00922078055880791, "loss": 2.9275, "step": 1572 }, { "crossentropy": 2.9378273487091064, "epoch": 0.1338040149710786, "grad_norm": 0.050385139882564545, "grad_norm_var": 4.228136185931616e-06, "learning_rate": 0.009219706112978385, "loss": 2.9378, "step": 1573 }, { "crossentropy": 2.891566753387451, "epoch": 0.13388907791765906, "grad_norm": 0.05703580379486084, "grad_norm_var": 8.062950260292502e-06, "learning_rate": 0.009218630989585646, "loss": 2.8916, "step": 1574 }, { "crossentropy": 2.896285057067871, "epoch": 0.13397414086423953, "grad_norm": 0.051526959985494614, "grad_norm_var": 8.081852251142809e-06, "learning_rate": 0.009217555188802327, "loss": 2.8963, "step": 1575 }, { "crossentropy": 2.8144428730010986, "epoch": 0.13405920381082, "grad_norm": 0.05190293490886688, "grad_norm_var": 8.317827534582746e-06, "learning_rate": 0.00921647871080117, "loss": 2.8144, "step": 1576 }, { "crossentropy": 2.849421262741089, "epoch": 0.1341442667574005, "grad_norm": 0.046781715005636215, "grad_norm_var": 8.014050577988495e-06, "learning_rate": 0.009215401555755029, "loss": 2.8494, "step": 1577 }, { "crossentropy": 2.8838794231414795, "epoch": 0.13422932970398094, "grad_norm": 0.04888622462749481, "grad_norm_var": 7.903356637980046e-06, "learning_rate": 0.00921432372383686, "loss": 2.8839, "step": 1578 }, { "crossentropy": 2.8587889671325684, "epoch": 0.13431439265056141, "grad_norm": 0.05506545677781105, "grad_norm_var": 9.391250491676504e-06, "learning_rate": 0.009213245215219736, "loss": 2.8588, "step": 1579 }, { "crossentropy": 2.919635772705078, "epoch": 0.1343994555971419, "grad_norm": 0.05509587377309799, "grad_norm_var": 1.04886037775031e-05, "learning_rate": 0.009212166030076832, "loss": 2.9196, "step": 1580 }, { "crossentropy": 2.930546998977661, "epoch": 0.13448451854372234, "grad_norm": 0.046414438635110855, "grad_norm_var": 1.1363718851616541e-05, "learning_rate": 0.009211086168581432, "loss": 2.9305, "step": 1581 }, { "crossentropy": 2.9304537773132324, "epoch": 0.13456958149030282, "grad_norm": 0.04865865781903267, "grad_norm_var": 1.1023747028756267e-05, "learning_rate": 0.009210005630906936, "loss": 2.9305, "step": 1582 }, { "crossentropy": 2.9948997497558594, "epoch": 0.1346546444368833, "grad_norm": 0.051592156291007996, "grad_norm_var": 9.336032511077462e-06, "learning_rate": 0.009208924417226842, "loss": 2.9949, "step": 1583 }, { "crossentropy": 2.862055540084839, "epoch": 0.13473970738346377, "grad_norm": 0.05107823386788368, "grad_norm_var": 9.308020880611072e-06, "learning_rate": 0.009207842527714767, "loss": 2.8621, "step": 1584 }, { "crossentropy": 2.96530818939209, "epoch": 0.13482477033004422, "grad_norm": 0.06164531409740448, "grad_norm_var": 1.6594782058541174e-05, "learning_rate": 0.009206759962544426, "loss": 2.9653, "step": 1585 }, { "crossentropy": 2.9310638904571533, "epoch": 0.1349098332766247, "grad_norm": 0.06199732422828674, "grad_norm_var": 2.2214361816063192e-05, "learning_rate": 0.00920567672188965, "loss": 2.9311, "step": 1586 }, { "crossentropy": 2.8720755577087402, "epoch": 0.13499489622320518, "grad_norm": 0.053503502160310745, "grad_norm_var": 2.1185086593437236e-05, "learning_rate": 0.009204592805924377, "loss": 2.8721, "step": 1587 }, { "crossentropy": 2.842008590698242, "epoch": 0.13507995916978563, "grad_norm": 0.04827895015478134, "grad_norm_var": 2.2181471910284223e-05, "learning_rate": 0.009203508214822652, "loss": 2.842, "step": 1588 }, { "crossentropy": 2.8526034355163574, "epoch": 0.1351650221163661, "grad_norm": 0.04855674132704735, "grad_norm_var": 2.290368076231485e-05, "learning_rate": 0.009202422948758627, "loss": 2.8526, "step": 1589 }, { "crossentropy": 2.899157762527466, "epoch": 0.13525008506294658, "grad_norm": 0.04925616458058357, "grad_norm_var": 2.1853087218942113e-05, "learning_rate": 0.00920133700790657, "loss": 2.8992, "step": 1590 }, { "crossentropy": 2.8394789695739746, "epoch": 0.13533514800952706, "grad_norm": 0.06428395956754684, "grad_norm_var": 3.140682635990114e-05, "learning_rate": 0.009200250392440846, "loss": 2.8395, "step": 1591 }, { "crossentropy": 2.8323497772216797, "epoch": 0.1354202109561075, "grad_norm": 0.04893798008561134, "grad_norm_var": 3.226636306819791e-05, "learning_rate": 0.009199163102535936, "loss": 2.8323, "step": 1592 }, { "crossentropy": 2.860635280609131, "epoch": 0.135505273902688, "grad_norm": 0.05314652621746063, "grad_norm_var": 2.994378146269257e-05, "learning_rate": 0.009198075138366428, "loss": 2.8606, "step": 1593 }, { "crossentropy": 2.8676681518554688, "epoch": 0.13559033684926847, "grad_norm": 0.04826277121901512, "grad_norm_var": 3.030171547207059e-05, "learning_rate": 0.009196986500107019, "loss": 2.8677, "step": 1594 }, { "crossentropy": 2.9725213050842285, "epoch": 0.13567539979584892, "grad_norm": 0.05626749247312546, "grad_norm_var": 3.074535204184529e-05, "learning_rate": 0.009195897187932511, "loss": 2.9725, "step": 1595 }, { "crossentropy": 2.8741185665130615, "epoch": 0.1357604627424294, "grad_norm": 0.052282679826021194, "grad_norm_var": 3.042983032112741e-05, "learning_rate": 0.009194807202017817, "loss": 2.8741, "step": 1596 }, { "crossentropy": 2.9164652824401855, "epoch": 0.13584552568900987, "grad_norm": 0.05119001120328903, "grad_norm_var": 2.7814604204704865e-05, "learning_rate": 0.009193716542537955, "loss": 2.9165, "step": 1597 }, { "crossentropy": 2.8066256046295166, "epoch": 0.13593058863559035, "grad_norm": 0.051562562584877014, "grad_norm_var": 2.6638022957594225e-05, "learning_rate": 0.009192625209668057, "loss": 2.8066, "step": 1598 }, { "crossentropy": 2.9397263526916504, "epoch": 0.1360156515821708, "grad_norm": 0.05719444528222084, "grad_norm_var": 2.7368622049756912e-05, "learning_rate": 0.009191533203583358, "loss": 2.9397, "step": 1599 }, { "crossentropy": 2.8954005241394043, "epoch": 0.13610071452875128, "grad_norm": 0.048163801431655884, "grad_norm_var": 2.887565466784299e-05, "learning_rate": 0.009190440524459203, "loss": 2.8954, "step": 1600 }, { "crossentropy": 2.871318817138672, "epoch": 0.13618577747533175, "grad_norm": 0.04851511865854263, "grad_norm_var": 2.5230019432086596e-05, "learning_rate": 0.009189347172471045, "loss": 2.8713, "step": 1601 }, { "crossentropy": 2.8540096282958984, "epoch": 0.1362708404219122, "grad_norm": 0.05201677605509758, "grad_norm_var": 1.893370367370602e-05, "learning_rate": 0.009188253147794444, "loss": 2.854, "step": 1602 }, { "crossentropy": 2.9472947120666504, "epoch": 0.13635590336849268, "grad_norm": 0.049024492502212524, "grad_norm_var": 1.9267987867074784e-05, "learning_rate": 0.009187158450605068, "loss": 2.9473, "step": 1603 }, { "crossentropy": 2.8437154293060303, "epoch": 0.13644096631507316, "grad_norm": 0.056520771235227585, "grad_norm_var": 1.977186447054413e-05, "learning_rate": 0.009186063081078697, "loss": 2.8437, "step": 1604 }, { "crossentropy": 2.848081111907959, "epoch": 0.13652602926165364, "grad_norm": 0.06740574538707733, "grad_norm_var": 3.282371556124018e-05, "learning_rate": 0.009184967039391213, "loss": 2.8481, "step": 1605 }, { "crossentropy": 2.816833972930908, "epoch": 0.1366110922082341, "grad_norm": 0.06146969273686409, "grad_norm_var": 3.543627026572991e-05, "learning_rate": 0.009183870325718609, "loss": 2.8168, "step": 1606 }, { "crossentropy": 2.900536298751831, "epoch": 0.13669615515481456, "grad_norm": 0.05429602414369583, "grad_norm_var": 2.8162638615869946e-05, "learning_rate": 0.009182772940236986, "loss": 2.9005, "step": 1607 }, { "crossentropy": 2.9581267833709717, "epoch": 0.13678121810139504, "grad_norm": 0.04959390312433243, "grad_norm_var": 2.7789146270480968e-05, "learning_rate": 0.009181674883122553, "loss": 2.9581, "step": 1608 }, { "crossentropy": 2.871730327606201, "epoch": 0.1368662810479755, "grad_norm": 0.04513569548726082, "grad_norm_var": 3.2238470143230016e-05, "learning_rate": 0.009180576154551628, "loss": 2.8717, "step": 1609 }, { "crossentropy": 2.931669235229492, "epoch": 0.13695134399455597, "grad_norm": 0.0470849834382534, "grad_norm_var": 3.3077948711223285e-05, "learning_rate": 0.00917947675470063, "loss": 2.9317, "step": 1610 }, { "crossentropy": 2.9623544216156006, "epoch": 0.13703640694113645, "grad_norm": 0.05088050663471222, "grad_norm_var": 3.253236869236494e-05, "learning_rate": 0.009178376683746095, "loss": 2.9624, "step": 1611 }, { "crossentropy": 2.83005690574646, "epoch": 0.13712146988771692, "grad_norm": 0.07496926188468933, "grad_norm_var": 6.360070377209129e-05, "learning_rate": 0.009177275941864663, "loss": 2.8301, "step": 1612 }, { "crossentropy": 2.8025171756744385, "epoch": 0.13720653283429737, "grad_norm": 0.05080731213092804, "grad_norm_var": 6.375650648218928e-05, "learning_rate": 0.00917617452923308, "loss": 2.8025, "step": 1613 }, { "crossentropy": 2.863929510116577, "epoch": 0.13729159578087785, "grad_norm": 0.050911881029605865, "grad_norm_var": 6.399791044006213e-05, "learning_rate": 0.0091750724460282, "loss": 2.8639, "step": 1614 }, { "crossentropy": 2.9291698932647705, "epoch": 0.13737665872745833, "grad_norm": 0.04402412474155426, "grad_norm_var": 6.922835954381769e-05, "learning_rate": 0.00917396969242699, "loss": 2.9292, "step": 1615 }, { "crossentropy": 2.8466577529907227, "epoch": 0.13746172167403878, "grad_norm": 0.05245201289653778, "grad_norm_var": 6.751172839769322e-05, "learning_rate": 0.009172866268606514, "loss": 2.8467, "step": 1616 }, { "crossentropy": 2.825037956237793, "epoch": 0.13754678462061926, "grad_norm": 0.05242687091231346, "grad_norm_var": 6.58972093216803e-05, "learning_rate": 0.009171762174743954, "loss": 2.825, "step": 1617 }, { "crossentropy": 2.926004648208618, "epoch": 0.13763184756719973, "grad_norm": 0.06296151876449585, "grad_norm_var": 7.094400647393255e-05, "learning_rate": 0.009170657411016596, "loss": 2.926, "step": 1618 }, { "crossentropy": 2.9201900959014893, "epoch": 0.1377169105137802, "grad_norm": 0.0511426217854023, "grad_norm_var": 6.971395677456763e-05, "learning_rate": 0.009169551977601834, "loss": 2.9202, "step": 1619 }, { "crossentropy": 2.895554304122925, "epoch": 0.13780197346036066, "grad_norm": 0.047179851680994034, "grad_norm_var": 7.26569289359232e-05, "learning_rate": 0.009168445874677167, "loss": 2.8956, "step": 1620 }, { "crossentropy": 2.908754825592041, "epoch": 0.13788703640694114, "grad_norm": 0.050104737281799316, "grad_norm_var": 6.025897482290673e-05, "learning_rate": 0.009167339102420203, "loss": 2.9088, "step": 1621 }, { "crossentropy": 2.783501386642456, "epoch": 0.13797209935352162, "grad_norm": 0.05539187043905258, "grad_norm_var": 5.5574472121723505e-05, "learning_rate": 0.009166231661008658, "loss": 2.7835, "step": 1622 }, { "crossentropy": 2.884845733642578, "epoch": 0.13805716230010207, "grad_norm": 0.0514521449804306, "grad_norm_var": 5.538383466801294e-05, "learning_rate": 0.009165123550620356, "loss": 2.8848, "step": 1623 }, { "crossentropy": 2.887180805206299, "epoch": 0.13814222524668254, "grad_norm": 0.048762109130620956, "grad_norm_var": 5.572525353891577e-05, "learning_rate": 0.009164014771433226, "loss": 2.8872, "step": 1624 }, { "crossentropy": 2.938157081604004, "epoch": 0.13822728819326302, "grad_norm": 0.04438989982008934, "grad_norm_var": 5.64655169120148e-05, "learning_rate": 0.009162905323625308, "loss": 2.9382, "step": 1625 }, { "crossentropy": 2.9276633262634277, "epoch": 0.1383123511398435, "grad_norm": 0.048924122005701065, "grad_norm_var": 5.542658089469262e-05, "learning_rate": 0.009161795207374746, "loss": 2.9277, "step": 1626 }, { "crossentropy": 2.7943601608276367, "epoch": 0.13839741408642395, "grad_norm": 0.047578468918800354, "grad_norm_var": 5.6732482405665495e-05, "learning_rate": 0.009160684422859793, "loss": 2.7944, "step": 1627 }, { "crossentropy": 2.8943192958831787, "epoch": 0.13848247703300443, "grad_norm": 0.0464906170964241, "grad_norm_var": 2.0555218425144287e-05, "learning_rate": 0.00915957297025881, "loss": 2.8943, "step": 1628 }, { "crossentropy": 2.8557496070861816, "epoch": 0.1385675399795849, "grad_norm": 0.050817299634218216, "grad_norm_var": 2.0555883570953723e-05, "learning_rate": 0.009158460849750263, "loss": 2.8557, "step": 1629 }, { "crossentropy": 2.7559666633605957, "epoch": 0.13865260292616535, "grad_norm": 0.04771793633699417, "grad_norm_var": 2.0938481942801786e-05, "learning_rate": 0.009157348061512726, "loss": 2.756, "step": 1630 }, { "crossentropy": 2.8373050689697266, "epoch": 0.13873766587274583, "grad_norm": 0.05517984554171562, "grad_norm_var": 1.9659078382544317e-05, "learning_rate": 0.009156234605724883, "loss": 2.8373, "step": 1631 }, { "crossentropy": 2.816026210784912, "epoch": 0.1388227288193263, "grad_norm": 0.0562242828309536, "grad_norm_var": 2.1373961839425635e-05, "learning_rate": 0.00915512048256552, "loss": 2.816, "step": 1632 }, { "crossentropy": 2.897167444229126, "epoch": 0.13890779176590676, "grad_norm": 0.04810373857617378, "grad_norm_var": 2.1746390866816437e-05, "learning_rate": 0.009154005692213536, "loss": 2.8972, "step": 1633 }, { "crossentropy": 2.821815013885498, "epoch": 0.13899285471248723, "grad_norm": 0.05077976733446121, "grad_norm_var": 1.1229468679626632e-05, "learning_rate": 0.009152890234847931, "loss": 2.8218, "step": 1634 }, { "crossentropy": 2.9262747764587402, "epoch": 0.1390779176590677, "grad_norm": 0.057495877146720886, "grad_norm_var": 1.4707455107031605e-05, "learning_rate": 0.009151774110647819, "loss": 2.9263, "step": 1635 }, { "crossentropy": 2.9135217666625977, "epoch": 0.1391629806056482, "grad_norm": 0.05114825442433357, "grad_norm_var": 1.3981504568316704e-05, "learning_rate": 0.009150657319792414, "loss": 2.9135, "step": 1636 }, { "crossentropy": 2.793182373046875, "epoch": 0.13924804355222864, "grad_norm": 0.053704243153333664, "grad_norm_var": 1.4524763562892041e-05, "learning_rate": 0.009149539862461043, "loss": 2.7932, "step": 1637 }, { "crossentropy": 2.8524155616760254, "epoch": 0.13933310649880912, "grad_norm": 0.04997826740145683, "grad_norm_var": 1.3103357714344677e-05, "learning_rate": 0.009148421738833136, "loss": 2.8524, "step": 1638 }, { "crossentropy": 2.8643596172332764, "epoch": 0.1394181694453896, "grad_norm": 0.0590820387005806, "grad_norm_var": 1.7662959836735555e-05, "learning_rate": 0.009147302949088233, "loss": 2.8644, "step": 1639 }, { "crossentropy": 2.9659221172332764, "epoch": 0.13950323239197004, "grad_norm": 0.04663710668683052, "grad_norm_var": 1.858592879915392e-05, "learning_rate": 0.009146183493405975, "loss": 2.9659, "step": 1640 }, { "crossentropy": 2.8504798412323, "epoch": 0.13958829533855052, "grad_norm": 0.045725539326667786, "grad_norm_var": 1.75397215701746e-05, "learning_rate": 0.00914506337196612, "loss": 2.8505, "step": 1641 }, { "crossentropy": 2.989415168762207, "epoch": 0.139673358285131, "grad_norm": 0.04538028687238693, "grad_norm_var": 1.929333571635434e-05, "learning_rate": 0.009143942584948524, "loss": 2.9894, "step": 1642 }, { "crossentropy": 2.9875738620758057, "epoch": 0.13975842123171148, "grad_norm": 0.04697873815894127, "grad_norm_var": 1.956964191272738e-05, "learning_rate": 0.009142821132533155, "loss": 2.9876, "step": 1643 }, { "crossentropy": 2.798178195953369, "epoch": 0.13984348417829193, "grad_norm": 0.04745347425341606, "grad_norm_var": 1.9085224168243163e-05, "learning_rate": 0.009141699014900083, "loss": 2.7982, "step": 1644 }, { "crossentropy": 2.8262710571289062, "epoch": 0.1399285471248724, "grad_norm": 0.04299452155828476, "grad_norm_var": 2.2866281668736052e-05, "learning_rate": 0.00914057623222949, "loss": 2.8263, "step": 1645 }, { "crossentropy": 2.845144271850586, "epoch": 0.14001361007145288, "grad_norm": 0.05007114261388779, "grad_norm_var": 2.2406467322579463e-05, "learning_rate": 0.009139452784701662, "loss": 2.8451, "step": 1646 }, { "crossentropy": 2.8282928466796875, "epoch": 0.14009867301803333, "grad_norm": 0.047642383724451065, "grad_norm_var": 2.118731801970201e-05, "learning_rate": 0.009138328672496993, "loss": 2.8283, "step": 1647 }, { "crossentropy": 2.9168920516967773, "epoch": 0.1401837359646138, "grad_norm": 0.047975361347198486, "grad_norm_var": 1.8553028284388344e-05, "learning_rate": 0.009137203895795983, "loss": 2.9169, "step": 1648 }, { "crossentropy": 2.8693888187408447, "epoch": 0.1402687989111943, "grad_norm": 0.047807034105062485, "grad_norm_var": 1.8611667491312082e-05, "learning_rate": 0.009136078454779237, "loss": 2.8694, "step": 1649 }, { "crossentropy": 2.8176681995391846, "epoch": 0.14035386185777476, "grad_norm": 0.053486909717321396, "grad_norm_var": 1.955749359877397e-05, "learning_rate": 0.00913495234962747, "loss": 2.8177, "step": 1650 }, { "crossentropy": 2.844409227371216, "epoch": 0.14043892480435521, "grad_norm": 0.05177290365099907, "grad_norm_var": 1.5577616028817723e-05, "learning_rate": 0.009133825580521502, "loss": 2.8444, "step": 1651 }, { "crossentropy": 2.935748815536499, "epoch": 0.1405239877509357, "grad_norm": 0.04594861716032028, "grad_norm_var": 1.594433840233263e-05, "learning_rate": 0.009132698147642258, "loss": 2.9357, "step": 1652 }, { "crossentropy": 2.790159225463867, "epoch": 0.14060905069751617, "grad_norm": 0.04386978596448898, "grad_norm_var": 1.570905777447118e-05, "learning_rate": 0.009131570051170775, "loss": 2.7902, "step": 1653 }, { "crossentropy": 2.8401997089385986, "epoch": 0.14069411364409662, "grad_norm": 0.0443873330950737, "grad_norm_var": 1.6411830767649033e-05, "learning_rate": 0.009130441291288188, "loss": 2.8402, "step": 1654 }, { "crossentropy": 2.86191987991333, "epoch": 0.1407791765906771, "grad_norm": 0.04431091994047165, "grad_norm_var": 8.12571916564969e-06, "learning_rate": 0.009129311868175744, "loss": 2.8619, "step": 1655 }, { "crossentropy": 2.848273277282715, "epoch": 0.14086423953725757, "grad_norm": 0.04679219424724579, "grad_norm_var": 8.11914707892746e-06, "learning_rate": 0.009128181782014801, "loss": 2.8483, "step": 1656 }, { "crossentropy": 2.872138738632202, "epoch": 0.14094930248383805, "grad_norm": 0.043692585080862045, "grad_norm_var": 8.733025962200508e-06, "learning_rate": 0.009127051032986814, "loss": 2.8721, "step": 1657 }, { "crossentropy": 2.7562241554260254, "epoch": 0.1410343654304185, "grad_norm": 0.044585373252630234, "grad_norm_var": 8.934678679998337e-06, "learning_rate": 0.009125919621273347, "loss": 2.7562, "step": 1658 }, { "crossentropy": 2.915875196456909, "epoch": 0.14111942837699898, "grad_norm": 0.04704819247126579, "grad_norm_var": 8.936074387564524e-06, "learning_rate": 0.009124787547056076, "loss": 2.9159, "step": 1659 }, { "crossentropy": 2.9485862255096436, "epoch": 0.14120449132357946, "grad_norm": 0.0488547645509243, "grad_norm_var": 9.16876486673282e-06, "learning_rate": 0.009123654810516779, "loss": 2.9486, "step": 1660 }, { "crossentropy": 2.822288990020752, "epoch": 0.1412895542701599, "grad_norm": 0.05105670541524887, "grad_norm_var": 8.976528903818088e-06, "learning_rate": 0.009122521411837338, "loss": 2.8223, "step": 1661 }, { "crossentropy": 3.013378620147705, "epoch": 0.14137461721674038, "grad_norm": 0.050643786787986755, "grad_norm_var": 9.196667195647083e-06, "learning_rate": 0.009121387351199748, "loss": 3.0134, "step": 1662 }, { "crossentropy": 2.894468307495117, "epoch": 0.14145968016332086, "grad_norm": 0.05042397975921631, "grad_norm_var": 9.735955138389224e-06, "learning_rate": 0.009120252628786104, "loss": 2.8945, "step": 1663 }, { "crossentropy": 2.8977365493774414, "epoch": 0.14154474310990134, "grad_norm": 0.049246612936258316, "grad_norm_var": 9.88939225934719e-06, "learning_rate": 0.009119117244778608, "loss": 2.8977, "step": 1664 }, { "crossentropy": 2.882059097290039, "epoch": 0.1416298060564818, "grad_norm": 0.04637899249792099, "grad_norm_var": 1.0005128659064793e-05, "learning_rate": 0.009117981199359575, "loss": 2.8821, "step": 1665 }, { "crossentropy": 2.9806172847747803, "epoch": 0.14171486900306227, "grad_norm": 0.05309871956706047, "grad_norm_var": 9.712758482937657e-06, "learning_rate": 0.009116844492711416, "loss": 2.9806, "step": 1666 }, { "crossentropy": 2.8966281414031982, "epoch": 0.14179993194964274, "grad_norm": 0.04590342193841934, "grad_norm_var": 8.625247255213362e-06, "learning_rate": 0.009115707125016657, "loss": 2.8966, "step": 1667 }, { "crossentropy": 2.849374771118164, "epoch": 0.1418849948962232, "grad_norm": 0.04727168753743172, "grad_norm_var": 8.502410289188723e-06, "learning_rate": 0.009114569096457924, "loss": 2.8494, "step": 1668 }, { "crossentropy": 2.8901894092559814, "epoch": 0.14197005784280367, "grad_norm": 0.04507477954030037, "grad_norm_var": 8.034360380426708e-06, "learning_rate": 0.009113430407217953, "loss": 2.8902, "step": 1669 }, { "crossentropy": 2.885970115661621, "epoch": 0.14205512078938415, "grad_norm": 0.04761439189314842, "grad_norm_var": 7.379004489909262e-06, "learning_rate": 0.009112291057479585, "loss": 2.886, "step": 1670 }, { "crossentropy": 2.8864922523498535, "epoch": 0.14214018373596463, "grad_norm": 0.04919508844614029, "grad_norm_var": 6.71186270300757e-06, "learning_rate": 0.009111151047425768, "loss": 2.8865, "step": 1671 }, { "crossentropy": 2.8283753395080566, "epoch": 0.14222524668254508, "grad_norm": 0.05238717049360275, "grad_norm_var": 7.819488675274838e-06, "learning_rate": 0.00911001037723955, "loss": 2.8284, "step": 1672 }, { "crossentropy": 2.877549648284912, "epoch": 0.14231030962912555, "grad_norm": 0.04675661399960518, "grad_norm_var": 6.532221850140349e-06, "learning_rate": 0.009108869047104094, "loss": 2.8775, "step": 1673 }, { "crossentropy": 2.8192713260650635, "epoch": 0.14239537257570603, "grad_norm": 0.05088634416460991, "grad_norm_var": 5.748957229195715e-06, "learning_rate": 0.009107727057202662, "loss": 2.8193, "step": 1674 }, { "crossentropy": 2.9441373348236084, "epoch": 0.14248043552228648, "grad_norm": 0.048092808574438095, "grad_norm_var": 5.5640989021124e-06, "learning_rate": 0.009106584407718627, "loss": 2.9441, "step": 1675 }, { "crossentropy": 3.0039823055267334, "epoch": 0.14256549846886696, "grad_norm": 0.05311009660363197, "grad_norm_var": 6.6529445836117805e-06, "learning_rate": 0.009105441098835465, "loss": 3.004, "step": 1676 }, { "crossentropy": 2.8443195819854736, "epoch": 0.14265056141544744, "grad_norm": 0.06436919420957565, "grad_norm_var": 2.1031514392597964e-05, "learning_rate": 0.009104297130736759, "loss": 2.8443, "step": 1677 }, { "crossentropy": 2.8307106494903564, "epoch": 0.1427356243620279, "grad_norm": 0.050432153046131134, "grad_norm_var": 2.101694755997126e-05, "learning_rate": 0.009103152503606195, "loss": 2.8307, "step": 1678 }, { "crossentropy": 2.8786749839782715, "epoch": 0.14282068730860836, "grad_norm": 0.04729907959699631, "grad_norm_var": 2.1456910854490637e-05, "learning_rate": 0.009102007217627568, "loss": 2.8787, "step": 1679 }, { "crossentropy": 2.8416945934295654, "epoch": 0.14290575025518884, "grad_norm": 0.046731606125831604, "grad_norm_var": 2.204445624839323e-05, "learning_rate": 0.009100861272984779, "loss": 2.8417, "step": 1680 }, { "crossentropy": 2.767982006072998, "epoch": 0.14299081320176932, "grad_norm": 0.04408399015665054, "grad_norm_var": 2.3378441388225563e-05, "learning_rate": 0.009099714669861834, "loss": 2.768, "step": 1681 }, { "crossentropy": 2.8100736141204834, "epoch": 0.14307587614834977, "grad_norm": 0.043486349284648895, "grad_norm_var": 2.4565601955457647e-05, "learning_rate": 0.009098567408442844, "loss": 2.8101, "step": 1682 }, { "crossentropy": 2.8540852069854736, "epoch": 0.14316093909493025, "grad_norm": 0.04884856566786766, "grad_norm_var": 2.3923770546746317e-05, "learning_rate": 0.009097419488912024, "loss": 2.8541, "step": 1683 }, { "crossentropy": 2.904944658279419, "epoch": 0.14324600204151072, "grad_norm": 0.05159655585885048, "grad_norm_var": 2.403706745236466e-05, "learning_rate": 0.009096270911453699, "loss": 2.9049, "step": 1684 }, { "crossentropy": 2.8941967487335205, "epoch": 0.1433310649880912, "grad_norm": 0.0446261391043663, "grad_norm_var": 2.4306749399310006e-05, "learning_rate": 0.009095121676252298, "loss": 2.8942, "step": 1685 }, { "crossentropy": 2.8857524394989014, "epoch": 0.14341612793467165, "grad_norm": 0.04969913884997368, "grad_norm_var": 2.409740128569467e-05, "learning_rate": 0.009093971783492354, "loss": 2.8858, "step": 1686 }, { "crossentropy": 2.865018606185913, "epoch": 0.14350119088125213, "grad_norm": 0.0470491498708725, "grad_norm_var": 2.4465322782738347e-05, "learning_rate": 0.009092821233358505, "loss": 2.865, "step": 1687 }, { "crossentropy": 2.8263511657714844, "epoch": 0.1435862538278326, "grad_norm": 0.048615653067827225, "grad_norm_var": 2.382248675921276e-05, "learning_rate": 0.0090916700260355, "loss": 2.8264, "step": 1688 }, { "crossentropy": 2.809157371520996, "epoch": 0.14367131677441305, "grad_norm": 0.04882412031292915, "grad_norm_var": 2.3442215201658716e-05, "learning_rate": 0.009090518161708187, "loss": 2.8092, "step": 1689 }, { "crossentropy": 2.887052297592163, "epoch": 0.14375637972099353, "grad_norm": 0.04734644293785095, "grad_norm_var": 2.344571666729091e-05, "learning_rate": 0.009089365640561523, "loss": 2.8871, "step": 1690 }, { "crossentropy": 2.86852765083313, "epoch": 0.143841442667574, "grad_norm": 0.05164468288421631, "grad_norm_var": 2.3798327546323965e-05, "learning_rate": 0.009088212462780569, "loss": 2.8685, "step": 1691 }, { "crossentropy": 2.81691312789917, "epoch": 0.1439265056141545, "grad_norm": 0.048173435032367706, "grad_norm_var": 2.277093972387293e-05, "learning_rate": 0.009087058628550491, "loss": 2.8169, "step": 1692 }, { "crossentropy": 2.8791003227233887, "epoch": 0.14401156856073494, "grad_norm": 0.04575984552502632, "grad_norm_var": 6.098399911337824e-06, "learning_rate": 0.009085904138056565, "loss": 2.8791, "step": 1693 }, { "crossentropy": 2.866842031478882, "epoch": 0.14409663150731541, "grad_norm": 0.049913205206394196, "grad_norm_var": 5.930583282779539e-06, "learning_rate": 0.009084748991484167, "loss": 2.8668, "step": 1694 }, { "crossentropy": 2.8658335208892822, "epoch": 0.1441816944538959, "grad_norm": 0.046222057193517685, "grad_norm_var": 6.065124508175939e-06, "learning_rate": 0.00908359318901878, "loss": 2.8658, "step": 1695 }, { "crossentropy": 2.8471567630767822, "epoch": 0.14426675740047634, "grad_norm": 0.04532802850008011, "grad_norm_var": 6.362707191275423e-06, "learning_rate": 0.009082436730845992, "loss": 2.8472, "step": 1696 }, { "crossentropy": 2.924952983856201, "epoch": 0.14435182034705682, "grad_norm": 0.04399005323648453, "grad_norm_var": 6.406996917843131e-06, "learning_rate": 0.009081279617151501, "loss": 2.925, "step": 1697 }, { "crossentropy": 2.8456695079803467, "epoch": 0.1444368832936373, "grad_norm": 0.0449615940451622, "grad_norm_var": 5.7397252933095885e-06, "learning_rate": 0.009080121848121101, "loss": 2.8457, "step": 1698 }, { "crossentropy": 2.8668642044067383, "epoch": 0.14452194624021777, "grad_norm": 0.04822808504104614, "grad_norm_var": 5.665656562799335e-06, "learning_rate": 0.009078963423940697, "loss": 2.8669, "step": 1699 }, { "crossentropy": 2.844219207763672, "epoch": 0.14460700918679822, "grad_norm": 0.04062939062714577, "grad_norm_var": 7.373521307994097e-06, "learning_rate": 0.009077804344796301, "loss": 2.8442, "step": 1700 }, { "crossentropy": 2.982398748397827, "epoch": 0.1446920721333787, "grad_norm": 0.0654769167304039, "grad_norm_var": 2.8117966298066982e-05, "learning_rate": 0.009076644610874027, "loss": 2.9824, "step": 1701 }, { "crossentropy": 2.862622022628784, "epoch": 0.14477713507995918, "grad_norm": 0.0453457273542881, "grad_norm_var": 2.8456304866170123e-05, "learning_rate": 0.009075484222360094, "loss": 2.8626, "step": 1702 }, { "crossentropy": 2.9480180740356445, "epoch": 0.14486219802653963, "grad_norm": 0.05285491794347763, "grad_norm_var": 2.9850718320152912e-05, "learning_rate": 0.009074323179440827, "loss": 2.948, "step": 1703 }, { "crossentropy": 2.9954795837402344, "epoch": 0.1449472609731201, "grad_norm": 0.051115646958351135, "grad_norm_var": 3.0335847293470364e-05, "learning_rate": 0.009073161482302654, "loss": 2.9955, "step": 1704 }, { "crossentropy": 2.8270010948181152, "epoch": 0.14503232391970058, "grad_norm": 0.0538988821208477, "grad_norm_var": 3.217259343172383e-05, "learning_rate": 0.009071999131132116, "loss": 2.827, "step": 1705 }, { "crossentropy": 2.7539544105529785, "epoch": 0.14511738686628106, "grad_norm": 0.05817237123847008, "grad_norm_var": 3.7391470152135094e-05, "learning_rate": 0.009070836126115847, "loss": 2.754, "step": 1706 }, { "crossentropy": 2.8290205001831055, "epoch": 0.1452024498128615, "grad_norm": 0.04524848982691765, "grad_norm_var": 3.810418522778852e-05, "learning_rate": 0.009069672467440597, "loss": 2.829, "step": 1707 }, { "crossentropy": 2.86161470413208, "epoch": 0.145287512759442, "grad_norm": 0.04665420204401016, "grad_norm_var": 3.843256659599442e-05, "learning_rate": 0.009068508155293212, "loss": 2.8616, "step": 1708 }, { "crossentropy": 2.9625730514526367, "epoch": 0.14537257570602247, "grad_norm": 0.05232379212975502, "grad_norm_var": 3.830061564399767e-05, "learning_rate": 0.00906734318986065, "loss": 2.9626, "step": 1709 }, { "crossentropy": 2.96859073638916, "epoch": 0.14545763865260292, "grad_norm": 0.05697304755449295, "grad_norm_var": 4.1900943491308906e-05, "learning_rate": 0.009066177571329968, "loss": 2.9686, "step": 1710 }, { "crossentropy": 2.8825511932373047, "epoch": 0.1455427015991834, "grad_norm": 0.05346854031085968, "grad_norm_var": 4.1688279497855095e-05, "learning_rate": 0.009065011299888332, "loss": 2.8826, "step": 1711 }, { "crossentropy": 2.882842779159546, "epoch": 0.14562776454576387, "grad_norm": 0.049236904829740524, "grad_norm_var": 4.005617192712052e-05, "learning_rate": 0.009063844375723013, "loss": 2.8828, "step": 1712 }, { "crossentropy": 2.8959853649139404, "epoch": 0.14571282749234432, "grad_norm": 0.05045149847865105, "grad_norm_var": 3.7025922596573925e-05, "learning_rate": 0.009062676799021386, "loss": 2.896, "step": 1713 }, { "crossentropy": 2.8868372440338135, "epoch": 0.1457978904389248, "grad_norm": 0.04805487021803856, "grad_norm_var": 3.515822999379843e-05, "learning_rate": 0.009061508569970926, "loss": 2.8868, "step": 1714 }, { "crossentropy": 2.9889776706695557, "epoch": 0.14588295338550528, "grad_norm": 0.043307628482580185, "grad_norm_var": 3.85774284782601e-05, "learning_rate": 0.009060339688759219, "loss": 2.989, "step": 1715 }, { "crossentropy": 2.8553805351257324, "epoch": 0.14596801633208575, "grad_norm": 0.045471448451280594, "grad_norm_var": 3.3459892138940996e-05, "learning_rate": 0.009059170155573955, "loss": 2.8554, "step": 1716 }, { "crossentropy": 2.8906636238098145, "epoch": 0.1460530792786662, "grad_norm": 0.05066428333520889, "grad_norm_var": 1.8834757976838683e-05, "learning_rate": 0.009057999970602927, "loss": 2.8907, "step": 1717 }, { "crossentropy": 2.980506658554077, "epoch": 0.14613814222524668, "grad_norm": 0.04779096692800522, "grad_norm_var": 1.7624948865201512e-05, "learning_rate": 0.009056829134034031, "loss": 2.9805, "step": 1718 }, { "crossentropy": 2.907015085220337, "epoch": 0.14622320517182716, "grad_norm": 0.04620607569813728, "grad_norm_var": 1.8172099856789136e-05, "learning_rate": 0.009055657646055272, "loss": 2.907, "step": 1719 }, { "crossentropy": 2.8010666370391846, "epoch": 0.1463082681184076, "grad_norm": 0.0426873117685318, "grad_norm_var": 2.129064085224323e-05, "learning_rate": 0.009054485506854755, "loss": 2.8011, "step": 1720 }, { "crossentropy": 2.820309638977051, "epoch": 0.14639333106498809, "grad_norm": 0.045989181846380234, "grad_norm_var": 2.0470072168766543e-05, "learning_rate": 0.009053312716620694, "loss": 2.8203, "step": 1721 }, { "crossentropy": 2.798311233520508, "epoch": 0.14647839401156856, "grad_norm": 0.044860802590847015, "grad_norm_var": 1.512097784495166e-05, "learning_rate": 0.009052139275541403, "loss": 2.7983, "step": 1722 }, { "crossentropy": 2.8152875900268555, "epoch": 0.14656345695814904, "grad_norm": 0.045612044632434845, "grad_norm_var": 1.4991653686537386e-05, "learning_rate": 0.009050965183805307, "loss": 2.8153, "step": 1723 }, { "crossentropy": 2.785245418548584, "epoch": 0.1466485199047295, "grad_norm": 0.046560682356357574, "grad_norm_var": 1.5010347309042891e-05, "learning_rate": 0.009049790441600924, "loss": 2.7852, "step": 1724 }, { "crossentropy": 2.810964345932007, "epoch": 0.14673358285130997, "grad_norm": 0.04439108446240425, "grad_norm_var": 1.4479762408166572e-05, "learning_rate": 0.009048615049116892, "loss": 2.811, "step": 1725 }, { "crossentropy": 2.883629560470581, "epoch": 0.14681864579789045, "grad_norm": 0.05062900111079216, "grad_norm_var": 9.073469616540224e-06, "learning_rate": 0.00904743900654194, "loss": 2.8836, "step": 1726 }, { "crossentropy": 2.8699982166290283, "epoch": 0.1469037087444709, "grad_norm": 0.04781727492809296, "grad_norm_var": 6.3547480768615916e-06, "learning_rate": 0.009046262314064907, "loss": 2.87, "step": 1727 }, { "crossentropy": 2.881697177886963, "epoch": 0.14698877169105137, "grad_norm": 0.04987049102783203, "grad_norm_var": 6.580786888920239e-06, "learning_rate": 0.009045084971874737, "loss": 2.8817, "step": 1728 }, { "crossentropy": 2.8875784873962402, "epoch": 0.14707383463763185, "grad_norm": 0.04466335102915764, "grad_norm_var": 5.932117719023577e-06, "learning_rate": 0.009043906980160478, "loss": 2.8876, "step": 1729 }, { "crossentropy": 2.9220218658447266, "epoch": 0.14715889758421233, "grad_norm": 0.04596896842122078, "grad_norm_var": 5.781634485768396e-06, "learning_rate": 0.009042728339111279, "loss": 2.922, "step": 1730 }, { "crossentropy": 2.8981082439422607, "epoch": 0.14724396053079278, "grad_norm": 0.0440453439950943, "grad_norm_var": 5.510919481146957e-06, "learning_rate": 0.009041549048916398, "loss": 2.8981, "step": 1731 }, { "crossentropy": 2.911975145339966, "epoch": 0.14732902347737326, "grad_norm": 0.04429879039525986, "grad_norm_var": 5.750142435806404e-06, "learning_rate": 0.009040369109765195, "loss": 2.912, "step": 1732 }, { "crossentropy": 2.870838165283203, "epoch": 0.14741408642395373, "grad_norm": 0.04661954566836357, "grad_norm_var": 4.461308810752886e-06, "learning_rate": 0.009039188521847133, "loss": 2.8708, "step": 1733 }, { "crossentropy": 2.8543872833251953, "epoch": 0.14749914937053418, "grad_norm": 0.05027403682470322, "grad_norm_var": 5.397996817091353e-06, "learning_rate": 0.009038007285351781, "loss": 2.8544, "step": 1734 }, { "crossentropy": 2.887429714202881, "epoch": 0.14758421231711466, "grad_norm": 0.052655864506959915, "grad_norm_var": 7.9336582155133e-06, "learning_rate": 0.009036825400468813, "loss": 2.8874, "step": 1735 }, { "crossentropy": 2.7520768642425537, "epoch": 0.14766927526369514, "grad_norm": 0.04866508021950722, "grad_norm_var": 6.981523048260376e-06, "learning_rate": 0.009035642867388002, "loss": 2.7521, "step": 1736 }, { "crossentropy": 2.8069636821746826, "epoch": 0.14775433821027562, "grad_norm": 0.048735518008470535, "grad_norm_var": 7.061690586028975e-06, "learning_rate": 0.009034459686299231, "loss": 2.807, "step": 1737 }, { "crossentropy": 2.7777445316314697, "epoch": 0.14783940115685607, "grad_norm": 0.04937908053398132, "grad_norm_var": 6.910781829070483e-06, "learning_rate": 0.009033275857392487, "loss": 2.7777, "step": 1738 }, { "crossentropy": 2.8009254932403564, "epoch": 0.14792446410343654, "grad_norm": 0.05219172686338425, "grad_norm_var": 7.950052108377653e-06, "learning_rate": 0.009032091380857855, "loss": 2.8009, "step": 1739 }, { "crossentropy": 2.8330976963043213, "epoch": 0.14800952705001702, "grad_norm": 0.046123627573251724, "grad_norm_var": 8.041370456855247e-06, "learning_rate": 0.009030906256885527, "loss": 2.8331, "step": 1740 }, { "crossentropy": 2.843874216079712, "epoch": 0.14809458999659747, "grad_norm": 0.046381253749132156, "grad_norm_var": 7.358988434153814e-06, "learning_rate": 0.009029720485665804, "loss": 2.8439, "step": 1741 }, { "crossentropy": 2.9847395420074463, "epoch": 0.14817965294317795, "grad_norm": 0.04648255556821823, "grad_norm_var": 6.9911046818443845e-06, "learning_rate": 0.009028534067389085, "loss": 2.9847, "step": 1742 }, { "crossentropy": 2.903198719024658, "epoch": 0.14826471588975842, "grad_norm": 0.052277930080890656, "grad_norm_var": 8.268294452356873e-06, "learning_rate": 0.009027347002245875, "loss": 2.9032, "step": 1743 }, { "crossentropy": 2.7952616214752197, "epoch": 0.1483497788363389, "grad_norm": 0.05001680180430412, "grad_norm_var": 8.305350122814597e-06, "learning_rate": 0.00902615929042678, "loss": 2.7953, "step": 1744 }, { "crossentropy": 2.8012802600860596, "epoch": 0.14843484178291935, "grad_norm": 0.04915979132056236, "grad_norm_var": 7.539360663565547e-06, "learning_rate": 0.009024970932122517, "loss": 2.8013, "step": 1745 }, { "crossentropy": 2.7586798667907715, "epoch": 0.14851990472949983, "grad_norm": 0.042332813143730164, "grad_norm_var": 9.510265465465887e-06, "learning_rate": 0.009023781927523898, "loss": 2.7587, "step": 1746 }, { "crossentropy": 2.9000279903411865, "epoch": 0.1486049676760803, "grad_norm": 0.041111599653959274, "grad_norm_var": 1.1635209224826876e-05, "learning_rate": 0.009022592276821843, "loss": 2.9, "step": 1747 }, { "crossentropy": 2.9253454208374023, "epoch": 0.14869003062266076, "grad_norm": 0.048534609377384186, "grad_norm_var": 1.0711916210758877e-05, "learning_rate": 0.009021401980207377, "loss": 2.9253, "step": 1748 }, { "crossentropy": 2.809109926223755, "epoch": 0.14877509356924123, "grad_norm": 0.04748938977718353, "grad_norm_var": 1.0577777018778527e-05, "learning_rate": 0.009020211037871626, "loss": 2.8091, "step": 1749 }, { "crossentropy": 2.842956066131592, "epoch": 0.1488601565158217, "grad_norm": 0.04461110755801201, "grad_norm_var": 1.1044924291132331e-05, "learning_rate": 0.009019019450005826, "loss": 2.843, "step": 1750 }, { "crossentropy": 2.843674898147583, "epoch": 0.1489452194624022, "grad_norm": 0.04792412742972374, "grad_norm_var": 9.433883968919196e-06, "learning_rate": 0.009017827216801303, "loss": 2.8437, "step": 1751 }, { "crossentropy": 2.8036305904388428, "epoch": 0.14903028240898264, "grad_norm": 0.048440463840961456, "grad_norm_var": 9.40479680675103e-06, "learning_rate": 0.009016634338449504, "loss": 2.8036, "step": 1752 }, { "crossentropy": 2.8263261318206787, "epoch": 0.14911534535556312, "grad_norm": 0.04302095249295235, "grad_norm_var": 1.0561203404534555e-05, "learning_rate": 0.009015440815141964, "loss": 2.8263, "step": 1753 }, { "crossentropy": 2.885509729385376, "epoch": 0.1492004083021436, "grad_norm": 0.04442731663584709, "grad_norm_var": 1.0666460221432118e-05, "learning_rate": 0.009014246647070331, "loss": 2.8855, "step": 1754 }, { "crossentropy": 2.8773438930511475, "epoch": 0.14928547124872404, "grad_norm": 0.047937482595443726, "grad_norm_var": 8.800451868305227e-06, "learning_rate": 0.009013051834426355, "loss": 2.8773, "step": 1755 }, { "crossentropy": 2.7693870067596436, "epoch": 0.14937053419530452, "grad_norm": 0.050161611288785934, "grad_norm_var": 9.540449279132369e-06, "learning_rate": 0.00901185637740189, "loss": 2.7694, "step": 1756 }, { "crossentropy": 2.7927119731903076, "epoch": 0.149455597141885, "grad_norm": 0.04855538159608841, "grad_norm_var": 9.68713428827576e-06, "learning_rate": 0.00901066027618889, "loss": 2.7927, "step": 1757 }, { "crossentropy": 2.909482717514038, "epoch": 0.14954066008846548, "grad_norm": 0.04987788572907448, "grad_norm_var": 1.0159705709303763e-05, "learning_rate": 0.009009463530979412, "loss": 2.9095, "step": 1758 }, { "crossentropy": 2.781735897064209, "epoch": 0.14962572303504593, "grad_norm": 0.053151343017816544, "grad_norm_var": 1.0793790502189084e-05, "learning_rate": 0.00900826614196562, "loss": 2.7817, "step": 1759 }, { "crossentropy": 2.8911502361297607, "epoch": 0.1497107859816264, "grad_norm": 0.05187302827835083, "grad_norm_var": 1.1682270986220338e-05, "learning_rate": 0.009007068109339784, "loss": 2.8912, "step": 1760 }, { "crossentropy": 2.932587146759033, "epoch": 0.14979584892820688, "grad_norm": 0.04876481741666794, "grad_norm_var": 1.1600032636578892e-05, "learning_rate": 0.00900586943329427, "loss": 2.9326, "step": 1761 }, { "crossentropy": 2.791945457458496, "epoch": 0.14988091187478733, "grad_norm": 0.045335039496421814, "grad_norm_var": 1.0139644113123468e-05, "learning_rate": 0.009004670114021548, "loss": 2.7919, "step": 1762 }, { "crossentropy": 2.847130537033081, "epoch": 0.1499659748213678, "grad_norm": 0.046293336898088455, "grad_norm_var": 7.35154421855159e-06, "learning_rate": 0.009003470151714203, "loss": 2.8471, "step": 1763 }, { "crossentropy": 2.875391960144043, "epoch": 0.1500510377679483, "grad_norm": 0.045332182198762894, "grad_norm_var": 7.721487197966982e-06, "learning_rate": 0.009002269546564906, "loss": 2.8754, "step": 1764 }, { "crossentropy": 2.795541524887085, "epoch": 0.15013610071452876, "grad_norm": 0.04564152657985687, "grad_norm_var": 7.986720300436936e-06, "learning_rate": 0.009001068298766443, "loss": 2.7955, "step": 1765 }, { "crossentropy": 2.90771746635437, "epoch": 0.1502211636611092, "grad_norm": 0.047409601509571075, "grad_norm_var": 7.366826430225265e-06, "learning_rate": 0.008999866408511699, "loss": 2.9077, "step": 1766 }, { "crossentropy": 2.9432289600372314, "epoch": 0.1503062266076897, "grad_norm": 0.04485559090971947, "grad_norm_var": 7.887814886618078e-06, "learning_rate": 0.008998663875993665, "loss": 2.9432, "step": 1767 }, { "crossentropy": 2.753774642944336, "epoch": 0.15039128955427017, "grad_norm": 0.04450348764657974, "grad_norm_var": 8.398226149800709e-06, "learning_rate": 0.00899746070140543, "loss": 2.7538, "step": 1768 }, { "crossentropy": 2.9525599479675293, "epoch": 0.15047635250085062, "grad_norm": 0.049023211002349854, "grad_norm_var": 7.208358151205609e-06, "learning_rate": 0.00899625688494019, "loss": 2.9526, "step": 1769 }, { "crossentropy": 2.8131062984466553, "epoch": 0.1505614154474311, "grad_norm": 0.045340556651353836, "grad_norm_var": 6.8624192069349195e-06, "learning_rate": 0.008995052426791246, "loss": 2.8131, "step": 1770 }, { "crossentropy": 2.803593635559082, "epoch": 0.15064647839401157, "grad_norm": 0.049012769013643265, "grad_norm_var": 6.9610613847350305e-06, "learning_rate": 0.008993847327151996, "loss": 2.8036, "step": 1771 }, { "crossentropy": 2.8044235706329346, "epoch": 0.15073154134059205, "grad_norm": 0.04695301130414009, "grad_norm_var": 6.603037469387942e-06, "learning_rate": 0.008992641586215945, "loss": 2.8044, "step": 1772 }, { "crossentropy": 2.8915770053863525, "epoch": 0.1508166042871725, "grad_norm": 0.05157271772623062, "grad_norm_var": 7.548302428796403e-06, "learning_rate": 0.0089914352041767, "loss": 2.8916, "step": 1773 }, { "crossentropy": 2.857567071914673, "epoch": 0.15090166723375298, "grad_norm": 0.05182339996099472, "grad_norm_var": 8.321602556487542e-06, "learning_rate": 0.008990228181227973, "loss": 2.8576, "step": 1774 }, { "crossentropy": 2.8352227210998535, "epoch": 0.15098673018033346, "grad_norm": 0.04603561386466026, "grad_norm_var": 6.532714175625821e-06, "learning_rate": 0.008989020517563576, "loss": 2.8352, "step": 1775 }, { "crossentropy": 2.8156096935272217, "epoch": 0.1510717931269139, "grad_norm": 0.04580213129520416, "grad_norm_var": 5.284799076507065e-06, "learning_rate": 0.008987812213377423, "loss": 2.8156, "step": 1776 }, { "crossentropy": 2.9639058113098145, "epoch": 0.15115685607349438, "grad_norm": 0.044500913470983505, "grad_norm_var": 5.478138495442162e-06, "learning_rate": 0.008986603268863537, "loss": 2.9639, "step": 1777 }, { "crossentropy": 2.915134906768799, "epoch": 0.15124191902007486, "grad_norm": 0.04974128678441048, "grad_norm_var": 5.807593497729874e-06, "learning_rate": 0.008985393684216034, "loss": 2.9151, "step": 1778 }, { "crossentropy": 2.8399767875671387, "epoch": 0.15132698196665534, "grad_norm": 0.0468398854136467, "grad_norm_var": 5.766379947610052e-06, "learning_rate": 0.008984183459629145, "loss": 2.84, "step": 1779 }, { "crossentropy": 2.737149715423584, "epoch": 0.1514120449132358, "grad_norm": 0.04318319261074066, "grad_norm_var": 6.575660631110275e-06, "learning_rate": 0.008982972595297194, "loss": 2.7371, "step": 1780 }, { "crossentropy": 2.8556466102600098, "epoch": 0.15149710785981627, "grad_norm": 0.043472785502672195, "grad_norm_var": 7.266766636933782e-06, "learning_rate": 0.00898176109141461, "loss": 2.8556, "step": 1781 }, { "crossentropy": 2.8084664344787598, "epoch": 0.15158217080639674, "grad_norm": 0.045865487307310104, "grad_norm_var": 7.306622631134072e-06, "learning_rate": 0.008980548948175926, "loss": 2.8085, "step": 1782 }, { "crossentropy": 2.781386613845825, "epoch": 0.1516672337529772, "grad_norm": 0.05255405232310295, "grad_norm_var": 9.032481333286155e-06, "learning_rate": 0.00897933616577578, "loss": 2.7814, "step": 1783 }, { "crossentropy": 2.8300600051879883, "epoch": 0.15175229669955767, "grad_norm": 0.04834834486246109, "grad_norm_var": 8.541228078821847e-06, "learning_rate": 0.008978122744408906, "loss": 2.8301, "step": 1784 }, { "crossentropy": 2.824347734451294, "epoch": 0.15183735964613815, "grad_norm": 0.04937048256397247, "grad_norm_var": 8.619093753038132e-06, "learning_rate": 0.008976908684270146, "loss": 2.8243, "step": 1785 }, { "crossentropy": 2.836779832839966, "epoch": 0.15192242259271863, "grad_norm": 0.047708120197057724, "grad_norm_var": 8.279525998036199e-06, "learning_rate": 0.008975693985554443, "loss": 2.8368, "step": 1786 }, { "crossentropy": 2.7956204414367676, "epoch": 0.15200748553929908, "grad_norm": 0.050176963210105896, "grad_norm_var": 8.572044987390642e-06, "learning_rate": 0.008974478648456845, "loss": 2.7956, "step": 1787 }, { "crossentropy": 2.8317503929138184, "epoch": 0.15209254848587955, "grad_norm": 0.04903946816921234, "grad_norm_var": 8.623306084738249e-06, "learning_rate": 0.008973262673172496, "loss": 2.8318, "step": 1788 }, { "crossentropy": 2.824692964553833, "epoch": 0.15217761143246003, "grad_norm": 0.04611041024327278, "grad_norm_var": 7.796616243352372e-06, "learning_rate": 0.008972046059896651, "loss": 2.8247, "step": 1789 }, { "crossentropy": 2.7742812633514404, "epoch": 0.15226267437904048, "grad_norm": 0.04297877848148346, "grad_norm_var": 7.629511824696946e-06, "learning_rate": 0.008970828808824658, "loss": 2.7743, "step": 1790 }, { "crossentropy": 2.7606618404388428, "epoch": 0.15234773732562096, "grad_norm": 0.048924438655376434, "grad_norm_var": 7.786184624391084e-06, "learning_rate": 0.008969610920151977, "loss": 2.7607, "step": 1791 }, { "crossentropy": 2.8814263343811035, "epoch": 0.15243280027220144, "grad_norm": 0.056136250495910645, "grad_norm_var": 1.2584940835571636e-05, "learning_rate": 0.008968392394074163, "loss": 2.8814, "step": 1792 }, { "crossentropy": 2.851975679397583, "epoch": 0.15251786321878189, "grad_norm": 0.048166319727897644, "grad_norm_var": 1.1807700668618054e-05, "learning_rate": 0.008967173230786878, "loss": 2.852, "step": 1793 }, { "crossentropy": 2.9018356800079346, "epoch": 0.15260292616536236, "grad_norm": 0.04672297090291977, "grad_norm_var": 1.1691823676322708e-05, "learning_rate": 0.008965953430485882, "loss": 2.9018, "step": 1794 }, { "crossentropy": 2.8384392261505127, "epoch": 0.15268798911194284, "grad_norm": 0.04911220818758011, "grad_norm_var": 1.1708537306331865e-05, "learning_rate": 0.008964732993367043, "loss": 2.8384, "step": 1795 }, { "crossentropy": 2.9084761142730713, "epoch": 0.15277305205852332, "grad_norm": 0.04855864495038986, "grad_norm_var": 1.0067980648413057e-05, "learning_rate": 0.008963511919626325, "loss": 2.9085, "step": 1796 }, { "crossentropy": 2.827026128768921, "epoch": 0.15285811500510377, "grad_norm": 0.049900878220796585, "grad_norm_var": 8.489323444705633e-06, "learning_rate": 0.0089622902094598, "loss": 2.827, "step": 1797 }, { "crossentropy": 2.921156883239746, "epoch": 0.15294317795168424, "grad_norm": 0.04821876809000969, "grad_norm_var": 7.936764964542633e-06, "learning_rate": 0.008961067863063638, "loss": 2.9212, "step": 1798 }, { "crossentropy": 2.8673040866851807, "epoch": 0.15302824089826472, "grad_norm": 0.046617209911346436, "grad_norm_var": 7.228726264203964e-06, "learning_rate": 0.008959844880634112, "loss": 2.8673, "step": 1799 }, { "crossentropy": 2.8655476570129395, "epoch": 0.15311330384484517, "grad_norm": 0.04330577328801155, "grad_norm_var": 8.923703743559235e-06, "learning_rate": 0.008958621262367598, "loss": 2.8655, "step": 1800 }, { "crossentropy": 2.8201305866241455, "epoch": 0.15319836679142565, "grad_norm": 0.046369150280952454, "grad_norm_var": 9.014492954497316e-06, "learning_rate": 0.008957397008460577, "loss": 2.8201, "step": 1801 }, { "crossentropy": 2.905864715576172, "epoch": 0.15328342973800613, "grad_norm": 0.0426449291408062, "grad_norm_var": 1.0815738479683495e-05, "learning_rate": 0.008956172119109625, "loss": 2.9059, "step": 1802 }, { "crossentropy": 2.7739198207855225, "epoch": 0.1533684926845866, "grad_norm": 0.046558257192373276, "grad_norm_var": 1.0432518891354918e-05, "learning_rate": 0.008954946594511425, "loss": 2.7739, "step": 1803 }, { "crossentropy": 2.8126063346862793, "epoch": 0.15345355563116705, "grad_norm": 0.04661126434803009, "grad_norm_var": 1.0289750500545407e-05, "learning_rate": 0.00895372043486276, "loss": 2.8126, "step": 1804 }, { "crossentropy": 2.8951737880706787, "epoch": 0.15353861857774753, "grad_norm": 0.04722342640161514, "grad_norm_var": 1.0189374371552945e-05, "learning_rate": 0.008952493640360517, "loss": 2.8952, "step": 1805 }, { "crossentropy": 2.8750407695770264, "epoch": 0.153623681524328, "grad_norm": 0.04408333823084831, "grad_norm_var": 9.61772222677811e-06, "learning_rate": 0.008951266211201686, "loss": 2.875, "step": 1806 }, { "crossentropy": 2.9642701148986816, "epoch": 0.15370874447090846, "grad_norm": 0.04358724504709244, "grad_norm_var": 1.0346772270488265e-05, "learning_rate": 0.008950038147583352, "loss": 2.9643, "step": 1807 }, { "crossentropy": 2.915499448776245, "epoch": 0.15379380741748894, "grad_norm": 0.046644844114780426, "grad_norm_var": 4.558768258638759e-06, "learning_rate": 0.008948809449702711, "loss": 2.9155, "step": 1808 }, { "crossentropy": 2.8630475997924805, "epoch": 0.15387887036406941, "grad_norm": 0.04552436247467995, "grad_norm_var": 4.415195314181643e-06, "learning_rate": 0.008947580117757054, "loss": 2.863, "step": 1809 }, { "crossentropy": 2.9072439670562744, "epoch": 0.1539639333106499, "grad_norm": 0.04516135901212692, "grad_norm_var": 4.491035327714756e-06, "learning_rate": 0.00894635015194378, "loss": 2.9072, "step": 1810 }, { "crossentropy": 2.8102407455444336, "epoch": 0.15404899625723034, "grad_norm": 0.04601428657770157, "grad_norm_var": 3.911743089594389e-06, "learning_rate": 0.00894511955246038, "loss": 2.8102, "step": 1811 }, { "crossentropy": 2.7630255222320557, "epoch": 0.15413405920381082, "grad_norm": 0.06949611753225327, "grad_norm_var": 3.8274606844065856e-05, "learning_rate": 0.008943888319504456, "loss": 2.763, "step": 1812 }, { "crossentropy": 2.909851551055908, "epoch": 0.1542191221503913, "grad_norm": 0.06362012773752213, "grad_norm_var": 5.466307503813513e-05, "learning_rate": 0.008942656453273709, "loss": 2.9099, "step": 1813 }, { "crossentropy": 2.926961898803711, "epoch": 0.15430418509697175, "grad_norm": 0.05058283358812332, "grad_norm_var": 5.5008825946938006e-05, "learning_rate": 0.00894142395396594, "loss": 2.927, "step": 1814 }, { "crossentropy": 2.9264070987701416, "epoch": 0.15438924804355222, "grad_norm": 0.04773539677262306, "grad_norm_var": 5.482448571593283e-05, "learning_rate": 0.008940190821779053, "loss": 2.9264, "step": 1815 }, { "crossentropy": 2.8885369300842285, "epoch": 0.1544743109901327, "grad_norm": 0.0443500354886055, "grad_norm_var": 5.4176709225866164e-05, "learning_rate": 0.008938957056911057, "loss": 2.8885, "step": 1816 }, { "crossentropy": 2.865532159805298, "epoch": 0.15455937393671318, "grad_norm": 0.04407496750354767, "grad_norm_var": 5.516142868667208e-05, "learning_rate": 0.008937722659560053, "loss": 2.8655, "step": 1817 }, { "crossentropy": 2.894792318344116, "epoch": 0.15464443688329363, "grad_norm": 0.04496435448527336, "grad_norm_var": 5.3727284800247854e-05, "learning_rate": 0.008936487629924255, "loss": 2.8948, "step": 1818 }, { "crossentropy": 2.8463916778564453, "epoch": 0.1547294998298741, "grad_norm": 0.043767571449279785, "grad_norm_var": 5.4941936456007894e-05, "learning_rate": 0.00893525196820197, "loss": 2.8464, "step": 1819 }, { "crossentropy": 2.9520514011383057, "epoch": 0.15481456277645458, "grad_norm": 0.045137349516153336, "grad_norm_var": 5.5417466365078176e-05, "learning_rate": 0.008934015674591608, "loss": 2.9521, "step": 1820 }, { "crossentropy": 2.873168706893921, "epoch": 0.15489962572303503, "grad_norm": 0.05008510500192642, "grad_norm_var": 5.553836742099114e-05, "learning_rate": 0.00893277874929169, "loss": 2.8732, "step": 1821 }, { "crossentropy": 2.844058036804199, "epoch": 0.1549846886696155, "grad_norm": 0.05614229664206505, "grad_norm_var": 5.764328928615821e-05, "learning_rate": 0.008931541192500822, "loss": 2.8441, "step": 1822 }, { "crossentropy": 2.8572380542755127, "epoch": 0.155069751616196, "grad_norm": 0.05138044059276581, "grad_norm_var": 5.5627230798667376e-05, "learning_rate": 0.008930303004417724, "loss": 2.8572, "step": 1823 }, { "crossentropy": 2.936892509460449, "epoch": 0.15515481456277647, "grad_norm": 0.043258387595415115, "grad_norm_var": 5.770883952434154e-05, "learning_rate": 0.008929064185241212, "loss": 2.9369, "step": 1824 }, { "crossentropy": 2.8059194087982178, "epoch": 0.15523987750935692, "grad_norm": 0.0492415651679039, "grad_norm_var": 5.6623844678125164e-05, "learning_rate": 0.008927824735170205, "loss": 2.8059, "step": 1825 }, { "crossentropy": 2.8545122146606445, "epoch": 0.1553249404559374, "grad_norm": 0.050632696598768234, "grad_norm_var": 5.519238689756396e-05, "learning_rate": 0.008926584654403725, "loss": 2.8545, "step": 1826 }, { "crossentropy": 2.8814289569854736, "epoch": 0.15541000340251787, "grad_norm": 0.04845824092626572, "grad_norm_var": 5.4257059308232315e-05, "learning_rate": 0.00892534394314089, "loss": 2.8814, "step": 1827 }, { "crossentropy": 2.8886215686798096, "epoch": 0.15549506634909832, "grad_norm": 0.04383428767323494, "grad_norm_var": 2.9333719178802006e-05, "learning_rate": 0.008924102601580925, "loss": 2.8886, "step": 1828 }, { "crossentropy": 2.860582113265991, "epoch": 0.1555801292956788, "grad_norm": 0.04466736316680908, "grad_norm_var": 1.377497576332953e-05, "learning_rate": 0.008922860629923151, "loss": 2.8606, "step": 1829 }, { "crossentropy": 2.8807802200317383, "epoch": 0.15566519224225928, "grad_norm": 0.04839874431490898, "grad_norm_var": 1.3144651681650429e-05, "learning_rate": 0.008921618028366995, "loss": 2.8808, "step": 1830 }, { "crossentropy": 2.9317784309387207, "epoch": 0.15575025518883975, "grad_norm": 0.04480855166912079, "grad_norm_var": 1.3493770468300744e-05, "learning_rate": 0.008920374797111984, "loss": 2.9318, "step": 1831 }, { "crossentropy": 2.7995896339416504, "epoch": 0.1558353181354202, "grad_norm": 0.04774608463048935, "grad_norm_var": 1.2980655137453425e-05, "learning_rate": 0.008919130936357742, "loss": 2.7996, "step": 1832 }, { "crossentropy": 2.9485979080200195, "epoch": 0.15592038108200068, "grad_norm": 0.05159765109419823, "grad_norm_var": 1.3295454373414792e-05, "learning_rate": 0.008917886446304, "loss": 2.9486, "step": 1833 }, { "crossentropy": 2.8511576652526855, "epoch": 0.15600544402858116, "grad_norm": 0.04427341744303703, "grad_norm_var": 1.3582613834158217e-05, "learning_rate": 0.008916641327150586, "loss": 2.8512, "step": 1834 }, { "crossentropy": 2.882411003112793, "epoch": 0.1560905069751616, "grad_norm": 0.04463779181241989, "grad_norm_var": 1.3172000663503695e-05, "learning_rate": 0.00891539557909743, "loss": 2.8824, "step": 1835 }, { "crossentropy": 2.7987477779388428, "epoch": 0.15617556992174209, "grad_norm": 0.04486720263957977, "grad_norm_var": 1.3271343762620048e-05, "learning_rate": 0.008914149202344564, "loss": 2.7987, "step": 1836 }, { "crossentropy": 2.8797342777252197, "epoch": 0.15626063286832256, "grad_norm": 0.04274880886077881, "grad_norm_var": 1.4352858583200195e-05, "learning_rate": 0.008912902197092119, "loss": 2.8797, "step": 1837 }, { "crossentropy": 2.8969857692718506, "epoch": 0.15634569581490304, "grad_norm": 0.04434841126203537, "grad_norm_var": 9.131206733049383e-06, "learning_rate": 0.008911654563540329, "loss": 2.897, "step": 1838 }, { "crossentropy": 2.8640499114990234, "epoch": 0.1564307587614835, "grad_norm": 0.04085636883974075, "grad_norm_var": 9.284080636891522e-06, "learning_rate": 0.008910406301889526, "loss": 2.864, "step": 1839 }, { "crossentropy": 2.8299968242645264, "epoch": 0.15651582170806397, "grad_norm": 0.04662255942821503, "grad_norm_var": 8.807206940874129e-06, "learning_rate": 0.008909157412340149, "loss": 2.83, "step": 1840 }, { "crossentropy": 2.862657308578491, "epoch": 0.15660088465464445, "grad_norm": 0.04324056953191757, "grad_norm_var": 8.551272983957764e-06, "learning_rate": 0.00890790789509273, "loss": 2.8627, "step": 1841 }, { "crossentropy": 2.8304576873779297, "epoch": 0.1566859476012249, "grad_norm": 0.04605449363589287, "grad_norm_var": 6.870772110491712e-06, "learning_rate": 0.008906657750347909, "loss": 2.8305, "step": 1842 }, { "crossentropy": 2.9261786937713623, "epoch": 0.15677101054780537, "grad_norm": 0.04667041450738907, "grad_norm_var": 6.352859509656655e-06, "learning_rate": 0.008905406978306421, "loss": 2.9262, "step": 1843 }, { "crossentropy": 2.8512232303619385, "epoch": 0.15685607349438585, "grad_norm": 0.04244666174054146, "grad_norm_var": 6.751007690918888e-06, "learning_rate": 0.008904155579169102, "loss": 2.8512, "step": 1844 }, { "crossentropy": 2.812316417694092, "epoch": 0.15694113644096633, "grad_norm": 0.0417080819606781, "grad_norm_var": 7.5278656890576456e-06, "learning_rate": 0.008902903553136892, "loss": 2.8123, "step": 1845 }, { "crossentropy": 2.7461366653442383, "epoch": 0.15702619938754678, "grad_norm": 0.04129441827535629, "grad_norm_var": 7.523623792423291e-06, "learning_rate": 0.008901650900410833, "loss": 2.7461, "step": 1846 }, { "crossentropy": 2.7706685066223145, "epoch": 0.15711126233412726, "grad_norm": 0.05450662598013878, "grad_norm_var": 1.3645605670693726e-05, "learning_rate": 0.008900397621192064, "loss": 2.7707, "step": 1847 }, { "crossentropy": 2.823423385620117, "epoch": 0.15719632528070773, "grad_norm": 0.04631008952856064, "grad_norm_var": 1.3292017854223068e-05, "learning_rate": 0.008899143715681821, "loss": 2.8234, "step": 1848 }, { "crossentropy": 2.7860288619995117, "epoch": 0.15728138822728818, "grad_norm": 0.05058390274643898, "grad_norm_var": 1.2482913716586827e-05, "learning_rate": 0.00889788918408145, "loss": 2.786, "step": 1849 }, { "crossentropy": 2.854778289794922, "epoch": 0.15736645117386866, "grad_norm": 0.046046897768974304, "grad_norm_var": 1.2490391350168892e-05, "learning_rate": 0.008896634026592394, "loss": 2.8548, "step": 1850 }, { "crossentropy": 2.8965327739715576, "epoch": 0.15745151412044914, "grad_norm": 0.04405920207500458, "grad_norm_var": 1.2553448241761633e-05, "learning_rate": 0.008895378243416189, "loss": 2.8965, "step": 1851 }, { "crossentropy": 2.8042097091674805, "epoch": 0.15753657706702962, "grad_norm": 0.0466623492538929, "grad_norm_var": 1.2687697279911183e-05, "learning_rate": 0.00889412183475448, "loss": 2.8042, "step": 1852 }, { "crossentropy": 2.743201494216919, "epoch": 0.15762164001361006, "grad_norm": 0.04888336732983589, "grad_norm_var": 1.2985748631853117e-05, "learning_rate": 0.008892864800809015, "loss": 2.7432, "step": 1853 }, { "crossentropy": 2.8698599338531494, "epoch": 0.15770670296019054, "grad_norm": 0.04831811040639877, "grad_norm_var": 1.3285226371239957e-05, "learning_rate": 0.00889160714178163, "loss": 2.8699, "step": 1854 }, { "crossentropy": 2.7749366760253906, "epoch": 0.15779176590677102, "grad_norm": 0.04308050125837326, "grad_norm_var": 1.210122396407808e-05, "learning_rate": 0.008890348857874273, "loss": 2.7749, "step": 1855 }, { "crossentropy": 2.92008638381958, "epoch": 0.15787682885335147, "grad_norm": 0.04087124019861221, "grad_norm_var": 1.3714573911957628e-05, "learning_rate": 0.008889089949288987, "loss": 2.9201, "step": 1856 }, { "crossentropy": 2.7798752784729004, "epoch": 0.15796189179993195, "grad_norm": 0.0489613339304924, "grad_norm_var": 1.390612032611723e-05, "learning_rate": 0.008887830416227918, "loss": 2.7799, "step": 1857 }, { "crossentropy": 2.894354820251465, "epoch": 0.15804695474651242, "grad_norm": 0.05466175451874733, "grad_norm_var": 1.856613900922006e-05, "learning_rate": 0.008886570258893308, "loss": 2.8944, "step": 1858 }, { "crossentropy": 2.942812442779541, "epoch": 0.1581320176930929, "grad_norm": 0.055864185094833374, "grad_norm_var": 2.3976286864985097e-05, "learning_rate": 0.008885309477487504, "loss": 2.9428, "step": 1859 }, { "crossentropy": 2.86342716217041, "epoch": 0.15821708063967335, "grad_norm": 0.044826749712228775, "grad_norm_var": 2.2840558977207936e-05, "learning_rate": 0.008884048072212951, "loss": 2.8634, "step": 1860 }, { "crossentropy": 2.86354660987854, "epoch": 0.15830214358625383, "grad_norm": 0.05334116145968437, "grad_norm_var": 2.264072182477922e-05, "learning_rate": 0.008882786043272193, "loss": 2.8635, "step": 1861 }, { "crossentropy": 2.822580099105835, "epoch": 0.1583872065328343, "grad_norm": 0.04947967082262039, "grad_norm_var": 1.949132292193145e-05, "learning_rate": 0.008881523390867877, "loss": 2.8226, "step": 1862 }, { "crossentropy": 2.7801358699798584, "epoch": 0.15847226947941476, "grad_norm": 0.0444168858230114, "grad_norm_var": 1.781173244166558e-05, "learning_rate": 0.008880260115202748, "loss": 2.7801, "step": 1863 }, { "crossentropy": 2.952705144882202, "epoch": 0.15855733242599523, "grad_norm": 0.04552362114191055, "grad_norm_var": 1.8016898989540025e-05, "learning_rate": 0.008878996216479652, "loss": 2.9527, "step": 1864 }, { "crossentropy": 2.8186089992523193, "epoch": 0.1586423953725757, "grad_norm": 0.056332286447286606, "grad_norm_var": 2.2178459963793395e-05, "learning_rate": 0.008877731694901532, "loss": 2.8186, "step": 1865 }, { "crossentropy": 2.7977817058563232, "epoch": 0.1587274583191562, "grad_norm": 0.043834906071424484, "grad_norm_var": 2.3121669641582945e-05, "learning_rate": 0.008876466550671439, "loss": 2.7978, "step": 1866 }, { "crossentropy": 2.873372793197632, "epoch": 0.15881252126573664, "grad_norm": 0.044822413474321365, "grad_norm_var": 2.2749947485982818e-05, "learning_rate": 0.008875200783992515, "loss": 2.8734, "step": 1867 }, { "crossentropy": 2.8578743934631348, "epoch": 0.15889758421231712, "grad_norm": 0.04481982812285423, "grad_norm_var": 2.3319622120935152e-05, "learning_rate": 0.008873934395068004, "loss": 2.8579, "step": 1868 }, { "crossentropy": 2.957287311553955, "epoch": 0.1589826471588976, "grad_norm": 0.045211970806121826, "grad_norm_var": 2.3730806674911407e-05, "learning_rate": 0.008872667384101258, "loss": 2.9573, "step": 1869 }, { "crossentropy": 2.877208948135376, "epoch": 0.15906771010547804, "grad_norm": 0.05319979414343834, "grad_norm_var": 2.55750976474224e-05, "learning_rate": 0.008871399751295715, "loss": 2.8772, "step": 1870 }, { "crossentropy": 2.831186056137085, "epoch": 0.15915277305205852, "grad_norm": 0.04882882162928581, "grad_norm_var": 2.3809985815238335e-05, "learning_rate": 0.008870131496854927, "loss": 2.8312, "step": 1871 }, { "crossentropy": 2.8457987308502197, "epoch": 0.159237835998639, "grad_norm": 0.043622810393571854, "grad_norm_var": 2.150738007094527e-05, "learning_rate": 0.008868862620982534, "loss": 2.8458, "step": 1872 }, { "crossentropy": 2.868612289428711, "epoch": 0.15932289894521945, "grad_norm": 0.04593285173177719, "grad_norm_var": 2.193844582797638e-05, "learning_rate": 0.008867593123882283, "loss": 2.8686, "step": 1873 }, { "crossentropy": 2.8891868591308594, "epoch": 0.15940796189179993, "grad_norm": 0.046447668224573135, "grad_norm_var": 1.9319335559764226e-05, "learning_rate": 0.00886632300575802, "loss": 2.8892, "step": 1874 }, { "crossentropy": 2.81693696975708, "epoch": 0.1594930248383804, "grad_norm": 0.0472245067358017, "grad_norm_var": 1.4817793236509593e-05, "learning_rate": 0.008865052266813684, "loss": 2.8169, "step": 1875 }, { "crossentropy": 2.882561445236206, "epoch": 0.15957808778496088, "grad_norm": 0.04548387601971626, "grad_norm_var": 1.4622246115419899e-05, "learning_rate": 0.008863780907253328, "loss": 2.8826, "step": 1876 }, { "crossentropy": 2.8421401977539062, "epoch": 0.15966315073154133, "grad_norm": 0.048892222344875336, "grad_norm_var": 1.2339626441777949e-05, "learning_rate": 0.008862508927281085, "loss": 2.8421, "step": 1877 }, { "crossentropy": 2.852590322494507, "epoch": 0.1597482136781218, "grad_norm": 0.05357665941119194, "grad_norm_var": 1.4672452209925299e-05, "learning_rate": 0.008861236327101207, "loss": 2.8526, "step": 1878 }, { "crossentropy": 2.8386635780334473, "epoch": 0.1598332766247023, "grad_norm": 0.04954518377780914, "grad_norm_var": 1.4286175359350579e-05, "learning_rate": 0.008859963106918034, "loss": 2.8387, "step": 1879 }, { "crossentropy": 2.9424824714660645, "epoch": 0.15991833957128274, "grad_norm": 0.04670551419258118, "grad_norm_var": 1.4029534332154805e-05, "learning_rate": 0.00885868926693601, "loss": 2.9425, "step": 1880 }, { "crossentropy": 2.843454122543335, "epoch": 0.1600034025178632, "grad_norm": 0.04927482455968857, "grad_norm_var": 9.094939014814227e-06, "learning_rate": 0.008857414807359671, "loss": 2.8435, "step": 1881 }, { "crossentropy": 2.8159384727478027, "epoch": 0.1600884654644437, "grad_norm": 0.052467960864305496, "grad_norm_var": 9.719580561300065e-06, "learning_rate": 0.008856139728393666, "loss": 2.8159, "step": 1882 }, { "crossentropy": 2.912564516067505, "epoch": 0.16017352841102417, "grad_norm": 0.048003919422626495, "grad_norm_var": 9.055785957675274e-06, "learning_rate": 0.008854864030242732, "loss": 2.9126, "step": 1883 }, { "crossentropy": 2.87591552734375, "epoch": 0.16025859135760462, "grad_norm": 0.04799693450331688, "grad_norm_var": 8.306707286687143e-06, "learning_rate": 0.00885358771311171, "loss": 2.8759, "step": 1884 }, { "crossentropy": 2.8398096561431885, "epoch": 0.1603436543041851, "grad_norm": 0.045356471091508865, "grad_norm_var": 8.248979141233827e-06, "learning_rate": 0.008852310777205542, "loss": 2.8398, "step": 1885 }, { "crossentropy": 2.8057661056518555, "epoch": 0.16042871725076557, "grad_norm": 0.043260641396045685, "grad_norm_var": 7.90996775709892e-06, "learning_rate": 0.008851033222729262, "loss": 2.8058, "step": 1886 }, { "crossentropy": 2.793627977371216, "epoch": 0.16051378019734602, "grad_norm": 0.05048379674553871, "grad_norm_var": 8.338228331638877e-06, "learning_rate": 0.008849755049888013, "loss": 2.7936, "step": 1887 }, { "crossentropy": 2.765538215637207, "epoch": 0.1605988431439265, "grad_norm": 0.047738995403051376, "grad_norm_var": 7.122599270111459e-06, "learning_rate": 0.00884847625888703, "loss": 2.7655, "step": 1888 }, { "crossentropy": 2.820615530014038, "epoch": 0.16068390609050698, "grad_norm": 0.04934924840927124, "grad_norm_var": 6.899297216394808e-06, "learning_rate": 0.008847196849931651, "loss": 2.8206, "step": 1889 }, { "crossentropy": 2.8808159828186035, "epoch": 0.16076896903708746, "grad_norm": 0.04947863891720772, "grad_norm_var": 6.749934736186708e-06, "learning_rate": 0.008845916823227313, "loss": 2.8808, "step": 1890 }, { "crossentropy": 2.822633743286133, "epoch": 0.1608540319836679, "grad_norm": 0.047722652554512024, "grad_norm_var": 6.685544439286734e-06, "learning_rate": 0.008844636178979553, "loss": 2.8226, "step": 1891 }, { "crossentropy": 2.748244285583496, "epoch": 0.16093909493024838, "grad_norm": 0.050310343503952026, "grad_norm_var": 6.227149999544525e-06, "learning_rate": 0.008843354917394, "loss": 2.7482, "step": 1892 }, { "crossentropy": 2.6799771785736084, "epoch": 0.16102415787682886, "grad_norm": 0.04681120812892914, "grad_norm_var": 6.461195706290203e-06, "learning_rate": 0.008842073038676393, "loss": 2.68, "step": 1893 }, { "crossentropy": 2.843062400817871, "epoch": 0.1611092208234093, "grad_norm": 0.04647137224674225, "grad_norm_var": 4.93036727306125e-06, "learning_rate": 0.008840790543032563, "loss": 2.8431, "step": 1894 }, { "crossentropy": 2.795124053955078, "epoch": 0.1611942837699898, "grad_norm": 0.04481618478894234, "grad_norm_var": 5.47113846251588e-06, "learning_rate": 0.008839507430668438, "loss": 2.7951, "step": 1895 }, { "crossentropy": 2.8725457191467285, "epoch": 0.16127934671657027, "grad_norm": 0.044957950711250305, "grad_norm_var": 5.938134110258266e-06, "learning_rate": 0.008838223701790056, "loss": 2.8725, "step": 1896 }, { "crossentropy": 2.840273857116699, "epoch": 0.16136440966315074, "grad_norm": 0.04804975166916847, "grad_norm_var": 5.7879809787183175e-06, "learning_rate": 0.008836939356603542, "loss": 2.8403, "step": 1897 }, { "crossentropy": 2.8072760105133057, "epoch": 0.1614494726097312, "grad_norm": 0.05262954905629158, "grad_norm_var": 5.89223662301371e-06, "learning_rate": 0.008835654395315126, "loss": 2.8073, "step": 1898 }, { "crossentropy": 2.859997510910034, "epoch": 0.16153453555631167, "grad_norm": 0.05393878370523453, "grad_norm_var": 8.322392144558913e-06, "learning_rate": 0.008834368818131138, "loss": 2.86, "step": 1899 }, { "crossentropy": 2.8154594898223877, "epoch": 0.16161959850289215, "grad_norm": 0.055108919739723206, "grad_norm_var": 1.1399411433197076e-05, "learning_rate": 0.008833082625258002, "loss": 2.8155, "step": 1900 }, { "crossentropy": 2.85029673576355, "epoch": 0.1617046614494726, "grad_norm": 0.052991170436143875, "grad_norm_var": 1.1811639038512786e-05, "learning_rate": 0.008831795816902242, "loss": 2.8503, "step": 1901 }, { "crossentropy": 2.7557835578918457, "epoch": 0.16178972439605308, "grad_norm": 0.04921978339552879, "grad_norm_var": 9.464959704086108e-06, "learning_rate": 0.008830508393270486, "loss": 2.7558, "step": 1902 }, { "crossentropy": 2.8682825565338135, "epoch": 0.16187478734263355, "grad_norm": 0.04997042194008827, "grad_norm_var": 9.405869889744558e-06, "learning_rate": 0.008829220354569456, "loss": 2.8683, "step": 1903 }, { "crossentropy": 2.8031082153320312, "epoch": 0.16195985028921403, "grad_norm": 0.0466340109705925, "grad_norm_var": 9.719210614400562e-06, "learning_rate": 0.008827931701005973, "loss": 2.8031, "step": 1904 }, { "crossentropy": 2.875941038131714, "epoch": 0.16204491323579448, "grad_norm": 0.042878832668066025, "grad_norm_var": 1.2275032040450403e-05, "learning_rate": 0.00882664243278696, "loss": 2.8759, "step": 1905 }, { "crossentropy": 2.774843692779541, "epoch": 0.16212997618237496, "grad_norm": 0.04567592963576317, "grad_norm_var": 1.2872427254462017e-05, "learning_rate": 0.008825352550119432, "loss": 2.7748, "step": 1906 }, { "crossentropy": 2.8060717582702637, "epoch": 0.16221503912895543, "grad_norm": 0.06385147571563721, "grad_norm_var": 2.716548784776502e-05, "learning_rate": 0.008824062053210511, "loss": 2.8061, "step": 1907 }, { "crossentropy": 2.912308692932129, "epoch": 0.16230010207553588, "grad_norm": 0.05727363005280495, "grad_norm_var": 3.0813928345868546e-05, "learning_rate": 0.00882277094226741, "loss": 2.9123, "step": 1908 }, { "crossentropy": 2.7811639308929443, "epoch": 0.16238516502211636, "grad_norm": 0.04953612759709358, "grad_norm_var": 3.0090399808548513e-05, "learning_rate": 0.008821479217497449, "loss": 2.7812, "step": 1909 }, { "crossentropy": 2.83750057220459, "epoch": 0.16247022796869684, "grad_norm": 0.047538116574287415, "grad_norm_var": 2.9624042693149332e-05, "learning_rate": 0.008820186879108037, "loss": 2.8375, "step": 1910 }, { "crossentropy": 2.822249412536621, "epoch": 0.16255529091527732, "grad_norm": 0.05298249423503876, "grad_norm_var": 2.7802658896592705e-05, "learning_rate": 0.008818893927306691, "loss": 2.8222, "step": 1911 }, { "crossentropy": 2.7747507095336914, "epoch": 0.16264035386185777, "grad_norm": 0.05097566172480583, "grad_norm_var": 2.5356614620456603e-05, "learning_rate": 0.008817600362301017, "loss": 2.7748, "step": 1912 }, { "crossentropy": 2.9155335426330566, "epoch": 0.16272541680843824, "grad_norm": 0.044974181801080704, "grad_norm_var": 2.7241052324831343e-05, "learning_rate": 0.008816306184298725, "loss": 2.9155, "step": 1913 }, { "crossentropy": 2.878279685974121, "epoch": 0.16281047975501872, "grad_norm": 0.04405967891216278, "grad_norm_var": 2.9982006057004997e-05, "learning_rate": 0.008815011393507627, "loss": 2.8783, "step": 1914 }, { "crossentropy": 2.820927381515503, "epoch": 0.16289554270159917, "grad_norm": 0.04332300275564194, "grad_norm_var": 3.2123477737151275e-05, "learning_rate": 0.008813715990135624, "loss": 2.8209, "step": 1915 }, { "crossentropy": 2.869889259338379, "epoch": 0.16298060564817965, "grad_norm": 0.04272298514842987, "grad_norm_var": 3.29641969682418e-05, "learning_rate": 0.008812419974390725, "loss": 2.8699, "step": 1916 }, { "crossentropy": 2.7946126461029053, "epoch": 0.16306566859476013, "grad_norm": 0.04534660652279854, "grad_norm_var": 3.258725664069904e-05, "learning_rate": 0.008811123346481028, "loss": 2.7946, "step": 1917 }, { "crossentropy": 2.863163948059082, "epoch": 0.1631507315413406, "grad_norm": 0.049663227051496506, "grad_norm_var": 3.2638546154682645e-05, "learning_rate": 0.00880982610661474, "loss": 2.8632, "step": 1918 }, { "crossentropy": 2.855419874191284, "epoch": 0.16323579448792105, "grad_norm": 0.04152122884988785, "grad_norm_var": 3.5542856328363064e-05, "learning_rate": 0.008808528255000154, "loss": 2.8554, "step": 1919 }, { "crossentropy": 2.854283332824707, "epoch": 0.16332085743450153, "grad_norm": 0.04017801955342293, "grad_norm_var": 3.937518396046746e-05, "learning_rate": 0.008807229791845672, "loss": 2.8543, "step": 1920 }, { "crossentropy": 2.9373133182525635, "epoch": 0.163405920381082, "grad_norm": 0.044549115002155304, "grad_norm_var": 3.8485581030311966e-05, "learning_rate": 0.00880593071735979, "loss": 2.9373, "step": 1921 }, { "crossentropy": 2.8708174228668213, "epoch": 0.16349098332766246, "grad_norm": 0.0489087775349617, "grad_norm_var": 3.824014732038827e-05, "learning_rate": 0.008804631031751095, "loss": 2.8708, "step": 1922 }, { "crossentropy": 2.820901393890381, "epoch": 0.16357604627424294, "grad_norm": 0.04295656457543373, "grad_norm_var": 2.1261734990528822e-05, "learning_rate": 0.008803330735228288, "loss": 2.8209, "step": 1923 }, { "crossentropy": 2.83845853805542, "epoch": 0.16366110922082341, "grad_norm": 0.04816284403204918, "grad_norm_var": 1.3552661028611255e-05, "learning_rate": 0.008802029828000156, "loss": 2.8385, "step": 1924 }, { "crossentropy": 2.8652384281158447, "epoch": 0.1637461721674039, "grad_norm": 0.04554082080721855, "grad_norm_var": 1.2713160215464351e-05, "learning_rate": 0.008800728310275584, "loss": 2.8652, "step": 1925 }, { "crossentropy": 2.8940138816833496, "epoch": 0.16383123511398434, "grad_norm": 0.043728914111852646, "grad_norm_var": 1.2756409848180677e-05, "learning_rate": 0.008799426182263561, "loss": 2.894, "step": 1926 }, { "crossentropy": 2.755105972290039, "epoch": 0.16391629806056482, "grad_norm": 0.04128693789243698, "grad_norm_var": 9.792647436329162e-06, "learning_rate": 0.008798123444173174, "loss": 2.7551, "step": 1927 }, { "crossentropy": 2.8102097511291504, "epoch": 0.1640013610071453, "grad_norm": 0.042250122874975204, "grad_norm_var": 7.446169715636698e-06, "learning_rate": 0.0087968200962136, "loss": 2.8102, "step": 1928 }, { "crossentropy": 2.722815990447998, "epoch": 0.16408642395372575, "grad_norm": 0.0466514527797699, "grad_norm_var": 7.767554595243068e-06, "learning_rate": 0.008795516138594122, "loss": 2.7228, "step": 1929 }, { "crossentropy": 2.8433048725128174, "epoch": 0.16417148690030622, "grad_norm": 0.04496495798230171, "grad_norm_var": 7.774300110351533e-06, "learning_rate": 0.008794211571524118, "loss": 2.8433, "step": 1930 }, { "crossentropy": 2.8691163063049316, "epoch": 0.1642565498468867, "grad_norm": 0.04437577351927757, "grad_norm_var": 7.680500405271342e-06, "learning_rate": 0.008792906395213063, "loss": 2.8691, "step": 1931 }, { "crossentropy": 2.8078484535217285, "epoch": 0.16434161279346718, "grad_norm": 0.04625183716416359, "grad_norm_var": 7.598919353197543e-06, "learning_rate": 0.00879160060987053, "loss": 2.8078, "step": 1932 }, { "crossentropy": 2.7387590408325195, "epoch": 0.16442667574004763, "grad_norm": 0.04363833740353584, "grad_norm_var": 7.650217454630882e-06, "learning_rate": 0.008790294215706191, "loss": 2.7388, "step": 1933 }, { "crossentropy": 2.7495715618133545, "epoch": 0.1645117386866281, "grad_norm": 0.0428406186401844, "grad_norm_var": 6.012044439655285e-06, "learning_rate": 0.008788987212929818, "loss": 2.7496, "step": 1934 }, { "crossentropy": 2.9076504707336426, "epoch": 0.16459680163320858, "grad_norm": 0.04447241127490997, "grad_norm_var": 5.487403076367449e-06, "learning_rate": 0.008787679601751275, "loss": 2.9077, "step": 1935 }, { "crossentropy": 2.863339424133301, "epoch": 0.16468186457978903, "grad_norm": 0.046086959540843964, "grad_norm_var": 4.325698463478189e-06, "learning_rate": 0.008786371382380528, "loss": 2.8633, "step": 1936 }, { "crossentropy": 2.9227070808410645, "epoch": 0.1647669275263695, "grad_norm": 0.046290136873722076, "grad_norm_var": 4.458843966168024e-06, "learning_rate": 0.008785062555027636, "loss": 2.9227, "step": 1937 }, { "crossentropy": 2.864820718765259, "epoch": 0.16485199047295, "grad_norm": 0.04525240883231163, "grad_norm_var": 3.340293494254592e-06, "learning_rate": 0.008783753119902765, "loss": 2.8648, "step": 1938 }, { "crossentropy": 2.8566479682922363, "epoch": 0.16493705341953047, "grad_norm": 0.04202725365757942, "grad_norm_var": 3.6068190676969215e-06, "learning_rate": 0.008782443077216168, "loss": 2.8566, "step": 1939 }, { "crossentropy": 2.8829309940338135, "epoch": 0.16502211636611092, "grad_norm": 0.044978898018598557, "grad_norm_var": 2.7337778331396405e-06, "learning_rate": 0.008781132427178203, "loss": 2.8829, "step": 1940 }, { "crossentropy": 2.935016632080078, "epoch": 0.1651071793126914, "grad_norm": 0.04629914462566376, "grad_norm_var": 2.8835639770942446e-06, "learning_rate": 0.008779821169999319, "loss": 2.935, "step": 1941 }, { "crossentropy": 2.857041835784912, "epoch": 0.16519224225927187, "grad_norm": 0.04419274628162384, "grad_norm_var": 2.851656969366824e-06, "learning_rate": 0.008778509305890069, "loss": 2.857, "step": 1942 }, { "crossentropy": 2.833242654800415, "epoch": 0.16527730520585232, "grad_norm": 0.04456810653209686, "grad_norm_var": 2.1226844825828275e-06, "learning_rate": 0.0087771968350611, "loss": 2.8332, "step": 1943 }, { "crossentropy": 2.8727080821990967, "epoch": 0.1653623681524328, "grad_norm": 0.04723146930336952, "grad_norm_var": 2.0488317963963825e-06, "learning_rate": 0.008775883757723155, "loss": 2.8727, "step": 1944 }, { "crossentropy": 2.8280177116394043, "epoch": 0.16544743109901328, "grad_norm": 0.04484722018241882, "grad_norm_var": 1.856846599711653e-06, "learning_rate": 0.00877457007408708, "loss": 2.828, "step": 1945 }, { "crossentropy": 2.8128669261932373, "epoch": 0.16553249404559375, "grad_norm": 0.04347138851881027, "grad_norm_var": 1.982315434545523e-06, "learning_rate": 0.008773255784363812, "loss": 2.8129, "step": 1946 }, { "crossentropy": 2.8369460105895996, "epoch": 0.1656175569921742, "grad_norm": 0.04457997530698776, "grad_norm_var": 1.9733291593964616e-06, "learning_rate": 0.008771940888764389, "loss": 2.8369, "step": 1947 }, { "crossentropy": 2.897679328918457, "epoch": 0.16570261993875468, "grad_norm": 0.04704436659812927, "grad_norm_var": 2.1644902521505508e-06, "learning_rate": 0.008770625387499946, "loss": 2.8977, "step": 1948 }, { "crossentropy": 2.777080535888672, "epoch": 0.16578768288533516, "grad_norm": 0.05485799163579941, "grad_norm_var": 8.19873478589596e-06, "learning_rate": 0.008769309280781715, "loss": 2.7771, "step": 1949 }, { "crossentropy": 2.8043394088745117, "epoch": 0.1658727458319156, "grad_norm": 0.05100273713469505, "grad_norm_var": 9.397524931078055e-06, "learning_rate": 0.008767992568821022, "loss": 2.8043, "step": 1950 }, { "crossentropy": 2.782975673675537, "epoch": 0.16595780877849609, "grad_norm": 0.05366077646613121, "grad_norm_var": 1.2710551181172962e-05, "learning_rate": 0.008766675251829293, "loss": 2.783, "step": 1951 }, { "crossentropy": 2.815948009490967, "epoch": 0.16604287172507656, "grad_norm": 0.04904896765947342, "grad_norm_var": 1.3036738228642635e-05, "learning_rate": 0.008765357330018056, "loss": 2.8159, "step": 1952 }, { "crossentropy": 2.888807535171509, "epoch": 0.166127934671657, "grad_norm": 0.04368046298623085, "grad_norm_var": 1.3651837315793203e-05, "learning_rate": 0.008764038803598925, "loss": 2.8888, "step": 1953 }, { "crossentropy": 2.943899631500244, "epoch": 0.1662129976182375, "grad_norm": 0.042788296937942505, "grad_norm_var": 1.4497565932153344e-05, "learning_rate": 0.008762719672783623, "loss": 2.9439, "step": 1954 }, { "crossentropy": 2.8306639194488525, "epoch": 0.16629806056481797, "grad_norm": 0.041552525013685226, "grad_norm_var": 1.4795870411158513e-05, "learning_rate": 0.008761399937783961, "loss": 2.8307, "step": 1955 }, { "crossentropy": 2.8475069999694824, "epoch": 0.16638312351139845, "grad_norm": 0.04967606067657471, "grad_norm_var": 1.5229810543059865e-05, "learning_rate": 0.008760079598811851, "loss": 2.8475, "step": 1956 }, { "crossentropy": 2.841557502746582, "epoch": 0.1664681864579789, "grad_norm": 0.04193888604640961, "grad_norm_var": 1.6698413259924086e-05, "learning_rate": 0.008758758656079302, "loss": 2.8416, "step": 1957 }, { "crossentropy": 2.8088302612304688, "epoch": 0.16655324940455937, "grad_norm": 0.04326798394322395, "grad_norm_var": 1.7037444652531764e-05, "learning_rate": 0.008757437109798418, "loss": 2.8088, "step": 1958 }, { "crossentropy": 2.903521776199341, "epoch": 0.16663831235113985, "grad_norm": 0.04385989531874657, "grad_norm_var": 1.72465976869993e-05, "learning_rate": 0.008756114960181404, "loss": 2.9035, "step": 1959 }, { "crossentropy": 2.8079328536987305, "epoch": 0.1667233752977203, "grad_norm": 0.042839039117097855, "grad_norm_var": 1.7969471682451923e-05, "learning_rate": 0.008754792207440557, "loss": 2.8079, "step": 1960 }, { "crossentropy": 2.749465227127075, "epoch": 0.16680843824430078, "grad_norm": 0.046928856521844864, "grad_norm_var": 1.788362538009495e-05, "learning_rate": 0.008753468851788274, "loss": 2.7495, "step": 1961 }, { "crossentropy": 2.7857284545898438, "epoch": 0.16689350119088125, "grad_norm": 0.0471363440155983, "grad_norm_var": 1.735926714519683e-05, "learning_rate": 0.008752144893437045, "loss": 2.7857, "step": 1962 }, { "crossentropy": 2.8917508125305176, "epoch": 0.16697856413746173, "grad_norm": 0.04618792608380318, "grad_norm_var": 1.7111054086402176e-05, "learning_rate": 0.008750820332599463, "loss": 2.8918, "step": 1963 }, { "crossentropy": 2.818063735961914, "epoch": 0.16706362708404218, "grad_norm": 0.040483858436346054, "grad_norm_var": 1.9405321840069296e-05, "learning_rate": 0.008749495169488211, "loss": 2.8181, "step": 1964 }, { "crossentropy": 2.800747871398926, "epoch": 0.16714869003062266, "grad_norm": 0.04289635643362999, "grad_norm_var": 1.451052161689035e-05, "learning_rate": 0.008748169404316077, "loss": 2.8007, "step": 1965 }, { "crossentropy": 2.8315203189849854, "epoch": 0.16723375297720314, "grad_norm": 0.0422341413795948, "grad_norm_var": 1.280573464990427e-05, "learning_rate": 0.008746843037295936, "loss": 2.8315, "step": 1966 }, { "crossentropy": 2.739661931991577, "epoch": 0.1673188159237836, "grad_norm": 0.046392783522605896, "grad_norm_var": 7.6041472164295e-06, "learning_rate": 0.008745516068640767, "loss": 2.7397, "step": 1967 }, { "crossentropy": 2.754446268081665, "epoch": 0.16740387887036406, "grad_norm": 0.04695137217640877, "grad_norm_var": 6.587877371622547e-06, "learning_rate": 0.008744188498563642, "loss": 2.7544, "step": 1968 }, { "crossentropy": 2.850693941116333, "epoch": 0.16748894181694454, "grad_norm": 0.04580407217144966, "grad_norm_var": 6.694052310545333e-06, "learning_rate": 0.008742860327277729, "loss": 2.8507, "step": 1969 }, { "crossentropy": 2.828153133392334, "epoch": 0.16757400476352502, "grad_norm": 0.04584459587931633, "grad_norm_var": 6.607370458328969e-06, "learning_rate": 0.008741531554996297, "loss": 2.8282, "step": 1970 }, { "crossentropy": 2.901718854904175, "epoch": 0.16765906771010547, "grad_norm": 0.049256984144449234, "grad_norm_var": 7.1613947092668644e-06, "learning_rate": 0.008740202181932706, "loss": 2.9017, "step": 1971 }, { "crossentropy": 2.7750861644744873, "epoch": 0.16774413065668595, "grad_norm": 0.04932602122426033, "grad_norm_var": 6.955768352883267e-06, "learning_rate": 0.008738872208300417, "loss": 2.7751, "step": 1972 }, { "crossentropy": 2.8913397789001465, "epoch": 0.16782919360326642, "grad_norm": 0.044770073145627975, "grad_norm_var": 6.269369774447355e-06, "learning_rate": 0.008737541634312984, "loss": 2.8913, "step": 1973 }, { "crossentropy": 2.8081326484680176, "epoch": 0.16791425654984687, "grad_norm": 0.046422507613897324, "grad_norm_var": 6.052926525658882e-06, "learning_rate": 0.00873621046018406, "loss": 2.8081, "step": 1974 }, { "crossentropy": 2.8386106491088867, "epoch": 0.16799931949642735, "grad_norm": 0.0685345008969307, "grad_norm_var": 3.884610237800719e-05, "learning_rate": 0.008734878686127392, "loss": 2.8386, "step": 1975 }, { "crossentropy": 2.720789670944214, "epoch": 0.16808438244300783, "grad_norm": 0.05408928170800209, "grad_norm_var": 4.051413952203536e-05, "learning_rate": 0.008733546312356824, "loss": 2.7208, "step": 1976 }, { "crossentropy": 2.753394365310669, "epoch": 0.1681694453895883, "grad_norm": 0.04525291547179222, "grad_norm_var": 4.0862840410100096e-05, "learning_rate": 0.0087322133390863, "loss": 2.7534, "step": 1977 }, { "crossentropy": 2.820249319076538, "epoch": 0.16825450833616876, "grad_norm": 0.047821786254644394, "grad_norm_var": 4.084992317535651e-05, "learning_rate": 0.008730879766529854, "loss": 2.8202, "step": 1978 }, { "crossentropy": 2.8362836837768555, "epoch": 0.16833957128274923, "grad_norm": 0.042398400604724884, "grad_norm_var": 4.2482065492635496e-05, "learning_rate": 0.00872954559490162, "loss": 2.8363, "step": 1979 }, { "crossentropy": 2.680691719055176, "epoch": 0.1684246342293297, "grad_norm": 0.043011099100112915, "grad_norm_var": 4.0549071540559495e-05, "learning_rate": 0.008728210824415827, "loss": 2.6807, "step": 1980 }, { "crossentropy": 2.927229642868042, "epoch": 0.16850969717591016, "grad_norm": 0.04168860241770744, "grad_norm_var": 4.139171491416201e-05, "learning_rate": 0.008726875455286805, "loss": 2.9272, "step": 1981 }, { "crossentropy": 2.8038878440856934, "epoch": 0.16859476012249064, "grad_norm": 0.040649209171533585, "grad_norm_var": 4.2658866460495065e-05, "learning_rate": 0.00872553948772897, "loss": 2.8039, "step": 1982 }, { "crossentropy": 2.785505771636963, "epoch": 0.16867982306907112, "grad_norm": 0.04179297760128975, "grad_norm_var": 4.459186642129699e-05, "learning_rate": 0.008724202921956843, "loss": 2.7855, "step": 1983 }, { "crossentropy": 2.857898235321045, "epoch": 0.1687648860156516, "grad_norm": 0.04167795926332474, "grad_norm_var": 4.643505789819919e-05, "learning_rate": 0.008722865758185035, "loss": 2.8579, "step": 1984 }, { "crossentropy": 2.7470781803131104, "epoch": 0.16884994896223204, "grad_norm": 0.04392140731215477, "grad_norm_var": 4.689938295843558e-05, "learning_rate": 0.00872152799662826, "loss": 2.7471, "step": 1985 }, { "crossentropy": 2.8447561264038086, "epoch": 0.16893501190881252, "grad_norm": 0.05160447210073471, "grad_norm_var": 4.835155737965598e-05, "learning_rate": 0.008720189637501322, "loss": 2.8448, "step": 1986 }, { "crossentropy": 2.9411840438842773, "epoch": 0.169020074855393, "grad_norm": 0.050489392131567, "grad_norm_var": 4.881511334996104e-05, "learning_rate": 0.008718850681019123, "loss": 2.9412, "step": 1987 }, { "crossentropy": 2.8699159622192383, "epoch": 0.16910513780197345, "grad_norm": 0.04656178504228592, "grad_norm_var": 4.84688014975807e-05, "learning_rate": 0.008717511127396662, "loss": 2.8699, "step": 1988 }, { "crossentropy": 2.8970654010772705, "epoch": 0.16919020074855393, "grad_norm": 0.0468381866812706, "grad_norm_var": 4.814386039523327e-05, "learning_rate": 0.00871617097684903, "loss": 2.8971, "step": 1989 }, { "crossentropy": 2.7717862129211426, "epoch": 0.1692752636951344, "grad_norm": 0.05127614364027977, "grad_norm_var": 4.921198020187139e-05, "learning_rate": 0.008714830229591417, "loss": 2.7718, "step": 1990 }, { "crossentropy": 2.886564254760742, "epoch": 0.16936032664171488, "grad_norm": 0.04847392439842224, "grad_norm_var": 1.7701901347671965e-05, "learning_rate": 0.008713488885839112, "loss": 2.8866, "step": 1991 }, { "crossentropy": 2.8638041019439697, "epoch": 0.16944538958829533, "grad_norm": 0.04426511377096176, "grad_norm_var": 1.3264676322010301e-05, "learning_rate": 0.008712146945807494, "loss": 2.8638, "step": 1992 }, { "crossentropy": 2.773684501647949, "epoch": 0.1695304525348758, "grad_norm": 0.04331739619374275, "grad_norm_var": 1.3558119141659821e-05, "learning_rate": 0.008710804409712036, "loss": 2.7737, "step": 1993 }, { "crossentropy": 2.7986299991607666, "epoch": 0.16961551548145629, "grad_norm": 0.042670391499996185, "grad_norm_var": 1.3526984810219243e-05, "learning_rate": 0.008709461277768319, "loss": 2.7986, "step": 1994 }, { "crossentropy": 2.9436545372009277, "epoch": 0.16970057842803674, "grad_norm": 0.04321112111210823, "grad_norm_var": 1.3282040009675773e-05, "learning_rate": 0.008708117550192003, "loss": 2.9437, "step": 1995 }, { "crossentropy": 2.9082183837890625, "epoch": 0.1697856413746172, "grad_norm": 0.04394170641899109, "grad_norm_var": 1.3078143628585578e-05, "learning_rate": 0.008706773227198856, "loss": 2.9082, "step": 1996 }, { "crossentropy": 2.915245771408081, "epoch": 0.1698707043211977, "grad_norm": 0.04461895301938057, "grad_norm_var": 1.226290737932692e-05, "learning_rate": 0.008705428309004738, "loss": 2.9152, "step": 1997 }, { "crossentropy": 2.7974181175231934, "epoch": 0.16995576726777817, "grad_norm": 0.04318023473024368, "grad_norm_var": 1.1083025502715317e-05, "learning_rate": 0.008704082795825602, "loss": 2.7974, "step": 1998 }, { "crossentropy": 2.9664907455444336, "epoch": 0.17004083021435862, "grad_norm": 0.04793526977300644, "grad_norm_var": 1.04131915472927e-05, "learning_rate": 0.008702736687877499, "loss": 2.9665, "step": 1999 }, { "crossentropy": 2.8568315505981445, "epoch": 0.1701258931609391, "grad_norm": 0.04552880674600601, "grad_norm_var": 9.185581394394182e-06, "learning_rate": 0.008701389985376578, "loss": 2.8568, "step": 2000 }, { "crossentropy": 2.8588902950286865, "epoch": 0.17021095610751957, "grad_norm": 0.047009747475385666, "grad_norm_var": 8.8785685631948e-06, "learning_rate": 0.008700042688539078, "loss": 2.8589, "step": 2001 }, { "crossentropy": 2.8669023513793945, "epoch": 0.17029601905410002, "grad_norm": 0.045979924499988556, "grad_norm_var": 6.883503944493663e-06, "learning_rate": 0.008698694797581334, "loss": 2.8669, "step": 2002 }, { "crossentropy": 2.814601182937622, "epoch": 0.1703810820006805, "grad_norm": 0.044971566647291183, "grad_norm_var": 5.4512380364848035e-06, "learning_rate": 0.008697346312719786, "loss": 2.8146, "step": 2003 }, { "crossentropy": 2.9306061267852783, "epoch": 0.17046614494726098, "grad_norm": 0.054368533194065094, "grad_norm_var": 1.0249714349357581e-05, "learning_rate": 0.008695997234170952, "loss": 2.9306, "step": 2004 }, { "crossentropy": 2.766295909881592, "epoch": 0.17055120789384146, "grad_norm": 0.05791513994336128, "grad_norm_var": 1.9009841288601612e-05, "learning_rate": 0.008694647562151462, "loss": 2.7663, "step": 2005 }, { "crossentropy": 2.896851062774658, "epoch": 0.1706362708404219, "grad_norm": 0.057097312062978745, "grad_norm_var": 2.4608499834250357e-05, "learning_rate": 0.008693297296878032, "loss": 2.8969, "step": 2006 }, { "crossentropy": 2.8797459602355957, "epoch": 0.17072133378700238, "grad_norm": 0.048957210034132004, "grad_norm_var": 2.4708065902565717e-05, "learning_rate": 0.008691946438567476, "loss": 2.8797, "step": 2007 }, { "crossentropy": 2.813190221786499, "epoch": 0.17080639673358286, "grad_norm": 0.05651325732469559, "grad_norm_var": 2.9314844787146217e-05, "learning_rate": 0.008690594987436705, "loss": 2.8132, "step": 2008 }, { "crossentropy": 2.9266018867492676, "epoch": 0.1708914596801633, "grad_norm": 0.06729648262262344, "grad_norm_var": 5.043740588793749e-05, "learning_rate": 0.008689242943702718, "loss": 2.9266, "step": 2009 }, { "crossentropy": 2.8646466732025146, "epoch": 0.1709765226267438, "grad_norm": 0.046356044709682465, "grad_norm_var": 4.7954903638086574e-05, "learning_rate": 0.008687890307582618, "loss": 2.8646, "step": 2010 }, { "crossentropy": 2.8682782649993896, "epoch": 0.17106158557332427, "grad_norm": 0.0411226823925972, "grad_norm_var": 5.002883924566844e-05, "learning_rate": 0.008686537079293599, "loss": 2.8683, "step": 2011 }, { "crossentropy": 2.8408639430999756, "epoch": 0.17114664851990474, "grad_norm": 0.05009419471025467, "grad_norm_var": 4.779436282300019e-05, "learning_rate": 0.008685183259052952, "loss": 2.8409, "step": 2012 }, { "crossentropy": 2.793510913848877, "epoch": 0.1712317114664852, "grad_norm": 0.05159042403101921, "grad_norm_var": 4.589137903031707e-05, "learning_rate": 0.008683828847078056, "loss": 2.7935, "step": 2013 }, { "crossentropy": 2.8103010654449463, "epoch": 0.17131677441306567, "grad_norm": 0.04427601397037506, "grad_norm_var": 4.4916000991366414e-05, "learning_rate": 0.008682473843586398, "loss": 2.8103, "step": 2014 }, { "crossentropy": 2.8321948051452637, "epoch": 0.17140183735964615, "grad_norm": 0.04838211461901665, "grad_norm_var": 4.477935227228894e-05, "learning_rate": 0.008681118248795548, "loss": 2.8322, "step": 2015 }, { "crossentropy": 2.7576398849487305, "epoch": 0.1714869003062266, "grad_norm": 0.047576338052749634, "grad_norm_var": 4.3693442981655206e-05, "learning_rate": 0.008679762062923176, "loss": 2.7576, "step": 2016 }, { "crossentropy": 2.9141783714294434, "epoch": 0.17157196325280707, "grad_norm": 0.03993938863277435, "grad_norm_var": 5.019691918893079e-05, "learning_rate": 0.008678405286187045, "loss": 2.9142, "step": 2017 }, { "crossentropy": 2.87225341796875, "epoch": 0.17165702619938755, "grad_norm": 0.04738980531692505, "grad_norm_var": 4.953681617065331e-05, "learning_rate": 0.008677047918805017, "loss": 2.8723, "step": 2018 }, { "crossentropy": 2.7737877368927, "epoch": 0.17174208914596803, "grad_norm": 0.04531486704945564, "grad_norm_var": 4.9303009460952704e-05, "learning_rate": 0.008675689960995046, "loss": 2.7738, "step": 2019 }, { "crossentropy": 2.820589780807495, "epoch": 0.17182715209254848, "grad_norm": 0.0452587716281414, "grad_norm_var": 4.9501639408605116e-05, "learning_rate": 0.008674331412975179, "loss": 2.8206, "step": 2020 }, { "crossentropy": 2.8027541637420654, "epoch": 0.17191221503912896, "grad_norm": 0.04680658504366875, "grad_norm_var": 4.5035256968755226e-05, "learning_rate": 0.00867297227496356, "loss": 2.8028, "step": 2021 }, { "crossentropy": 2.6958580017089844, "epoch": 0.17199727798570943, "grad_norm": 0.044412802904844284, "grad_norm_var": 4.139356799066022e-05, "learning_rate": 0.008671612547178428, "loss": 2.6959, "step": 2022 }, { "crossentropy": 2.8163256645202637, "epoch": 0.17208234093228988, "grad_norm": 0.044588252902030945, "grad_norm_var": 4.214862576137693e-05, "learning_rate": 0.008670252229838115, "loss": 2.8163, "step": 2023 }, { "crossentropy": 2.94043231010437, "epoch": 0.17216740387887036, "grad_norm": 0.042685896158218384, "grad_norm_var": 3.82782456780482e-05, "learning_rate": 0.008668891323161051, "loss": 2.9404, "step": 2024 }, { "crossentropy": 2.789105176925659, "epoch": 0.17225246682545084, "grad_norm": 0.04337030276656151, "grad_norm_var": 9.52561535164925e-06, "learning_rate": 0.00866752982736576, "loss": 2.7891, "step": 2025 }, { "crossentropy": 2.8227455615997314, "epoch": 0.1723375297720313, "grad_norm": 0.04626996070146561, "grad_norm_var": 9.517088299924812e-06, "learning_rate": 0.008666167742670853, "loss": 2.8227, "step": 2026 }, { "crossentropy": 2.8450241088867188, "epoch": 0.17242259271861177, "grad_norm": 0.047538645565509796, "grad_norm_var": 8.287588786550816e-06, "learning_rate": 0.008664805069295045, "loss": 2.845, "step": 2027 }, { "crossentropy": 2.855072498321533, "epoch": 0.17250765566519224, "grad_norm": 0.0503101646900177, "grad_norm_var": 8.409310414100347e-06, "learning_rate": 0.008663441807457142, "loss": 2.8551, "step": 2028 }, { "crossentropy": 2.8724043369293213, "epoch": 0.17259271861177272, "grad_norm": 0.051710695028305054, "grad_norm_var": 8.50015358690078e-06, "learning_rate": 0.008662077957376044, "loss": 2.8724, "step": 2029 }, { "crossentropy": 2.8458263874053955, "epoch": 0.17267778155835317, "grad_norm": 0.04913235828280449, "grad_norm_var": 8.86471129448816e-06, "learning_rate": 0.008660713519270748, "loss": 2.8458, "step": 2030 }, { "crossentropy": 2.909777879714966, "epoch": 0.17276284450493365, "grad_norm": 0.05161174759268761, "grad_norm_var": 1.0416257437592148e-05, "learning_rate": 0.008659348493360341, "loss": 2.9098, "step": 2031 }, { "crossentropy": 2.782005786895752, "epoch": 0.17284790745151413, "grad_norm": 0.051695566624403, "grad_norm_var": 1.2070781321362143e-05, "learning_rate": 0.008657982879864008, "loss": 2.782, "step": 2032 }, { "crossentropy": 2.935347080230713, "epoch": 0.17293297039809458, "grad_norm": 0.05044260993599892, "grad_norm_var": 9.424719396365575e-06, "learning_rate": 0.008656616679001026, "loss": 2.9353, "step": 2033 }, { "crossentropy": 2.8658459186553955, "epoch": 0.17301803334467505, "grad_norm": 0.043444763869047165, "grad_norm_var": 1.0407362088896837e-05, "learning_rate": 0.00865524989099077, "loss": 2.8658, "step": 2034 }, { "crossentropy": 2.742875576019287, "epoch": 0.17310309629125553, "grad_norm": 0.045847922563552856, "grad_norm_var": 1.0293829253136578e-05, "learning_rate": 0.008653882516052702, "loss": 2.7429, "step": 2035 }, { "crossentropy": 2.8237547874450684, "epoch": 0.173188159237836, "grad_norm": 0.048576436936855316, "grad_norm_var": 1.0125064861024602e-05, "learning_rate": 0.008652514554406387, "loss": 2.8238, "step": 2036 }, { "crossentropy": 2.8378217220306396, "epoch": 0.17327322218441646, "grad_norm": 0.08020977675914764, "grad_norm_var": 7.720551786100042e-05, "learning_rate": 0.008651146006271482, "loss": 2.8378, "step": 2037 }, { "crossentropy": 2.8285257816314697, "epoch": 0.17335828513099694, "grad_norm": 0.06230032444000244, "grad_norm_var": 8.509292429323389e-05, "learning_rate": 0.008649776871867732, "loss": 2.8285, "step": 2038 }, { "crossentropy": 2.825031042098999, "epoch": 0.17344334807757741, "grad_norm": 0.04690627381205559, "grad_norm_var": 8.356808722511927e-05, "learning_rate": 0.008648407151414983, "loss": 2.825, "step": 2039 }, { "crossentropy": 2.8781065940856934, "epoch": 0.17352841102415786, "grad_norm": 0.04342649132013321, "grad_norm_var": 8.280573928062662e-05, "learning_rate": 0.008647036845133172, "loss": 2.8781, "step": 2040 }, { "crossentropy": 2.8115134239196777, "epoch": 0.17361347397073834, "grad_norm": 0.0411212295293808, "grad_norm_var": 8.534976433935785e-05, "learning_rate": 0.008645665953242329, "loss": 2.8115, "step": 2041 }, { "crossentropy": 2.7703070640563965, "epoch": 0.17369853691731882, "grad_norm": 0.040532663464546204, "grad_norm_var": 9.07645932329631e-05, "learning_rate": 0.008644294475962584, "loss": 2.7703, "step": 2042 }, { "crossentropy": 2.866875648498535, "epoch": 0.1737835998638993, "grad_norm": 0.039076145738363266, "grad_norm_var": 9.835673130469119e-05, "learning_rate": 0.00864292241351415, "loss": 2.8669, "step": 2043 }, { "crossentropy": 2.801812171936035, "epoch": 0.17386866281047975, "grad_norm": 0.04088061675429344, "grad_norm_var": 0.00010323684807883384, "learning_rate": 0.008641549766117348, "loss": 2.8018, "step": 2044 }, { "crossentropy": 2.7691433429718018, "epoch": 0.17395372575706022, "grad_norm": 0.043539926409721375, "grad_norm_var": 0.00010465483505178521, "learning_rate": 0.008640176533992584, "loss": 2.7691, "step": 2045 }, { "crossentropy": 2.8401811122894287, "epoch": 0.1740387887036407, "grad_norm": 0.05481164902448654, "grad_norm_var": 0.00010701967084780721, "learning_rate": 0.008638802717360354, "loss": 2.8402, "step": 2046 }, { "crossentropy": 2.798293113708496, "epoch": 0.17412385165022115, "grad_norm": 0.0441378690302372, "grad_norm_var": 0.00010793461512105246, "learning_rate": 0.00863742831644126, "loss": 2.7983, "step": 2047 }, { "crossentropy": 2.8772706985473633, "epoch": 0.17420891459680163, "grad_norm": 0.04989396408200264, "grad_norm_var": 0.00010738412374314271, "learning_rate": 0.008636053331455987, "loss": 2.8773, "step": 2048 }, { "crossentropy": 2.7432875633239746, "epoch": 0.1742939775433821, "grad_norm": 0.05053111910820007, "grad_norm_var": 0.00010740816645883519, "learning_rate": 0.00863467776262532, "loss": 2.7433, "step": 2049 }, { "crossentropy": 2.807210683822632, "epoch": 0.17437904048996258, "grad_norm": 0.04770435765385628, "grad_norm_var": 0.00010569815262084758, "learning_rate": 0.008633301610170136, "loss": 2.8072, "step": 2050 }, { "crossentropy": 2.7424581050872803, "epoch": 0.17446410343654303, "grad_norm": 0.044166333973407745, "grad_norm_var": 0.00010651851449399637, "learning_rate": 0.008631924874311404, "loss": 2.7425, "step": 2051 }, { "crossentropy": 2.89223575592041, "epoch": 0.1745491663831235, "grad_norm": 0.04536917060613632, "grad_norm_var": 0.00010717725183940115, "learning_rate": 0.008630547555270188, "loss": 2.8922, "step": 2052 }, { "crossentropy": 2.8193299770355225, "epoch": 0.174634229329704, "grad_norm": 0.04518192261457443, "grad_norm_var": 3.535858862822756e-05, "learning_rate": 0.008629169653267647, "loss": 2.8193, "step": 2053 }, { "crossentropy": 2.812126398086548, "epoch": 0.17471929227628444, "grad_norm": 0.045882698148489, "grad_norm_var": 1.701285765840632e-05, "learning_rate": 0.008627791168525032, "loss": 2.8121, "step": 2054 }, { "crossentropy": 2.728347063064575, "epoch": 0.17480435522286492, "grad_norm": 0.0438971184194088, "grad_norm_var": 1.6893261679200704e-05, "learning_rate": 0.008626412101263688, "loss": 2.7283, "step": 2055 }, { "crossentropy": 2.880110025405884, "epoch": 0.1748894181694454, "grad_norm": 0.043605685234069824, "grad_norm_var": 1.6857444608144438e-05, "learning_rate": 0.008625032451705053, "loss": 2.8801, "step": 2056 }, { "crossentropy": 2.7351293563842773, "epoch": 0.17497448111602587, "grad_norm": 0.04756540805101395, "grad_norm_var": 1.61023233726358e-05, "learning_rate": 0.008623652220070659, "loss": 2.7351, "step": 2057 }, { "crossentropy": 2.8771512508392334, "epoch": 0.17505954406260632, "grad_norm": 0.052716728299856186, "grad_norm_var": 1.7435103031782086e-05, "learning_rate": 0.008622271406582132, "loss": 2.8772, "step": 2058 }, { "crossentropy": 2.8465499877929688, "epoch": 0.1751446070091868, "grad_norm": 0.04566914960741997, "grad_norm_var": 1.390263461442942e-05, "learning_rate": 0.008620890011461189, "loss": 2.8465, "step": 2059 }, { "crossentropy": 2.8198800086975098, "epoch": 0.17522966995576728, "grad_norm": 0.04822368919849396, "grad_norm_var": 1.1675798598193877e-05, "learning_rate": 0.008619508034929646, "loss": 2.8199, "step": 2060 }, { "crossentropy": 2.874781608581543, "epoch": 0.17531473290234773, "grad_norm": 0.04327312856912613, "grad_norm_var": 1.1805326616900977e-05, "learning_rate": 0.008618125477209405, "loss": 2.8748, "step": 2061 }, { "crossentropy": 2.830082654953003, "epoch": 0.1753997958489282, "grad_norm": 0.04211584851145744, "grad_norm_var": 8.722586407393835e-06, "learning_rate": 0.008616742338522467, "loss": 2.8301, "step": 2062 }, { "crossentropy": 2.933169364929199, "epoch": 0.17548485879550868, "grad_norm": 0.05040745437145233, "grad_norm_var": 9.417131250968822e-06, "learning_rate": 0.008615358619090921, "loss": 2.9332, "step": 2063 }, { "crossentropy": 2.8385097980499268, "epoch": 0.17556992174208916, "grad_norm": 0.04690759629011154, "grad_norm_var": 8.67795817663386e-06, "learning_rate": 0.008613974319136958, "loss": 2.8385, "step": 2064 }, { "crossentropy": 2.891724109649658, "epoch": 0.1756549846886696, "grad_norm": 0.05912268906831741, "grad_norm_var": 1.796524997238827e-05, "learning_rate": 0.008612589438882852, "loss": 2.8917, "step": 2065 }, { "crossentropy": 2.785278558731079, "epoch": 0.17574004763525009, "grad_norm": 0.057628169655799866, "grad_norm_var": 2.5068163323969418e-05, "learning_rate": 0.008611203978550978, "loss": 2.7853, "step": 2066 }, { "crossentropy": 2.786041736602783, "epoch": 0.17582511058183056, "grad_norm": 0.0498129203915596, "grad_norm_var": 2.4469528752489068e-05, "learning_rate": 0.008609817938363797, "loss": 2.786, "step": 2067 }, { "crossentropy": 2.7467193603515625, "epoch": 0.175910173528411, "grad_norm": 0.0420745313167572, "grad_norm_var": 2.6286589366750616e-05, "learning_rate": 0.008608431318543872, "loss": 2.7467, "step": 2068 }, { "crossentropy": 2.8309786319732666, "epoch": 0.1759952364749915, "grad_norm": 0.04201509803533554, "grad_norm_var": 2.799997764441105e-05, "learning_rate": 0.008607044119313852, "loss": 2.831, "step": 2069 }, { "crossentropy": 2.8470520973205566, "epoch": 0.17608029942157197, "grad_norm": 0.04437695071101189, "grad_norm_var": 2.8477899980987726e-05, "learning_rate": 0.00860565634089648, "loss": 2.8471, "step": 2070 }, { "crossentropy": 2.916290044784546, "epoch": 0.17616536236815244, "grad_norm": 0.04956592619419098, "grad_norm_var": 2.77909249103897e-05, "learning_rate": 0.008604267983514595, "loss": 2.9163, "step": 2071 }, { "crossentropy": 2.8686182498931885, "epoch": 0.1762504253147329, "grad_norm": 0.04546581953763962, "grad_norm_var": 2.6962560563146415e-05, "learning_rate": 0.008602879047391125, "loss": 2.8686, "step": 2072 }, { "crossentropy": 2.8467824459075928, "epoch": 0.17633548826131337, "grad_norm": 0.046150561422109604, "grad_norm_var": 2.715717187641662e-05, "learning_rate": 0.008601489532749097, "loss": 2.8468, "step": 2073 }, { "crossentropy": 2.8745498657226562, "epoch": 0.17642055120789385, "grad_norm": 0.0439712293446064, "grad_norm_var": 2.6257103383619563e-05, "learning_rate": 0.008600099439811621, "loss": 2.8745, "step": 2074 }, { "crossentropy": 2.725261688232422, "epoch": 0.1765056141544743, "grad_norm": 0.0431293360888958, "grad_norm_var": 2.7212136124759623e-05, "learning_rate": 0.008598708768801913, "loss": 2.7253, "step": 2075 }, { "crossentropy": 2.81010103225708, "epoch": 0.17659067710105478, "grad_norm": 0.04263995215296745, "grad_norm_var": 2.8354007994810248e-05, "learning_rate": 0.00859731751994327, "loss": 2.8101, "step": 2076 }, { "crossentropy": 2.8184773921966553, "epoch": 0.17667574004763525, "grad_norm": 0.04390154778957367, "grad_norm_var": 2.8083923833042284e-05, "learning_rate": 0.008595925693459086, "loss": 2.8185, "step": 2077 }, { "crossentropy": 2.646711587905884, "epoch": 0.17676080299421573, "grad_norm": 0.04351826384663582, "grad_norm_var": 2.732528796237088e-05, "learning_rate": 0.008594533289572851, "loss": 2.6467, "step": 2078 }, { "crossentropy": 2.957334518432617, "epoch": 0.17684586594079618, "grad_norm": 0.04536684602499008, "grad_norm_var": 2.656807680787368e-05, "learning_rate": 0.008593140308508145, "loss": 2.9573, "step": 2079 }, { "crossentropy": 2.745858907699585, "epoch": 0.17693092888737666, "grad_norm": 0.042966894805431366, "grad_norm_var": 2.7378585786273087e-05, "learning_rate": 0.008591746750488639, "loss": 2.7459, "step": 2080 }, { "crossentropy": 2.8994643688201904, "epoch": 0.17701599183395714, "grad_norm": 0.04802368953824043, "grad_norm_var": 1.6185818927004434e-05, "learning_rate": 0.008590352615738097, "loss": 2.8995, "step": 2081 }, { "crossentropy": 2.8932578563690186, "epoch": 0.1771010547805376, "grad_norm": 0.04994341358542442, "grad_norm_var": 7.616847963935284e-06, "learning_rate": 0.00858895790448038, "loss": 2.8933, "step": 2082 }, { "crossentropy": 2.725205183029175, "epoch": 0.17718611772711806, "grad_norm": 0.054617851972579956, "grad_norm_var": 1.202620283503759e-05, "learning_rate": 0.008587562616939437, "loss": 2.7252, "step": 2083 }, { "crossentropy": 2.8271002769470215, "epoch": 0.17727118067369854, "grad_norm": 0.05072357878088951, "grad_norm_var": 1.277091787238212e-05, "learning_rate": 0.00858616675333931, "loss": 2.8271, "step": 2084 }, { "crossentropy": 2.8854596614837646, "epoch": 0.17735624362027902, "grad_norm": 0.051761556416749954, "grad_norm_var": 1.3498900877492725e-05, "learning_rate": 0.008584770313904137, "loss": 2.8855, "step": 2085 }, { "crossentropy": 2.8003201484680176, "epoch": 0.17744130656685947, "grad_norm": 0.04810864478349686, "grad_norm_var": 1.324687154889096e-05, "learning_rate": 0.008583373298858143, "loss": 2.8003, "step": 2086 }, { "crossentropy": 2.8240866661071777, "epoch": 0.17752636951343995, "grad_norm": 0.05211631581187248, "grad_norm_var": 1.4571536030408977e-05, "learning_rate": 0.008581975708425652, "loss": 2.8241, "step": 2087 }, { "crossentropy": 2.843079090118408, "epoch": 0.17761143246002042, "grad_norm": 0.06461431086063385, "grad_norm_var": 3.350640927711201e-05, "learning_rate": 0.008580577542831072, "loss": 2.8431, "step": 2088 }, { "crossentropy": 2.818702459335327, "epoch": 0.17769649540660087, "grad_norm": 0.053747791796922684, "grad_norm_var": 3.501535960989099e-05, "learning_rate": 0.008579178802298911, "loss": 2.8187, "step": 2089 }, { "crossentropy": 2.825481653213501, "epoch": 0.17778155835318135, "grad_norm": 0.04596521705389023, "grad_norm_var": 3.4007454550374847e-05, "learning_rate": 0.008577779487053768, "loss": 2.8255, "step": 2090 }, { "crossentropy": 2.762460708618164, "epoch": 0.17786662129976183, "grad_norm": 0.044229622930288315, "grad_norm_var": 3.3248039482920654e-05, "learning_rate": 0.008576379597320327, "loss": 2.7625, "step": 2091 }, { "crossentropy": 2.8331215381622314, "epoch": 0.1779516842463423, "grad_norm": 0.04247131571173668, "grad_norm_var": 3.339035604359991e-05, "learning_rate": 0.008574979133323377, "loss": 2.8331, "step": 2092 }, { "crossentropy": 2.885605812072754, "epoch": 0.17803674719292276, "grad_norm": 0.04308084025979042, "grad_norm_var": 3.39772125619688e-05, "learning_rate": 0.008573578095287788, "loss": 2.8856, "step": 2093 }, { "crossentropy": 2.767713785171509, "epoch": 0.17812181013950323, "grad_norm": 0.054331738501787186, "grad_norm_var": 3.362912025987504e-05, "learning_rate": 0.008572176483438528, "loss": 2.7677, "step": 2094 }, { "crossentropy": 2.8882391452789307, "epoch": 0.1782068730860837, "grad_norm": 0.041777074337005615, "grad_norm_var": 3.641488420448845e-05, "learning_rate": 0.008570774298000652, "loss": 2.8882, "step": 2095 }, { "crossentropy": 2.761387348175049, "epoch": 0.17829193603266416, "grad_norm": 0.06553904712200165, "grad_norm_var": 4.9258740232376495e-05, "learning_rate": 0.008569371539199315, "loss": 2.7614, "step": 2096 }, { "crossentropy": 2.795988082885742, "epoch": 0.17837699897924464, "grad_norm": 0.04206143319606781, "grad_norm_var": 5.3600748416087786e-05, "learning_rate": 0.008567968207259758, "loss": 2.796, "step": 2097 }, { "crossentropy": 2.7527334690093994, "epoch": 0.17846206192582512, "grad_norm": 0.04532751068472862, "grad_norm_var": 5.5163016428033156e-05, "learning_rate": 0.008566564302407317, "loss": 2.7527, "step": 2098 }, { "crossentropy": 2.9129533767700195, "epoch": 0.1785471248724056, "grad_norm": 0.042768314480781555, "grad_norm_var": 5.6689607139979485e-05, "learning_rate": 0.008565159824867415, "loss": 2.913, "step": 2099 }, { "crossentropy": 2.736341714859009, "epoch": 0.17863218781898604, "grad_norm": 0.042403824627399445, "grad_norm_var": 5.9424394109156706e-05, "learning_rate": 0.008563754774865573, "loss": 2.7363, "step": 2100 }, { "crossentropy": 2.7988264560699463, "epoch": 0.17871725076556652, "grad_norm": 0.04426063597202301, "grad_norm_var": 5.994799331315113e-05, "learning_rate": 0.008562349152627402, "loss": 2.7988, "step": 2101 }, { "crossentropy": 2.9033854007720947, "epoch": 0.178802313712147, "grad_norm": 0.045683037489652634, "grad_norm_var": 6.037767703812753e-05, "learning_rate": 0.008560942958378604, "loss": 2.9034, "step": 2102 }, { "crossentropy": 2.785773277282715, "epoch": 0.17888737665872745, "grad_norm": 0.04401511698961258, "grad_norm_var": 6.019377773296876e-05, "learning_rate": 0.008559536192344973, "loss": 2.7858, "step": 2103 }, { "crossentropy": 2.909388303756714, "epoch": 0.17897243960530793, "grad_norm": 0.048573754727840424, "grad_norm_var": 3.9976264691762335e-05, "learning_rate": 0.008558128854752396, "loss": 2.9094, "step": 2104 }, { "crossentropy": 2.8520286083221436, "epoch": 0.1790575025518884, "grad_norm": 0.05158594250679016, "grad_norm_var": 3.821950050223802e-05, "learning_rate": 0.00855672094582685, "loss": 2.852, "step": 2105 }, { "crossentropy": 2.839977264404297, "epoch": 0.17914256549846885, "grad_norm": 0.05699625238776207, "grad_norm_var": 4.5031331322881114e-05, "learning_rate": 0.008555312465794402, "loss": 2.84, "step": 2106 }, { "crossentropy": 2.75610613822937, "epoch": 0.17922762844504933, "grad_norm": 0.049600545316934586, "grad_norm_var": 4.4711332790209275e-05, "learning_rate": 0.00855390341488122, "loss": 2.7561, "step": 2107 }, { "crossentropy": 2.7790005207061768, "epoch": 0.1793126913916298, "grad_norm": 0.044434305280447006, "grad_norm_var": 4.3628205674220374e-05, "learning_rate": 0.00855249379331355, "loss": 2.779, "step": 2108 }, { "crossentropy": 2.8780319690704346, "epoch": 0.17939775433821029, "grad_norm": 0.04641305282711983, "grad_norm_var": 4.229103499237685e-05, "learning_rate": 0.00855108360131774, "loss": 2.878, "step": 2109 }, { "crossentropy": 2.8172287940979004, "epoch": 0.17948281728479074, "grad_norm": 0.044919490814208984, "grad_norm_var": 3.9707030088555655e-05, "learning_rate": 0.008549672839120226, "loss": 2.8172, "step": 2110 }, { "crossentropy": 2.78167724609375, "epoch": 0.1795678802313712, "grad_norm": 0.04573073610663414, "grad_norm_var": 3.778707638111088e-05, "learning_rate": 0.008548261506947537, "loss": 2.7817, "step": 2111 }, { "crossentropy": 2.891409397125244, "epoch": 0.1796529431779517, "grad_norm": 0.05004218593239784, "grad_norm_var": 1.5563893700790143e-05, "learning_rate": 0.008546849605026288, "loss": 2.8914, "step": 2112 }, { "crossentropy": 2.8310301303863525, "epoch": 0.17973800612453214, "grad_norm": 0.042503662407398224, "grad_norm_var": 1.531139375957405e-05, "learning_rate": 0.008545437133583194, "loss": 2.831, "step": 2113 }, { "crossentropy": 2.8359272480010986, "epoch": 0.17982306907111262, "grad_norm": 0.043069351464509964, "grad_norm_var": 1.6006801263346304e-05, "learning_rate": 0.008544024092845057, "loss": 2.8359, "step": 2114 }, { "crossentropy": 2.879727840423584, "epoch": 0.1799081320176931, "grad_norm": 0.0436895415186882, "grad_norm_var": 1.5609153805471593e-05, "learning_rate": 0.008542610483038768, "loss": 2.8797, "step": 2115 }, { "crossentropy": 2.7427010536193848, "epoch": 0.17999319496427357, "grad_norm": 0.041376736015081406, "grad_norm_var": 1.623536465621683e-05, "learning_rate": 0.008541196304391315, "loss": 2.7427, "step": 2116 }, { "crossentropy": 2.7822647094726562, "epoch": 0.18007825791085402, "grad_norm": 0.042162131518125534, "grad_norm_var": 1.7117837465402177e-05, "learning_rate": 0.008539781557129771, "loss": 2.7823, "step": 2117 }, { "crossentropy": 2.7839622497558594, "epoch": 0.1801633208574345, "grad_norm": 0.04683024808764458, "grad_norm_var": 1.710576150615849e-05, "learning_rate": 0.008538366241481306, "loss": 2.784, "step": 2118 }, { "crossentropy": 2.866269826889038, "epoch": 0.18024838380401498, "grad_norm": 0.05224713683128357, "grad_norm_var": 1.8754838599652214e-05, "learning_rate": 0.008536950357673177, "loss": 2.8663, "step": 2119 }, { "crossentropy": 2.6911025047302246, "epoch": 0.18033344675059543, "grad_norm": 0.053916435688734055, "grad_norm_var": 2.174117950288577e-05, "learning_rate": 0.008535533905932738, "loss": 2.6911, "step": 2120 }, { "crossentropy": 2.8334107398986816, "epoch": 0.1804185096971759, "grad_norm": 0.05819237604737282, "grad_norm_var": 2.8314887907521003e-05, "learning_rate": 0.008534116886487426, "loss": 2.8334, "step": 2121 }, { "crossentropy": 2.8444857597351074, "epoch": 0.18050357264375638, "grad_norm": 0.04589094594120979, "grad_norm_var": 2.215828632572044e-05, "learning_rate": 0.008532699299564777, "loss": 2.8445, "step": 2122 }, { "crossentropy": 2.846036911010742, "epoch": 0.18058863559033686, "grad_norm": 0.05150943249464035, "grad_norm_var": 2.3063520347834688e-05, "learning_rate": 0.008531281145392412, "loss": 2.846, "step": 2123 }, { "crossentropy": 2.8021492958068848, "epoch": 0.1806736985369173, "grad_norm": 0.04661063477396965, "grad_norm_var": 2.259821360671519e-05, "learning_rate": 0.008529862424198046, "loss": 2.8021, "step": 2124 }, { "crossentropy": 2.859752655029297, "epoch": 0.1807587614834978, "grad_norm": 0.04084793105721474, "grad_norm_var": 2.5113354924658343e-05, "learning_rate": 0.008528443136209485, "loss": 2.8598, "step": 2125 }, { "crossentropy": 2.7770986557006836, "epoch": 0.18084382443007826, "grad_norm": 0.043634116649627686, "grad_norm_var": 2.5546819832227875e-05, "learning_rate": 0.008527023281654627, "loss": 2.7771, "step": 2126 }, { "crossentropy": 2.9020676612854004, "epoch": 0.18092888737665871, "grad_norm": 0.04756214842200279, "grad_norm_var": 2.5503686524402414e-05, "learning_rate": 0.008525602860761459, "loss": 2.9021, "step": 2127 }, { "crossentropy": 2.8239591121673584, "epoch": 0.1810139503232392, "grad_norm": 0.044658418744802475, "grad_norm_var": 2.504554111972205e-05, "learning_rate": 0.00852418187375806, "loss": 2.824, "step": 2128 }, { "crossentropy": 2.8286900520324707, "epoch": 0.18109901326981967, "grad_norm": 0.04137831926345825, "grad_norm_var": 2.5730900627699872e-05, "learning_rate": 0.008522760320872597, "loss": 2.8287, "step": 2129 }, { "crossentropy": 2.8760032653808594, "epoch": 0.18118407621640015, "grad_norm": 0.04225859418511391, "grad_norm_var": 2.6139974703683008e-05, "learning_rate": 0.008521338202333334, "loss": 2.876, "step": 2130 }, { "crossentropy": 2.798856019973755, "epoch": 0.1812691391629806, "grad_norm": 0.040326185524463654, "grad_norm_var": 2.8072717497162834e-05, "learning_rate": 0.00851991551836862, "loss": 2.7989, "step": 2131 }, { "crossentropy": 2.975250244140625, "epoch": 0.18135420210956107, "grad_norm": 0.043492671102285385, "grad_norm_var": 2.698822138479471e-05, "learning_rate": 0.008518492269206899, "loss": 2.9753, "step": 2132 }, { "crossentropy": 2.831782579421997, "epoch": 0.18143926505614155, "grad_norm": 0.04413214698433876, "grad_norm_var": 2.6132110022717736e-05, "learning_rate": 0.008517068455076701, "loss": 2.8318, "step": 2133 }, { "crossentropy": 2.8920390605926514, "epoch": 0.181524328002722, "grad_norm": 0.052575740963220596, "grad_norm_var": 2.847279625004754e-05, "learning_rate": 0.008515644076206653, "loss": 2.892, "step": 2134 }, { "crossentropy": 2.814732074737549, "epoch": 0.18160939094930248, "grad_norm": 0.049492042511701584, "grad_norm_var": 2.6956168548734432e-05, "learning_rate": 0.008514219132825466, "loss": 2.8147, "step": 2135 }, { "crossentropy": 2.7906973361968994, "epoch": 0.18169445389588296, "grad_norm": 0.05036507546901703, "grad_norm_var": 2.4305976963868472e-05, "learning_rate": 0.008512793625161946, "loss": 2.7907, "step": 2136 }, { "crossentropy": 2.89955997467041, "epoch": 0.18177951684246343, "grad_norm": 0.0493619441986084, "grad_norm_var": 1.5334037609724775e-05, "learning_rate": 0.00851136755344499, "loss": 2.8996, "step": 2137 }, { "crossentropy": 2.901519298553467, "epoch": 0.18186457978904388, "grad_norm": 0.042429983615875244, "grad_norm_var": 1.6078099247965733e-05, "learning_rate": 0.008509940917903582, "loss": 2.9015, "step": 2138 }, { "crossentropy": 2.7792115211486816, "epoch": 0.18194964273562436, "grad_norm": 0.04053308442234993, "grad_norm_var": 1.5054287387019037e-05, "learning_rate": 0.0085085137187668, "loss": 2.7792, "step": 2139 }, { "crossentropy": 2.817753553390503, "epoch": 0.18203470568220484, "grad_norm": 0.042750660330057144, "grad_norm_var": 1.5145597972127111e-05, "learning_rate": 0.008507085956263809, "loss": 2.8178, "step": 2140 }, { "crossentropy": 2.8368308544158936, "epoch": 0.1821197686287853, "grad_norm": 0.0407182015478611, "grad_norm_var": 1.5213927734808452e-05, "learning_rate": 0.008505657630623867, "loss": 2.8368, "step": 2141 }, { "crossentropy": 2.835177421569824, "epoch": 0.18220483157536577, "grad_norm": 0.04369518160820007, "grad_norm_var": 1.5205243544386126e-05, "learning_rate": 0.008504228742076325, "loss": 2.8352, "step": 2142 }, { "crossentropy": 2.8593204021453857, "epoch": 0.18228989452194624, "grad_norm": 0.047046106308698654, "grad_norm_var": 1.502723627948288e-05, "learning_rate": 0.008502799290850618, "loss": 2.8593, "step": 2143 }, { "crossentropy": 2.8386762142181396, "epoch": 0.18237495746852672, "grad_norm": 0.0444764718413353, "grad_norm_var": 1.5030335833893643e-05, "learning_rate": 0.008501369277176275, "loss": 2.8387, "step": 2144 }, { "crossentropy": 2.7662339210510254, "epoch": 0.18246002041510717, "grad_norm": 0.04395708441734314, "grad_norm_var": 1.4307452882981036e-05, "learning_rate": 0.008499938701282917, "loss": 2.7662, "step": 2145 }, { "crossentropy": 2.8592631816864014, "epoch": 0.18254508336168765, "grad_norm": 0.042072053998708725, "grad_norm_var": 1.4374098591989475e-05, "learning_rate": 0.00849850756340025, "loss": 2.8593, "step": 2146 }, { "crossentropy": 2.800856828689575, "epoch": 0.18263014630826813, "grad_norm": 0.04241114482283592, "grad_norm_var": 1.3391240582851192e-05, "learning_rate": 0.008497075863758079, "loss": 2.8009, "step": 2147 }, { "crossentropy": 2.8771097660064697, "epoch": 0.18271520925484858, "grad_norm": 0.0466184988617897, "grad_norm_var": 1.3386469832266044e-05, "learning_rate": 0.008495643602586287, "loss": 2.8771, "step": 2148 }, { "crossentropy": 2.7931618690490723, "epoch": 0.18280027220142905, "grad_norm": 0.04338211938738823, "grad_norm_var": 1.352488915740731e-05, "learning_rate": 0.00849421078011486, "loss": 2.7932, "step": 2149 }, { "crossentropy": 2.827228546142578, "epoch": 0.18288533514800953, "grad_norm": 0.04375462234020233, "grad_norm_var": 9.616540281777673e-06, "learning_rate": 0.008492777396573862, "loss": 2.8272, "step": 2150 }, { "crossentropy": 2.8542354106903076, "epoch": 0.18297039809459, "grad_norm": 0.04351511970162392, "grad_norm_var": 7.924001086518732e-06, "learning_rate": 0.008491343452193458, "loss": 2.8542, "step": 2151 }, { "crossentropy": 2.7695677280426025, "epoch": 0.18305546104117046, "grad_norm": 0.03959881141781807, "grad_norm_var": 6.308444729061629e-06, "learning_rate": 0.008489908947203898, "loss": 2.7696, "step": 2152 }, { "crossentropy": 2.837661027908325, "epoch": 0.18314052398775094, "grad_norm": 0.04522578418254852, "grad_norm_var": 4.155958853725196e-06, "learning_rate": 0.008488473881835516, "loss": 2.8377, "step": 2153 }, { "crossentropy": 2.8020927906036377, "epoch": 0.1832255869343314, "grad_norm": 0.045924149453639984, "grad_norm_var": 4.531612337715871e-06, "learning_rate": 0.00848703825631875, "loss": 2.8021, "step": 2154 }, { "crossentropy": 2.840646505355835, "epoch": 0.18331064988091186, "grad_norm": 0.04525226354598999, "grad_norm_var": 4.069294004227277e-06, "learning_rate": 0.008485602070884116, "loss": 2.8406, "step": 2155 }, { "crossentropy": 2.837921380996704, "epoch": 0.18339571282749234, "grad_norm": 0.04256085678935051, "grad_norm_var": 4.097465966519202e-06, "learning_rate": 0.008484165325762225, "loss": 2.8379, "step": 2156 }, { "crossentropy": 2.813594341278076, "epoch": 0.18348077577407282, "grad_norm": 0.043037332594394684, "grad_norm_var": 3.4921000468159913e-06, "learning_rate": 0.008482728021183777, "loss": 2.8136, "step": 2157 }, { "crossentropy": 2.835207939147949, "epoch": 0.1835658387206533, "grad_norm": 0.04476504400372505, "grad_norm_var": 3.5332832623411376e-06, "learning_rate": 0.00848129015737956, "loss": 2.8352, "step": 2158 }, { "crossentropy": 2.9499351978302, "epoch": 0.18365090166723375, "grad_norm": 0.043409526348114014, "grad_norm_var": 2.8706411911077457e-06, "learning_rate": 0.008479851734580456, "loss": 2.9499, "step": 2159 }, { "crossentropy": 2.763596534729004, "epoch": 0.18373596461381422, "grad_norm": 0.04668714851140976, "grad_norm_var": 3.3909375922874216e-06, "learning_rate": 0.008478412753017433, "loss": 2.7636, "step": 2160 }, { "crossentropy": 2.8057668209075928, "epoch": 0.1838210275603947, "grad_norm": 0.042799144983291626, "grad_norm_var": 3.463721375474883e-06, "learning_rate": 0.008476973212921549, "loss": 2.8058, "step": 2161 }, { "crossentropy": 2.882044792175293, "epoch": 0.18390609050697515, "grad_norm": 0.05218905210494995, "grad_norm_var": 7.511930997763002e-06, "learning_rate": 0.008475533114523955, "loss": 2.882, "step": 2162 }, { "crossentropy": 2.7613887786865234, "epoch": 0.18399115345355563, "grad_norm": 0.05893820524215698, "grad_norm_var": 2.010013171242399e-05, "learning_rate": 0.008474092458055886, "loss": 2.7614, "step": 2163 }, { "crossentropy": 2.7490715980529785, "epoch": 0.1840762164001361, "grad_norm": 0.04856712743639946, "grad_norm_var": 2.0633617770482218e-05, "learning_rate": 0.008472651243748673, "loss": 2.7491, "step": 2164 }, { "crossentropy": 2.7903902530670166, "epoch": 0.18416127934671658, "grad_norm": 0.04399382323026657, "grad_norm_var": 2.0476080485322344e-05, "learning_rate": 0.008471209471833734, "loss": 2.7904, "step": 2165 }, { "crossentropy": 2.7957630157470703, "epoch": 0.18424634229329703, "grad_norm": 0.04385415464639664, "grad_norm_var": 2.0451697092348175e-05, "learning_rate": 0.008469767142542573, "loss": 2.7958, "step": 2166 }, { "crossentropy": 2.9270782470703125, "epoch": 0.1843314052398775, "grad_norm": 0.04418163374066353, "grad_norm_var": 2.029019644348793e-05, "learning_rate": 0.008468324256106788, "loss": 2.9271, "step": 2167 }, { "crossentropy": 2.739959716796875, "epoch": 0.184416468186458, "grad_norm": 0.046367861330509186, "grad_norm_var": 1.7659563071680446e-05, "learning_rate": 0.008466880812758064, "loss": 2.74, "step": 2168 }, { "crossentropy": 2.790372848510742, "epoch": 0.18450153113303844, "grad_norm": 0.04214346036314964, "grad_norm_var": 1.8616572985302667e-05, "learning_rate": 0.00846543681272818, "loss": 2.7904, "step": 2169 }, { "crossentropy": 2.796415090560913, "epoch": 0.18458659407961892, "grad_norm": 0.044806573539972305, "grad_norm_var": 1.869355731777244e-05, "learning_rate": 0.008463992256248996, "loss": 2.7964, "step": 2170 }, { "crossentropy": 2.8251752853393555, "epoch": 0.1846716570261994, "grad_norm": 0.04219280555844307, "grad_norm_var": 1.952121531801348e-05, "learning_rate": 0.00846254714355247, "loss": 2.8252, "step": 2171 }, { "crossentropy": 2.7989120483398438, "epoch": 0.18475671997277987, "grad_norm": 0.04547688364982605, "grad_norm_var": 1.884931800868554e-05, "learning_rate": 0.008461101474870641, "loss": 2.7989, "step": 2172 }, { "crossentropy": 2.7588412761688232, "epoch": 0.18484178291936032, "grad_norm": 0.04798298329114914, "grad_norm_var": 1.853114470025674e-05, "learning_rate": 0.008459655250435648, "loss": 2.7588, "step": 2173 }, { "crossentropy": 2.807203769683838, "epoch": 0.1849268458659408, "grad_norm": 0.048024438321590424, "grad_norm_var": 1.8594451195309317e-05, "learning_rate": 0.008458208470479707, "loss": 2.8072, "step": 2174 }, { "crossentropy": 2.7413318157196045, "epoch": 0.18501190881252128, "grad_norm": 0.04325760155916214, "grad_norm_var": 1.8655476644138282e-05, "learning_rate": 0.008456761135235132, "loss": 2.7413, "step": 2175 }, { "crossentropy": 2.803636074066162, "epoch": 0.18509697175910172, "grad_norm": 0.04148104041814804, "grad_norm_var": 2.0109470270822324e-05, "learning_rate": 0.008455313244934324, "loss": 2.8036, "step": 2176 }, { "crossentropy": 2.7469658851623535, "epoch": 0.1851820347056822, "grad_norm": 0.042640719562768936, "grad_norm_var": 2.0178990857556515e-05, "learning_rate": 0.00845386479980977, "loss": 2.747, "step": 2177 }, { "crossentropy": 2.7785377502441406, "epoch": 0.18526709765226268, "grad_norm": 0.042255815118551254, "grad_norm_var": 1.815698177517518e-05, "learning_rate": 0.008452415800094051, "loss": 2.7785, "step": 2178 }, { "crossentropy": 2.761470079421997, "epoch": 0.18535216059884316, "grad_norm": 0.04433433711528778, "grad_norm_var": 5.096603052861098e-06, "learning_rate": 0.008450966246019834, "loss": 2.7615, "step": 2179 }, { "crossentropy": 2.799142360687256, "epoch": 0.1854372235454236, "grad_norm": 0.043295085430145264, "grad_norm_var": 3.95553699647407e-06, "learning_rate": 0.008449516137819874, "loss": 2.7991, "step": 2180 }, { "crossentropy": 2.8727684020996094, "epoch": 0.18552228649200408, "grad_norm": 0.09849853813648224, "grad_norm_var": 0.0001885436193314621, "learning_rate": 0.00844806547572702, "loss": 2.8728, "step": 2181 }, { "crossentropy": 2.8717992305755615, "epoch": 0.18560734943858456, "grad_norm": 0.05086119845509529, "grad_norm_var": 0.0001881597133935398, "learning_rate": 0.008446614259974202, "loss": 2.8718, "step": 2182 }, { "crossentropy": 2.8735125064849854, "epoch": 0.185692412385165, "grad_norm": 0.0498003214597702, "grad_norm_var": 0.00018728157440941565, "learning_rate": 0.008445162490794446, "loss": 2.8735, "step": 2183 }, { "crossentropy": 3.01387619972229, "epoch": 0.1857774753317455, "grad_norm": 0.04775076359510422, "grad_norm_var": 0.0001870376982693415, "learning_rate": 0.008443710168420866, "loss": 3.0139, "step": 2184 }, { "crossentropy": 2.880605459213257, "epoch": 0.18586253827832597, "grad_norm": 0.046172793954610825, "grad_norm_var": 0.00018467761020884803, "learning_rate": 0.008442257293086659, "loss": 2.8806, "step": 2185 }, { "crossentropy": 2.779627799987793, "epoch": 0.18594760122490642, "grad_norm": 0.04693583771586418, "grad_norm_var": 0.00018386215098113422, "learning_rate": 0.008440803865025118, "loss": 2.7796, "step": 2186 }, { "crossentropy": 2.842921495437622, "epoch": 0.1860326641714869, "grad_norm": 0.04424227774143219, "grad_norm_var": 0.00018231641800180807, "learning_rate": 0.00843934988446962, "loss": 2.8429, "step": 2187 }, { "crossentropy": 2.74983811378479, "epoch": 0.18611772711806737, "grad_norm": 0.0436314232647419, "grad_norm_var": 0.00018338096336682834, "learning_rate": 0.008437895351653636, "loss": 2.7498, "step": 2188 }, { "crossentropy": 2.8697292804718018, "epoch": 0.18620279006464785, "grad_norm": 0.04362824559211731, "grad_norm_var": 0.00018505383495481253, "learning_rate": 0.008436440266810714, "loss": 2.8697, "step": 2189 }, { "crossentropy": 2.8601999282836914, "epoch": 0.1862878530112283, "grad_norm": 0.050676990300416946, "grad_norm_var": 0.00018530747895004098, "learning_rate": 0.008434984630174508, "loss": 2.8602, "step": 2190 }, { "crossentropy": 2.7227580547332764, "epoch": 0.18637291595780878, "grad_norm": 0.056598104536533356, "grad_norm_var": 0.00018672072824060982, "learning_rate": 0.008433528441978748, "loss": 2.7228, "step": 2191 }, { "crossentropy": 2.8903112411499023, "epoch": 0.18645797890438925, "grad_norm": 0.0567963682115078, "grad_norm_var": 0.000184903068476198, "learning_rate": 0.008432071702457253, "loss": 2.8903, "step": 2192 }, { "crossentropy": 2.851041555404663, "epoch": 0.1865430418509697, "grad_norm": 0.05535000190138817, "grad_norm_var": 0.00018166774170528418, "learning_rate": 0.008430614411843935, "loss": 2.851, "step": 2193 }, { "crossentropy": 2.768852710723877, "epoch": 0.18662810479755018, "grad_norm": 0.042583588510751724, "grad_norm_var": 0.00018127912056929243, "learning_rate": 0.008429156570372794, "loss": 2.7689, "step": 2194 }, { "crossentropy": 2.862107515335083, "epoch": 0.18671316774413066, "grad_norm": 0.04809422791004181, "grad_norm_var": 0.0001786595012206238, "learning_rate": 0.008427698178277918, "loss": 2.8621, "step": 2195 }, { "crossentropy": 2.7484824657440186, "epoch": 0.18679823069071114, "grad_norm": 0.05579115450382233, "grad_norm_var": 0.00017465306551838107, "learning_rate": 0.00842623923579348, "loss": 2.7485, "step": 2196 }, { "crossentropy": 2.8148369789123535, "epoch": 0.1868832936372916, "grad_norm": 0.044634006917476654, "grad_norm_var": 2.446942033687167e-05, "learning_rate": 0.008424779743153745, "loss": 2.8148, "step": 2197 }, { "crossentropy": 2.871375322341919, "epoch": 0.18696835658387206, "grad_norm": 0.04351343587040901, "grad_norm_var": 2.599263287047677e-05, "learning_rate": 0.008423319700593068, "loss": 2.8714, "step": 2198 }, { "crossentropy": 2.6965715885162354, "epoch": 0.18705341953045254, "grad_norm": 0.041207779198884964, "grad_norm_var": 2.9131665105499155e-05, "learning_rate": 0.008421859108345886, "loss": 2.6966, "step": 2199 }, { "crossentropy": 2.808776378631592, "epoch": 0.187138482477033, "grad_norm": 0.05043498054146767, "grad_norm_var": 2.9501569099704462e-05, "learning_rate": 0.008420397966646731, "loss": 2.8088, "step": 2200 }, { "crossentropy": 2.910222291946411, "epoch": 0.18722354542361347, "grad_norm": 0.048220664262771606, "grad_norm_var": 2.9225661477086788e-05, "learning_rate": 0.008418936275730218, "loss": 2.9102, "step": 2201 }, { "crossentropy": 2.8126423358917236, "epoch": 0.18730860837019395, "grad_norm": 0.045716430991888046, "grad_norm_var": 2.953570817577257e-05, "learning_rate": 0.008417474035831055, "loss": 2.8126, "step": 2202 }, { "crossentropy": 2.900674819946289, "epoch": 0.18739367131677442, "grad_norm": 0.046038828790187836, "grad_norm_var": 2.8790602049629372e-05, "learning_rate": 0.008416011247184033, "loss": 2.9007, "step": 2203 }, { "crossentropy": 2.8170416355133057, "epoch": 0.18747873426335487, "grad_norm": 0.03987432271242142, "grad_norm_var": 3.201518723787317e-05, "learning_rate": 0.008414547910024035, "loss": 2.817, "step": 2204 }, { "crossentropy": 2.7562060356140137, "epoch": 0.18756379720993535, "grad_norm": 0.043566152453422546, "grad_norm_var": 3.2052222121166604e-05, "learning_rate": 0.00841308402458603, "loss": 2.7562, "step": 2205 }, { "crossentropy": 2.804490089416504, "epoch": 0.18764886015651583, "grad_norm": 0.04536208137869835, "grad_norm_var": 3.196926568157047e-05, "learning_rate": 0.008411619591105077, "loss": 2.8045, "step": 2206 }, { "crossentropy": 2.8405332565307617, "epoch": 0.18773392310309628, "grad_norm": 0.04261338338255882, "grad_norm_var": 2.766871508748375e-05, "learning_rate": 0.008410154609816324, "loss": 2.8405, "step": 2207 }, { "crossentropy": 2.853102922439575, "epoch": 0.18781898604967676, "grad_norm": 0.04249973222613335, "grad_norm_var": 2.1506899056664545e-05, "learning_rate": 0.008408689080954998, "loss": 2.8531, "step": 2208 }, { "crossentropy": 2.915060520172119, "epoch": 0.18790404899625723, "grad_norm": 0.042567335069179535, "grad_norm_var": 1.5730277949824178e-05, "learning_rate": 0.008407223004756426, "loss": 2.9151, "step": 2209 }, { "crossentropy": 2.7238640785217285, "epoch": 0.1879891119428377, "grad_norm": 0.04226971045136452, "grad_norm_var": 1.5844672833064308e-05, "learning_rate": 0.008405756381456016, "loss": 2.7239, "step": 2210 }, { "crossentropy": 2.856372356414795, "epoch": 0.18807417488941816, "grad_norm": 0.04222892224788666, "grad_norm_var": 1.5692486361254532e-05, "learning_rate": 0.008404289211289266, "loss": 2.8564, "step": 2211 }, { "crossentropy": 2.7312541007995605, "epoch": 0.18815923783599864, "grad_norm": 0.04408089071512222, "grad_norm_var": 7.076408725560877e-06, "learning_rate": 0.008402821494491763, "loss": 2.7313, "step": 2212 }, { "crossentropy": 2.809844493865967, "epoch": 0.18824430078257912, "grad_norm": 0.045322876423597336, "grad_norm_var": 7.159543648931471e-06, "learning_rate": 0.008401353231299176, "loss": 2.8098, "step": 2213 }, { "crossentropy": 2.8314850330352783, "epoch": 0.18832936372915957, "grad_norm": 0.04820278286933899, "grad_norm_var": 8.170392953658207e-06, "learning_rate": 0.008399884421947269, "loss": 2.8315, "step": 2214 }, { "crossentropy": 2.817852020263672, "epoch": 0.18841442667574004, "grad_norm": 0.04988986253738403, "grad_norm_var": 9.200176197357984e-06, "learning_rate": 0.008398415066671888, "loss": 2.8179, "step": 2215 }, { "crossentropy": 2.8920693397521973, "epoch": 0.18849948962232052, "grad_norm": 0.045245978981256485, "grad_norm_var": 7.074708407403945e-06, "learning_rate": 0.008396945165708972, "loss": 2.8921, "step": 2216 }, { "crossentropy": 2.9060137271881104, "epoch": 0.188584552568901, "grad_norm": 0.0448504239320755, "grad_norm_var": 6.1604220209666875e-06, "learning_rate": 0.00839547471929454, "loss": 2.906, "step": 2217 }, { "crossentropy": 2.845996141433716, "epoch": 0.18866961551548145, "grad_norm": 0.042567212134599686, "grad_norm_var": 6.225662307500895e-06, "learning_rate": 0.00839400372766471, "loss": 2.846, "step": 2218 }, { "crossentropy": 2.8591501712799072, "epoch": 0.18875467846206193, "grad_norm": 0.05247369408607483, "grad_norm_var": 1.0392358457860367e-05, "learning_rate": 0.008392532191055676, "loss": 2.8592, "step": 2219 }, { "crossentropy": 2.788679838180542, "epoch": 0.1888397414086424, "grad_norm": 0.04904814437031746, "grad_norm_var": 9.870785737930061e-06, "learning_rate": 0.008391060109703726, "loss": 2.7887, "step": 2220 }, { "crossentropy": 2.8489933013916016, "epoch": 0.18892480435522285, "grad_norm": 0.0441424697637558, "grad_norm_var": 9.767968980755234e-06, "learning_rate": 0.008389587483845232, "loss": 2.849, "step": 2221 }, { "crossentropy": 2.8070218563079834, "epoch": 0.18900986730180333, "grad_norm": 0.04524349048733711, "grad_norm_var": 9.76644867536879e-06, "learning_rate": 0.008388114313716657, "loss": 2.807, "step": 2222 }, { "crossentropy": 2.7459771633148193, "epoch": 0.1890949302483838, "grad_norm": 0.047167759388685226, "grad_norm_var": 9.490341318490794e-06, "learning_rate": 0.00838664059955455, "loss": 2.746, "step": 2223 }, { "crossentropy": 2.828500986099243, "epoch": 0.18917999319496429, "grad_norm": 0.04571119695901871, "grad_norm_var": 8.855552907464868e-06, "learning_rate": 0.008385166341595548, "loss": 2.8285, "step": 2224 }, { "crossentropy": 2.7505550384521484, "epoch": 0.18926505614154474, "grad_norm": 0.045028701424598694, "grad_norm_var": 8.209954118500116e-06, "learning_rate": 0.008383691540076371, "loss": 2.7506, "step": 2225 }, { "crossentropy": 2.8064348697662354, "epoch": 0.1893501190881252, "grad_norm": 0.049930017441511154, "grad_norm_var": 8.228693328036068e-06, "learning_rate": 0.008382216195233832, "loss": 2.8064, "step": 2226 }, { "crossentropy": 2.8176519870758057, "epoch": 0.1894351820347057, "grad_norm": 0.04456369951367378, "grad_norm_var": 7.29554437038718e-06, "learning_rate": 0.00838074030730483, "loss": 2.8177, "step": 2227 }, { "crossentropy": 2.7674753665924072, "epoch": 0.18952024498128614, "grad_norm": 0.04093513637781143, "grad_norm_var": 8.914771794289965e-06, "learning_rate": 0.008379263876526346, "loss": 2.7675, "step": 2228 }, { "crossentropy": 2.773345470428467, "epoch": 0.18960530792786662, "grad_norm": 0.04021550714969635, "grad_norm_var": 1.1190219443599156e-05, "learning_rate": 0.008377786903135457, "loss": 2.7733, "step": 2229 }, { "crossentropy": 2.878592014312744, "epoch": 0.1896903708744471, "grad_norm": 0.04915570095181465, "grad_norm_var": 1.1533074081858848e-05, "learning_rate": 0.008376309387369319, "loss": 2.8786, "step": 2230 }, { "crossentropy": 2.8601980209350586, "epoch": 0.18977543382102757, "grad_norm": 0.05182632431387901, "grad_norm_var": 1.2769057416648205e-05, "learning_rate": 0.00837483132946518, "loss": 2.8602, "step": 2231 }, { "crossentropy": 2.7248663902282715, "epoch": 0.18986049676760802, "grad_norm": 0.04273340478539467, "grad_norm_var": 1.3460310548919716e-05, "learning_rate": 0.008373352729660373, "loss": 2.7249, "step": 2232 }, { "crossentropy": 2.90927791595459, "epoch": 0.1899455597141885, "grad_norm": 0.04396218806505203, "grad_norm_var": 1.3642753227605052e-05, "learning_rate": 0.008371873588192316, "loss": 2.9093, "step": 2233 }, { "crossentropy": 2.8207125663757324, "epoch": 0.19003062266076898, "grad_norm": 0.03983654826879501, "grad_norm_var": 1.5329148245022545e-05, "learning_rate": 0.008370393905298524, "loss": 2.8207, "step": 2234 }, { "crossentropy": 2.7614011764526367, "epoch": 0.19011568560734943, "grad_norm": 0.04038362205028534, "grad_norm_var": 1.3623482719767275e-05, "learning_rate": 0.008368913681216582, "loss": 2.7614, "step": 2235 }, { "crossentropy": 2.790304183959961, "epoch": 0.1902007485539299, "grad_norm": 0.04503954201936722, "grad_norm_var": 1.246025710338183e-05, "learning_rate": 0.008367432916184178, "loss": 2.7903, "step": 2236 }, { "crossentropy": 2.8223257064819336, "epoch": 0.19028581150051038, "grad_norm": 0.04167969897389412, "grad_norm_var": 1.3036269918319705e-05, "learning_rate": 0.008365951610439078, "loss": 2.8223, "step": 2237 }, { "crossentropy": 2.7946794033050537, "epoch": 0.19037087444709086, "grad_norm": 0.04231075942516327, "grad_norm_var": 1.3317620821207537e-05, "learning_rate": 0.008364469764219135, "loss": 2.7947, "step": 2238 }, { "crossentropy": 2.8377442359924316, "epoch": 0.1904559373936713, "grad_norm": 0.0430721752345562, "grad_norm_var": 1.285729552571024e-05, "learning_rate": 0.008362987377762296, "loss": 2.8377, "step": 2239 }, { "crossentropy": 2.8803064823150635, "epoch": 0.1905410003402518, "grad_norm": 0.04103871062397957, "grad_norm_var": 1.3248566382267669e-05, "learning_rate": 0.008361504451306585, "loss": 2.8803, "step": 2240 }, { "crossentropy": 2.730024814605713, "epoch": 0.19062606328683226, "grad_norm": 0.04183598607778549, "grad_norm_var": 1.3386860947443383e-05, "learning_rate": 0.008360020985090118, "loss": 2.73, "step": 2241 }, { "crossentropy": 2.7350826263427734, "epoch": 0.19071112623341271, "grad_norm": 0.04083557426929474, "grad_norm_var": 1.0950085688426053e-05, "learning_rate": 0.008358536979351098, "loss": 2.7351, "step": 2242 }, { "crossentropy": 2.745723247528076, "epoch": 0.1907961891799932, "grad_norm": 0.04486527293920517, "grad_norm_var": 1.1015065751644044e-05, "learning_rate": 0.008357052434327814, "loss": 2.7457, "step": 2243 }, { "crossentropy": 2.8184621334075928, "epoch": 0.19088125212657367, "grad_norm": 0.04248287156224251, "grad_norm_var": 1.071640502491259e-05, "learning_rate": 0.00835556735025864, "loss": 2.8185, "step": 2244 }, { "crossentropy": 2.6705949306488037, "epoch": 0.19096631507315415, "grad_norm": 0.040374238044023514, "grad_norm_var": 1.065471784670105e-05, "learning_rate": 0.008354081727382038, "loss": 2.6706, "step": 2245 }, { "crossentropy": 2.795269012451172, "epoch": 0.1910513780197346, "grad_norm": 0.040954750031232834, "grad_norm_var": 8.361768482677741e-06, "learning_rate": 0.008352595565936554, "loss": 2.7953, "step": 2246 }, { "crossentropy": 2.828735113143921, "epoch": 0.19113644096631507, "grad_norm": 0.043069396167993546, "grad_norm_var": 2.501007415813226e-06, "learning_rate": 0.008351108866160827, "loss": 2.8287, "step": 2247 }, { "crossentropy": 2.8014278411865234, "epoch": 0.19122150391289555, "grad_norm": 0.04704497382044792, "grad_norm_var": 3.995559135268797e-06, "learning_rate": 0.008349621628293577, "loss": 2.8014, "step": 2248 }, { "crossentropy": 2.799065113067627, "epoch": 0.191306566859476, "grad_norm": 0.05091257765889168, "grad_norm_var": 8.440137709384415e-06, "learning_rate": 0.00834813385257361, "loss": 2.7991, "step": 2249 }, { "crossentropy": 2.7255325317382812, "epoch": 0.19139162980605648, "grad_norm": 0.04507570341229439, "grad_norm_var": 8.044657407470466e-06, "learning_rate": 0.00834664553923982, "loss": 2.7255, "step": 2250 }, { "crossentropy": 2.828172206878662, "epoch": 0.19147669275263696, "grad_norm": 0.039852507412433624, "grad_norm_var": 8.260738119530987e-06, "learning_rate": 0.008345156688531186, "loss": 2.8282, "step": 2251 }, { "crossentropy": 2.861650228500366, "epoch": 0.19156175569921743, "grad_norm": 0.04057341441512108, "grad_norm_var": 8.383855203970778e-06, "learning_rate": 0.008343667300686778, "loss": 2.8617, "step": 2252 }, { "crossentropy": 2.8770439624786377, "epoch": 0.19164681864579788, "grad_norm": 0.04592999070882797, "grad_norm_var": 8.83629031261171e-06, "learning_rate": 0.00834217737594575, "loss": 2.877, "step": 2253 }, { "crossentropy": 2.7738938331604004, "epoch": 0.19173188159237836, "grad_norm": 0.046110473573207855, "grad_norm_var": 9.318889001913578e-06, "learning_rate": 0.008340686914547334, "loss": 2.7739, "step": 2254 }, { "crossentropy": 2.7779977321624756, "epoch": 0.19181694453895884, "grad_norm": 0.04857503995299339, "grad_norm_var": 1.0987984598817557e-05, "learning_rate": 0.00833919591673086, "loss": 2.778, "step": 2255 }, { "crossentropy": 2.7929773330688477, "epoch": 0.1919020074855393, "grad_norm": 0.048852548003196716, "grad_norm_var": 1.2009752556439655e-05, "learning_rate": 0.00833770438273574, "loss": 2.793, "step": 2256 }, { "crossentropy": 2.7406766414642334, "epoch": 0.19198707043211977, "grad_norm": 0.04093572497367859, "grad_norm_var": 1.2345261107639336e-05, "learning_rate": 0.00833621231280147, "loss": 2.7407, "step": 2257 }, { "crossentropy": 2.8677256107330322, "epoch": 0.19207213337870024, "grad_norm": 0.04298783838748932, "grad_norm_var": 1.1682832096393015e-05, "learning_rate": 0.00833471970716763, "loss": 2.8677, "step": 2258 }, { "crossentropy": 2.7762084007263184, "epoch": 0.19215719632528072, "grad_norm": 0.0463474839925766, "grad_norm_var": 1.1934358727971362e-05, "learning_rate": 0.008333226566073897, "loss": 2.7762, "step": 2259 }, { "crossentropy": 2.8217296600341797, "epoch": 0.19224225927186117, "grad_norm": 0.04100377485156059, "grad_norm_var": 1.2445224094340991e-05, "learning_rate": 0.00833173288976002, "loss": 2.8217, "step": 2260 }, { "crossentropy": 2.8539345264434814, "epoch": 0.19232732221844165, "grad_norm": 0.04217294231057167, "grad_norm_var": 1.1708919336014292e-05, "learning_rate": 0.008330238678465843, "loss": 2.8539, "step": 2261 }, { "crossentropy": 2.7762017250061035, "epoch": 0.19241238516502213, "grad_norm": 0.04163862392306328, "grad_norm_var": 1.1424005590459676e-05, "learning_rate": 0.008328743932431293, "loss": 2.7762, "step": 2262 }, { "crossentropy": 2.8919830322265625, "epoch": 0.19249744811160258, "grad_norm": 0.044454384595155716, "grad_norm_var": 1.1290293479273144e-05, "learning_rate": 0.008327248651896383, "loss": 2.892, "step": 2263 }, { "crossentropy": 2.8524818420410156, "epoch": 0.19258251105818305, "grad_norm": 0.04631505534052849, "grad_norm_var": 1.1078755862172345e-05, "learning_rate": 0.008325752837101212, "loss": 2.8525, "step": 2264 }, { "crossentropy": 2.850233793258667, "epoch": 0.19266757400476353, "grad_norm": 0.04722980037331581, "grad_norm_var": 8.769583015613182e-06, "learning_rate": 0.008324256488285964, "loss": 2.8502, "step": 2265 }, { "crossentropy": 2.801032781600952, "epoch": 0.19275263695134398, "grad_norm": 0.04725433513522148, "grad_norm_var": 9.305084997668761e-06, "learning_rate": 0.008322759605690911, "loss": 2.801, "step": 2266 }, { "crossentropy": 2.8335657119750977, "epoch": 0.19283769989792446, "grad_norm": 0.041717980057001114, "grad_norm_var": 8.394069391378055e-06, "learning_rate": 0.008321262189556408, "loss": 2.8336, "step": 2267 }, { "crossentropy": 2.844372034072876, "epoch": 0.19292276284450494, "grad_norm": 0.043555717915296555, "grad_norm_var": 7.386112847680935e-06, "learning_rate": 0.008319764240122898, "loss": 2.8444, "step": 2268 }, { "crossentropy": 2.8733201026916504, "epoch": 0.1930078257910854, "grad_norm": 0.04801752045750618, "grad_norm_var": 8.002884129846138e-06, "learning_rate": 0.008318265757630908, "loss": 2.8733, "step": 2269 }, { "crossentropy": 2.989469528198242, "epoch": 0.19309288873766586, "grad_norm": 0.04786399379372597, "grad_norm_var": 8.49605781925907e-06, "learning_rate": 0.008316766742321052, "loss": 2.9895, "step": 2270 }, { "crossentropy": 2.84159779548645, "epoch": 0.19317795168424634, "grad_norm": 0.04573456943035126, "grad_norm_var": 7.620853313756946e-06, "learning_rate": 0.008315267194434026, "loss": 2.8416, "step": 2271 }, { "crossentropy": 2.8836026191711426, "epoch": 0.19326301463082682, "grad_norm": 0.042063165456056595, "grad_norm_var": 6.7926558356243744e-06, "learning_rate": 0.008313767114210615, "loss": 2.8836, "step": 2272 }, { "crossentropy": 2.7861342430114746, "epoch": 0.19334807757740727, "grad_norm": 0.04127087444067001, "grad_norm_var": 6.647961503727929e-06, "learning_rate": 0.00831226650189169, "loss": 2.7861, "step": 2273 }, { "crossentropy": 2.7286744117736816, "epoch": 0.19343314052398775, "grad_norm": 0.041355326771736145, "grad_norm_var": 7.1114109189205726e-06, "learning_rate": 0.008310765357718207, "loss": 2.7287, "step": 2274 }, { "crossentropy": 2.7027499675750732, "epoch": 0.19351820347056822, "grad_norm": 0.04113531857728958, "grad_norm_var": 7.351476550663399e-06, "learning_rate": 0.008309263681931203, "loss": 2.7027, "step": 2275 }, { "crossentropy": 2.82379150390625, "epoch": 0.1936032664171487, "grad_norm": 0.04492684081196785, "grad_norm_var": 6.785901498965185e-06, "learning_rate": 0.008307761474771809, "loss": 2.8238, "step": 2276 }, { "crossentropy": 2.866328001022339, "epoch": 0.19368832936372915, "grad_norm": 0.039166033267974854, "grad_norm_var": 8.151318604580846e-06, "learning_rate": 0.00830625873648123, "loss": 2.8663, "step": 2277 }, { "crossentropy": 2.8149967193603516, "epoch": 0.19377339231030963, "grad_norm": 0.04358860105276108, "grad_norm_var": 7.779901112791911e-06, "learning_rate": 0.008304755467300768, "loss": 2.815, "step": 2278 }, { "crossentropy": 2.7248294353485107, "epoch": 0.1938584552568901, "grad_norm": 0.04734424874186516, "grad_norm_var": 8.437215575201237e-06, "learning_rate": 0.008303251667471801, "loss": 2.7248, "step": 2279 }, { "crossentropy": 2.816080331802368, "epoch": 0.19394351820347056, "grad_norm": 0.042618971318006516, "grad_norm_var": 8.289961319079183e-06, "learning_rate": 0.008301747337235797, "loss": 2.8161, "step": 2280 }, { "crossentropy": 2.753626585006714, "epoch": 0.19402858115005103, "grad_norm": 0.03998883068561554, "grad_norm_var": 8.499573168188555e-06, "learning_rate": 0.00830024247683431, "loss": 2.7536, "step": 2281 }, { "crossentropy": 2.803854465484619, "epoch": 0.1941136440966315, "grad_norm": 0.043607134371995926, "grad_norm_var": 7.553944323293536e-06, "learning_rate": 0.008298737086508972, "loss": 2.8039, "step": 2282 }, { "crossentropy": 2.8593921661376953, "epoch": 0.194198707043212, "grad_norm": 0.04291252791881561, "grad_norm_var": 7.379656432836742e-06, "learning_rate": 0.008297231166501513, "loss": 2.8594, "step": 2283 }, { "crossentropy": 2.764911651611328, "epoch": 0.19428376998979244, "grad_norm": 0.04130130261182785, "grad_norm_var": 7.664582648690168e-06, "learning_rate": 0.008295724717053735, "loss": 2.7649, "step": 2284 }, { "crossentropy": 2.84480881690979, "epoch": 0.19436883293637292, "grad_norm": 0.04432497173547745, "grad_norm_var": 6.197072943804153e-06, "learning_rate": 0.00829421773840753, "loss": 2.8448, "step": 2285 }, { "crossentropy": 2.7763030529022217, "epoch": 0.1944538958829534, "grad_norm": 0.044889986515045166, "grad_norm_var": 4.850934767081499e-06, "learning_rate": 0.008292710230804877, "loss": 2.7763, "step": 2286 }, { "crossentropy": 2.8605806827545166, "epoch": 0.19453895882953384, "grad_norm": 0.04400148615241051, "grad_norm_var": 4.381178463276856e-06, "learning_rate": 0.008291202194487838, "loss": 2.8606, "step": 2287 }, { "crossentropy": 2.8754196166992188, "epoch": 0.19462402177611432, "grad_norm": 0.041893042623996735, "grad_norm_var": 4.39926945902987e-06, "learning_rate": 0.008289693629698563, "loss": 2.8754, "step": 2288 }, { "crossentropy": 2.7713067531585693, "epoch": 0.1947090847226948, "grad_norm": 0.04347078502178192, "grad_norm_var": 4.261918460093881e-06, "learning_rate": 0.00828818453667928, "loss": 2.7713, "step": 2289 }, { "crossentropy": 2.7974109649658203, "epoch": 0.19479414766927527, "grad_norm": 0.044780343770980835, "grad_norm_var": 4.286106148873997e-06, "learning_rate": 0.008286674915672308, "loss": 2.7974, "step": 2290 }, { "crossentropy": 2.835360527038574, "epoch": 0.19487921061585572, "grad_norm": 0.043858982622623444, "grad_norm_var": 4.0283148060098075e-06, "learning_rate": 0.008285164766920046, "loss": 2.8354, "step": 2291 }, { "crossentropy": 2.8327622413635254, "epoch": 0.1949642735624362, "grad_norm": 0.04583593085408211, "grad_norm_var": 4.278114106299041e-06, "learning_rate": 0.008283654090664986, "loss": 2.8328, "step": 2292 }, { "crossentropy": 2.9271340370178223, "epoch": 0.19504933650901668, "grad_norm": 0.044555529952049255, "grad_norm_var": 3.08769008000766e-06, "learning_rate": 0.00828214288714969, "loss": 2.9271, "step": 2293 }, { "crossentropy": 2.8616793155670166, "epoch": 0.19513439945559713, "grad_norm": 0.0447094589471817, "grad_norm_var": 3.151685228939584e-06, "learning_rate": 0.008280631156616822, "loss": 2.8617, "step": 2294 }, { "crossentropy": 2.7678909301757812, "epoch": 0.1952194624021776, "grad_norm": 0.045750897377729416, "grad_norm_var": 2.548013382066133e-06, "learning_rate": 0.008279118899309122, "loss": 2.7679, "step": 2295 }, { "crossentropy": 2.800473213195801, "epoch": 0.19530452534875808, "grad_norm": 0.04399662837386131, "grad_norm_var": 2.476097232983511e-06, "learning_rate": 0.00827760611546941, "loss": 2.8005, "step": 2296 }, { "crossentropy": 2.786365509033203, "epoch": 0.19538958829533856, "grad_norm": 0.052466168999671936, "grad_norm_var": 5.961796492131612e-06, "learning_rate": 0.008276092805340595, "loss": 2.7864, "step": 2297 }, { "crossentropy": 2.903991460800171, "epoch": 0.195474651241919, "grad_norm": 0.06330693513154984, "grad_norm_var": 2.781338642753317e-05, "learning_rate": 0.008274578969165678, "loss": 2.904, "step": 2298 }, { "crossentropy": 2.7664411067962646, "epoch": 0.1955597141884995, "grad_norm": 0.04519551992416382, "grad_norm_var": 2.727437026728933e-05, "learning_rate": 0.008273064607187732, "loss": 2.7664, "step": 2299 }, { "crossentropy": 2.9185967445373535, "epoch": 0.19564477713507997, "grad_norm": 0.04180729016661644, "grad_norm_var": 2.69803821280616e-05, "learning_rate": 0.008271549719649923, "loss": 2.9186, "step": 2300 }, { "crossentropy": 2.889439344406128, "epoch": 0.19572984008166042, "grad_norm": 0.06836569309234619, "grad_norm_var": 5.796506546442744e-05, "learning_rate": 0.008270034306795493, "loss": 2.8894, "step": 2301 }, { "crossentropy": 2.7385237216949463, "epoch": 0.1958149030282409, "grad_norm": 0.0518244132399559, "grad_norm_var": 5.862171552214092e-05, "learning_rate": 0.00826851836886778, "loss": 2.7385, "step": 2302 }, { "crossentropy": 2.7508788108825684, "epoch": 0.19589996597482137, "grad_norm": 0.043564923107624054, "grad_norm_var": 5.88584401984236e-05, "learning_rate": 0.008267001906110197, "loss": 2.7509, "step": 2303 }, { "crossentropy": 2.867455244064331, "epoch": 0.19598502892140185, "grad_norm": 0.043684326112270355, "grad_norm_var": 5.7639483203186795e-05, "learning_rate": 0.008265484918766244, "loss": 2.8675, "step": 2304 }, { "crossentropy": 2.829596757888794, "epoch": 0.1960700918679823, "grad_norm": 0.043591760098934174, "grad_norm_var": 5.7568174491071794e-05, "learning_rate": 0.008263967407079504, "loss": 2.8296, "step": 2305 }, { "crossentropy": 2.8679137229919434, "epoch": 0.19615515481456278, "grad_norm": 0.042051415890455246, "grad_norm_var": 5.9189072612796044e-05, "learning_rate": 0.00826244937129365, "loss": 2.8679, "step": 2306 }, { "crossentropy": 2.847626209259033, "epoch": 0.19624021776114325, "grad_norm": 0.043991658836603165, "grad_norm_var": 5.9120714421638984e-05, "learning_rate": 0.008260930811652432, "loss": 2.8476, "step": 2307 }, { "crossentropy": 2.833446741104126, "epoch": 0.1963252807077237, "grad_norm": 0.04418773204088211, "grad_norm_var": 5.9720729491038035e-05, "learning_rate": 0.008259411728399686, "loss": 2.8334, "step": 2308 }, { "crossentropy": 2.8067283630371094, "epoch": 0.19641034365430418, "grad_norm": 0.063929982483387, "grad_norm_var": 7.508249426250256e-05, "learning_rate": 0.008257892121779336, "loss": 2.8067, "step": 2309 }, { "crossentropy": 2.7351791858673096, "epoch": 0.19649540660088466, "grad_norm": 0.04282103851437569, "grad_norm_var": 7.636090160923482e-05, "learning_rate": 0.008256371992035385, "loss": 2.7352, "step": 2310 }, { "crossentropy": 2.891923427581787, "epoch": 0.19658046954746514, "grad_norm": 0.042784303426742554, "grad_norm_var": 7.811048697930612e-05, "learning_rate": 0.00825485133941192, "loss": 2.8919, "step": 2311 }, { "crossentropy": 2.7874927520751953, "epoch": 0.1966655324940456, "grad_norm": 0.041173290461301804, "grad_norm_var": 8.03408949863526e-05, "learning_rate": 0.008253330164153118, "loss": 2.7875, "step": 2312 }, { "crossentropy": 2.6984660625457764, "epoch": 0.19675059544062606, "grad_norm": 0.04441367834806442, "grad_norm_var": 8.005110110855847e-05, "learning_rate": 0.008251808466503235, "loss": 2.6985, "step": 2313 }, { "crossentropy": 2.8400156497955322, "epoch": 0.19683565838720654, "grad_norm": 0.04645229130983353, "grad_norm_var": 6.322354093440992e-05, "learning_rate": 0.008250286246706609, "loss": 2.84, "step": 2314 }, { "crossentropy": 2.7879114151000977, "epoch": 0.196920721333787, "grad_norm": 0.04815860092639923, "grad_norm_var": 6.311272440256045e-05, "learning_rate": 0.008248763505007669, "loss": 2.7879, "step": 2315 }, { "crossentropy": 2.8737902641296387, "epoch": 0.19700578428036747, "grad_norm": 0.04927698150277138, "grad_norm_var": 6.137831994145834e-05, "learning_rate": 0.008247240241650917, "loss": 2.8738, "step": 2316 }, { "crossentropy": 2.853588581085205, "epoch": 0.19709084722694795, "grad_norm": 0.045781459659338, "grad_norm_var": 3.047607820722553e-05, "learning_rate": 0.008245716456880953, "loss": 2.8536, "step": 2317 }, { "crossentropy": 2.844501256942749, "epoch": 0.19717591017352842, "grad_norm": 0.045254237949848175, "grad_norm_var": 2.816411886896464e-05, "learning_rate": 0.008244192150942449, "loss": 2.8445, "step": 2318 }, { "crossentropy": 2.767737627029419, "epoch": 0.19726097312010887, "grad_norm": 0.045027051120996475, "grad_norm_var": 2.7882501428094265e-05, "learning_rate": 0.008242667324080164, "loss": 2.7677, "step": 2319 }, { "crossentropy": 2.8409836292266846, "epoch": 0.19734603606668935, "grad_norm": 0.04354013130068779, "grad_norm_var": 2.7924212243043052e-05, "learning_rate": 0.008241141976538942, "loss": 2.841, "step": 2320 }, { "crossentropy": 2.8576338291168213, "epoch": 0.19743109901326983, "grad_norm": 0.04195326194167137, "grad_norm_var": 2.856945539073206e-05, "learning_rate": 0.00823961610856371, "loss": 2.8576, "step": 2321 }, { "crossentropy": 2.810636043548584, "epoch": 0.19751616195985028, "grad_norm": 0.04492655023932457, "grad_norm_var": 2.769706885666582e-05, "learning_rate": 0.00823808972039948, "loss": 2.8106, "step": 2322 }, { "crossentropy": 2.9724559783935547, "epoch": 0.19760122490643076, "grad_norm": 0.050136130303144455, "grad_norm_var": 2.853055756933485e-05, "learning_rate": 0.008236562812291341, "loss": 2.9725, "step": 2323 }, { "crossentropy": 2.7289552688598633, "epoch": 0.19768628785301123, "grad_norm": 0.041671060025691986, "grad_norm_var": 2.9614573122745993e-05, "learning_rate": 0.008235035384484477, "loss": 2.729, "step": 2324 }, { "crossentropy": 2.862760543823242, "epoch": 0.1977713507995917, "grad_norm": 0.04925405606627464, "grad_norm_var": 8.149780582102547e-06, "learning_rate": 0.008233507437224143, "loss": 2.8628, "step": 2325 }, { "crossentropy": 2.747507333755493, "epoch": 0.19785641374617216, "grad_norm": 0.04613497853279114, "grad_norm_var": 7.800906741661264e-06, "learning_rate": 0.008231978970755689, "loss": 2.7475, "step": 2326 }, { "crossentropy": 2.8044493198394775, "epoch": 0.19794147669275264, "grad_norm": 0.04473890736699104, "grad_norm_var": 7.365523946850638e-06, "learning_rate": 0.008230449985324538, "loss": 2.8044, "step": 2327 }, { "crossentropy": 2.7729952335357666, "epoch": 0.19802653963933312, "grad_norm": 0.040185555815696716, "grad_norm_var": 7.995435497793781e-06, "learning_rate": 0.008228920481176202, "loss": 2.773, "step": 2328 }, { "crossentropy": 2.8576855659484863, "epoch": 0.19811160258591357, "grad_norm": 0.03916627913713455, "grad_norm_var": 1.0428548379862584e-05, "learning_rate": 0.008227390458556276, "loss": 2.8577, "step": 2329 }, { "crossentropy": 2.8338093757629395, "epoch": 0.19819666553249404, "grad_norm": 0.0410580039024353, "grad_norm_var": 1.1277160943366724e-05, "learning_rate": 0.00822585991771044, "loss": 2.8338, "step": 2330 }, { "crossentropy": 2.7857401371002197, "epoch": 0.19828172847907452, "grad_norm": 0.042593568563461304, "grad_norm_var": 1.0695771517262437e-05, "learning_rate": 0.00822432885888445, "loss": 2.7857, "step": 2331 }, { "crossentropy": 2.7409675121307373, "epoch": 0.198366791425655, "grad_norm": 0.04433564096689224, "grad_norm_var": 9.02092734234398e-06, "learning_rate": 0.00822279728232415, "loss": 2.741, "step": 2332 }, { "crossentropy": 2.8001346588134766, "epoch": 0.19845185437223545, "grad_norm": 0.04376490041613579, "grad_norm_var": 8.825618751106175e-06, "learning_rate": 0.008221265188275474, "loss": 2.8001, "step": 2333 }, { "crossentropy": 2.8770275115966797, "epoch": 0.19853691731881593, "grad_norm": 0.04303436353802681, "grad_norm_var": 8.757571544834527e-06, "learning_rate": 0.008219732576984424, "loss": 2.877, "step": 2334 }, { "crossentropy": 2.793645143508911, "epoch": 0.1986219802653964, "grad_norm": 0.04773496091365814, "grad_norm_var": 9.642645139162678e-06, "learning_rate": 0.0082181994486971, "loss": 2.7936, "step": 2335 }, { "crossentropy": 2.764072895050049, "epoch": 0.19870704321197685, "grad_norm": 0.048609551042318344, "grad_norm_var": 1.092835136741165e-05, "learning_rate": 0.008216665803659672, "loss": 2.7641, "step": 2336 }, { "crossentropy": 2.8045237064361572, "epoch": 0.19879210615855733, "grad_norm": 0.04290414974093437, "grad_norm_var": 1.0683387462373286e-05, "learning_rate": 0.0082151316421184, "loss": 2.8045, "step": 2337 }, { "crossentropy": 2.8804869651794434, "epoch": 0.1988771691051378, "grad_norm": 0.04336876794695854, "grad_norm_var": 1.0723723897309902e-05, "learning_rate": 0.008213596964319631, "loss": 2.8805, "step": 2338 }, { "crossentropy": 2.6609063148498535, "epoch": 0.19896223205171829, "grad_norm": 0.050351984798908234, "grad_norm_var": 1.089479959250483e-05, "learning_rate": 0.00821206177050979, "loss": 2.6609, "step": 2339 }, { "crossentropy": 2.7115440368652344, "epoch": 0.19904729499829873, "grad_norm": 0.049786604940891266, "grad_norm_var": 1.2159256845123364e-05, "learning_rate": 0.008210526060935378, "loss": 2.7115, "step": 2340 }, { "crossentropy": 2.7777271270751953, "epoch": 0.1991323579448792, "grad_norm": 0.05374457687139511, "grad_norm_var": 1.607804189976044e-05, "learning_rate": 0.00820898983584299, "loss": 2.7777, "step": 2341 }, { "crossentropy": 2.8241982460021973, "epoch": 0.1992174208914597, "grad_norm": 0.05210011452436447, "grad_norm_var": 1.9129476494150522e-05, "learning_rate": 0.0082074530954793, "loss": 2.8242, "step": 2342 }, { "crossentropy": 2.8216497898101807, "epoch": 0.19930248383804014, "grad_norm": 0.04342721775174141, "grad_norm_var": 1.936441196113669e-05, "learning_rate": 0.008205915840091066, "loss": 2.8216, "step": 2343 }, { "crossentropy": 2.8985044956207275, "epoch": 0.19938754678462062, "grad_norm": 0.04431266710162163, "grad_norm_var": 1.7567604824486084e-05, "learning_rate": 0.00820437806992512, "loss": 2.8985, "step": 2344 }, { "crossentropy": 2.791482448577881, "epoch": 0.1994726097312011, "grad_norm": 0.03856492415070534, "grad_norm_var": 1.810954116931533e-05, "learning_rate": 0.008202839785228391, "loss": 2.7915, "step": 2345 }, { "crossentropy": 2.8257758617401123, "epoch": 0.19955767267778154, "grad_norm": 0.044281814247369766, "grad_norm_var": 1.6804291361192877e-05, "learning_rate": 0.00820130098624788, "loss": 2.8258, "step": 2346 }, { "crossentropy": 2.8529770374298096, "epoch": 0.19964273562436202, "grad_norm": 0.045898616313934326, "grad_norm_var": 1.6070822705164096e-05, "learning_rate": 0.008199761673230674, "loss": 2.853, "step": 2347 }, { "crossentropy": 2.862757921218872, "epoch": 0.1997277985709425, "grad_norm": 0.04929688945412636, "grad_norm_var": 1.6499092717859692e-05, "learning_rate": 0.008198221846423942, "loss": 2.8628, "step": 2348 }, { "crossentropy": 2.754242420196533, "epoch": 0.19981286151752298, "grad_norm": 0.04353749752044678, "grad_norm_var": 1.6579914016666825e-05, "learning_rate": 0.008196681506074936, "loss": 2.7542, "step": 2349 }, { "crossentropy": 2.8342537879943848, "epoch": 0.19989792446410343, "grad_norm": 0.04144991934299469, "grad_norm_var": 1.7428756434945124e-05, "learning_rate": 0.00819514065243099, "loss": 2.8343, "step": 2350 }, { "crossentropy": 2.7861361503601074, "epoch": 0.1999829874106839, "grad_norm": 0.09176730364561081, "grad_norm_var": 0.00014755595684541855, "learning_rate": 0.008193599285739522, "loss": 2.7861, "step": 2351 }, { "crossentropy": 2.779658079147339, "epoch": 0.20006805035726438, "grad_norm": 0.043988604098558426, "grad_norm_var": 0.00014910808997965115, "learning_rate": 0.008192057406248027, "loss": 2.7797, "step": 2352 }, { "crossentropy": 2.8158321380615234, "epoch": 0.20015311330384483, "grad_norm": 0.04402879998087883, "grad_norm_var": 0.0001483219559024492, "learning_rate": 0.008190515014204093, "loss": 2.8158, "step": 2353 }, { "crossentropy": 2.822786569595337, "epoch": 0.2002381762504253, "grad_norm": 0.04138500243425369, "grad_norm_var": 0.00014998971181563451, "learning_rate": 0.008188972109855381, "loss": 2.8228, "step": 2354 }, { "crossentropy": 2.8208167552948, "epoch": 0.2003232391970058, "grad_norm": 0.04055183380842209, "grad_norm_var": 0.00015372944231411412, "learning_rate": 0.008187428693449635, "loss": 2.8208, "step": 2355 }, { "crossentropy": 2.9327127933502197, "epoch": 0.20040830214358626, "grad_norm": 0.04212285205721855, "grad_norm_var": 0.00015558246455725695, "learning_rate": 0.008185884765234684, "loss": 2.9327, "step": 2356 }, { "crossentropy": 2.796574592590332, "epoch": 0.20049336509016671, "grad_norm": 0.05142032727599144, "grad_norm_var": 0.00015399378718529462, "learning_rate": 0.00818434032545844, "loss": 2.7966, "step": 2357 }, { "crossentropy": 2.6868700981140137, "epoch": 0.2005784280367472, "grad_norm": 0.0472792349755764, "grad_norm_var": 0.0001524145131622206, "learning_rate": 0.008182795374368891, "loss": 2.6869, "step": 2358 }, { "crossentropy": 2.8241641521453857, "epoch": 0.20066349098332767, "grad_norm": 0.04335489124059677, "grad_norm_var": 0.00015245008603394038, "learning_rate": 0.00818124991221412, "loss": 2.8242, "step": 2359 }, { "crossentropy": 2.726355791091919, "epoch": 0.20074855392990812, "grad_norm": 0.044432900846004486, "grad_norm_var": 0.00015240666486865002, "learning_rate": 0.008179703939242276, "loss": 2.7264, "step": 2360 }, { "crossentropy": 2.7657814025878906, "epoch": 0.2008336168764886, "grad_norm": 0.04503186047077179, "grad_norm_var": 0.0001476739128313119, "learning_rate": 0.0081781574557016, "loss": 2.7658, "step": 2361 }, { "crossentropy": 2.7571778297424316, "epoch": 0.20091867982306907, "grad_norm": 0.03842239826917648, "grad_norm_var": 0.00015232555374133534, "learning_rate": 0.008176610461840414, "loss": 2.7572, "step": 2362 }, { "crossentropy": 2.819653034210205, "epoch": 0.20100374276964955, "grad_norm": 0.041463904082775116, "grad_norm_var": 0.0001542787271180457, "learning_rate": 0.00817506295790712, "loss": 2.8197, "step": 2363 }, { "crossentropy": 2.8480582237243652, "epoch": 0.20108880571623, "grad_norm": 0.047217220067977905, "grad_norm_var": 0.00015386940500602958, "learning_rate": 0.008173514944150204, "loss": 2.8481, "step": 2364 }, { "crossentropy": 2.7538208961486816, "epoch": 0.20117386866281048, "grad_norm": 0.048937033861875534, "grad_norm_var": 0.00015340333200242914, "learning_rate": 0.008171966420818227, "loss": 2.7538, "step": 2365 }, { "crossentropy": 2.885451316833496, "epoch": 0.20125893160939096, "grad_norm": 0.04754650965332985, "grad_norm_var": 0.00015117142365310508, "learning_rate": 0.008170417388159844, "loss": 2.8855, "step": 2366 }, { "crossentropy": 2.861577272415161, "epoch": 0.2013439945559714, "grad_norm": 0.04563705250620842, "grad_norm_var": 1.1493137246701424e-05, "learning_rate": 0.00816886784642378, "loss": 2.8616, "step": 2367 }, { "crossentropy": 2.7864818572998047, "epoch": 0.20142905750255188, "grad_norm": 0.04446548968553543, "grad_norm_var": 1.1471573600541941e-05, "learning_rate": 0.008167317795858851, "loss": 2.7865, "step": 2368 }, { "crossentropy": 2.7709286212921143, "epoch": 0.20151412044913236, "grad_norm": 0.04269804060459137, "grad_norm_var": 1.1680250054242845e-05, "learning_rate": 0.008165767236713946, "loss": 2.7709, "step": 2369 }, { "crossentropy": 2.9079840183258057, "epoch": 0.20159918339571284, "grad_norm": 0.04895386844873428, "grad_norm_var": 1.2119243161399369e-05, "learning_rate": 0.008164216169238042, "loss": 2.908, "step": 2370 }, { "crossentropy": 2.7539122104644775, "epoch": 0.2016842463422933, "grad_norm": 0.04976610094308853, "grad_norm_var": 1.1996457414999393e-05, "learning_rate": 0.008162664593680199, "loss": 2.7539, "step": 2371 }, { "crossentropy": 2.856266736984253, "epoch": 0.20176930928887377, "grad_norm": 0.0452062226831913, "grad_norm_var": 1.1182993008263901e-05, "learning_rate": 0.008161112510289548, "loss": 2.8563, "step": 2372 }, { "crossentropy": 2.868680715560913, "epoch": 0.20185437223545424, "grad_norm": 0.03996788710355759, "grad_norm_var": 1.0705914937553646e-05, "learning_rate": 0.008159559919315315, "loss": 2.8687, "step": 2373 }, { "crossentropy": 2.825171947479248, "epoch": 0.2019394351820347, "grad_norm": 0.04692757502198219, "grad_norm_var": 1.060789062071506e-05, "learning_rate": 0.008158006821006801, "loss": 2.8252, "step": 2374 }, { "crossentropy": 2.7532358169555664, "epoch": 0.20202449812861517, "grad_norm": 0.04839090257883072, "grad_norm_var": 1.1087125408036116e-05, "learning_rate": 0.008156453215613386, "loss": 2.7532, "step": 2375 }, { "crossentropy": 2.7984564304351807, "epoch": 0.20210956107519565, "grad_norm": 0.04610013961791992, "grad_norm_var": 1.1064419526309911e-05, "learning_rate": 0.008154899103384537, "loss": 2.7985, "step": 2376 }, { "crossentropy": 2.773463487625122, "epoch": 0.20219462402177613, "grad_norm": 0.040678586810827255, "grad_norm_var": 1.2474589629862577e-05, "learning_rate": 0.008153344484569797, "loss": 2.7735, "step": 2377 }, { "crossentropy": 2.8696236610412598, "epoch": 0.20227968696835658, "grad_norm": 0.04212893545627594, "grad_norm_var": 1.0009077363360801e-05, "learning_rate": 0.008151789359418796, "loss": 2.8696, "step": 2378 }, { "crossentropy": 2.8426907062530518, "epoch": 0.20236474991493705, "grad_norm": 0.04341845214366913, "grad_norm_var": 9.227194724577513e-06, "learning_rate": 0.00815023372818124, "loss": 2.8427, "step": 2379 }, { "crossentropy": 2.7253589630126953, "epoch": 0.20244981286151753, "grad_norm": 0.047620903700590134, "grad_norm_var": 9.329673619920947e-06, "learning_rate": 0.008148677591106919, "loss": 2.7254, "step": 2380 }, { "crossentropy": 2.8241629600524902, "epoch": 0.20253487580809798, "grad_norm": 0.04302383214235306, "grad_norm_var": 8.827059945734627e-06, "learning_rate": 0.008147120948445703, "loss": 2.8242, "step": 2381 }, { "crossentropy": 2.7956621646881104, "epoch": 0.20261993875467846, "grad_norm": 0.04034784063696861, "grad_norm_var": 9.77346669794113e-06, "learning_rate": 0.008145563800447547, "loss": 2.7957, "step": 2382 }, { "crossentropy": 2.7872769832611084, "epoch": 0.20270500170125894, "grad_norm": 0.04490095004439354, "grad_norm_var": 9.716171907941708e-06, "learning_rate": 0.008144006147362479, "loss": 2.7873, "step": 2383 }, { "crossentropy": 2.7529759407043457, "epoch": 0.2027900646478394, "grad_norm": 0.04094313085079193, "grad_norm_var": 1.0584010205488826e-05, "learning_rate": 0.008142447989440619, "loss": 2.753, "step": 2384 }, { "crossentropy": 2.7607386112213135, "epoch": 0.20287512759441986, "grad_norm": 0.0404333621263504, "grad_norm_var": 1.143118505043117e-05, "learning_rate": 0.008140889326932157, "loss": 2.7607, "step": 2385 }, { "crossentropy": 2.8752048015594482, "epoch": 0.20296019054100034, "grad_norm": 0.04727742820978165, "grad_norm_var": 1.0566702054576327e-05, "learning_rate": 0.008139330160087374, "loss": 2.8752, "step": 2386 }, { "crossentropy": 2.803109884262085, "epoch": 0.20304525348758082, "grad_norm": 0.045216985046863556, "grad_norm_var": 8.481425396372756e-06, "learning_rate": 0.008137770489156624, "loss": 2.8031, "step": 2387 }, { "crossentropy": 2.791097402572632, "epoch": 0.20313031643416127, "grad_norm": 0.04299785941839218, "grad_norm_var": 8.404984663670268e-06, "learning_rate": 0.008136210314390347, "loss": 2.7911, "step": 2388 }, { "crossentropy": 2.6955385208129883, "epoch": 0.20321537938074175, "grad_norm": 0.03973235934972763, "grad_norm_var": 8.527959661631887e-06, "learning_rate": 0.008134649636039059, "loss": 2.6955, "step": 2389 }, { "crossentropy": 2.841850757598877, "epoch": 0.20330044232732222, "grad_norm": 0.04066845774650574, "grad_norm_var": 8.33191484350959e-06, "learning_rate": 0.008133088454353366, "loss": 2.8419, "step": 2390 }, { "crossentropy": 2.7405436038970947, "epoch": 0.2033855052739027, "grad_norm": 0.04294225573539734, "grad_norm_var": 6.537972205087478e-06, "learning_rate": 0.008131526769583942, "loss": 2.7405, "step": 2391 }, { "crossentropy": 2.8740789890289307, "epoch": 0.20347056822048315, "grad_norm": 0.04158322140574455, "grad_norm_var": 5.962295937821972e-06, "learning_rate": 0.008129964581981553, "loss": 2.8741, "step": 2392 }, { "crossentropy": 2.7525336742401123, "epoch": 0.20355563116706363, "grad_norm": 0.039251428097486496, "grad_norm_var": 6.482743402772863e-06, "learning_rate": 0.00812840189179704, "loss": 2.7525, "step": 2393 }, { "crossentropy": 2.807429075241089, "epoch": 0.2036406941136441, "grad_norm": 0.04551853984594345, "grad_norm_var": 6.962869512547726e-06, "learning_rate": 0.008126838699281326, "loss": 2.8074, "step": 2394 }, { "crossentropy": 2.7597250938415527, "epoch": 0.20372575706022455, "grad_norm": 0.044529229402542114, "grad_norm_var": 7.12160936915205e-06, "learning_rate": 0.008125275004685415, "loss": 2.7597, "step": 2395 }, { "crossentropy": 2.8316848278045654, "epoch": 0.20381082000680503, "grad_norm": 0.03836487606167793, "grad_norm_var": 6.695333185703124e-06, "learning_rate": 0.00812371080826039, "loss": 2.8317, "step": 2396 }, { "crossentropy": 2.812689781188965, "epoch": 0.2038958829533855, "grad_norm": 0.03815570846199989, "grad_norm_var": 7.744469394086833e-06, "learning_rate": 0.008122146110257418, "loss": 2.8127, "step": 2397 }, { "crossentropy": 2.844705581665039, "epoch": 0.203980945899966, "grad_norm": 0.05363474413752556, "grad_norm_var": 1.5755762906289697e-05, "learning_rate": 0.008120580910927742, "loss": 2.8447, "step": 2398 }, { "crossentropy": 2.8058385848999023, "epoch": 0.20406600884654644, "grad_norm": 0.04253782704472542, "grad_norm_var": 1.546940670582378e-05, "learning_rate": 0.00811901521052269, "loss": 2.8058, "step": 2399 }, { "crossentropy": 2.7745635509490967, "epoch": 0.20415107179312691, "grad_norm": 0.04262111708521843, "grad_norm_var": 1.5244103178517893e-05, "learning_rate": 0.008117449009293669, "loss": 2.7746, "step": 2400 }, { "crossentropy": 2.881659746170044, "epoch": 0.2042361347397074, "grad_norm": 0.04519437253475189, "grad_norm_var": 1.5132059658725232e-05, "learning_rate": 0.008115882307492162, "loss": 2.8817, "step": 2401 }, { "crossentropy": 2.8034420013427734, "epoch": 0.20432119768628784, "grad_norm": 0.0416935570538044, "grad_norm_var": 1.399977119514543e-05, "learning_rate": 0.008114315105369738, "loss": 2.8034, "step": 2402 }, { "crossentropy": 2.846672534942627, "epoch": 0.20440626063286832, "grad_norm": 0.04040220379829407, "grad_norm_var": 1.3890701836867756e-05, "learning_rate": 0.008112747403178048, "loss": 2.8467, "step": 2403 }, { "crossentropy": 2.7602221965789795, "epoch": 0.2044913235794488, "grad_norm": 0.04057823866605759, "grad_norm_var": 1.4092521640187104e-05, "learning_rate": 0.008111179201168814, "loss": 2.7602, "step": 2404 }, { "crossentropy": 2.709120035171509, "epoch": 0.20457638652602927, "grad_norm": 0.039459072053432465, "grad_norm_var": 1.4192134954321468e-05, "learning_rate": 0.008109610499593849, "loss": 2.7091, "step": 2405 }, { "crossentropy": 2.7771449089050293, "epoch": 0.20466144947260972, "grad_norm": 0.04278718680143356, "grad_norm_var": 1.400588000306513e-05, "learning_rate": 0.008108041298705038, "loss": 2.7771, "step": 2406 }, { "crossentropy": 2.816720962524414, "epoch": 0.2047465124191902, "grad_norm": 0.0461759977042675, "grad_norm_var": 1.4870247868698355e-05, "learning_rate": 0.008106471598754352, "loss": 2.8167, "step": 2407 }, { "crossentropy": 2.7766194343566895, "epoch": 0.20483157536577068, "grad_norm": 0.041742146015167236, "grad_norm_var": 1.4849105805962253e-05, "learning_rate": 0.008104901399993836, "loss": 2.7766, "step": 2408 }, { "crossentropy": 2.88572359085083, "epoch": 0.20491663831235113, "grad_norm": 0.04949796944856644, "grad_norm_var": 1.674690742884301e-05, "learning_rate": 0.008103330702675621, "loss": 2.8857, "step": 2409 }, { "crossentropy": 2.7581052780151367, "epoch": 0.2050017012589316, "grad_norm": 0.04664625972509384, "grad_norm_var": 1.7159105495411346e-05, "learning_rate": 0.008101759507051918, "loss": 2.7581, "step": 2410 }, { "crossentropy": 2.8386969566345215, "epoch": 0.20508676420551208, "grad_norm": 0.03893101215362549, "grad_norm_var": 1.8257264149991863e-05, "learning_rate": 0.008100187813375013, "loss": 2.8387, "step": 2411 }, { "crossentropy": 2.783905506134033, "epoch": 0.20517182715209256, "grad_norm": 0.03930732607841492, "grad_norm_var": 1.7727011167391567e-05, "learning_rate": 0.008098615621897272, "loss": 2.7839, "step": 2412 }, { "crossentropy": 2.8018038272857666, "epoch": 0.205256890098673, "grad_norm": 0.03790932521224022, "grad_norm_var": 1.7892747590185037e-05, "learning_rate": 0.008097042932871149, "loss": 2.8018, "step": 2413 }, { "crossentropy": 2.820146322250366, "epoch": 0.2053419530452535, "grad_norm": 0.04132014140486717, "grad_norm_var": 1.0023918588953721e-05, "learning_rate": 0.00809546974654917, "loss": 2.8201, "step": 2414 }, { "crossentropy": 2.86120867729187, "epoch": 0.20542701599183397, "grad_norm": 0.048009566962718964, "grad_norm_var": 1.206850394301669e-05, "learning_rate": 0.008093896063183944, "loss": 2.8612, "step": 2415 }, { "crossentropy": 2.748525381088257, "epoch": 0.20551207893841442, "grad_norm": 0.04341345280408859, "grad_norm_var": 1.2105511954078423e-05, "learning_rate": 0.008092321883028157, "loss": 2.7485, "step": 2416 }, { "crossentropy": 2.8716797828674316, "epoch": 0.2055971418849949, "grad_norm": 0.04204306751489639, "grad_norm_var": 1.1674640728622186e-05, "learning_rate": 0.00809074720633458, "loss": 2.8717, "step": 2417 }, { "crossentropy": 2.7768969535827637, "epoch": 0.20568220483157537, "grad_norm": 0.04431209713220596, "grad_norm_var": 1.1823448871023784e-05, "learning_rate": 0.008089172033356061, "loss": 2.7769, "step": 2418 }, { "crossentropy": 2.8014705181121826, "epoch": 0.20576726777815585, "grad_norm": 0.043209731578826904, "grad_norm_var": 1.1471493811265366e-05, "learning_rate": 0.008087596364345525, "loss": 2.8015, "step": 2419 }, { "crossentropy": 2.750105142593384, "epoch": 0.2058523307247363, "grad_norm": 0.04206116870045662, "grad_norm_var": 1.1162935648767237e-05, "learning_rate": 0.008086020199555979, "loss": 2.7501, "step": 2420 }, { "crossentropy": 2.794989824295044, "epoch": 0.20593739367131678, "grad_norm": 0.038425613194704056, "grad_norm_var": 1.170749362660219e-05, "learning_rate": 0.008084443539240512, "loss": 2.795, "step": 2421 }, { "crossentropy": 2.963050365447998, "epoch": 0.20602245661789725, "grad_norm": 0.0448916032910347, "grad_norm_var": 1.1963286328864748e-05, "learning_rate": 0.008082866383652288, "loss": 2.9631, "step": 2422 }, { "crossentropy": 2.798001289367676, "epoch": 0.2061075195644777, "grad_norm": 0.04869585111737251, "grad_norm_var": 1.342938712554914e-05, "learning_rate": 0.008081288733044551, "loss": 2.798, "step": 2423 }, { "crossentropy": 2.791794538497925, "epoch": 0.20619258251105818, "grad_norm": 0.04053884744644165, "grad_norm_var": 1.3745922187836414e-05, "learning_rate": 0.008079710587670633, "loss": 2.7918, "step": 2424 }, { "crossentropy": 2.787442922592163, "epoch": 0.20627764545763866, "grad_norm": 0.04158114641904831, "grad_norm_var": 1.0884102508234607e-05, "learning_rate": 0.008078131947783933, "loss": 2.7874, "step": 2425 }, { "crossentropy": 2.813991069793701, "epoch": 0.2063627084042191, "grad_norm": 0.042231179773807526, "grad_norm_var": 9.709292219849524e-06, "learning_rate": 0.008076552813637936, "loss": 2.814, "step": 2426 }, { "crossentropy": 2.8682806491851807, "epoch": 0.20644777135079959, "grad_norm": 0.046650052070617676, "grad_norm_var": 9.960666810503577e-06, "learning_rate": 0.008074973185486205, "loss": 2.8683, "step": 2427 }, { "crossentropy": 2.7652978897094727, "epoch": 0.20653283429738006, "grad_norm": 0.044466160237789154, "grad_norm_var": 9.230188908767625e-06, "learning_rate": 0.008073393063582386, "loss": 2.7653, "step": 2428 }, { "crossentropy": 2.6894116401672363, "epoch": 0.20661789724396054, "grad_norm": 0.03877084702253342, "grad_norm_var": 8.679185491514141e-06, "learning_rate": 0.008071812448180197, "loss": 2.6894, "step": 2429 }, { "crossentropy": 2.988046884536743, "epoch": 0.206702960190541, "grad_norm": 0.046243879944086075, "grad_norm_var": 8.984037731261247e-06, "learning_rate": 0.008070231339533442, "loss": 2.988, "step": 2430 }, { "crossentropy": 2.7621634006500244, "epoch": 0.20678802313712147, "grad_norm": 0.04478410631418228, "grad_norm_var": 7.682622113362041e-06, "learning_rate": 0.008068649737896002, "loss": 2.7622, "step": 2431 }, { "crossentropy": 2.8231565952301025, "epoch": 0.20687308608370195, "grad_norm": 0.04330345243215561, "grad_norm_var": 7.681273291080399e-06, "learning_rate": 0.008067067643521834, "loss": 2.8232, "step": 2432 }, { "crossentropy": 2.867403984069824, "epoch": 0.2069581490302824, "grad_norm": 0.0376911424100399, "grad_norm_var": 9.572879687687736e-06, "learning_rate": 0.008065485056664977, "loss": 2.8674, "step": 2433 }, { "crossentropy": 2.872633218765259, "epoch": 0.20704321197686287, "grad_norm": 0.038952119648456573, "grad_norm_var": 1.0424363749814089e-05, "learning_rate": 0.00806390197757955, "loss": 2.8726, "step": 2434 }, { "crossentropy": 2.8039071559906006, "epoch": 0.20712827492344335, "grad_norm": 0.04120740666985512, "grad_norm_var": 1.0527126954619954e-05, "learning_rate": 0.00806231840651975, "loss": 2.8039, "step": 2435 }, { "crossentropy": 2.887885093688965, "epoch": 0.20721333787002383, "grad_norm": 0.04464048892259598, "grad_norm_var": 1.0781383965266915e-05, "learning_rate": 0.008060734343739854, "loss": 2.8879, "step": 2436 }, { "crossentropy": 2.8448574542999268, "epoch": 0.20729840081660428, "grad_norm": 0.04650823399424553, "grad_norm_var": 1.0266492070887661e-05, "learning_rate": 0.008059149789494215, "loss": 2.8449, "step": 2437 }, { "crossentropy": 2.8351824283599854, "epoch": 0.20738346376318476, "grad_norm": 0.07581646740436554, "grad_norm_var": 7.702441263363931e-05, "learning_rate": 0.008057564744037264, "loss": 2.8352, "step": 2438 }, { "crossentropy": 2.7564518451690674, "epoch": 0.20746852670976523, "grad_norm": 0.04540400207042694, "grad_norm_var": 7.613661827731063e-05, "learning_rate": 0.008055979207623522, "loss": 2.7565, "step": 2439 }, { "crossentropy": 2.766623020172119, "epoch": 0.20755358965634568, "grad_norm": 0.045897841453552246, "grad_norm_var": 7.479796307353533e-05, "learning_rate": 0.008054393180507572, "loss": 2.7666, "step": 2440 }, { "crossentropy": 2.7941412925720215, "epoch": 0.20763865260292616, "grad_norm": 0.045516237616539, "grad_norm_var": 7.383593164969036e-05, "learning_rate": 0.008052806662944088, "loss": 2.7941, "step": 2441 }, { "crossentropy": 2.763847827911377, "epoch": 0.20772371554950664, "grad_norm": 0.04466312378644943, "grad_norm_var": 7.31439389739091e-05, "learning_rate": 0.008051219655187817, "loss": 2.7638, "step": 2442 }, { "crossentropy": 2.7863855361938477, "epoch": 0.20780877849608712, "grad_norm": 0.04104965925216675, "grad_norm_var": 7.43628493034126e-05, "learning_rate": 0.008049632157493592, "loss": 2.7864, "step": 2443 }, { "crossentropy": 2.8466744422912598, "epoch": 0.20789384144266757, "grad_norm": 0.0412728413939476, "grad_norm_var": 7.53582732267941e-05, "learning_rate": 0.008048044170116312, "loss": 2.8467, "step": 2444 }, { "crossentropy": 2.791822671890259, "epoch": 0.20797890438924804, "grad_norm": 0.040128786116838455, "grad_norm_var": 7.432619702557268e-05, "learning_rate": 0.008046455693310969, "loss": 2.7918, "step": 2445 }, { "crossentropy": 2.7853946685791016, "epoch": 0.20806396733582852, "grad_norm": 0.04280168563127518, "grad_norm_var": 7.458419429894128e-05, "learning_rate": 0.00804486672733262, "loss": 2.7854, "step": 2446 }, { "crossentropy": 2.8268702030181885, "epoch": 0.20814903028240897, "grad_norm": 0.04300592467188835, "grad_norm_var": 7.482763116526212e-05, "learning_rate": 0.008043277272436413, "loss": 2.8269, "step": 2447 }, { "crossentropy": 2.7733073234558105, "epoch": 0.20823409322898945, "grad_norm": 0.04649205133318901, "grad_norm_var": 7.479867648703953e-05, "learning_rate": 0.008041687328877566, "loss": 2.7733, "step": 2448 }, { "crossentropy": 2.7460057735443115, "epoch": 0.20831915617556993, "grad_norm": 0.04237492009997368, "grad_norm_var": 7.15644733035444e-05, "learning_rate": 0.008040096896911377, "loss": 2.746, "step": 2449 }, { "crossentropy": 2.8071348667144775, "epoch": 0.2084042191221504, "grad_norm": 0.043141674250364304, "grad_norm_var": 6.908299256548617e-05, "learning_rate": 0.008038505976793226, "loss": 2.8071, "step": 2450 }, { "crossentropy": 2.8424746990203857, "epoch": 0.20848928206873085, "grad_norm": 0.04398580640554428, "grad_norm_var": 6.793077074786468e-05, "learning_rate": 0.008036914568778567, "loss": 2.8425, "step": 2451 }, { "crossentropy": 2.7932519912719727, "epoch": 0.20857434501531133, "grad_norm": 0.03986876830458641, "grad_norm_var": 7.008758155922079e-05, "learning_rate": 0.008035322673122933, "loss": 2.7933, "step": 2452 }, { "crossentropy": 2.801382064819336, "epoch": 0.2086594079618918, "grad_norm": 0.04344147443771362, "grad_norm_var": 7.026128733250888e-05, "learning_rate": 0.00803373029008194, "loss": 2.8014, "step": 2453 }, { "crossentropy": 2.858794927597046, "epoch": 0.20874447090847226, "grad_norm": 0.049473270773887634, "grad_norm_var": 6.460646833792808e-06, "learning_rate": 0.008032137419911278, "loss": 2.8588, "step": 2454 }, { "crossentropy": 2.7768502235412598, "epoch": 0.20882953385505273, "grad_norm": 0.045792367309331894, "grad_norm_var": 6.560517236041339e-06, "learning_rate": 0.008030544062866716, "loss": 2.7769, "step": 2455 }, { "crossentropy": 2.9067165851593018, "epoch": 0.2089145968016332, "grad_norm": 0.04871203750371933, "grad_norm_var": 7.8870706436695e-06, "learning_rate": 0.0080289502192041, "loss": 2.9067, "step": 2456 }, { "crossentropy": 2.8997225761413574, "epoch": 0.2089996597482137, "grad_norm": 0.05782218649983406, "grad_norm_var": 2.007342514741527e-05, "learning_rate": 0.008027355889179355, "loss": 2.8997, "step": 2457 }, { "crossentropy": 2.7713167667388916, "epoch": 0.20908472269479414, "grad_norm": 0.040430616587400436, "grad_norm_var": 2.1172480333425612e-05, "learning_rate": 0.008025761073048487, "loss": 2.7713, "step": 2458 }, { "crossentropy": 2.7617881298065186, "epoch": 0.20916978564137462, "grad_norm": 0.04446728900074959, "grad_norm_var": 2.0393052861382897e-05, "learning_rate": 0.008024165771067576, "loss": 2.7618, "step": 2459 }, { "crossentropy": 2.671389579772949, "epoch": 0.2092548485879551, "grad_norm": 0.04503827914595604, "grad_norm_var": 1.9620966955555667e-05, "learning_rate": 0.00802256998349278, "loss": 2.6714, "step": 2460 }, { "crossentropy": 2.7254183292388916, "epoch": 0.20933991153453554, "grad_norm": 0.04064290598034859, "grad_norm_var": 1.931651947679972e-05, "learning_rate": 0.008020973710580338, "loss": 2.7254, "step": 2461 }, { "crossentropy": 2.8490025997161865, "epoch": 0.20942497448111602, "grad_norm": 0.04247989505529404, "grad_norm_var": 1.9410583466862064e-05, "learning_rate": 0.008019376952586567, "loss": 2.849, "step": 2462 }, { "crossentropy": 2.8208374977111816, "epoch": 0.2095100374276965, "grad_norm": 0.0396738164126873, "grad_norm_var": 2.091185053035103e-05, "learning_rate": 0.008017779709767857, "loss": 2.8208, "step": 2463 }, { "crossentropy": 2.7875967025756836, "epoch": 0.20959510037427698, "grad_norm": 0.03943026438355446, "grad_norm_var": 2.226111908626791e-05, "learning_rate": 0.008016181982380681, "loss": 2.7876, "step": 2464 }, { "crossentropy": 2.834164619445801, "epoch": 0.20968016332085743, "grad_norm": 0.04006422311067581, "grad_norm_var": 2.314894813837745e-05, "learning_rate": 0.00801458377068159, "loss": 2.8342, "step": 2465 }, { "crossentropy": 2.789720296859741, "epoch": 0.2097652262674379, "grad_norm": 0.039715029299259186, "grad_norm_var": 2.4288248311917816e-05, "learning_rate": 0.008012985074927209, "loss": 2.7897, "step": 2466 }, { "crossentropy": 2.8601646423339844, "epoch": 0.20985028921401838, "grad_norm": 0.04295145720243454, "grad_norm_var": 2.433154398661045e-05, "learning_rate": 0.00801138589537424, "loss": 2.8602, "step": 2467 }, { "crossentropy": 2.7730953693389893, "epoch": 0.20993535216059883, "grad_norm": 0.03858407586812973, "grad_norm_var": 2.509956290223381e-05, "learning_rate": 0.008009786232279468, "loss": 2.7731, "step": 2468 }, { "crossentropy": 2.7497944831848145, "epoch": 0.2100204151071793, "grad_norm": 0.0439489483833313, "grad_norm_var": 2.5100199177399695e-05, "learning_rate": 0.008008186085899751, "loss": 2.7498, "step": 2469 }, { "crossentropy": 2.7288005352020264, "epoch": 0.2101054780537598, "grad_norm": 0.046222392469644547, "grad_norm_var": 2.3259007831237925e-05, "learning_rate": 0.00800658545649203, "loss": 2.7288, "step": 2470 }, { "crossentropy": 2.885150909423828, "epoch": 0.21019054100034026, "grad_norm": 0.04804306849837303, "grad_norm_var": 2.426399001941298e-05, "learning_rate": 0.008004984344313315, "loss": 2.8852, "step": 2471 }, { "crossentropy": 2.7860517501831055, "epoch": 0.21027560394692071, "grad_norm": 0.047055814415216446, "grad_norm_var": 2.3315188926003225e-05, "learning_rate": 0.008003382749620702, "loss": 2.7861, "step": 2472 }, { "crossentropy": 2.7966573238372803, "epoch": 0.2103606668935012, "grad_norm": 0.04243515431880951, "grad_norm_var": 8.802399146002188e-06, "learning_rate": 0.00800178067267136, "loss": 2.7967, "step": 2473 }, { "crossentropy": 2.81604266166687, "epoch": 0.21044572984008167, "grad_norm": 0.04147595167160034, "grad_norm_var": 8.571960674967165e-06, "learning_rate": 0.008000178113722537, "loss": 2.816, "step": 2474 }, { "crossentropy": 2.887965679168701, "epoch": 0.21053079278666212, "grad_norm": 0.039567891508340836, "grad_norm_var": 8.878067779067671e-06, "learning_rate": 0.007998575073031557, "loss": 2.888, "step": 2475 }, { "crossentropy": 2.8845531940460205, "epoch": 0.2106158557332426, "grad_norm": 0.041617363691329956, "grad_norm_var": 8.375579978734244e-06, "learning_rate": 0.007996971550855821, "loss": 2.8846, "step": 2476 }, { "crossentropy": 2.891840696334839, "epoch": 0.21070091867982307, "grad_norm": 0.04451524093747139, "grad_norm_var": 8.550504823604382e-06, "learning_rate": 0.00799536754745281, "loss": 2.8918, "step": 2477 }, { "crossentropy": 2.91715931892395, "epoch": 0.21078598162640355, "grad_norm": 0.04500088095664978, "grad_norm_var": 8.98758344159602e-06, "learning_rate": 0.007993763063080082, "loss": 2.9172, "step": 2478 }, { "crossentropy": 2.740079879760742, "epoch": 0.210871044572984, "grad_norm": 0.043616827577352524, "grad_norm_var": 8.463559663393107e-06, "learning_rate": 0.007992158097995267, "loss": 2.7401, "step": 2479 }, { "crossentropy": 2.8897156715393066, "epoch": 0.21095610751956448, "grad_norm": 0.0428621731698513, "grad_norm_var": 7.673618964355345e-06, "learning_rate": 0.00799055265245608, "loss": 2.8897, "step": 2480 }, { "crossentropy": 2.759075164794922, "epoch": 0.21104117046614496, "grad_norm": 0.04300384223461151, "grad_norm_var": 7.070953442583385e-06, "learning_rate": 0.007988946726720307, "loss": 2.7591, "step": 2481 }, { "crossentropy": 2.901796340942383, "epoch": 0.2111262334127254, "grad_norm": 0.042577337473630905, "grad_norm_var": 6.266923154810145e-06, "learning_rate": 0.007987340321045816, "loss": 2.9018, "step": 2482 }, { "crossentropy": 2.754805088043213, "epoch": 0.21121129635930588, "grad_norm": 0.045029327273368835, "grad_norm_var": 6.428458858820143e-06, "learning_rate": 0.007985733435690544, "loss": 2.7548, "step": 2483 }, { "crossentropy": 2.700566291809082, "epoch": 0.21129635930588636, "grad_norm": 0.04459765553474426, "grad_norm_var": 4.769250636814844e-06, "learning_rate": 0.007984126070912518, "loss": 2.7006, "step": 2484 }, { "crossentropy": 2.7772257328033447, "epoch": 0.21138142225246684, "grad_norm": 0.042744532227516174, "grad_norm_var": 4.843721867216673e-06, "learning_rate": 0.007982518226969831, "loss": 2.7772, "step": 2485 }, { "crossentropy": 2.817887306213379, "epoch": 0.2114664851990473, "grad_norm": 0.04416825249791145, "grad_norm_var": 4.436543809689201e-06, "learning_rate": 0.007980909904120653, "loss": 2.8179, "step": 2486 }, { "crossentropy": 2.723628044128418, "epoch": 0.21155154814562777, "grad_norm": 0.040182773023843765, "grad_norm_var": 3.68814110835979e-06, "learning_rate": 0.00797930110262324, "loss": 2.7236, "step": 2487 }, { "crossentropy": 2.758979082107544, "epoch": 0.21163661109220824, "grad_norm": 0.04380868747830391, "grad_norm_var": 2.65748786250052e-06, "learning_rate": 0.007977691822735914, "loss": 2.759, "step": 2488 }, { "crossentropy": 2.811511754989624, "epoch": 0.2117216740387887, "grad_norm": 0.055125799030065536, "grad_norm_var": 1.185169210722725e-05, "learning_rate": 0.007976082064717083, "loss": 2.8115, "step": 2489 }, { "crossentropy": 2.8930394649505615, "epoch": 0.21180673698536917, "grad_norm": 0.06467140465974808, "grad_norm_var": 3.846588190755945e-05, "learning_rate": 0.007974471828825226, "loss": 2.893, "step": 2490 }, { "crossentropy": 2.79358172416687, "epoch": 0.21189179993194965, "grad_norm": 0.04015522077679634, "grad_norm_var": 3.804692645319596e-05, "learning_rate": 0.007972861115318901, "loss": 2.7936, "step": 2491 }, { "crossentropy": 2.682142496109009, "epoch": 0.21197686287853013, "grad_norm": 0.04193178936839104, "grad_norm_var": 3.790165836602573e-05, "learning_rate": 0.007971249924456742, "loss": 2.6821, "step": 2492 }, { "crossentropy": 2.8277108669281006, "epoch": 0.21206192582511058, "grad_norm": 0.04602934047579765, "grad_norm_var": 3.789671050722828e-05, "learning_rate": 0.00796963825649746, "loss": 2.8277, "step": 2493 }, { "crossentropy": 2.7570056915283203, "epoch": 0.21214698877169105, "grad_norm": 0.04675848037004471, "grad_norm_var": 3.8009346956200533e-05, "learning_rate": 0.007968026111699842, "loss": 2.757, "step": 2494 }, { "crossentropy": 2.8705759048461914, "epoch": 0.21223205171827153, "grad_norm": 0.046627841889858246, "grad_norm_var": 3.783843188284123e-05, "learning_rate": 0.00796641349032275, "loss": 2.8706, "step": 2495 }, { "crossentropy": 2.805630922317505, "epoch": 0.21231711466485198, "grad_norm": 0.045024625957012177, "grad_norm_var": 3.7329150989542925e-05, "learning_rate": 0.00796480039262513, "loss": 2.8056, "step": 2496 }, { "crossentropy": 2.815871000289917, "epoch": 0.21240217761143246, "grad_norm": 0.044897280633449554, "grad_norm_var": 3.6853035702836406e-05, "learning_rate": 0.007963186818865993, "loss": 2.8159, "step": 2497 }, { "crossentropy": 2.776184320449829, "epoch": 0.21248724055801294, "grad_norm": 0.04144064337015152, "grad_norm_var": 3.743671064705545e-05, "learning_rate": 0.007961572769304436, "loss": 2.7762, "step": 2498 }, { "crossentropy": 2.828169822692871, "epoch": 0.2125723035045934, "grad_norm": 0.04730663821101189, "grad_norm_var": 3.751936591017228e-05, "learning_rate": 0.007959958244199629, "loss": 2.8282, "step": 2499 }, { "crossentropy": 2.7505056858062744, "epoch": 0.21265736645117386, "grad_norm": 0.0435042530298233, "grad_norm_var": 3.779370967148902e-05, "learning_rate": 0.007958343243810817, "loss": 2.7505, "step": 2500 }, { "crossentropy": 2.753622531890869, "epoch": 0.21274242939775434, "grad_norm": 0.039797089993953705, "grad_norm_var": 3.9576196586550934e-05, "learning_rate": 0.007956727768397323, "loss": 2.7536, "step": 2501 }, { "crossentropy": 2.7602341175079346, "epoch": 0.21282749234433482, "grad_norm": 0.04528253152966499, "grad_norm_var": 3.9424088323915484e-05, "learning_rate": 0.007955111818218544, "loss": 2.7602, "step": 2502 }, { "crossentropy": 2.7713780403137207, "epoch": 0.21291255529091527, "grad_norm": 0.04127062112092972, "grad_norm_var": 3.86856101612519e-05, "learning_rate": 0.007953495393533959, "loss": 2.7714, "step": 2503 }, { "crossentropy": 2.8249475955963135, "epoch": 0.21299761823749574, "grad_norm": 0.04394970461726189, "grad_norm_var": 3.864843378975815e-05, "learning_rate": 0.007951878494603114, "loss": 2.8249, "step": 2504 }, { "crossentropy": 2.8401668071746826, "epoch": 0.21308268118407622, "grad_norm": 0.04510318860411644, "grad_norm_var": 3.2545504164587365e-05, "learning_rate": 0.00795026112168564, "loss": 2.8402, "step": 2505 }, { "crossentropy": 2.786639451980591, "epoch": 0.21316774413065667, "grad_norm": 0.04146530479192734, "grad_norm_var": 6.062302508561395e-06, "learning_rate": 0.007948643275041242, "loss": 2.7866, "step": 2506 }, { "crossentropy": 2.834852457046509, "epoch": 0.21325280707723715, "grad_norm": 0.04290114343166351, "grad_norm_var": 5.2049659058038085e-06, "learning_rate": 0.007947024954929695, "loss": 2.8349, "step": 2507 }, { "crossentropy": 2.720452070236206, "epoch": 0.21333787002381763, "grad_norm": 0.03744953125715256, "grad_norm_var": 7.670162421933614e-06, "learning_rate": 0.007945406161610857, "loss": 2.7205, "step": 2508 }, { "crossentropy": 2.83671236038208, "epoch": 0.2134229329703981, "grad_norm": 0.04179052636027336, "grad_norm_var": 7.462809438984047e-06, "learning_rate": 0.00794378689534466, "loss": 2.8367, "step": 2509 }, { "crossentropy": 2.800814151763916, "epoch": 0.21350799591697855, "grad_norm": 0.04534337297081947, "grad_norm_var": 6.956283829761868e-06, "learning_rate": 0.007942167156391112, "loss": 2.8008, "step": 2510 }, { "crossentropy": 2.8253402709960938, "epoch": 0.21359305886355903, "grad_norm": 0.042483724653720856, "grad_norm_var": 6.203080396715901e-06, "learning_rate": 0.007940546945010294, "loss": 2.8253, "step": 2511 }, { "crossentropy": 2.781005382537842, "epoch": 0.2136781218101395, "grad_norm": 0.0394403301179409, "grad_norm_var": 6.6916310814248954e-06, "learning_rate": 0.007938926261462366, "loss": 2.781, "step": 2512 }, { "crossentropy": 2.8537209033966064, "epoch": 0.21376318475671996, "grad_norm": 0.04269731417298317, "grad_norm_var": 6.353737188000836e-06, "learning_rate": 0.007937305106007564, "loss": 2.8537, "step": 2513 }, { "crossentropy": 2.7707407474517822, "epoch": 0.21384824770330044, "grad_norm": 0.05015627667307854, "grad_norm_var": 9.781278042059135e-06, "learning_rate": 0.0079356834789062, "loss": 2.7707, "step": 2514 }, { "crossentropy": 2.7838551998138428, "epoch": 0.21393331064988091, "grad_norm": 0.04566957801580429, "grad_norm_var": 9.035232762984411e-06, "learning_rate": 0.007934061380418658, "loss": 2.7839, "step": 2515 }, { "crossentropy": 2.7297956943511963, "epoch": 0.2140183735964614, "grad_norm": 0.040850721299648285, "grad_norm_var": 9.30363596817844e-06, "learning_rate": 0.0079324388108054, "loss": 2.7298, "step": 2516 }, { "crossentropy": 2.815078020095825, "epoch": 0.21410343654304184, "grad_norm": 0.043003637343645096, "grad_norm_var": 8.639655818748887e-06, "learning_rate": 0.007930815770326966, "loss": 2.8151, "step": 2517 }, { "crossentropy": 2.8142528533935547, "epoch": 0.21418849948962232, "grad_norm": 0.043084681034088135, "grad_norm_var": 8.288382187472919e-06, "learning_rate": 0.00792919225924397, "loss": 2.8143, "step": 2518 }, { "crossentropy": 2.80962872505188, "epoch": 0.2142735624362028, "grad_norm": 0.04430308938026428, "grad_norm_var": 8.197756707131742e-06, "learning_rate": 0.007927568277817099, "loss": 2.8096, "step": 2519 }, { "crossentropy": 2.7910327911376953, "epoch": 0.21435862538278325, "grad_norm": 0.0443028025329113, "grad_norm_var": 8.245281872594027e-06, "learning_rate": 0.007925943826307118, "loss": 2.791, "step": 2520 }, { "crossentropy": 2.9589929580688477, "epoch": 0.21444368832936372, "grad_norm": 0.04352763295173645, "grad_norm_var": 7.98545794011233e-06, "learning_rate": 0.007924318904974869, "loss": 2.959, "step": 2521 }, { "crossentropy": 2.7525558471679688, "epoch": 0.2145287512759442, "grad_norm": 0.037509720772504807, "grad_norm_var": 9.78827042075354e-06, "learning_rate": 0.007922693514081267, "loss": 2.7526, "step": 2522 }, { "crossentropy": 2.6468279361724854, "epoch": 0.21461381422252468, "grad_norm": 0.04003671929240227, "grad_norm_var": 1.0255624351831043e-05, "learning_rate": 0.007921067653887299, "loss": 2.6468, "step": 2523 }, { "crossentropy": 2.796356439590454, "epoch": 0.21469887716910513, "grad_norm": 0.041730258613824844, "grad_norm_var": 8.459441728526454e-06, "learning_rate": 0.007919441324654036, "loss": 2.7964, "step": 2524 }, { "crossentropy": 2.805617094039917, "epoch": 0.2147839401156856, "grad_norm": 0.04161682352423668, "grad_norm_var": 8.486343572664904e-06, "learning_rate": 0.007917814526642619, "loss": 2.8056, "step": 2525 }, { "crossentropy": 2.7741668224334717, "epoch": 0.21486900306226608, "grad_norm": 0.03906618803739548, "grad_norm_var": 8.870381894258e-06, "learning_rate": 0.007916187260114263, "loss": 2.7742, "step": 2526 }, { "crossentropy": 2.870577335357666, "epoch": 0.21495406600884653, "grad_norm": 0.03985961526632309, "grad_norm_var": 9.295066103497523e-06, "learning_rate": 0.007914559525330262, "loss": 2.8706, "step": 2527 }, { "crossentropy": 2.809516191482544, "epoch": 0.215039128955427, "grad_norm": 0.050618235021829605, "grad_norm_var": 1.2836988303794167e-05, "learning_rate": 0.00791293132255198, "loss": 2.8095, "step": 2528 }, { "crossentropy": 2.8232991695404053, "epoch": 0.2151241919020075, "grad_norm": 0.04038511589169502, "grad_norm_var": 1.32650870808594e-05, "learning_rate": 0.007911302652040864, "loss": 2.8233, "step": 2529 }, { "crossentropy": 2.7611334323883057, "epoch": 0.21520925484858797, "grad_norm": 0.03942495957016945, "grad_norm_var": 1.0019359726129845e-05, "learning_rate": 0.007909673514058429, "loss": 2.7611, "step": 2530 }, { "crossentropy": 2.7758193016052246, "epoch": 0.21529431779516842, "grad_norm": 0.04440348595380783, "grad_norm_var": 9.5316211899736e-06, "learning_rate": 0.007908043908866267, "loss": 2.7758, "step": 2531 }, { "crossentropy": 2.821171760559082, "epoch": 0.2153793807417489, "grad_norm": 0.04213407635688782, "grad_norm_var": 9.419466836941594e-06, "learning_rate": 0.007906413836726047, "loss": 2.8212, "step": 2532 }, { "crossentropy": 2.881598472595215, "epoch": 0.21546444368832937, "grad_norm": 0.04335542395710945, "grad_norm_var": 9.465461633738753e-06, "learning_rate": 0.007904783297899514, "loss": 2.8816, "step": 2533 }, { "crossentropy": 2.8107540607452393, "epoch": 0.21554950663490982, "grad_norm": 0.04541536420583725, "grad_norm_var": 1.0076803562206599e-05, "learning_rate": 0.007903152292648478, "loss": 2.8108, "step": 2534 }, { "crossentropy": 2.8508667945861816, "epoch": 0.2156345695814903, "grad_norm": 0.042654089629650116, "grad_norm_var": 9.818564421163317e-06, "learning_rate": 0.00790152082123484, "loss": 2.8509, "step": 2535 }, { "crossentropy": 2.786107063293457, "epoch": 0.21571963252807078, "grad_norm": 0.040914103388786316, "grad_norm_var": 9.609902860662397e-06, "learning_rate": 0.00789988888392056, "loss": 2.7861, "step": 2536 }, { "crossentropy": 2.740521192550659, "epoch": 0.21580469547465125, "grad_norm": 0.041147369891405106, "grad_norm_var": 9.492112729629112e-06, "learning_rate": 0.007898256480967689, "loss": 2.7405, "step": 2537 }, { "crossentropy": 2.7494068145751953, "epoch": 0.2158897584212317, "grad_norm": 0.04048162326216698, "grad_norm_var": 8.307642459062272e-06, "learning_rate": 0.007896623612638334, "loss": 2.7494, "step": 2538 }, { "crossentropy": 2.8560428619384766, "epoch": 0.21597482136781218, "grad_norm": 0.04042285308241844, "grad_norm_var": 8.21188147297752e-06, "learning_rate": 0.007894990279194691, "loss": 2.856, "step": 2539 }, { "crossentropy": 2.7087695598602295, "epoch": 0.21605988431439266, "grad_norm": 0.03938634693622589, "grad_norm_var": 8.671381633358159e-06, "learning_rate": 0.00789335648089903, "loss": 2.7088, "step": 2540 }, { "crossentropy": 2.784015417098999, "epoch": 0.2161449472609731, "grad_norm": 0.038996513932943344, "grad_norm_var": 9.218782202009372e-06, "learning_rate": 0.007891722218013686, "loss": 2.784, "step": 2541 }, { "crossentropy": 2.7828052043914795, "epoch": 0.21623001020755359, "grad_norm": 0.03949214145541191, "grad_norm_var": 9.075336335867487e-06, "learning_rate": 0.007890087490801077, "loss": 2.7828, "step": 2542 }, { "crossentropy": 2.8108766078948975, "epoch": 0.21631507315413406, "grad_norm": 0.03970196843147278, "grad_norm_var": 9.11805839795132e-06, "learning_rate": 0.007888452299523691, "loss": 2.8109, "step": 2543 }, { "crossentropy": 2.8568716049194336, "epoch": 0.21640013610071454, "grad_norm": 0.04020357131958008, "grad_norm_var": 3.6635421747234496e-06, "learning_rate": 0.007886816644444098, "loss": 2.8569, "step": 2544 }, { "crossentropy": 2.7654926776885986, "epoch": 0.216485199047295, "grad_norm": 0.04601471126079559, "grad_norm_var": 5.064598915166472e-06, "learning_rate": 0.00788518052582493, "loss": 2.7655, "step": 2545 }, { "crossentropy": 2.7560672760009766, "epoch": 0.21657026199387547, "grad_norm": 0.04246903583407402, "grad_norm_var": 4.797768443459173e-06, "learning_rate": 0.007883543943928909, "loss": 2.7561, "step": 2546 }, { "crossentropy": 2.8396403789520264, "epoch": 0.21665532494045595, "grad_norm": 0.04195253551006317, "grad_norm_var": 4.289584946422636e-06, "learning_rate": 0.007881906899018815, "loss": 2.8396, "step": 2547 }, { "crossentropy": 2.8351449966430664, "epoch": 0.2167403878870364, "grad_norm": 0.03833513334393501, "grad_norm_var": 4.893888489069381e-06, "learning_rate": 0.007880269391357513, "loss": 2.8351, "step": 2548 }, { "crossentropy": 2.769354820251465, "epoch": 0.21682545083361687, "grad_norm": 0.04089244827628136, "grad_norm_var": 4.600965143911988e-06, "learning_rate": 0.00787863142120794, "loss": 2.7694, "step": 2549 }, { "crossentropy": 2.8441169261932373, "epoch": 0.21691051378019735, "grad_norm": 0.04182463511824608, "grad_norm_var": 3.3670844223694537e-06, "learning_rate": 0.007876992988833107, "loss": 2.8441, "step": 2550 }, { "crossentropy": 2.8265955448150635, "epoch": 0.21699557672677783, "grad_norm": 0.04070174694061279, "grad_norm_var": 3.156657946228915e-06, "learning_rate": 0.0078753540944961, "loss": 2.8266, "step": 2551 }, { "crossentropy": 2.761082172393799, "epoch": 0.21708063967335828, "grad_norm": 0.03968658670783043, "grad_norm_var": 3.233556330916922e-06, "learning_rate": 0.007873714738460075, "loss": 2.7611, "step": 2552 }, { "crossentropy": 2.680694341659546, "epoch": 0.21716570261993876, "grad_norm": 0.04631089046597481, "grad_norm_var": 5.186016905683645e-06, "learning_rate": 0.007872074920988266, "loss": 2.6807, "step": 2553 }, { "crossentropy": 2.80818772315979, "epoch": 0.21725076556651923, "grad_norm": 0.043765198439359665, "grad_norm_var": 5.609052048973171e-06, "learning_rate": 0.007870434642343984, "loss": 2.8082, "step": 2554 }, { "crossentropy": 2.8045809268951416, "epoch": 0.21733582851309968, "grad_norm": 0.046731602400541306, "grad_norm_var": 7.392585538471691e-06, "learning_rate": 0.007868793902790609, "loss": 2.8046, "step": 2555 }, { "crossentropy": 2.875994920730591, "epoch": 0.21742089145968016, "grad_norm": 0.03966949135065079, "grad_norm_var": 7.311983921606813e-06, "learning_rate": 0.007867152702591593, "loss": 2.876, "step": 2556 }, { "crossentropy": 2.757310152053833, "epoch": 0.21750595440626064, "grad_norm": 0.048327043652534485, "grad_norm_var": 9.424958967525245e-06, "learning_rate": 0.007865511042010468, "loss": 2.7573, "step": 2557 }, { "crossentropy": 2.7865185737609863, "epoch": 0.21759101735284112, "grad_norm": 0.03917280212044716, "grad_norm_var": 9.548967804521615e-06, "learning_rate": 0.007863868921310841, "loss": 2.7865, "step": 2558 }, { "crossentropy": 2.7626285552978516, "epoch": 0.21767608029942156, "grad_norm": 0.043041378259658813, "grad_norm_var": 9.118119025308621e-06, "learning_rate": 0.007862226340756383, "loss": 2.7626, "step": 2559 }, { "crossentropy": 2.9168577194213867, "epoch": 0.21776114324600204, "grad_norm": 0.04116756469011307, "grad_norm_var": 8.88827312512565e-06, "learning_rate": 0.007860583300610848, "loss": 2.9169, "step": 2560 }, { "crossentropy": 2.8033549785614014, "epoch": 0.21784620619258252, "grad_norm": 0.043114759027957916, "grad_norm_var": 8.056399254504204e-06, "learning_rate": 0.007858939801138061, "loss": 2.8034, "step": 2561 }, { "crossentropy": 2.8192508220672607, "epoch": 0.21793126913916297, "grad_norm": 0.044703878462314606, "grad_norm_var": 8.412168339288808e-06, "learning_rate": 0.00785729584260192, "loss": 2.8193, "step": 2562 }, { "crossentropy": 2.8008713722229004, "epoch": 0.21801633208574345, "grad_norm": 0.0427275076508522, "grad_norm_var": 8.397025154074922e-06, "learning_rate": 0.007855651425266398, "loss": 2.8009, "step": 2563 }, { "crossentropy": 2.712921142578125, "epoch": 0.21810139503232392, "grad_norm": 0.04239227622747421, "grad_norm_var": 7.166968343108072e-06, "learning_rate": 0.007854006549395543, "loss": 2.7129, "step": 2564 }, { "crossentropy": 2.7402126789093018, "epoch": 0.2181864579789044, "grad_norm": 0.04407615587115288, "grad_norm_var": 7.0058507073172385e-06, "learning_rate": 0.00785236121525347, "loss": 2.7402, "step": 2565 }, { "crossentropy": 2.8029885292053223, "epoch": 0.21827152092548485, "grad_norm": 0.03774946555495262, "grad_norm_var": 8.66251333434259e-06, "learning_rate": 0.007850715423104376, "loss": 2.803, "step": 2566 }, { "crossentropy": 2.820040225982666, "epoch": 0.21835658387206533, "grad_norm": 0.03982187807559967, "grad_norm_var": 8.946340063597548e-06, "learning_rate": 0.007849069173212526, "loss": 2.82, "step": 2567 }, { "crossentropy": 2.8107943534851074, "epoch": 0.2184416468186458, "grad_norm": 0.04368632286787033, "grad_norm_var": 8.363876137718977e-06, "learning_rate": 0.00784742246584226, "loss": 2.8108, "step": 2568 }, { "crossentropy": 2.852053165435791, "epoch": 0.21852670976522626, "grad_norm": 0.042945731431245804, "grad_norm_var": 7.542851939446963e-06, "learning_rate": 0.007845775301257994, "loss": 2.8521, "step": 2569 }, { "crossentropy": 2.707629919052124, "epoch": 0.21861177271180673, "grad_norm": 0.046561893075704575, "grad_norm_var": 8.431392792272267e-06, "learning_rate": 0.007844127679724214, "loss": 2.7076, "step": 2570 }, { "crossentropy": 2.731527090072632, "epoch": 0.2186968356583872, "grad_norm": 0.05214757099747658, "grad_norm_var": 1.3054628420113245e-05, "learning_rate": 0.007842479601505479, "loss": 2.7315, "step": 2571 }, { "crossentropy": 2.9508304595947266, "epoch": 0.2187818986049677, "grad_norm": 0.05041477084159851, "grad_norm_var": 1.5203302724461909e-05, "learning_rate": 0.007840831066866423, "loss": 2.9508, "step": 2572 }, { "crossentropy": 2.8081188201904297, "epoch": 0.21886696155154814, "grad_norm": 0.045845046639442444, "grad_norm_var": 1.411604899879589e-05, "learning_rate": 0.007839182076071757, "loss": 2.8081, "step": 2573 }, { "crossentropy": 2.896043300628662, "epoch": 0.21895202449812862, "grad_norm": 0.04607504606246948, "grad_norm_var": 1.2906008658395268e-05, "learning_rate": 0.007837532629386256, "loss": 2.896, "step": 2574 }, { "crossentropy": 2.756696939468384, "epoch": 0.2190370874447091, "grad_norm": 0.0420374721288681, "grad_norm_var": 1.3117987533533121e-05, "learning_rate": 0.00783588272707478, "loss": 2.7567, "step": 2575 }, { "crossentropy": 2.732356309890747, "epoch": 0.21912215039128954, "grad_norm": 0.039688266813755035, "grad_norm_var": 1.383151501299634e-05, "learning_rate": 0.00783423236940225, "loss": 2.7324, "step": 2576 }, { "crossentropy": 2.833894968032837, "epoch": 0.21920721333787002, "grad_norm": 0.03770565241575241, "grad_norm_var": 1.629807650238516e-05, "learning_rate": 0.00783258155663367, "loss": 2.8339, "step": 2577 }, { "crossentropy": 2.7627995014190674, "epoch": 0.2192922762844505, "grad_norm": 0.039695728570222855, "grad_norm_var": 1.7169411277634153e-05, "learning_rate": 0.007830930289034107, "loss": 2.7628, "step": 2578 }, { "crossentropy": 2.8005216121673584, "epoch": 0.21937733923103098, "grad_norm": 0.04119172319769859, "grad_norm_var": 1.7443920541829597e-05, "learning_rate": 0.007829278566868714, "loss": 2.8005, "step": 2579 }, { "crossentropy": 2.7882463932037354, "epoch": 0.21946240217761143, "grad_norm": 0.04135899245738983, "grad_norm_var": 1.7629121239364346e-05, "learning_rate": 0.007827626390402707, "loss": 2.7882, "step": 2580 }, { "crossentropy": 2.7142584323883057, "epoch": 0.2195474651241919, "grad_norm": 0.039492931216955185, "grad_norm_var": 1.8399003982442056e-05, "learning_rate": 0.007825973759901376, "loss": 2.7143, "step": 2581 }, { "crossentropy": 2.8662400245666504, "epoch": 0.21963252807077238, "grad_norm": 0.03962722420692444, "grad_norm_var": 1.73295601293993e-05, "learning_rate": 0.00782432067563009, "loss": 2.8662, "step": 2582 }, { "crossentropy": 2.743635416030884, "epoch": 0.21971759101735283, "grad_norm": 0.043141163885593414, "grad_norm_var": 1.6603423257137892e-05, "learning_rate": 0.007822667137854282, "loss": 2.7436, "step": 2583 }, { "crossentropy": 2.8785347938537598, "epoch": 0.2198026539639333, "grad_norm": 0.04350801929831505, "grad_norm_var": 1.659446595101647e-05, "learning_rate": 0.007821013146839466, "loss": 2.8785, "step": 2584 }, { "crossentropy": 2.774153232574463, "epoch": 0.2198877169105138, "grad_norm": 0.03756476566195488, "grad_norm_var": 1.85972064172872e-05, "learning_rate": 0.007819358702851225, "loss": 2.7742, "step": 2585 }, { "crossentropy": 2.834373712539673, "epoch": 0.21997277985709424, "grad_norm": 0.04189498722553253, "grad_norm_var": 1.7666460965369093e-05, "learning_rate": 0.007817703806155213, "loss": 2.8344, "step": 2586 }, { "crossentropy": 2.794403553009033, "epoch": 0.2200578428036747, "grad_norm": 0.0400981530547142, "grad_norm_var": 1.1380566825623603e-05, "learning_rate": 0.007816048457017163, "loss": 2.7944, "step": 2587 }, { "crossentropy": 2.842526912689209, "epoch": 0.2201429057502552, "grad_norm": 0.04184971749782562, "grad_norm_var": 6.16598464577006e-06, "learning_rate": 0.007814392655702869, "loss": 2.8425, "step": 2588 }, { "crossentropy": 2.809987783432007, "epoch": 0.22022796869683567, "grad_norm": 0.04902881011366844, "grad_norm_var": 8.72955316957765e-06, "learning_rate": 0.007812736402478213, "loss": 2.81, "step": 2589 }, { "crossentropy": 2.747161865234375, "epoch": 0.22031303164341612, "grad_norm": 0.03817160055041313, "grad_norm_var": 7.809707907234914e-06, "learning_rate": 0.007811079697609136, "loss": 2.7472, "step": 2590 }, { "crossentropy": 2.863694667816162, "epoch": 0.2203980945899966, "grad_norm": 0.042120058089494705, "grad_norm_var": 7.821520273857777e-06, "learning_rate": 0.007809422541361659, "loss": 2.8637, "step": 2591 }, { "crossentropy": 2.7555673122406006, "epoch": 0.22048315753657707, "grad_norm": 0.04836765304207802, "grad_norm_var": 1.1001782748461657e-05, "learning_rate": 0.007807764934001874, "loss": 2.7556, "step": 2592 }, { "crossentropy": 2.801190137863159, "epoch": 0.22056822048315752, "grad_norm": 0.04534541815519333, "grad_norm_var": 1.073257658046328e-05, "learning_rate": 0.007806106875795944, "loss": 2.8012, "step": 2593 }, { "crossentropy": 2.8453705310821533, "epoch": 0.220653283429738, "grad_norm": 0.043037693947553635, "grad_norm_var": 1.0391123834647907e-05, "learning_rate": 0.007804448367010107, "loss": 2.8454, "step": 2594 }, { "crossentropy": 2.8189287185668945, "epoch": 0.22073834637631848, "grad_norm": 0.03890847787261009, "grad_norm_var": 1.1035297367538386e-05, "learning_rate": 0.00780278940791067, "loss": 2.8189, "step": 2595 }, { "crossentropy": 2.855724811553955, "epoch": 0.22082340932289896, "grad_norm": 0.04049380496144295, "grad_norm_var": 1.1166955062576431e-05, "learning_rate": 0.007801129998764014, "loss": 2.8557, "step": 2596 }, { "crossentropy": 2.7768051624298096, "epoch": 0.2209084722694794, "grad_norm": 0.04199334233999252, "grad_norm_var": 1.070832765643111e-05, "learning_rate": 0.007799470139836593, "loss": 2.7768, "step": 2597 }, { "crossentropy": 2.7947635650634766, "epoch": 0.22099353521605988, "grad_norm": 0.05220954865217209, "grad_norm_var": 1.6291957543577002e-05, "learning_rate": 0.007797809831394934, "loss": 2.7948, "step": 2598 }, { "crossentropy": 2.755537986755371, "epoch": 0.22107859816264036, "grad_norm": 0.050871264189481735, "grad_norm_var": 2.018929117753865e-05, "learning_rate": 0.0077961490737056315, "loss": 2.7555, "step": 2599 }, { "crossentropy": 2.812382698059082, "epoch": 0.2211636611092208, "grad_norm": 0.0429580956697464, "grad_norm_var": 2.0205144710869146e-05, "learning_rate": 0.007794487867035358, "loss": 2.8124, "step": 2600 }, { "crossentropy": 2.7426204681396484, "epoch": 0.2212487240558013, "grad_norm": 0.043607551604509354, "grad_norm_var": 1.776001971147915e-05, "learning_rate": 0.007792826211650854, "loss": 2.7426, "step": 2601 }, { "crossentropy": 2.7811336517333984, "epoch": 0.22133378700238177, "grad_norm": 0.0404309406876564, "grad_norm_var": 1.8267759961879992e-05, "learning_rate": 0.007791164107818933, "loss": 2.7811, "step": 2602 }, { "crossentropy": 2.795046806335449, "epoch": 0.22141884994896224, "grad_norm": 0.04192403703927994, "grad_norm_var": 1.7594806809640277e-05, "learning_rate": 0.007789501555806484, "loss": 2.795, "step": 2603 }, { "crossentropy": 2.780886173248291, "epoch": 0.2215039128955427, "grad_norm": 0.042715493589639664, "grad_norm_var": 1.7412783058906358e-05, "learning_rate": 0.007787838555880461, "loss": 2.7809, "step": 2604 }, { "crossentropy": 2.7320096492767334, "epoch": 0.22158897584212317, "grad_norm": 0.041989319026470184, "grad_norm_var": 1.5683356780635203e-05, "learning_rate": 0.007786175108307896, "loss": 2.732, "step": 2605 }, { "crossentropy": 2.8090739250183105, "epoch": 0.22167403878870365, "grad_norm": 0.04263390973210335, "grad_norm_var": 1.3789427382392081e-05, "learning_rate": 0.007784511213355891, "loss": 2.8091, "step": 2606 }, { "crossentropy": 2.737605333328247, "epoch": 0.2217591017352841, "grad_norm": 0.04133869335055351, "grad_norm_var": 1.399483459415627e-05, "learning_rate": 0.007782846871291618, "loss": 2.7376, "step": 2607 }, { "crossentropy": 2.7616043090820312, "epoch": 0.22184416468186458, "grad_norm": 0.040254708379507065, "grad_norm_var": 1.3034118628000614e-05, "learning_rate": 0.007781182082382324, "loss": 2.7616, "step": 2608 }, { "crossentropy": 2.7882237434387207, "epoch": 0.22192922762844505, "grad_norm": 0.04173567146062851, "grad_norm_var": 1.2801251129058478e-05, "learning_rate": 0.007779516846895326, "loss": 2.7882, "step": 2609 }, { "crossentropy": 2.79032564163208, "epoch": 0.22201429057502553, "grad_norm": 0.11775193363428116, "grad_norm_var": 0.0003626241227226469, "learning_rate": 0.007777851165098011, "loss": 2.7903, "step": 2610 }, { "crossentropy": 2.83693528175354, "epoch": 0.22209935352160598, "grad_norm": 0.04122062772512436, "grad_norm_var": 0.00036027459268384525, "learning_rate": 0.0077761850372578415, "loss": 2.8369, "step": 2611 }, { "crossentropy": 2.8754920959472656, "epoch": 0.22218441646818646, "grad_norm": 0.042596235871315, "grad_norm_var": 0.00035851451057093697, "learning_rate": 0.0077745184636423504, "loss": 2.8755, "step": 2612 }, { "crossentropy": 2.841297149658203, "epoch": 0.22226947941476694, "grad_norm": 0.049521300941705704, "grad_norm_var": 0.0003561382975153274, "learning_rate": 0.0077728514445191375, "loss": 2.8413, "step": 2613 }, { "crossentropy": 2.884787082672119, "epoch": 0.22235454236134738, "grad_norm": 0.04065770283341408, "grad_norm_var": 0.00035854930219734717, "learning_rate": 0.007771183980155883, "loss": 2.8848, "step": 2614 }, { "crossentropy": 2.714953899383545, "epoch": 0.22243960530792786, "grad_norm": 0.04191867634654045, "grad_norm_var": 0.0003596990880917692, "learning_rate": 0.0077695160708203285, "loss": 2.715, "step": 2615 }, { "crossentropy": 2.864269971847534, "epoch": 0.22252466825450834, "grad_norm": 0.03907034918665886, "grad_norm_var": 0.00036277959150645686, "learning_rate": 0.007767847716780296, "loss": 2.8643, "step": 2616 }, { "crossentropy": 2.7615885734558105, "epoch": 0.22260973120108882, "grad_norm": 0.042673490941524506, "grad_norm_var": 0.00036323612751624747, "learning_rate": 0.007766178918303675, "loss": 2.7616, "step": 2617 }, { "crossentropy": 2.7622034549713135, "epoch": 0.22269479414766927, "grad_norm": 0.04448726028203964, "grad_norm_var": 0.0003608322293887465, "learning_rate": 0.007764509675658424, "loss": 2.7622, "step": 2618 }, { "crossentropy": 2.7851314544677734, "epoch": 0.22277985709424974, "grad_norm": 0.04041849076747894, "grad_norm_var": 0.00036199898276807106, "learning_rate": 0.007762839989112577, "loss": 2.7851, "step": 2619 }, { "crossentropy": 2.857069253921509, "epoch": 0.22286492004083022, "grad_norm": 0.04182414710521698, "grad_norm_var": 0.0003625502884813766, "learning_rate": 0.007761169858934237, "loss": 2.8571, "step": 2620 }, { "crossentropy": 2.81085205078125, "epoch": 0.22294998298741067, "grad_norm": 0.03940804675221443, "grad_norm_var": 0.0003646502170450018, "learning_rate": 0.007759499285391579, "loss": 2.8109, "step": 2621 }, { "crossentropy": 2.7711455821990967, "epoch": 0.22303504593399115, "grad_norm": 0.041395314037799835, "grad_norm_var": 0.0003654208109798458, "learning_rate": 0.007757828268752849, "loss": 2.7711, "step": 2622 }, { "crossentropy": 2.8269968032836914, "epoch": 0.22312010888057163, "grad_norm": 0.041632506996393204, "grad_norm_var": 0.00036521844694472656, "learning_rate": 0.0077561568092863655, "loss": 2.827, "step": 2623 }, { "crossentropy": 2.8206238746643066, "epoch": 0.2232051718271521, "grad_norm": 0.04067142680287361, "grad_norm_var": 0.00036487338415362486, "learning_rate": 0.007754484907260513, "loss": 2.8206, "step": 2624 }, { "crossentropy": 2.7690911293029785, "epoch": 0.22329023477373255, "grad_norm": 0.04144144058227539, "grad_norm_var": 0.0003650730177692522, "learning_rate": 0.007752812562943754, "loss": 2.7691, "step": 2625 }, { "crossentropy": 2.6988272666931152, "epoch": 0.22337529772031303, "grad_norm": 0.042278826236724854, "grad_norm_var": 5.762106047894842e-06, "learning_rate": 0.007751139776604618, "loss": 2.6988, "step": 2626 }, { "crossentropy": 2.7606656551361084, "epoch": 0.2234603606668935, "grad_norm": 0.04466008022427559, "grad_norm_var": 6.166531029999413e-06, "learning_rate": 0.007749466548511706, "loss": 2.7607, "step": 2627 }, { "crossentropy": 2.744896173477173, "epoch": 0.22354542361347396, "grad_norm": 0.04453850910067558, "grad_norm_var": 6.513737089266074e-06, "learning_rate": 0.0077477928789336905, "loss": 2.7449, "step": 2628 }, { "crossentropy": 2.890873908996582, "epoch": 0.22363048656005444, "grad_norm": 0.0429055280983448, "grad_norm_var": 2.868173373761438e-06, "learning_rate": 0.007746118768139314, "loss": 2.8909, "step": 2629 }, { "crossentropy": 2.763639211654663, "epoch": 0.22371554950663491, "grad_norm": 0.040502432733774185, "grad_norm_var": 2.894857933554741e-06, "learning_rate": 0.007744444216397392, "loss": 2.7636, "step": 2630 }, { "crossentropy": 2.7472996711730957, "epoch": 0.2238006124532154, "grad_norm": 0.04457608610391617, "grad_norm_var": 3.3555391486667085e-06, "learning_rate": 0.007742769223976807, "loss": 2.7473, "step": 2631 }, { "crossentropy": 2.888401985168457, "epoch": 0.22388567539979584, "grad_norm": 0.04152083024382591, "grad_norm_var": 2.763753261562455e-06, "learning_rate": 0.0077410937911465165, "loss": 2.8884, "step": 2632 }, { "crossentropy": 2.7278201580047607, "epoch": 0.22397073834637632, "grad_norm": 0.046832356601953506, "grad_norm_var": 4.116525880868266e-06, "learning_rate": 0.007739417918175545, "loss": 2.7278, "step": 2633 }, { "crossentropy": 2.7383100986480713, "epoch": 0.2240558012929568, "grad_norm": 0.04409972205758095, "grad_norm_var": 4.0202990250479595e-06, "learning_rate": 0.007737741605332992, "loss": 2.7383, "step": 2634 }, { "crossentropy": 2.7345528602600098, "epoch": 0.22414086423953725, "grad_norm": 0.03999197855591774, "grad_norm_var": 4.1454403158729e-06, "learning_rate": 0.007736064852888023, "loss": 2.7346, "step": 2635 }, { "crossentropy": 2.7523326873779297, "epoch": 0.22422592718611772, "grad_norm": 0.03911690041422844, "grad_norm_var": 4.8086532342966426e-06, "learning_rate": 0.007734387661109875, "loss": 2.7523, "step": 2636 }, { "crossentropy": 2.814044952392578, "epoch": 0.2243109901326982, "grad_norm": 0.0424480065703392, "grad_norm_var": 4.245157697780182e-06, "learning_rate": 0.00773271003026786, "loss": 2.814, "step": 2637 }, { "crossentropy": 2.7885453701019287, "epoch": 0.22439605307927868, "grad_norm": 0.045657046139240265, "grad_norm_var": 4.801884454142385e-06, "learning_rate": 0.007731031960631354, "loss": 2.7885, "step": 2638 }, { "crossentropy": 2.840606689453125, "epoch": 0.22448111602585913, "grad_norm": 0.04327712580561638, "grad_norm_var": 4.74132240190535e-06, "learning_rate": 0.0077293534524698095, "loss": 2.8406, "step": 2639 }, { "crossentropy": 2.7981903553009033, "epoch": 0.2245661789724396, "grad_norm": 0.038850829005241394, "grad_norm_var": 5.460912936610292e-06, "learning_rate": 0.0077276745060527435, "loss": 2.7982, "step": 2640 }, { "crossentropy": 2.744480848312378, "epoch": 0.22465124191902008, "grad_norm": 0.03937523066997528, "grad_norm_var": 6.065816955164171e-06, "learning_rate": 0.0077259951216497496, "loss": 2.7445, "step": 2641 }, { "crossentropy": 2.7537004947662354, "epoch": 0.22473630486560053, "grad_norm": 0.039045270532369614, "grad_norm_var": 6.831682914178588e-06, "learning_rate": 0.007724315299530485, "loss": 2.7537, "step": 2642 }, { "crossentropy": 2.8807897567749023, "epoch": 0.224821367812181, "grad_norm": 0.03896705433726311, "grad_norm_var": 7.094242583895125e-06, "learning_rate": 0.007722635039964684, "loss": 2.8808, "step": 2643 }, { "crossentropy": 2.7318594455718994, "epoch": 0.2249064307587615, "grad_norm": 0.04051697626709938, "grad_norm_var": 6.733988900822253e-06, "learning_rate": 0.007720954343222146, "loss": 2.7319, "step": 2644 }, { "crossentropy": 2.822448253631592, "epoch": 0.22499149370534197, "grad_norm": 0.03769345581531525, "grad_norm_var": 7.615066556337693e-06, "learning_rate": 0.007719273209572743, "loss": 2.8224, "step": 2645 }, { "crossentropy": 2.753929853439331, "epoch": 0.22507655665192242, "grad_norm": 0.04168171063065529, "grad_norm_var": 7.560153541147132e-06, "learning_rate": 0.0077175916392864165, "loss": 2.7539, "step": 2646 }, { "crossentropy": 2.8629064559936523, "epoch": 0.2251616195985029, "grad_norm": 0.0378338024020195, "grad_norm_var": 7.616357771633488e-06, "learning_rate": 0.00771590963263318, "loss": 2.8629, "step": 2647 }, { "crossentropy": 2.709444284439087, "epoch": 0.22524668254508337, "grad_norm": 0.045323945581912994, "grad_norm_var": 8.755655540368331e-06, "learning_rate": 0.0077142271898831116, "loss": 2.7094, "step": 2648 }, { "crossentropy": 2.766486167907715, "epoch": 0.22533174549166382, "grad_norm": 0.047806594520807266, "grad_norm_var": 9.534340191377718e-06, "learning_rate": 0.007712544311306366, "loss": 2.7665, "step": 2649 }, { "crossentropy": 2.87127685546875, "epoch": 0.2254168084382443, "grad_norm": 0.040978383272886276, "grad_norm_var": 9.001115052864085e-06, "learning_rate": 0.007710860997173163, "loss": 2.8713, "step": 2650 }, { "crossentropy": 2.725780487060547, "epoch": 0.22550187138482478, "grad_norm": 0.03728001192212105, "grad_norm_var": 9.883236516218935e-06, "learning_rate": 0.007709177247753798, "loss": 2.7258, "step": 2651 }, { "crossentropy": 2.857248306274414, "epoch": 0.22558693433140525, "grad_norm": 0.03967561572790146, "grad_norm_var": 9.763151962323645e-06, "learning_rate": 0.007707493063318629, "loss": 2.8572, "step": 2652 }, { "crossentropy": 2.632554054260254, "epoch": 0.2256719972779857, "grad_norm": 0.040689073503017426, "grad_norm_var": 9.622949636685594e-06, "learning_rate": 0.007705808444138088, "loss": 2.6326, "step": 2653 }, { "crossentropy": 2.7397706508636475, "epoch": 0.22575706022456618, "grad_norm": 0.0431397445499897, "grad_norm_var": 8.427633097498645e-06, "learning_rate": 0.007704123390482678, "loss": 2.7398, "step": 2654 }, { "crossentropy": 2.79963755607605, "epoch": 0.22584212317114666, "grad_norm": 0.04351406171917915, "grad_norm_var": 8.510711136748966e-06, "learning_rate": 0.007702437902622967, "loss": 2.7996, "step": 2655 }, { "crossentropy": 2.6635055541992188, "epoch": 0.2259271861177271, "grad_norm": 0.041987959295511246, "grad_norm_var": 8.321698534901242e-06, "learning_rate": 0.007700751980829601, "loss": 2.6635, "step": 2656 }, { "crossentropy": 2.8126766681671143, "epoch": 0.22601224906430759, "grad_norm": 0.042094744741916656, "grad_norm_var": 8.205918747577803e-06, "learning_rate": 0.007699065625373285, "loss": 2.8127, "step": 2657 }, { "crossentropy": 2.8274083137512207, "epoch": 0.22609731201088806, "grad_norm": 0.04484741762280464, "grad_norm_var": 8.690012519320851e-06, "learning_rate": 0.007697378836524802, "loss": 2.8274, "step": 2658 }, { "crossentropy": 2.7989916801452637, "epoch": 0.22618237495746854, "grad_norm": 0.04614732787013054, "grad_norm_var": 9.48548923381994e-06, "learning_rate": 0.0076956916145550025, "loss": 2.799, "step": 2659 }, { "crossentropy": 2.801774024963379, "epoch": 0.226267437904049, "grad_norm": 0.056845955550670624, "grad_norm_var": 2.3028763628215634e-05, "learning_rate": 0.007694003959734801, "loss": 2.8018, "step": 2660 }, { "crossentropy": 2.8377292156219482, "epoch": 0.22635250085062947, "grad_norm": 0.04094783961772919, "grad_norm_var": 2.1400578240554114e-05, "learning_rate": 0.007692315872335192, "loss": 2.8377, "step": 2661 }, { "crossentropy": 2.8582475185394287, "epoch": 0.22643756379720995, "grad_norm": 0.03935139253735542, "grad_norm_var": 2.220384285154525e-05, "learning_rate": 0.007690627352627231, "loss": 2.8582, "step": 2662 }, { "crossentropy": 2.6952242851257324, "epoch": 0.2265226267437904, "grad_norm": 0.04057588428258896, "grad_norm_var": 2.0774363167236296e-05, "learning_rate": 0.007688938400882045, "loss": 2.6952, "step": 2663 }, { "crossentropy": 2.9404118061065674, "epoch": 0.22660768969037087, "grad_norm": 0.04175379499793053, "grad_norm_var": 2.056012296435978e-05, "learning_rate": 0.007687249017370832, "loss": 2.9404, "step": 2664 }, { "crossentropy": 2.7782111167907715, "epoch": 0.22669275263695135, "grad_norm": 0.04548383876681328, "grad_norm_var": 1.940166706676701e-05, "learning_rate": 0.007685559202364858, "loss": 2.7782, "step": 2665 }, { "crossentropy": 2.7462100982666016, "epoch": 0.2267778155835318, "grad_norm": 0.03956618160009384, "grad_norm_var": 1.9875348059695112e-05, "learning_rate": 0.00768386895613546, "loss": 2.7462, "step": 2666 }, { "crossentropy": 2.8746697902679443, "epoch": 0.22686287853011228, "grad_norm": 0.04371851310133934, "grad_norm_var": 1.7775758474524762e-05, "learning_rate": 0.007682178278954041, "loss": 2.8747, "step": 2667 }, { "crossentropy": 2.735507011413574, "epoch": 0.22694794147669275, "grad_norm": 0.042332328855991364, "grad_norm_var": 1.6987508470859593e-05, "learning_rate": 0.007680487171092074, "loss": 2.7355, "step": 2668 }, { "crossentropy": 2.839707374572754, "epoch": 0.22703300442327323, "grad_norm": 0.04433443769812584, "grad_norm_var": 1.654305804022274e-05, "learning_rate": 0.007678795632821105, "loss": 2.8397, "step": 2669 }, { "crossentropy": 2.6737964153289795, "epoch": 0.22711806736985368, "grad_norm": 0.03924502432346344, "grad_norm_var": 1.7699008137827114e-05, "learning_rate": 0.0076771036644127455, "loss": 2.6738, "step": 2670 }, { "crossentropy": 2.809208869934082, "epoch": 0.22720313031643416, "grad_norm": 0.04133053123950958, "grad_norm_var": 1.7933704796295005e-05, "learning_rate": 0.007675411266138676, "loss": 2.8092, "step": 2671 }, { "crossentropy": 2.857417583465576, "epoch": 0.22728819326301464, "grad_norm": 0.041801005601882935, "grad_norm_var": 1.7965109863657344e-05, "learning_rate": 0.0076737184382706485, "loss": 2.8574, "step": 2672 }, { "crossentropy": 2.820783853530884, "epoch": 0.2273732562095951, "grad_norm": 0.04340342432260513, "grad_norm_var": 1.7888277223503517e-05, "learning_rate": 0.0076720251810804795, "loss": 2.8208, "step": 2673 }, { "crossentropy": 2.846877336502075, "epoch": 0.22745831915617556, "grad_norm": 0.0434299036860466, "grad_norm_var": 1.7708224255373142e-05, "learning_rate": 0.007670331494840059, "loss": 2.8469, "step": 2674 }, { "crossentropy": 2.760169267654419, "epoch": 0.22754338210275604, "grad_norm": 0.04332960769534111, "grad_norm_var": 1.7075247825224885e-05, "learning_rate": 0.007668637379821346, "loss": 2.7602, "step": 2675 }, { "crossentropy": 2.736356019973755, "epoch": 0.22762844504933652, "grad_norm": 0.04421551525592804, "grad_norm_var": 3.670422318999972e-06, "learning_rate": 0.007666942836296364, "loss": 2.7364, "step": 2676 }, { "crossentropy": 2.7480194568634033, "epoch": 0.22771350799591697, "grad_norm": 0.03884675353765488, "grad_norm_var": 4.290451688387931e-06, "learning_rate": 0.007665247864537209, "loss": 2.748, "step": 2677 }, { "crossentropy": 2.7776846885681152, "epoch": 0.22779857094249745, "grad_norm": 0.043254412710666656, "grad_norm_var": 3.840849602248543e-06, "learning_rate": 0.007663552464816044, "loss": 2.7777, "step": 2678 }, { "crossentropy": 2.874335527420044, "epoch": 0.22788363388907792, "grad_norm": 0.039185792207717896, "grad_norm_var": 4.279107402634502e-06, "learning_rate": 0.007661856637405102, "loss": 2.8743, "step": 2679 }, { "crossentropy": 2.773653268814087, "epoch": 0.22796869683565837, "grad_norm": 0.04363133758306503, "grad_norm_var": 4.387241710251723e-06, "learning_rate": 0.007660160382576683, "loss": 2.7737, "step": 2680 }, { "crossentropy": 2.73124098777771, "epoch": 0.22805375978223885, "grad_norm": 0.0406595915555954, "grad_norm_var": 3.8062834283822748e-06, "learning_rate": 0.007658463700603158, "loss": 2.7312, "step": 2681 }, { "crossentropy": 2.7732863426208496, "epoch": 0.22813882272881933, "grad_norm": 0.04487530142068863, "grad_norm_var": 3.8325168194264585e-06, "learning_rate": 0.007656766591756964, "loss": 2.7733, "step": 2682 }, { "crossentropy": 2.7399120330810547, "epoch": 0.2282238856753998, "grad_norm": 0.050562042742967606, "grad_norm_var": 8.008735224599752e-06, "learning_rate": 0.0076550690563106085, "loss": 2.7399, "step": 2683 }, { "crossentropy": 2.8621039390563965, "epoch": 0.22830894862198026, "grad_norm": 0.04461407661437988, "grad_norm_var": 8.198754603898756e-06, "learning_rate": 0.0076533710945366675, "loss": 2.8621, "step": 2684 }, { "crossentropy": 2.7379398345947266, "epoch": 0.22839401156856073, "grad_norm": 0.039643220603466034, "grad_norm_var": 8.689451347272902e-06, "learning_rate": 0.007651672706707782, "loss": 2.7379, "step": 2685 }, { "crossentropy": 2.804276704788208, "epoch": 0.2284790745151412, "grad_norm": 0.041945379227399826, "grad_norm_var": 7.927625193221837e-06, "learning_rate": 0.007649973893096668, "loss": 2.8043, "step": 2686 }, { "crossentropy": 2.8689725399017334, "epoch": 0.22856413746172166, "grad_norm": 0.041135698556900024, "grad_norm_var": 7.968054016069201e-06, "learning_rate": 0.007648274653976102, "loss": 2.869, "step": 2687 }, { "crossentropy": 2.801682233810425, "epoch": 0.22864920040830214, "grad_norm": 0.04337947070598602, "grad_norm_var": 7.917036900413562e-06, "learning_rate": 0.007646574989618938, "loss": 2.8017, "step": 2688 }, { "crossentropy": 2.8145370483398438, "epoch": 0.22873426335488262, "grad_norm": 0.038508687168359756, "grad_norm_var": 9.074122902581287e-06, "learning_rate": 0.007644874900298086, "loss": 2.8145, "step": 2689 }, { "crossentropy": 2.80876088142395, "epoch": 0.2288193263014631, "grad_norm": 0.0356498621404171, "grad_norm_var": 1.1971452119778567e-05, "learning_rate": 0.0076431743862865375, "loss": 2.8088, "step": 2690 }, { "crossentropy": 2.7453558444976807, "epoch": 0.22890438924804354, "grad_norm": 0.03985628858208656, "grad_norm_var": 1.2151280899888075e-05, "learning_rate": 0.007641473447857344, "loss": 2.7454, "step": 2691 }, { "crossentropy": 2.745486259460449, "epoch": 0.22898945219462402, "grad_norm": 0.04004857316613197, "grad_norm_var": 1.1934851740082648e-05, "learning_rate": 0.007639772085283627, "loss": 2.7455, "step": 2692 }, { "crossentropy": 2.8322958946228027, "epoch": 0.2290745151412045, "grad_norm": 0.04262695461511612, "grad_norm_var": 1.1434072001512141e-05, "learning_rate": 0.0076380702988385766, "loss": 2.8323, "step": 2693 }, { "crossentropy": 2.7452235221862793, "epoch": 0.22915957808778495, "grad_norm": 0.04095585644245148, "grad_norm_var": 1.1333419338129213e-05, "learning_rate": 0.00763636808879545, "loss": 2.7452, "step": 2694 }, { "crossentropy": 2.8116793632507324, "epoch": 0.22924464103436543, "grad_norm": 0.04848567768931389, "grad_norm_var": 1.3615276679893908e-05, "learning_rate": 0.007634665455427574, "loss": 2.8117, "step": 2695 }, { "crossentropy": 2.7061264514923096, "epoch": 0.2293297039809459, "grad_norm": 0.043643880635499954, "grad_norm_var": 1.361753625386634e-05, "learning_rate": 0.007632962399008341, "loss": 2.7061, "step": 2696 }, { "crossentropy": 2.8467330932617188, "epoch": 0.22941476692752638, "grad_norm": 0.0408211387693882, "grad_norm_var": 1.3584115510493061e-05, "learning_rate": 0.007631258919811215, "loss": 2.8467, "step": 2697 }, { "crossentropy": 2.841611623764038, "epoch": 0.22949982987410683, "grad_norm": 0.04118550196290016, "grad_norm_var": 1.3166577259079993e-05, "learning_rate": 0.007629555018109724, "loss": 2.8416, "step": 2698 }, { "crossentropy": 2.798271417617798, "epoch": 0.2295848928206873, "grad_norm": 0.0374174490571022, "grad_norm_var": 9.075769213008102e-06, "learning_rate": 0.007627850694177466, "loss": 2.7983, "step": 2699 }, { "crossentropy": 2.7342946529388428, "epoch": 0.22966995576726779, "grad_norm": 0.038441140204668045, "grad_norm_var": 8.684277119767187e-06, "learning_rate": 0.007626145948288107, "loss": 2.7343, "step": 2700 }, { "crossentropy": 2.785051107406616, "epoch": 0.22975501871384824, "grad_norm": 0.04310668259859085, "grad_norm_var": 8.87253691831009e-06, "learning_rate": 0.007624440780715378, "loss": 2.7851, "step": 2701 }, { "crossentropy": 2.832507610321045, "epoch": 0.2298400816604287, "grad_norm": 0.0449732206761837, "grad_norm_var": 9.796700664839685e-06, "learning_rate": 0.007622735191733084, "loss": 2.8325, "step": 2702 }, { "crossentropy": 2.806694746017456, "epoch": 0.2299251446070092, "grad_norm": 0.04562501981854439, "grad_norm_var": 1.0979075779383949e-05, "learning_rate": 0.007621029181615086, "loss": 2.8067, "step": 2703 }, { "crossentropy": 2.8096442222595215, "epoch": 0.23001020755358967, "grad_norm": 0.039842549711465836, "grad_norm_var": 1.0895981137667062e-05, "learning_rate": 0.007619322750635327, "loss": 2.8096, "step": 2704 }, { "crossentropy": 2.8505568504333496, "epoch": 0.23009527050017012, "grad_norm": 0.0358809269964695, "grad_norm_var": 1.2314045122404268e-05, "learning_rate": 0.007617615899067809, "loss": 2.8506, "step": 2705 }, { "crossentropy": 2.733562469482422, "epoch": 0.2301803334467506, "grad_norm": 0.04511035606265068, "grad_norm_var": 1.0957313765154e-05, "learning_rate": 0.007615908627186603, "loss": 2.7336, "step": 2706 }, { "crossentropy": 2.8417093753814697, "epoch": 0.23026539639333107, "grad_norm": 0.04264025762677193, "grad_norm_var": 1.0738288901831113e-05, "learning_rate": 0.007614200935265845, "loss": 2.8417, "step": 2707 }, { "crossentropy": 2.7364730834960938, "epoch": 0.23035045933991152, "grad_norm": 0.040235232561826706, "grad_norm_var": 1.0693758083257556e-05, "learning_rate": 0.0076124928235797445, "loss": 2.7365, "step": 2708 }, { "crossentropy": 2.725203037261963, "epoch": 0.230435522286492, "grad_norm": 0.038426723331213, "grad_norm_var": 1.1409978239180531e-05, "learning_rate": 0.007610784292402572, "loss": 2.7252, "step": 2709 }, { "crossentropy": 2.7652645111083984, "epoch": 0.23052058523307248, "grad_norm": 0.04050518944859505, "grad_norm_var": 1.1465853110520508e-05, "learning_rate": 0.007609075342008673, "loss": 2.7653, "step": 2710 }, { "crossentropy": 2.624802589416504, "epoch": 0.23060564817965296, "grad_norm": 0.04682580754160881, "grad_norm_var": 1.0124389355272341e-05, "learning_rate": 0.0076073659726724535, "loss": 2.6248, "step": 2711 }, { "crossentropy": 2.6713385581970215, "epoch": 0.2306907111262334, "grad_norm": 0.04231613129377365, "grad_norm_var": 9.86256943244773e-06, "learning_rate": 0.0076056561846683845, "loss": 2.6713, "step": 2712 }, { "crossentropy": 2.7761106491088867, "epoch": 0.23077577407281388, "grad_norm": 0.03915045037865639, "grad_norm_var": 1.0179238257483104e-05, "learning_rate": 0.007603945978271016, "loss": 2.7761, "step": 2713 }, { "crossentropy": 2.8055739402770996, "epoch": 0.23086083701939436, "grad_norm": 0.04379977658390999, "grad_norm_var": 1.0547250824224398e-05, "learning_rate": 0.007602235353754953, "loss": 2.8056, "step": 2714 }, { "crossentropy": 2.6022727489471436, "epoch": 0.2309458999659748, "grad_norm": 0.04372694343328476, "grad_norm_var": 9.585235846358043e-06, "learning_rate": 0.007600524311394873, "loss": 2.6023, "step": 2715 }, { "crossentropy": 2.8336822986602783, "epoch": 0.2310309629125553, "grad_norm": 0.040884196758270264, "grad_norm_var": 8.82737444138082e-06, "learning_rate": 0.007598812851465522, "loss": 2.8337, "step": 2716 }, { "crossentropy": 2.8015997409820557, "epoch": 0.23111602585913577, "grad_norm": 0.03994295746088028, "grad_norm_var": 9.013783282499354e-06, "learning_rate": 0.007597100974241711, "loss": 2.8016, "step": 2717 }, { "crossentropy": 2.7773280143737793, "epoch": 0.23120108880571624, "grad_norm": 0.03833192586898804, "grad_norm_var": 9.020641521794749e-06, "learning_rate": 0.007595388679998315, "loss": 2.7773, "step": 2718 }, { "crossentropy": 2.7659096717834473, "epoch": 0.2312861517522967, "grad_norm": 0.039652690291404724, "grad_norm_var": 7.927535964188936e-06, "learning_rate": 0.007593675969010283, "loss": 2.7659, "step": 2719 }, { "crossentropy": 2.8100297451019287, "epoch": 0.23137121469887717, "grad_norm": 0.039461590349674225, "grad_norm_var": 7.999437329925656e-06, "learning_rate": 0.007591962841552627, "loss": 2.81, "step": 2720 }, { "crossentropy": 2.726746082305908, "epoch": 0.23145627764545765, "grad_norm": 0.03831042721867561, "grad_norm_var": 6.692061057499548e-06, "learning_rate": 0.0075902492979004225, "loss": 2.7267, "step": 2721 }, { "crossentropy": 2.7489354610443115, "epoch": 0.2315413405920381, "grad_norm": 0.04117703065276146, "grad_norm_var": 5.612196059007607e-06, "learning_rate": 0.007588535338328816, "loss": 2.7489, "step": 2722 }, { "crossentropy": 2.705348014831543, "epoch": 0.23162640353861857, "grad_norm": 0.04705159738659859, "grad_norm_var": 7.815727888510886e-06, "learning_rate": 0.007586820963113022, "loss": 2.7053, "step": 2723 }, { "crossentropy": 2.717064380645752, "epoch": 0.23171146648519905, "grad_norm": 0.04022694006562233, "grad_norm_var": 7.816840267674668e-06, "learning_rate": 0.00758510617252832, "loss": 2.7171, "step": 2724 }, { "crossentropy": 2.8026673793792725, "epoch": 0.23179652943177953, "grad_norm": 0.04475712403655052, "grad_norm_var": 7.949525494181596e-06, "learning_rate": 0.007583390966850052, "loss": 2.8027, "step": 2725 }, { "crossentropy": 2.762009859085083, "epoch": 0.23188159237835998, "grad_norm": 0.037358738481998444, "grad_norm_var": 9.041242502383361e-06, "learning_rate": 0.007581675346353636, "loss": 2.762, "step": 2726 }, { "crossentropy": 2.7879996299743652, "epoch": 0.23196665532494046, "grad_norm": 0.0379241518676281, "grad_norm_var": 7.596490782050639e-06, "learning_rate": 0.007579959311314549, "loss": 2.788, "step": 2727 }, { "crossentropy": 2.7909364700317383, "epoch": 0.23205171827152093, "grad_norm": 0.04154082015156746, "grad_norm_var": 7.48555283516892e-06, "learning_rate": 0.007578242862008335, "loss": 2.7909, "step": 2728 }, { "crossentropy": 2.6644341945648193, "epoch": 0.23213678121810138, "grad_norm": 0.03987224027514458, "grad_norm_var": 7.356372108235863e-06, "learning_rate": 0.007576525998710609, "loss": 2.6644, "step": 2729 }, { "crossentropy": 2.821380615234375, "epoch": 0.23222184416468186, "grad_norm": 0.04313657060265541, "grad_norm_var": 7.125337509014153e-06, "learning_rate": 0.0075748087216970465, "loss": 2.8214, "step": 2730 }, { "crossentropy": 2.796788215637207, "epoch": 0.23230690711126234, "grad_norm": 0.04130734130740166, "grad_norm_var": 6.558180067069384e-06, "learning_rate": 0.0075730910312433964, "loss": 2.7968, "step": 2731 }, { "crossentropy": 2.75942063331604, "epoch": 0.23239197005784282, "grad_norm": 0.03714658319950104, "grad_norm_var": 7.3312835147792505e-06, "learning_rate": 0.0075713729276254685, "loss": 2.7594, "step": 2732 }, { "crossentropy": 2.8564999103546143, "epoch": 0.23247703300442327, "grad_norm": 0.038284722715616226, "grad_norm_var": 7.615230932801695e-06, "learning_rate": 0.00756965441111914, "loss": 2.8565, "step": 2733 }, { "crossentropy": 2.8385262489318848, "epoch": 0.23256209595100374, "grad_norm": 0.04896543174982071, "grad_norm_var": 1.1826242241350036e-05, "learning_rate": 0.007567935482000357, "loss": 2.8385, "step": 2734 }, { "crossentropy": 2.811441421508789, "epoch": 0.23264715889758422, "grad_norm": 0.05126800015568733, "grad_norm_var": 1.8155024660822712e-05, "learning_rate": 0.007566216140545128, "loss": 2.8114, "step": 2735 }, { "crossentropy": 2.743793249130249, "epoch": 0.23273222184416467, "grad_norm": 0.041369665414094925, "grad_norm_var": 1.7803727344589162e-05, "learning_rate": 0.007564496387029531, "loss": 2.7438, "step": 2736 }, { "crossentropy": 2.6901614665985107, "epoch": 0.23281728479074515, "grad_norm": 0.04125314950942993, "grad_norm_var": 1.6953767804522613e-05, "learning_rate": 0.007562776221729708, "loss": 2.6902, "step": 2737 }, { "crossentropy": 2.8401365280151367, "epoch": 0.23290234773732563, "grad_norm": 0.043613921850919724, "grad_norm_var": 1.704452305194721e-05, "learning_rate": 0.007561055644921871, "loss": 2.8401, "step": 2738 }, { "crossentropy": 2.7209877967834473, "epoch": 0.2329874106839061, "grad_norm": 0.041948139667510986, "grad_norm_var": 1.5365799047636386e-05, "learning_rate": 0.00755933465688229, "loss": 2.721, "step": 2739 }, { "crossentropy": 2.730299949645996, "epoch": 0.23307247363048655, "grad_norm": 0.04297948256134987, "grad_norm_var": 1.5235089262085645e-05, "learning_rate": 0.007557613257887309, "loss": 2.7303, "step": 2740 }, { "crossentropy": 2.7575173377990723, "epoch": 0.23315753657706703, "grad_norm": 0.04047619551420212, "grad_norm_var": 1.4832648549775729e-05, "learning_rate": 0.007555891448213335, "loss": 2.7575, "step": 2741 }, { "crossentropy": 2.7908666133880615, "epoch": 0.2332425995236475, "grad_norm": 0.04297483712434769, "grad_norm_var": 1.3494865796312427e-05, "learning_rate": 0.007554169228136841, "loss": 2.7909, "step": 2742 }, { "crossentropy": 2.9085984230041504, "epoch": 0.23332766247022796, "grad_norm": 0.04834374785423279, "grad_norm_var": 1.4438893879833933e-05, "learning_rate": 0.007552446597934366, "loss": 2.9086, "step": 2743 }, { "crossentropy": 2.8193366527557373, "epoch": 0.23341272541680844, "grad_norm": 0.04481066018342972, "grad_norm_var": 1.4566855609188013e-05, "learning_rate": 0.007550723557882513, "loss": 2.8193, "step": 2744 }, { "crossentropy": 2.6930928230285645, "epoch": 0.23349778836338891, "grad_norm": 0.04740302637219429, "grad_norm_var": 1.4986449073639552e-05, "learning_rate": 0.007549000108257955, "loss": 2.6931, "step": 2745 }, { "crossentropy": 2.762216091156006, "epoch": 0.23358285130996936, "grad_norm": 0.044228605926036835, "grad_norm_var": 1.5014604645317783e-05, "learning_rate": 0.0075472762493374295, "loss": 2.7622, "step": 2746 }, { "crossentropy": 2.829617738723755, "epoch": 0.23366791425654984, "grad_norm": 0.03947362303733826, "grad_norm_var": 1.5766565705464113e-05, "learning_rate": 0.007545551981397732, "loss": 2.8296, "step": 2747 }, { "crossentropy": 2.756617546081543, "epoch": 0.23375297720313032, "grad_norm": 0.039994481950998306, "grad_norm_var": 1.3895609643120993e-05, "learning_rate": 0.0075438273047157365, "loss": 2.7566, "step": 2748 }, { "crossentropy": 2.9406561851501465, "epoch": 0.2338380401497108, "grad_norm": 0.04223719984292984, "grad_norm_var": 1.2077847307206958e-05, "learning_rate": 0.007542102219568377, "loss": 2.9407, "step": 2749 }, { "crossentropy": 2.74043345451355, "epoch": 0.23392310309629125, "grad_norm": 0.03694542869925499, "grad_norm_var": 1.2883516418607926e-05, "learning_rate": 0.0075403767262326475, "loss": 2.7404, "step": 2750 }, { "crossentropy": 2.731689691543579, "epoch": 0.23400816604287172, "grad_norm": 0.03932292386889458, "grad_norm_var": 8.764479370395629e-06, "learning_rate": 0.0075386508249856155, "loss": 2.7317, "step": 2751 }, { "crossentropy": 2.746471405029297, "epoch": 0.2340932289894522, "grad_norm": 0.037512507289648056, "grad_norm_var": 1.0191278439932762e-05, "learning_rate": 0.0075369245161044105, "loss": 2.7465, "step": 2752 }, { "crossentropy": 2.8110148906707764, "epoch": 0.23417829193603265, "grad_norm": 0.038488149642944336, "grad_norm_var": 1.0979419488107468e-05, "learning_rate": 0.007535197799866229, "loss": 2.811, "step": 2753 }, { "crossentropy": 2.661792039871216, "epoch": 0.23426335488261313, "grad_norm": 0.04199798032641411, "grad_norm_var": 1.0778096648469439e-05, "learning_rate": 0.007533470676548331, "loss": 2.6618, "step": 2754 }, { "crossentropy": 2.8757164478302, "epoch": 0.2343484178291936, "grad_norm": 0.041227277368307114, "grad_norm_var": 1.0798360222414686e-05, "learning_rate": 0.00753174314642804, "loss": 2.8757, "step": 2755 }, { "crossentropy": 2.680952310562134, "epoch": 0.23443348077577408, "grad_norm": 0.04245059937238693, "grad_norm_var": 1.073097623119721e-05, "learning_rate": 0.007530015209782754, "loss": 2.681, "step": 2756 }, { "crossentropy": 2.783277988433838, "epoch": 0.23451854372235453, "grad_norm": 0.03698204457759857, "grad_norm_var": 1.2084209879314963e-05, "learning_rate": 0.007528286866889924, "loss": 2.7833, "step": 2757 }, { "crossentropy": 2.7778584957122803, "epoch": 0.234603606668935, "grad_norm": 0.040216173976659775, "grad_norm_var": 1.2026408370279425e-05, "learning_rate": 0.007526558118027073, "loss": 2.7779, "step": 2758 }, { "crossentropy": 2.79962158203125, "epoch": 0.2346886696155155, "grad_norm": 0.03803286328911781, "grad_norm_var": 9.059116134957761e-06, "learning_rate": 0.007524828963471792, "loss": 2.7996, "step": 2759 }, { "crossentropy": 2.698960542678833, "epoch": 0.23477373256209594, "grad_norm": 0.038300368934869766, "grad_norm_var": 8.146599380099799e-06, "learning_rate": 0.0075230994035017295, "loss": 2.699, "step": 2760 }, { "crossentropy": 2.8242955207824707, "epoch": 0.23485879550867642, "grad_norm": 0.043007317930459976, "grad_norm_var": 5.191681299415718e-06, "learning_rate": 0.007521369438394603, "loss": 2.8243, "step": 2761 }, { "crossentropy": 2.764752149581909, "epoch": 0.2349438584552569, "grad_norm": 0.04017064347863197, "grad_norm_var": 3.947055844964684e-06, "learning_rate": 0.007519639068428198, "loss": 2.7648, "step": 2762 }, { "crossentropy": 2.78861665725708, "epoch": 0.23502892140183737, "grad_norm": 0.038357432931661606, "grad_norm_var": 4.069399961464706e-06, "learning_rate": 0.007517908293880359, "loss": 2.7886, "step": 2763 }, { "crossentropy": 2.7703030109405518, "epoch": 0.23511398434841782, "grad_norm": 0.04274633899331093, "grad_norm_var": 4.6497493105921345e-06, "learning_rate": 0.007516177115029002, "loss": 2.7703, "step": 2764 }, { "crossentropy": 2.703447103500366, "epoch": 0.2351990472949983, "grad_norm": 0.04103494808077812, "grad_norm_var": 4.361378618096488e-06, "learning_rate": 0.0075144455321521, "loss": 2.7034, "step": 2765 }, { "crossentropy": 2.7737302780151367, "epoch": 0.23528411024157878, "grad_norm": 0.04005596414208412, "grad_norm_var": 3.7823751587603403e-06, "learning_rate": 0.007512713545527698, "loss": 2.7737, "step": 2766 }, { "crossentropy": 2.8802616596221924, "epoch": 0.23536917318815923, "grad_norm": 0.040437862277030945, "grad_norm_var": 3.7603113244686254e-06, "learning_rate": 0.007510981155433904, "loss": 2.8803, "step": 2767 }, { "crossentropy": 2.772146701812744, "epoch": 0.2354542361347397, "grad_norm": 0.04015212133526802, "grad_norm_var": 3.297911420854782e-06, "learning_rate": 0.007509248362148889, "loss": 2.7721, "step": 2768 }, { "crossentropy": 2.7752997875213623, "epoch": 0.23553929908132018, "grad_norm": 0.040976233780384064, "grad_norm_var": 3.1074265742815632e-06, "learning_rate": 0.007507515165950887, "loss": 2.7753, "step": 2769 }, { "crossentropy": 2.8100712299346924, "epoch": 0.23562436202790066, "grad_norm": 0.03880362957715988, "grad_norm_var": 3.057810752286373e-06, "learning_rate": 0.0075057815671182035, "loss": 2.8101, "step": 2770 }, { "crossentropy": 2.8333308696746826, "epoch": 0.2357094249744811, "grad_norm": 0.04304995760321617, "grad_norm_var": 3.5188686478494654e-06, "learning_rate": 0.007504047565929203, "loss": 2.8333, "step": 2771 }, { "crossentropy": 2.826805591583252, "epoch": 0.23579448792106159, "grad_norm": 0.04316813871264458, "grad_norm_var": 3.756951996308535e-06, "learning_rate": 0.007502313162662315, "loss": 2.8268, "step": 2772 }, { "crossentropy": 2.8569765090942383, "epoch": 0.23587955086764206, "grad_norm": 0.042153265327215195, "grad_norm_var": 3.1107575116123253e-06, "learning_rate": 0.007500578357596036, "loss": 2.857, "step": 2773 }, { "crossentropy": 2.794832229614258, "epoch": 0.2359646138142225, "grad_norm": 0.04296094924211502, "grad_norm_var": 3.4168305521673863e-06, "learning_rate": 0.007498843151008926, "loss": 2.7948, "step": 2774 }, { "crossentropy": 2.821463108062744, "epoch": 0.236049676760803, "grad_norm": 0.042654551565647125, "grad_norm_var": 3.023233677420557e-06, "learning_rate": 0.007497107543179607, "loss": 2.8215, "step": 2775 }, { "crossentropy": 2.8231751918792725, "epoch": 0.23613473970738347, "grad_norm": 0.04456700384616852, "grad_norm_var": 3.115976786755015e-06, "learning_rate": 0.00749537153438677, "loss": 2.8232, "step": 2776 }, { "crossentropy": 2.7141358852386475, "epoch": 0.23621980265396395, "grad_norm": 0.039977215230464935, "grad_norm_var": 3.088328164859745e-06, "learning_rate": 0.007493635124909166, "loss": 2.7141, "step": 2777 }, { "crossentropy": 2.783200740814209, "epoch": 0.2363048656005444, "grad_norm": 0.03971040993928909, "grad_norm_var": 3.17265718586604e-06, "learning_rate": 0.007491898315025614, "loss": 2.7832, "step": 2778 }, { "crossentropy": 2.7890098094940186, "epoch": 0.23638992854712487, "grad_norm": 0.049589335918426514, "grad_norm_var": 6.650070891687544e-06, "learning_rate": 0.007490161105014996, "loss": 2.789, "step": 2779 }, { "crossentropy": 2.897890090942383, "epoch": 0.23647499149370535, "grad_norm": 0.041464000940322876, "grad_norm_var": 6.625642747973975e-06, "learning_rate": 0.007488423495156257, "loss": 2.8979, "step": 2780 }, { "crossentropy": 2.8393044471740723, "epoch": 0.2365600544402858, "grad_norm": 0.045991361141204834, "grad_norm_var": 7.574658710359818e-06, "learning_rate": 0.007486685485728406, "loss": 2.8393, "step": 2781 }, { "crossentropy": 2.804455280303955, "epoch": 0.23664511738686628, "grad_norm": 0.03980433940887451, "grad_norm_var": 7.651621825097849e-06, "learning_rate": 0.007484947077010519, "loss": 2.8045, "step": 2782 }, { "crossentropy": 2.8176534175872803, "epoch": 0.23673018033344675, "grad_norm": 0.040996622294187546, "grad_norm_var": 7.538641111721513e-06, "learning_rate": 0.007483208269281731, "loss": 2.8177, "step": 2783 }, { "crossentropy": 2.7470951080322266, "epoch": 0.23681524328002723, "grad_norm": 0.046151045709848404, "grad_norm_var": 8.10887578246011e-06, "learning_rate": 0.007481469062821251, "loss": 2.7471, "step": 2784 }, { "crossentropy": 2.7967097759246826, "epoch": 0.23690030622660768, "grad_norm": 0.050247736275196075, "grad_norm_var": 1.1441822460829474e-05, "learning_rate": 0.00747972945790834, "loss": 2.7967, "step": 2785 }, { "crossentropy": 2.726191282272339, "epoch": 0.23698536917318816, "grad_norm": 0.04369185119867325, "grad_norm_var": 1.0066202459671434e-05, "learning_rate": 0.007477989454822328, "loss": 2.7262, "step": 2786 }, { "crossentropy": 2.834801197052002, "epoch": 0.23707043211976864, "grad_norm": 0.044415757060050964, "grad_norm_var": 1.0098811299850269e-05, "learning_rate": 0.007476249053842612, "loss": 2.8348, "step": 2787 }, { "crossentropy": 2.823057174682617, "epoch": 0.2371554950663491, "grad_norm": 0.04016951099038124, "grad_norm_var": 1.0832052550934408e-05, "learning_rate": 0.00747450825524865, "loss": 2.8231, "step": 2788 }, { "crossentropy": 2.788226366043091, "epoch": 0.23724055801292956, "grad_norm": 0.040946587920188904, "grad_norm_var": 1.112510211732174e-05, "learning_rate": 0.007472767059319964, "loss": 2.7882, "step": 2789 }, { "crossentropy": 2.7518081665039062, "epoch": 0.23732562095951004, "grad_norm": 0.040507055819034576, "grad_norm_var": 1.16233915865034e-05, "learning_rate": 0.007471025466336139, "loss": 2.7518, "step": 2790 }, { "crossentropy": 2.829108238220215, "epoch": 0.23741068390609052, "grad_norm": 0.04129473865032196, "grad_norm_var": 1.1834277686032362e-05, "learning_rate": 0.007469283476576823, "loss": 2.8291, "step": 2791 }, { "crossentropy": 2.8455188274383545, "epoch": 0.23749574685267097, "grad_norm": 0.046161286532878876, "grad_norm_var": 1.2305980885860396e-05, "learning_rate": 0.007467541090321735, "loss": 2.8455, "step": 2792 }, { "crossentropy": 2.7661569118499756, "epoch": 0.23758080979925145, "grad_norm": 0.043264277279376984, "grad_norm_var": 1.1571036379860213e-05, "learning_rate": 0.007465798307850646, "loss": 2.7662, "step": 2793 }, { "crossentropy": 2.883883476257324, "epoch": 0.23766587274583192, "grad_norm": 0.04217343404889107, "grad_norm_var": 1.0738397179376376e-05, "learning_rate": 0.0074640551294434, "loss": 2.8839, "step": 2794 }, { "crossentropy": 2.811652660369873, "epoch": 0.23775093569241237, "grad_norm": 0.038910314440727234, "grad_norm_var": 9.27289971764204e-06, "learning_rate": 0.0074623115553799, "loss": 2.8117, "step": 2795 }, { "crossentropy": 2.8086798191070557, "epoch": 0.23783599863899285, "grad_norm": 0.040800344198942184, "grad_norm_var": 9.426333452078733e-06, "learning_rate": 0.007460567585940115, "loss": 2.8087, "step": 2796 }, { "crossentropy": 2.721907377243042, "epoch": 0.23792106158557333, "grad_norm": 0.04153269901871681, "grad_norm_var": 8.798570568555042e-06, "learning_rate": 0.007458823221404074, "loss": 2.7219, "step": 2797 }, { "crossentropy": 2.796032667160034, "epoch": 0.2380061245321538, "grad_norm": 0.044399797916412354, "grad_norm_var": 8.425870018273958e-06, "learning_rate": 0.007457078462051875, "loss": 2.796, "step": 2798 }, { "crossentropy": 2.7309255599975586, "epoch": 0.23809118747873426, "grad_norm": 0.043642595410346985, "grad_norm_var": 8.208187920708393e-06, "learning_rate": 0.007455333308163672, "loss": 2.7309, "step": 2799 }, { "crossentropy": 2.8095643520355225, "epoch": 0.23817625042531473, "grad_norm": 0.039911363273859024, "grad_norm_var": 8.036072488277051e-06, "learning_rate": 0.00745358776001969, "loss": 2.8096, "step": 2800 }, { "crossentropy": 2.7953712940216064, "epoch": 0.2382613133718952, "grad_norm": 0.03822449594736099, "grad_norm_var": 4.85791647914276e-06, "learning_rate": 0.007451841817900211, "loss": 2.7954, "step": 2801 }, { "crossentropy": 2.6551826000213623, "epoch": 0.23834637631847566, "grad_norm": 0.05789019167423248, "grad_norm_var": 2.0891518239451028e-05, "learning_rate": 0.0074500954820855845, "loss": 2.6552, "step": 2802 }, { "crossentropy": 2.82830810546875, "epoch": 0.23843143926505614, "grad_norm": 0.03924015164375305, "grad_norm_var": 2.1426735265933117e-05, "learning_rate": 0.007448348752856222, "loss": 2.8283, "step": 2803 }, { "crossentropy": 2.8446342945098877, "epoch": 0.23851650221163662, "grad_norm": 0.04041347652673721, "grad_norm_var": 2.1356540429591046e-05, "learning_rate": 0.007446601630492598, "loss": 2.8446, "step": 2804 }, { "crossentropy": 2.8883204460144043, "epoch": 0.2386015651582171, "grad_norm": 0.03885142132639885, "grad_norm_var": 2.205285425921488e-05, "learning_rate": 0.007444854115275248, "loss": 2.8883, "step": 2805 }, { "crossentropy": 2.632922649383545, "epoch": 0.23868662810479754, "grad_norm": 0.039521314203739166, "grad_norm_var": 2.2352666033077725e-05, "learning_rate": 0.007443106207484776, "loss": 2.6329, "step": 2806 }, { "crossentropy": 2.770291328430176, "epoch": 0.23877169105137802, "grad_norm": 0.05148858204483986, "grad_norm_var": 2.752924767001619e-05, "learning_rate": 0.007441357907401841, "loss": 2.7703, "step": 2807 }, { "crossentropy": 2.7458317279815674, "epoch": 0.2388567539979585, "grad_norm": 0.05152396112680435, "grad_norm_var": 3.1657385347236276e-05, "learning_rate": 0.007439609215307173, "loss": 2.7458, "step": 2808 }, { "crossentropy": 2.7940566539764404, "epoch": 0.23894181694453895, "grad_norm": 0.04145431891083717, "grad_norm_var": 3.18554954170153e-05, "learning_rate": 0.007437860131481562, "loss": 2.7941, "step": 2809 }, { "crossentropy": 2.753114938735962, "epoch": 0.23902687989111943, "grad_norm": 0.03702355921268463, "grad_norm_var": 3.416553950431403e-05, "learning_rate": 0.007436110656205859, "loss": 2.7531, "step": 2810 }, { "crossentropy": 2.7182302474975586, "epoch": 0.2391119428376999, "grad_norm": 0.04176818206906319, "grad_norm_var": 3.3193160793826684e-05, "learning_rate": 0.007434360789760978, "loss": 2.7182, "step": 2811 }, { "crossentropy": 2.7289764881134033, "epoch": 0.23919700578428038, "grad_norm": 0.04508601129055023, "grad_norm_var": 3.309536029457808e-05, "learning_rate": 0.0074326105324279, "loss": 2.729, "step": 2812 }, { "crossentropy": 2.7468369007110596, "epoch": 0.23928206873086083, "grad_norm": 0.04402649775147438, "grad_norm_var": 3.291361542665979e-05, "learning_rate": 0.0074308598844876625, "loss": 2.7468, "step": 2813 }, { "crossentropy": 2.8370840549468994, "epoch": 0.2393671316774413, "grad_norm": 0.041619058698415756, "grad_norm_var": 3.302773459593755e-05, "learning_rate": 0.007429108846221373, "loss": 2.8371, "step": 2814 }, { "crossentropy": 2.813896417617798, "epoch": 0.23945219462402179, "grad_norm": 0.037053290754556656, "grad_norm_var": 3.537920700457849e-05, "learning_rate": 0.007427357417910197, "loss": 2.8139, "step": 2815 }, { "crossentropy": 2.7925918102264404, "epoch": 0.23953725757060224, "grad_norm": 0.03965253755450249, "grad_norm_var": 3.548371922742465e-05, "learning_rate": 0.00742560559983536, "loss": 2.7926, "step": 2816 }, { "crossentropy": 2.804011583328247, "epoch": 0.2396223205171827, "grad_norm": 0.03911024332046509, "grad_norm_var": 3.4992114597017415e-05, "learning_rate": 0.007423853392278157, "loss": 2.804, "step": 2817 }, { "crossentropy": 2.813445568084717, "epoch": 0.2397073834637632, "grad_norm": 0.04444584622979164, "grad_norm_var": 1.9342036302977665e-05, "learning_rate": 0.007422100795519942, "loss": 2.8134, "step": 2818 }, { "crossentropy": 2.789562225341797, "epoch": 0.23979244641034367, "grad_norm": 0.040032077580690384, "grad_norm_var": 1.9087982622639545e-05, "learning_rate": 0.00742034780984213, "loss": 2.7896, "step": 2819 }, { "crossentropy": 2.794212818145752, "epoch": 0.23987750935692412, "grad_norm": 0.03817117214202881, "grad_norm_var": 1.989655827624633e-05, "learning_rate": 0.007418594435526199, "loss": 2.7942, "step": 2820 }, { "crossentropy": 2.7895452976226807, "epoch": 0.2399625723035046, "grad_norm": 0.038590218871831894, "grad_norm_var": 2.0007927061021134e-05, "learning_rate": 0.007416840672853693, "loss": 2.7895, "step": 2821 }, { "crossentropy": 2.6765947341918945, "epoch": 0.24004763525008507, "grad_norm": 0.04157203063368797, "grad_norm_var": 1.961751386685967e-05, "learning_rate": 0.007415086522106215, "loss": 2.6766, "step": 2822 }, { "crossentropy": 2.70969557762146, "epoch": 0.24013269819666552, "grad_norm": 0.03982764855027199, "grad_norm_var": 1.3423350055609811e-05, "learning_rate": 0.00741333198356543, "loss": 2.7097, "step": 2823 }, { "crossentropy": 2.7593929767608643, "epoch": 0.240217761143246, "grad_norm": 0.03601807355880737, "grad_norm_var": 7.33308107190387e-06, "learning_rate": 0.007411577057513066, "loss": 2.7594, "step": 2824 }, { "crossentropy": 2.786867141723633, "epoch": 0.24030282408982648, "grad_norm": 0.03813221678137779, "grad_norm_var": 7.529567647471663e-06, "learning_rate": 0.007409821744230917, "loss": 2.7869, "step": 2825 }, { "crossentropy": 2.7343950271606445, "epoch": 0.24038788703640693, "grad_norm": 0.03895770013332367, "grad_norm_var": 6.961483692303055e-06, "learning_rate": 0.007408066044000832, "loss": 2.7344, "step": 2826 }, { "crossentropy": 2.76627779006958, "epoch": 0.2404729499829874, "grad_norm": 0.03748229891061783, "grad_norm_var": 7.244209707446316e-06, "learning_rate": 0.007406309957104727, "loss": 2.7663, "step": 2827 }, { "crossentropy": 2.797173261642456, "epoch": 0.24055801292956788, "grad_norm": 0.03639025241136551, "grad_norm_var": 6.057161351915215e-06, "learning_rate": 0.00740455348382458, "loss": 2.7972, "step": 2828 }, { "crossentropy": 2.728943109512329, "epoch": 0.24064307587614836, "grad_norm": 0.04196621850132942, "grad_norm_var": 5.063236186395037e-06, "learning_rate": 0.007402796624442428, "loss": 2.7289, "step": 2829 }, { "crossentropy": 2.783353567123413, "epoch": 0.2407281388227288, "grad_norm": 0.0400133915245533, "grad_norm_var": 4.730842331668085e-06, "learning_rate": 0.007401039379240373, "loss": 2.7834, "step": 2830 }, { "crossentropy": 2.654865264892578, "epoch": 0.2408132017693093, "grad_norm": 0.04113433510065079, "grad_norm_var": 4.596346999206855e-06, "learning_rate": 0.007399281748500579, "loss": 2.6549, "step": 2831 }, { "crossentropy": 2.8293819427490234, "epoch": 0.24089826471588976, "grad_norm": 0.041187919676303864, "grad_norm_var": 4.7813567701502266e-06, "learning_rate": 0.0073975237325052704, "loss": 2.8294, "step": 2832 }, { "crossentropy": 2.813692808151245, "epoch": 0.24098332766247021, "grad_norm": 0.04155532643198967, "grad_norm_var": 5.006923277883248e-06, "learning_rate": 0.0073957653315367316, "loss": 2.8137, "step": 2833 }, { "crossentropy": 2.7320399284362793, "epoch": 0.2410683906090507, "grad_norm": 0.038180746138095856, "grad_norm_var": 3.5101615270184685e-06, "learning_rate": 0.0073940065458773145, "loss": 2.732, "step": 2834 }, { "crossentropy": 2.758049488067627, "epoch": 0.24115345355563117, "grad_norm": 0.03829457610845566, "grad_norm_var": 3.535205391048436e-06, "learning_rate": 0.007392247375809426, "loss": 2.758, "step": 2835 }, { "crossentropy": 2.831192970275879, "epoch": 0.24123851650221165, "grad_norm": 0.03850386291742325, "grad_norm_var": 3.495725557868939e-06, "learning_rate": 0.007390487821615543, "loss": 2.8312, "step": 2836 }, { "crossentropy": 2.84875750541687, "epoch": 0.2413235794487921, "grad_norm": 0.039806511253118515, "grad_norm_var": 3.4831458365931653e-06, "learning_rate": 0.007388727883578194, "loss": 2.8488, "step": 2837 }, { "crossentropy": 2.734262704849243, "epoch": 0.24140864239537257, "grad_norm": 0.04278738796710968, "grad_norm_var": 3.941381750147827e-06, "learning_rate": 0.007386967561979977, "loss": 2.7343, "step": 2838 }, { "crossentropy": 2.8061344623565674, "epoch": 0.24149370534195305, "grad_norm": 0.05223431810736656, "grad_norm_var": 1.4285849072112611e-05, "learning_rate": 0.0073852068571035505, "loss": 2.8061, "step": 2839 }, { "crossentropy": 2.7421059608459473, "epoch": 0.2415787682885335, "grad_norm": 0.04522287845611572, "grad_norm_var": 1.449142904269285e-05, "learning_rate": 0.007383445769231627, "loss": 2.7421, "step": 2840 }, { "crossentropy": 2.7654919624328613, "epoch": 0.24166383123511398, "grad_norm": 0.03829726204276085, "grad_norm_var": 1.443573089861816e-05, "learning_rate": 0.007381684298646992, "loss": 2.7655, "step": 2841 }, { "crossentropy": 2.8166706562042236, "epoch": 0.24174889418169446, "grad_norm": 0.042953647673130035, "grad_norm_var": 1.4478281845932648e-05, "learning_rate": 0.007379922445632486, "loss": 2.8167, "step": 2842 }, { "crossentropy": 2.753509998321533, "epoch": 0.24183395712827493, "grad_norm": 0.042066626250743866, "grad_norm_var": 1.3641195302552949e-05, "learning_rate": 0.007378160210471011, "loss": 2.7535, "step": 2843 }, { "crossentropy": 2.845703601837158, "epoch": 0.24191902007485538, "grad_norm": 0.03755819797515869, "grad_norm_var": 1.2963868337320662e-05, "learning_rate": 0.00737639759344553, "loss": 2.8457, "step": 2844 }, { "crossentropy": 2.7823216915130615, "epoch": 0.24200408302143586, "grad_norm": 0.03873010724782944, "grad_norm_var": 1.3356908733028599e-05, "learning_rate": 0.007374634594839072, "loss": 2.7823, "step": 2845 }, { "crossentropy": 2.7676405906677246, "epoch": 0.24208914596801634, "grad_norm": 0.04353141784667969, "grad_norm_var": 1.3593565394443009e-05, "learning_rate": 0.007372871214934721, "loss": 2.7676, "step": 2846 }, { "crossentropy": 2.7961764335632324, "epoch": 0.2421742089145968, "grad_norm": 0.0381845124065876, "grad_norm_var": 1.423317126506973e-05, "learning_rate": 0.0073711074540156254, "loss": 2.7962, "step": 2847 }, { "crossentropy": 2.8518903255462646, "epoch": 0.24225927186117727, "grad_norm": 0.043841756880283356, "grad_norm_var": 1.467139045364333e-05, "learning_rate": 0.007369343312364994, "loss": 2.8519, "step": 2848 }, { "crossentropy": 2.7942419052124023, "epoch": 0.24234433480775774, "grad_norm": 0.04684668034315109, "grad_norm_var": 1.655957665295781e-05, "learning_rate": 0.007367578790266097, "loss": 2.7942, "step": 2849 }, { "crossentropy": 2.8075826168060303, "epoch": 0.24242939775433822, "grad_norm": 0.04117203131318092, "grad_norm_var": 1.571917726188628e-05, "learning_rate": 0.007365813888002269, "loss": 2.8076, "step": 2850 }, { "crossentropy": 2.7625885009765625, "epoch": 0.24251446070091867, "grad_norm": 0.042770083993673325, "grad_norm_var": 1.4833315821604956e-05, "learning_rate": 0.007364048605856897, "loss": 2.7626, "step": 2851 }, { "crossentropy": 2.766958475112915, "epoch": 0.24259952364749915, "grad_norm": 0.04269348084926605, "grad_norm_var": 1.3889836853647752e-05, "learning_rate": 0.007362282944113439, "loss": 2.767, "step": 2852 }, { "crossentropy": 2.8278696537017822, "epoch": 0.24268458659407963, "grad_norm": 0.04231198504567146, "grad_norm_var": 1.3409586235860887e-05, "learning_rate": 0.007360516903055408, "loss": 2.8279, "step": 2853 }, { "crossentropy": 2.849299907684326, "epoch": 0.24276964954066008, "grad_norm": 0.040632251650094986, "grad_norm_var": 1.3638887160030179e-05, "learning_rate": 0.007358750482966379, "loss": 2.8493, "step": 2854 }, { "crossentropy": 2.8390464782714844, "epoch": 0.24285471248724055, "grad_norm": 0.040770772844552994, "grad_norm_var": 6.88253512624992e-06, "learning_rate": 0.00735698368412999, "loss": 2.839, "step": 2855 }, { "crossentropy": 2.741243839263916, "epoch": 0.24293977543382103, "grad_norm": 0.03744812309741974, "grad_norm_var": 7.033384992757794e-06, "learning_rate": 0.0073552165068299325, "loss": 2.7412, "step": 2856 }, { "crossentropy": 2.7400481700897217, "epoch": 0.2430248383804015, "grad_norm": 0.041164692491292953, "grad_norm_var": 6.422932553349671e-06, "learning_rate": 0.00735344895134997, "loss": 2.74, "step": 2857 }, { "crossentropy": 2.8089828491210938, "epoch": 0.24310990132698196, "grad_norm": 0.0444965660572052, "grad_norm_var": 6.887786639501493e-06, "learning_rate": 0.00735168101797392, "loss": 2.809, "step": 2858 }, { "crossentropy": 2.753152370452881, "epoch": 0.24319496427356244, "grad_norm": 0.04458259791135788, "grad_norm_var": 7.468903154533799e-06, "learning_rate": 0.0073499127069856585, "loss": 2.7532, "step": 2859 }, { "crossentropy": 2.7404093742370605, "epoch": 0.2432800272201429, "grad_norm": 0.039920028299093246, "grad_norm_var": 6.522392509228138e-06, "learning_rate": 0.007348144018669129, "loss": 2.7404, "step": 2860 }, { "crossentropy": 2.8322396278381348, "epoch": 0.24336509016672336, "grad_norm": 0.03842007741332054, "grad_norm_var": 6.656068579538541e-06, "learning_rate": 0.00734637495330833, "loss": 2.8322, "step": 2861 }, { "crossentropy": 2.8716142177581787, "epoch": 0.24345015311330384, "grad_norm": 0.03815935179591179, "grad_norm_var": 7.219010437557044e-06, "learning_rate": 0.007344605511187322, "loss": 2.8716, "step": 2862 }, { "crossentropy": 2.8503739833831787, "epoch": 0.24353521605988432, "grad_norm": 0.037563107907772064, "grad_norm_var": 7.5148162106838366e-06, "learning_rate": 0.007342835692590228, "loss": 2.8504, "step": 2863 }, { "crossentropy": 2.792025089263916, "epoch": 0.2436202790064648, "grad_norm": 0.04452351853251457, "grad_norm_var": 7.763589529657018e-06, "learning_rate": 0.007341065497801229, "loss": 2.792, "step": 2864 }, { "crossentropy": 2.8171958923339844, "epoch": 0.24370534195304525, "grad_norm": 0.041588395833969116, "grad_norm_var": 5.72011491053621e-06, "learning_rate": 0.007339294927104567, "loss": 2.8172, "step": 2865 }, { "crossentropy": 2.7099411487579346, "epoch": 0.24379040489962572, "grad_norm": 0.040079809725284576, "grad_norm_var": 5.789800704266284e-06, "learning_rate": 0.007337523980784546, "loss": 2.7099, "step": 2866 }, { "crossentropy": 2.8551509380340576, "epoch": 0.2438754678462062, "grad_norm": 0.039116110652685165, "grad_norm_var": 5.7961433810250925e-06, "learning_rate": 0.007335752659125526, "loss": 2.8552, "step": 2867 }, { "crossentropy": 2.7982723712921143, "epoch": 0.24396053079278665, "grad_norm": 0.041143592447042465, "grad_norm_var": 5.5636515903102755e-06, "learning_rate": 0.007333980962411934, "loss": 2.7983, "step": 2868 }, { "crossentropy": 2.804028272628784, "epoch": 0.24404559373936713, "grad_norm": 0.044512469321489334, "grad_norm_var": 6.326016906053195e-06, "learning_rate": 0.007332208890928251, "loss": 2.804, "step": 2869 }, { "crossentropy": 2.673600912094116, "epoch": 0.2441306566859476, "grad_norm": 0.04214152321219444, "grad_norm_var": 6.418008229471435e-06, "learning_rate": 0.007330436444959021, "loss": 2.6736, "step": 2870 }, { "crossentropy": 2.808584213256836, "epoch": 0.24421571963252808, "grad_norm": 0.04328044503927231, "grad_norm_var": 6.742679699179149e-06, "learning_rate": 0.007328663624788847, "loss": 2.8086, "step": 2871 }, { "crossentropy": 2.7791006565093994, "epoch": 0.24430078257910853, "grad_norm": 0.04047631472349167, "grad_norm_var": 5.827686289141115e-06, "learning_rate": 0.007326890430702396, "loss": 2.7791, "step": 2872 }, { "crossentropy": 2.761646032333374, "epoch": 0.244385845525689, "grad_norm": 0.04030491039156914, "grad_norm_var": 5.892040171476669e-06, "learning_rate": 0.007325116862984387, "loss": 2.7616, "step": 2873 }, { "crossentropy": 2.7629690170288086, "epoch": 0.2444709084722695, "grad_norm": 0.038293104618787766, "grad_norm_var": 5.627861792650615e-06, "learning_rate": 0.0073233429219196075, "loss": 2.763, "step": 2874 }, { "crossentropy": 2.818310022354126, "epoch": 0.24455597141884994, "grad_norm": 0.03586680814623833, "grad_norm_var": 6.074706859473568e-06, "learning_rate": 0.0073215686077929025, "loss": 2.8183, "step": 2875 }, { "crossentropy": 2.822824001312256, "epoch": 0.24464103436543042, "grad_norm": 0.03764777258038521, "grad_norm_var": 6.523686282029241e-06, "learning_rate": 0.00731979392088917, "loss": 2.8228, "step": 2876 }, { "crossentropy": 2.733560800552368, "epoch": 0.2447260973120109, "grad_norm": 0.04115423187613487, "grad_norm_var": 6.343917556920521e-06, "learning_rate": 0.007318018861493379, "loss": 2.7336, "step": 2877 }, { "crossentropy": 2.8011326789855957, "epoch": 0.24481116025859137, "grad_norm": 0.045085225254297256, "grad_norm_var": 7.304433060166017e-06, "learning_rate": 0.007316243429890551, "loss": 2.8011, "step": 2878 }, { "crossentropy": 2.8194804191589355, "epoch": 0.24489622320517182, "grad_norm": 0.04199482128024101, "grad_norm_var": 6.620111389757377e-06, "learning_rate": 0.00731446762636577, "loss": 2.8195, "step": 2879 }, { "crossentropy": 2.6837103366851807, "epoch": 0.2449812861517523, "grad_norm": 0.03855282440781593, "grad_norm_var": 6.103296219034151e-06, "learning_rate": 0.007312691451204178, "loss": 2.6837, "step": 2880 }, { "crossentropy": 2.5748445987701416, "epoch": 0.24506634909833278, "grad_norm": 0.03927692770957947, "grad_norm_var": 6.1641655942780425e-06, "learning_rate": 0.007310914904690975, "loss": 2.5748, "step": 2881 }, { "crossentropy": 2.8425381183624268, "epoch": 0.24515141204491323, "grad_norm": 0.039852846413850784, "grad_norm_var": 6.181853905468509e-06, "learning_rate": 0.007309137987111428, "loss": 2.8425, "step": 2882 }, { "crossentropy": 2.7584128379821777, "epoch": 0.2452364749914937, "grad_norm": 0.03740818053483963, "grad_norm_var": 6.689274727404252e-06, "learning_rate": 0.007307360698750856, "loss": 2.7584, "step": 2883 }, { "crossentropy": 2.869688034057617, "epoch": 0.24532153793807418, "grad_norm": 0.039300743490457535, "grad_norm_var": 6.72791137616401e-06, "learning_rate": 0.007305583039894641, "loss": 2.8697, "step": 2884 }, { "crossentropy": 2.8183794021606445, "epoch": 0.24540660088465466, "grad_norm": 0.040235091000795364, "grad_norm_var": 5.481411233383924e-06, "learning_rate": 0.007303805010828224, "loss": 2.8184, "step": 2885 }, { "crossentropy": 2.8614425659179688, "epoch": 0.2454916638312351, "grad_norm": 0.03962451219558716, "grad_norm_var": 5.176957396381352e-06, "learning_rate": 0.007302026611837105, "loss": 2.8614, "step": 2886 }, { "crossentropy": 2.8380963802337646, "epoch": 0.24557672677781558, "grad_norm": 0.04454365372657776, "grad_norm_var": 5.846525646739595e-06, "learning_rate": 0.007300247843206843, "loss": 2.8381, "step": 2887 }, { "crossentropy": 2.743953227996826, "epoch": 0.24566178972439606, "grad_norm": 0.04122104495763779, "grad_norm_var": 5.9308573095805e-06, "learning_rate": 0.007298468705223058, "loss": 2.744, "step": 2888 }, { "crossentropy": 2.920757293701172, "epoch": 0.2457468526709765, "grad_norm": 0.039348192512989044, "grad_norm_var": 5.952060700388044e-06, "learning_rate": 0.007296689198171427, "loss": 2.9208, "step": 2889 }, { "crossentropy": 2.7570178508758545, "epoch": 0.245831915617557, "grad_norm": 0.04190237820148468, "grad_norm_var": 5.962685351054188e-06, "learning_rate": 0.007294909322337688, "loss": 2.757, "step": 2890 }, { "crossentropy": 2.9601993560791016, "epoch": 0.24591697856413747, "grad_norm": 0.039865221828222275, "grad_norm_var": 4.65792895151338e-06, "learning_rate": 0.00729312907800764, "loss": 2.9602, "step": 2891 }, { "crossentropy": 2.776327133178711, "epoch": 0.24600204151071794, "grad_norm": 0.04358323663473129, "grad_norm_var": 4.651334340468003e-06, "learning_rate": 0.007291348465467136, "loss": 2.7763, "step": 2892 }, { "crossentropy": 2.770656108856201, "epoch": 0.2460871044572984, "grad_norm": 0.03931541368365288, "grad_norm_var": 4.778098765280634e-06, "learning_rate": 0.007289567485002092, "loss": 2.7707, "step": 2893 }, { "crossentropy": 2.800978899002075, "epoch": 0.24617216740387887, "grad_norm": 0.04012373089790344, "grad_norm_var": 3.41194788061368e-06, "learning_rate": 0.007287786136898483, "loss": 2.801, "step": 2894 }, { "crossentropy": 2.812333106994629, "epoch": 0.24625723035045935, "grad_norm": 0.03602316975593567, "grad_norm_var": 4.358408139237592e-06, "learning_rate": 0.007286004421442341, "loss": 2.8123, "step": 2895 }, { "crossentropy": 2.7300326824188232, "epoch": 0.2463422932970398, "grad_norm": 0.03845760598778725, "grad_norm_var": 4.377488415346677e-06, "learning_rate": 0.0072842223389197585, "loss": 2.73, "step": 2896 }, { "crossentropy": 2.817429542541504, "epoch": 0.24642735624362028, "grad_norm": 0.0373416431248188, "grad_norm_var": 4.799473035040485e-06, "learning_rate": 0.007282439889616886, "loss": 2.8174, "step": 2897 }, { "crossentropy": 2.816124439239502, "epoch": 0.24651241919020075, "grad_norm": 0.0390852726995945, "grad_norm_var": 4.839501525673248e-06, "learning_rate": 0.007280657073819935, "loss": 2.8161, "step": 2898 }, { "crossentropy": 2.664424180984497, "epoch": 0.24659748213678123, "grad_norm": 0.039988547563552856, "grad_norm_var": 4.420289723278993e-06, "learning_rate": 0.007278873891815175, "loss": 2.6644, "step": 2899 }, { "crossentropy": 2.7779016494750977, "epoch": 0.24668254508336168, "grad_norm": 0.043471559882164, "grad_norm_var": 5.1200679253987635e-06, "learning_rate": 0.00727709034388893, "loss": 2.7779, "step": 2900 }, { "crossentropy": 2.773155689239502, "epoch": 0.24676760802994216, "grad_norm": 0.04095255210995674, "grad_norm_var": 5.1500347195041854e-06, "learning_rate": 0.007275306430327589, "loss": 2.7732, "step": 2901 }, { "crossentropy": 2.7340636253356934, "epoch": 0.24685267097652264, "grad_norm": 0.039836760610342026, "grad_norm_var": 5.1336497122628565e-06, "learning_rate": 0.007273522151417597, "loss": 2.7341, "step": 2902 }, { "crossentropy": 2.864208936691284, "epoch": 0.2469377339231031, "grad_norm": 0.04059312865138054, "grad_norm_var": 3.88233612449787e-06, "learning_rate": 0.007271737507445458, "loss": 2.8642, "step": 2903 }, { "crossentropy": 2.848397731781006, "epoch": 0.24702279686968356, "grad_norm": 0.03856385126709938, "grad_norm_var": 3.915588522224121e-06, "learning_rate": 0.007269952498697734, "loss": 2.8484, "step": 2904 }, { "crossentropy": 2.898848295211792, "epoch": 0.24710785981626404, "grad_norm": 0.035884346812963486, "grad_norm_var": 4.921836504983892e-06, "learning_rate": 0.007268167125461046, "loss": 2.8988, "step": 2905 }, { "crossentropy": 2.7483534812927246, "epoch": 0.2471929227628445, "grad_norm": 0.03865783289074898, "grad_norm_var": 4.621295849654925e-06, "learning_rate": 0.007266381388022073, "loss": 2.7484, "step": 2906 }, { "crossentropy": 2.682985782623291, "epoch": 0.24727798570942497, "grad_norm": 0.038874030113220215, "grad_norm_var": 4.632316770012783e-06, "learning_rate": 0.007264595286667554, "loss": 2.683, "step": 2907 }, { "crossentropy": 2.7766313552856445, "epoch": 0.24736304865600545, "grad_norm": 0.04091593995690346, "grad_norm_var": 3.5970860638357206e-06, "learning_rate": 0.007262808821684284, "loss": 2.7766, "step": 2908 }, { "crossentropy": 2.733181953430176, "epoch": 0.24744811160258592, "grad_norm": 0.03964046761393547, "grad_norm_var": 3.606293588618153e-06, "learning_rate": 0.007261021993359118, "loss": 2.7332, "step": 2909 }, { "crossentropy": 2.8050241470336914, "epoch": 0.24753317454916637, "grad_norm": 0.04196196794509888, "grad_norm_var": 4.025350855328622e-06, "learning_rate": 0.007259234801978971, "loss": 2.805, "step": 2910 }, { "crossentropy": 2.7719485759735107, "epoch": 0.24761823749574685, "grad_norm": 0.04263031855225563, "grad_norm_var": 3.78725426932273e-06, "learning_rate": 0.0072574472478308125, "loss": 2.7719, "step": 2911 }, { "crossentropy": 2.6527509689331055, "epoch": 0.24770330044232733, "grad_norm": 0.03744357079267502, "grad_norm_var": 4.033490694489196e-06, "learning_rate": 0.007255659331201673, "loss": 2.6528, "step": 2912 }, { "crossentropy": 2.7537198066711426, "epoch": 0.24778836338890778, "grad_norm": 0.03946804627776146, "grad_norm_var": 3.6360752137961614e-06, "learning_rate": 0.007253871052378641, "loss": 2.7537, "step": 2913 }, { "crossentropy": 2.812220573425293, "epoch": 0.24787342633548826, "grad_norm": 0.04288448020815849, "grad_norm_var": 4.1391607558870105e-06, "learning_rate": 0.007252082411648861, "loss": 2.8122, "step": 2914 }, { "crossentropy": 2.704232692718506, "epoch": 0.24795848928206873, "grad_norm": 0.048162080347537994, "grad_norm_var": 8.181712125893078e-06, "learning_rate": 0.0072502934092995375, "loss": 2.7042, "step": 2915 }, { "crossentropy": 2.8187875747680664, "epoch": 0.2480435522286492, "grad_norm": 0.03899255394935608, "grad_norm_var": 7.733383056483743e-06, "learning_rate": 0.007248504045617934, "loss": 2.8188, "step": 2916 }, { "crossentropy": 2.6255650520324707, "epoch": 0.24812861517522966, "grad_norm": 0.03770789876580238, "grad_norm_var": 8.126959356286913e-06, "learning_rate": 0.00724671432089137, "loss": 2.6256, "step": 2917 }, { "crossentropy": 2.7384068965911865, "epoch": 0.24821367812181014, "grad_norm": 0.04350358620285988, "grad_norm_var": 8.819747595279793e-06, "learning_rate": 0.007244924235407223, "loss": 2.7384, "step": 2918 }, { "crossentropy": 2.78375244140625, "epoch": 0.24829874106839062, "grad_norm": 0.03882055729627609, "grad_norm_var": 8.96285802091945e-06, "learning_rate": 0.0072431337894529315, "loss": 2.7838, "step": 2919 }, { "crossentropy": 2.7970921993255615, "epoch": 0.24838380401497107, "grad_norm": 0.037244364619255066, "grad_norm_var": 9.369546441335453e-06, "learning_rate": 0.007241342983315985, "loss": 2.7971, "step": 2920 }, { "crossentropy": 2.7924013137817383, "epoch": 0.24846886696155154, "grad_norm": 0.03921159356832504, "grad_norm_var": 8.158202850053892e-06, "learning_rate": 0.0072395518172839405, "loss": 2.7924, "step": 2921 }, { "crossentropy": 2.7615411281585693, "epoch": 0.24855392990813202, "grad_norm": 0.040289342403411865, "grad_norm_var": 7.949401698349214e-06, "learning_rate": 0.007237760291644405, "loss": 2.7615, "step": 2922 }, { "crossentropy": 2.7511868476867676, "epoch": 0.2486389928547125, "grad_norm": 0.04458180069923401, "grad_norm_var": 8.759998471855914e-06, "learning_rate": 0.007235968406685045, "loss": 2.7512, "step": 2923 }, { "crossentropy": 2.7967758178710938, "epoch": 0.24872405580129295, "grad_norm": 0.049778006970882416, "grad_norm_var": 1.3756872942782596e-05, "learning_rate": 0.007234176162693589, "loss": 2.7968, "step": 2924 }, { "crossentropy": 2.6858878135681152, "epoch": 0.24880911874787343, "grad_norm": 0.04020272195339203, "grad_norm_var": 1.3645095623523023e-05, "learning_rate": 0.007232383559957815, "loss": 2.6859, "step": 2925 }, { "crossentropy": 2.68402099609375, "epoch": 0.2488941816944539, "grad_norm": 0.037608131766319275, "grad_norm_var": 1.452113008571735e-05, "learning_rate": 0.007230590598765567, "loss": 2.684, "step": 2926 }, { "crossentropy": 2.7832822799682617, "epoch": 0.24897924464103435, "grad_norm": 0.0422663539648056, "grad_norm_var": 1.4457963097217614e-05, "learning_rate": 0.007228797279404743, "loss": 2.7833, "step": 2927 }, { "crossentropy": 2.7770767211914062, "epoch": 0.24906430758761483, "grad_norm": 0.0430353544652462, "grad_norm_var": 1.3659755569752202e-05, "learning_rate": 0.007227003602163296, "loss": 2.7771, "step": 2928 }, { "crossentropy": 2.7671115398406982, "epoch": 0.2491493705341953, "grad_norm": 0.038054388016462326, "grad_norm_var": 1.4164791698173934e-05, "learning_rate": 0.007225209567329241, "loss": 2.7671, "step": 2929 }, { "crossentropy": 2.748070240020752, "epoch": 0.24923443348077579, "grad_norm": 0.036562077701091766, "grad_norm_var": 1.540870071771889e-05, "learning_rate": 0.007223415175190646, "loss": 2.7481, "step": 2930 }, { "crossentropy": 2.778503179550171, "epoch": 0.24931949642735624, "grad_norm": 0.038722578436136246, "grad_norm_var": 1.1965154016564615e-05, "learning_rate": 0.0072216204260356414, "loss": 2.7785, "step": 2931 }, { "crossentropy": 2.780172824859619, "epoch": 0.2494045593739367, "grad_norm": 0.03649140149354935, "grad_norm_var": 1.2829283243976676e-05, "learning_rate": 0.007219825320152411, "loss": 2.7802, "step": 2932 }, { "crossentropy": 2.826287031173706, "epoch": 0.2494896223205172, "grad_norm": 0.03762705624103546, "grad_norm_var": 1.2857147032864734e-05, "learning_rate": 0.007218029857829197, "loss": 2.8263, "step": 2933 }, { "crossentropy": 2.789659261703491, "epoch": 0.24957468526709764, "grad_norm": 0.03965495526790619, "grad_norm_var": 1.2113292259142925e-05, "learning_rate": 0.007216234039354298, "loss": 2.7897, "step": 2934 }, { "crossentropy": 2.8440353870391846, "epoch": 0.24965974821367812, "grad_norm": 0.03915323317050934, "grad_norm_var": 1.2067475308642715e-05, "learning_rate": 0.007214437865016072, "loss": 2.844, "step": 2935 }, { "crossentropy": 2.891225576400757, "epoch": 0.2497448111602586, "grad_norm": 0.04273241385817528, "grad_norm_var": 1.1911378836755812e-05, "learning_rate": 0.007212641335102932, "loss": 2.8912, "step": 2936 }, { "crossentropy": 2.7848458290100098, "epoch": 0.24982987410683907, "grad_norm": 0.04441874101758003, "grad_norm_var": 1.2799531297796354e-05, "learning_rate": 0.007210844449903351, "loss": 2.7848, "step": 2937 }, { "crossentropy": 2.7571966648101807, "epoch": 0.24991493705341952, "grad_norm": 0.038496844470500946, "grad_norm_var": 1.3098173604583243e-05, "learning_rate": 0.007209047209705855, "loss": 2.7572, "step": 2938 }, { "crossentropy": 2.734680414199829, "epoch": 0.25, "grad_norm": 0.04001683369278908, "grad_norm_var": 1.1968895903133325e-05, "learning_rate": 0.007207249614799027, "loss": 2.7347, "step": 2939 }, { "crossentropy": 2.7669551372528076, "epoch": 0.25008506294658045, "grad_norm": 0.03753019869327545, "grad_norm_var": 5.868624220099067e-06, "learning_rate": 0.007205451665471514, "loss": 2.767, "step": 2940 }, { "crossentropy": 2.7594845294952393, "epoch": 0.25017012589316096, "grad_norm": 0.03848531097173691, "grad_norm_var": 5.900257718417327e-06, "learning_rate": 0.007203653362012011, "loss": 2.7595, "step": 2941 }, { "crossentropy": 2.81699538230896, "epoch": 0.2502551888397414, "grad_norm": 0.03867294266819954, "grad_norm_var": 5.712676342286595e-06, "learning_rate": 0.007201854704709274, "loss": 2.817, "step": 2942 }, { "crossentropy": 2.8388071060180664, "epoch": 0.25034025178632185, "grad_norm": 0.04058294743299484, "grad_norm_var": 5.267759987630643e-06, "learning_rate": 0.0072000556938521185, "loss": 2.8388, "step": 2943 }, { "crossentropy": 2.778146743774414, "epoch": 0.25042531473290236, "grad_norm": 0.043081991374492645, "grad_norm_var": 5.290564725298162e-06, "learning_rate": 0.007198256329729411, "loss": 2.7781, "step": 2944 }, { "crossentropy": 2.745171308517456, "epoch": 0.2505103776794828, "grad_norm": 0.03884142264723778, "grad_norm_var": 5.188834291454623e-06, "learning_rate": 0.007196456612630076, "loss": 2.7452, "step": 2945 }, { "crossentropy": 2.796173095703125, "epoch": 0.25059544062606326, "grad_norm": 0.0827607735991478, "grad_norm_var": 0.00012084438718013546, "learning_rate": 0.007194656542843102, "loss": 2.7962, "step": 2946 }, { "crossentropy": 2.8236708641052246, "epoch": 0.25068050357264376, "grad_norm": 0.042274877429008484, "grad_norm_var": 0.00011992475180622411, "learning_rate": 0.007192856120657524, "loss": 2.8237, "step": 2947 }, { "crossentropy": 2.817929267883301, "epoch": 0.2507655665192242, "grad_norm": 0.03866945579648018, "grad_norm_var": 0.00011846138767344638, "learning_rate": 0.007191055346362437, "loss": 2.8179, "step": 2948 }, { "crossentropy": 2.740786552429199, "epoch": 0.2508506294658047, "grad_norm": 0.04103733226656914, "grad_norm_var": 0.00011688726029422016, "learning_rate": 0.007189254220246997, "loss": 2.7408, "step": 2949 }, { "crossentropy": 2.877697706222534, "epoch": 0.25093569241238517, "grad_norm": 0.04104858636856079, "grad_norm_var": 0.00011640554291493562, "learning_rate": 0.007187452742600409, "loss": 2.8777, "step": 2950 }, { "crossentropy": 2.892838954925537, "epoch": 0.2510207553589656, "grad_norm": 0.03914904594421387, "grad_norm_var": 0.0001164076848060672, "learning_rate": 0.007185650913711943, "loss": 2.8928, "step": 2951 }, { "crossentropy": 2.767469644546509, "epoch": 0.2511058183055461, "grad_norm": 0.03514803200960159, "grad_norm_var": 0.00012026080109277873, "learning_rate": 0.007183848733870917, "loss": 2.7675, "step": 2952 }, { "crossentropy": 2.749185085296631, "epoch": 0.2511908812521266, "grad_norm": 0.04038270190358162, "grad_norm_var": 0.00012025359587019164, "learning_rate": 0.00718204620336671, "loss": 2.7492, "step": 2953 }, { "crossentropy": 2.8189828395843506, "epoch": 0.251275944198707, "grad_norm": 0.042317572981119156, "grad_norm_var": 0.00011924828827079332, "learning_rate": 0.007180243322488759, "loss": 2.819, "step": 2954 }, { "crossentropy": 2.7002315521240234, "epoch": 0.25136100714528753, "grad_norm": 0.041195183992385864, "grad_norm_var": 0.00011894493122199904, "learning_rate": 0.0071784400915265514, "loss": 2.7002, "step": 2955 }, { "crossentropy": 2.6885859966278076, "epoch": 0.251446070091868, "grad_norm": 0.038073841482400894, "grad_norm_var": 0.00011859782492331285, "learning_rate": 0.007176636510769634, "loss": 2.6886, "step": 2956 }, { "crossentropy": 2.7931716442108154, "epoch": 0.25153113303844843, "grad_norm": 0.03749862313270569, "grad_norm_var": 0.00011920099708356228, "learning_rate": 0.007174832580507613, "loss": 2.7932, "step": 2957 }, { "crossentropy": 2.8170361518859863, "epoch": 0.25161619598502893, "grad_norm": 0.03862383961677551, "grad_norm_var": 0.00011922650469528909, "learning_rate": 0.007173028301030147, "loss": 2.817, "step": 2958 }, { "crossentropy": 2.8048322200775146, "epoch": 0.2517012589316094, "grad_norm": 0.045133523643016815, "grad_norm_var": 0.00011933155673895316, "learning_rate": 0.007171223672626947, "loss": 2.8048, "step": 2959 }, { "crossentropy": 2.8277828693389893, "epoch": 0.25178632187818983, "grad_norm": 0.03687509521842003, "grad_norm_var": 0.00012152862539369456, "learning_rate": 0.007169418695587791, "loss": 2.8278, "step": 2960 }, { "crossentropy": 2.782869815826416, "epoch": 0.25187138482477034, "grad_norm": 0.03980075195431709, "grad_norm_var": 0.00012112592952223059, "learning_rate": 0.0071676133702025, "loss": 2.7829, "step": 2961 }, { "crossentropy": 2.8222241401672363, "epoch": 0.2519564477713508, "grad_norm": 0.039329834282398224, "grad_norm_var": 5.870677868290558e-06, "learning_rate": 0.007165807696760965, "loss": 2.8222, "step": 2962 }, { "crossentropy": 2.7211787700653076, "epoch": 0.2520415107179313, "grad_norm": 0.04038414731621742, "grad_norm_var": 5.466388326613493e-06, "learning_rate": 0.007164001675553115, "loss": 2.7212, "step": 2963 }, { "crossentropy": 2.7459168434143066, "epoch": 0.25212657366451174, "grad_norm": 0.039659373462200165, "grad_norm_var": 5.396006065961669e-06, "learning_rate": 0.00716219530686895, "loss": 2.7459, "step": 2964 }, { "crossentropy": 2.773019790649414, "epoch": 0.2522116366110922, "grad_norm": 0.0400787778198719, "grad_norm_var": 5.286166331653665e-06, "learning_rate": 0.0071603885909985255, "loss": 2.773, "step": 2965 }, { "crossentropy": 2.767270088195801, "epoch": 0.2522966995576727, "grad_norm": 0.03967804089188576, "grad_norm_var": 5.151403339516645e-06, "learning_rate": 0.00715858152823194, "loss": 2.7673, "step": 2966 }, { "crossentropy": 2.7445194721221924, "epoch": 0.25238176250425315, "grad_norm": 0.03757894039154053, "grad_norm_var": 5.396332507212378e-06, "learning_rate": 0.007156774118859359, "loss": 2.7445, "step": 2967 }, { "crossentropy": 2.8423125743865967, "epoch": 0.2524668254508336, "grad_norm": 0.03985222429037094, "grad_norm_var": 4.059231481195793e-06, "learning_rate": 0.007154966363171002, "loss": 2.8423, "step": 2968 }, { "crossentropy": 2.7712557315826416, "epoch": 0.2525518883974141, "grad_norm": 0.04305204004049301, "grad_norm_var": 4.7194654448467465e-06, "learning_rate": 0.007153158261457141, "loss": 2.7713, "step": 2969 }, { "crossentropy": 2.7025516033172607, "epoch": 0.25263695134399455, "grad_norm": 0.038320526480674744, "grad_norm_var": 4.453944695953274e-06, "learning_rate": 0.0071513498140081035, "loss": 2.7026, "step": 2970 }, { "crossentropy": 2.751159429550171, "epoch": 0.252722014290575, "grad_norm": 0.03732955828309059, "grad_norm_var": 4.615141705428758e-06, "learning_rate": 0.007149541021114278, "loss": 2.7512, "step": 2971 }, { "crossentropy": 2.8592236042022705, "epoch": 0.2528070772371555, "grad_norm": 0.04237707704305649, "grad_norm_var": 4.980435917425104e-06, "learning_rate": 0.0071477318830660995, "loss": 2.8592, "step": 2972 }, { "crossentropy": 2.6761748790740967, "epoch": 0.25289214018373596, "grad_norm": 0.041539885103702545, "grad_norm_var": 4.802454060398355e-06, "learning_rate": 0.007145922400154069, "loss": 2.6762, "step": 2973 }, { "crossentropy": 2.7039437294006348, "epoch": 0.2529772031303164, "grad_norm": 0.038983702659606934, "grad_norm_var": 4.7456759816319935e-06, "learning_rate": 0.007144112572668733, "loss": 2.7039, "step": 2974 }, { "crossentropy": 2.767430543899536, "epoch": 0.2530622660768969, "grad_norm": 0.04065503180027008, "grad_norm_var": 2.9328500123733855e-06, "learning_rate": 0.007142302400900698, "loss": 2.7674, "step": 2975 }, { "crossentropy": 2.8466668128967285, "epoch": 0.25314732902347736, "grad_norm": 0.04478379338979721, "grad_norm_var": 3.8437839343346524e-06, "learning_rate": 0.007140491885140629, "loss": 2.8467, "step": 2976 }, { "crossentropy": 2.766063690185547, "epoch": 0.25323239197005787, "grad_norm": 0.042228084057569504, "grad_norm_var": 4.078695407470853e-06, "learning_rate": 0.0071386810256792365, "loss": 2.7661, "step": 2977 }, { "crossentropy": 2.739260673522949, "epoch": 0.2533174549166383, "grad_norm": 0.041830483824014664, "grad_norm_var": 4.124565296464063e-06, "learning_rate": 0.0071368698228072985, "loss": 2.7393, "step": 2978 }, { "crossentropy": 2.8969345092773438, "epoch": 0.25340251786321877, "grad_norm": 0.03920970857143402, "grad_norm_var": 4.232159744135764e-06, "learning_rate": 0.007135058276815638, "loss": 2.8969, "step": 2979 }, { "crossentropy": 2.7915985584259033, "epoch": 0.2534875808097993, "grad_norm": 0.041360996663570404, "grad_norm_var": 4.234356274460983e-06, "learning_rate": 0.0071332463879951406, "loss": 2.7916, "step": 2980 }, { "crossentropy": 2.7550432682037354, "epoch": 0.2535726437563797, "grad_norm": 0.040246255695819855, "grad_norm_var": 4.22550459123298e-06, "learning_rate": 0.00713143415663674, "loss": 2.755, "step": 2981 }, { "crossentropy": 2.7433695793151855, "epoch": 0.2536577067029602, "grad_norm": 0.039676081389188766, "grad_norm_var": 4.225736341473043e-06, "learning_rate": 0.00712962158303143, "loss": 2.7434, "step": 2982 }, { "crossentropy": 2.875088691711426, "epoch": 0.2537427696495407, "grad_norm": 0.038685038685798645, "grad_norm_var": 3.8619626771927335e-06, "learning_rate": 0.007127808667470257, "loss": 2.8751, "step": 2983 }, { "crossentropy": 2.7460403442382812, "epoch": 0.25382783259612113, "grad_norm": 0.03770405799150467, "grad_norm_var": 4.374052343948896e-06, "learning_rate": 0.007125995410244323, "loss": 2.746, "step": 2984 }, { "crossentropy": 2.8079733848571777, "epoch": 0.2539128955427016, "grad_norm": 0.03662286326289177, "grad_norm_var": 4.768830954063734e-06, "learning_rate": 0.007124181811644786, "loss": 2.808, "step": 2985 }, { "crossentropy": 2.8494279384613037, "epoch": 0.2539979584892821, "grad_norm": 0.043471984565258026, "grad_norm_var": 5.207186313177919e-06, "learning_rate": 0.007122367871962857, "loss": 2.8494, "step": 2986 }, { "crossentropy": 2.805833578109741, "epoch": 0.25408302143586253, "grad_norm": 0.04263520985841751, "grad_norm_var": 4.780997309501942e-06, "learning_rate": 0.007120553591489803, "loss": 2.8058, "step": 2987 }, { "crossentropy": 2.7496578693389893, "epoch": 0.254168084382443, "grad_norm": 0.04245132952928543, "grad_norm_var": 4.797444155858986e-06, "learning_rate": 0.007118738970516944, "loss": 2.7497, "step": 2988 }, { "crossentropy": 2.8021883964538574, "epoch": 0.2542531473290235, "grad_norm": 0.04054161161184311, "grad_norm_var": 4.755295356774723e-06, "learning_rate": 0.007116924009335656, "loss": 2.8022, "step": 2989 }, { "crossentropy": 2.7426071166992188, "epoch": 0.25433821027560394, "grad_norm": 0.03682517260313034, "grad_norm_var": 5.538409457346109e-06, "learning_rate": 0.00711510870823737, "loss": 2.7426, "step": 2990 }, { "crossentropy": 2.869509220123291, "epoch": 0.25442327322218444, "grad_norm": 0.039092887192964554, "grad_norm_var": 5.670713722957278e-06, "learning_rate": 0.007113293067513572, "loss": 2.8695, "step": 2991 }, { "crossentropy": 2.710463762283325, "epoch": 0.2545083361687649, "grad_norm": 0.03788682818412781, "grad_norm_var": 4.6679009420800065e-06, "learning_rate": 0.0071114770874558, "loss": 2.7105, "step": 2992 }, { "crossentropy": 2.6801483631134033, "epoch": 0.25459339911534534, "grad_norm": 0.03982215002179146, "grad_norm_var": 4.324328630288659e-06, "learning_rate": 0.007109660768355648, "loss": 2.6801, "step": 2993 }, { "crossentropy": 2.7681400775909424, "epoch": 0.25467846206192585, "grad_norm": 0.038325972855091095, "grad_norm_var": 4.180023111506845e-06, "learning_rate": 0.007107844110504765, "loss": 2.7681, "step": 2994 }, { "crossentropy": 2.834956169128418, "epoch": 0.2547635250085063, "grad_norm": 0.038789719343185425, "grad_norm_var": 4.216256739826794e-06, "learning_rate": 0.007106027114194855, "loss": 2.835, "step": 2995 }, { "crossentropy": 2.814835786819458, "epoch": 0.25484858795508675, "grad_norm": 0.03923526406288147, "grad_norm_var": 4.009090056396787e-06, "learning_rate": 0.007104209779717673, "loss": 2.8148, "step": 2996 }, { "crossentropy": 2.712186813354492, "epoch": 0.25493365090166725, "grad_norm": 0.04163181036710739, "grad_norm_var": 4.266795423597351e-06, "learning_rate": 0.007102392107365033, "loss": 2.7122, "step": 2997 }, { "crossentropy": 2.7248330116271973, "epoch": 0.2550187138482477, "grad_norm": 0.04096950590610504, "grad_norm_var": 4.3866528115244285e-06, "learning_rate": 0.0071005740974287995, "loss": 2.7248, "step": 2998 }, { "crossentropy": 2.8061513900756836, "epoch": 0.25510377679482815, "grad_norm": 0.041779667139053345, "grad_norm_var": 4.579523630050299e-06, "learning_rate": 0.007098755750200892, "loss": 2.8062, "step": 2999 }, { "crossentropy": 2.715620517730713, "epoch": 0.25518883974140866, "grad_norm": 0.03955734521150589, "grad_norm_var": 4.261044677216346e-06, "learning_rate": 0.007096937065973284, "loss": 2.7156, "step": 3000 }, { "crossentropy": 2.820096731185913, "epoch": 0.2552739026879891, "grad_norm": 0.03820250555872917, "grad_norm_var": 3.7104578237624638e-06, "learning_rate": 0.007095118045038007, "loss": 2.8201, "step": 3001 }, { "crossentropy": 2.8518569469451904, "epoch": 0.25535896563456956, "grad_norm": 0.044678714126348495, "grad_norm_var": 4.34784494389139e-06, "learning_rate": 0.007093298687687141, "loss": 2.8519, "step": 3002 }, { "crossentropy": 2.7693328857421875, "epoch": 0.25544402858115006, "grad_norm": 0.05069904029369354, "grad_norm_var": 1.1082244905324019e-05, "learning_rate": 0.007091478994212823, "loss": 2.7693, "step": 3003 }, { "crossentropy": 2.780498504638672, "epoch": 0.2555290915277305, "grad_norm": 0.04193528741598129, "grad_norm_var": 1.097533202011706e-05, "learning_rate": 0.007089658964907242, "loss": 2.7805, "step": 3004 }, { "crossentropy": 2.7145259380340576, "epoch": 0.255614154474311, "grad_norm": 0.041699863970279694, "grad_norm_var": 1.1046556785509105e-05, "learning_rate": 0.0070878386000626445, "loss": 2.7145, "step": 3005 }, { "crossentropy": 2.6989738941192627, "epoch": 0.25569921742089147, "grad_norm": 0.04525052383542061, "grad_norm_var": 1.1135104533676644e-05, "learning_rate": 0.0070860178999713275, "loss": 2.699, "step": 3006 }, { "crossentropy": 2.7848265171051025, "epoch": 0.2557842803674719, "grad_norm": 0.04136446490883827, "grad_norm_var": 1.0812652987918168e-05, "learning_rate": 0.0070841968649256425, "loss": 2.7848, "step": 3007 }, { "crossentropy": 2.7580442428588867, "epoch": 0.2558693433140524, "grad_norm": 0.04128528758883476, "grad_norm_var": 9.958762684530765e-06, "learning_rate": 0.007082375495217995, "loss": 2.758, "step": 3008 }, { "crossentropy": 2.7640388011932373, "epoch": 0.25595440626063287, "grad_norm": 0.04059035703539848, "grad_norm_var": 9.81593270944677e-06, "learning_rate": 0.007080553791140848, "loss": 2.764, "step": 3009 }, { "crossentropy": 2.6607112884521484, "epoch": 0.2560394692072133, "grad_norm": 0.0447898805141449, "grad_norm_var": 9.584284762200145e-06, "learning_rate": 0.007078731752986709, "loss": 2.6607, "step": 3010 }, { "crossentropy": 2.7338924407958984, "epoch": 0.2561245321537938, "grad_norm": 0.03862549364566803, "grad_norm_var": 9.656893624213764e-06, "learning_rate": 0.00707690938104815, "loss": 2.7339, "step": 3011 }, { "crossentropy": 2.7809925079345703, "epoch": 0.2562095951003743, "grad_norm": 0.04023926705121994, "grad_norm_var": 9.347319642640424e-06, "learning_rate": 0.007075086675617788, "loss": 2.781, "step": 3012 }, { "crossentropy": 2.7376673221588135, "epoch": 0.2562946580469547, "grad_norm": 0.03665187209844589, "grad_norm_var": 1.1195689537185637e-05, "learning_rate": 0.007073263636988298, "loss": 2.7377, "step": 3013 }, { "crossentropy": 2.751711368560791, "epoch": 0.25637972099353523, "grad_norm": 0.03640693426132202, "grad_norm_var": 1.2983695554921737e-05, "learning_rate": 0.007071440265452408, "loss": 2.7517, "step": 3014 }, { "crossentropy": 2.6770944595336914, "epoch": 0.2564647839401157, "grad_norm": 0.036567408591508865, "grad_norm_var": 1.4476737005576258e-05, "learning_rate": 0.007069616561302898, "loss": 2.6771, "step": 3015 }, { "crossentropy": 2.871066093444824, "epoch": 0.25654984688669613, "grad_norm": 0.04421406611800194, "grad_norm_var": 1.4837581810052634e-05, "learning_rate": 0.007067792524832603, "loss": 2.8711, "step": 3016 }, { "crossentropy": 2.7565364837646484, "epoch": 0.25663490983327664, "grad_norm": 0.03645660728216171, "grad_norm_var": 1.578407858854874e-05, "learning_rate": 0.007065968156334412, "loss": 2.7565, "step": 3017 }, { "crossentropy": 2.817035675048828, "epoch": 0.2567199727798571, "grad_norm": 0.05055125057697296, "grad_norm_var": 2.0552988317745537e-05, "learning_rate": 0.007064143456101263, "loss": 2.817, "step": 3018 }, { "crossentropy": 2.73518443107605, "epoch": 0.25680503572643754, "grad_norm": 0.04083907604217529, "grad_norm_var": 1.4808958700240672e-05, "learning_rate": 0.007062318424426153, "loss": 2.7352, "step": 3019 }, { "crossentropy": 2.825105905532837, "epoch": 0.25689009867301804, "grad_norm": 0.03953760489821434, "grad_norm_var": 1.4898585294852105e-05, "learning_rate": 0.007060493061602128, "loss": 2.8251, "step": 3020 }, { "crossentropy": 2.7705769538879395, "epoch": 0.2569751616195985, "grad_norm": 0.04263770207762718, "grad_norm_var": 1.5048339681718777e-05, "learning_rate": 0.007058667367922289, "loss": 2.7706, "step": 3021 }, { "crossentropy": 2.772564172744751, "epoch": 0.257060224566179, "grad_norm": 0.040466178208589554, "grad_norm_var": 1.3767809868155586e-05, "learning_rate": 0.00705684134367979, "loss": 2.7726, "step": 3022 }, { "crossentropy": 2.8731696605682373, "epoch": 0.25714528751275945, "grad_norm": 0.041198741644620895, "grad_norm_var": 1.3754876461184034e-05, "learning_rate": 0.007055014989167837, "loss": 2.8732, "step": 3023 }, { "crossentropy": 2.906809091567993, "epoch": 0.2572303504593399, "grad_norm": 0.04696450009942055, "grad_norm_var": 1.6220647020330028e-05, "learning_rate": 0.0070531883046796905, "loss": 2.9068, "step": 3024 }, { "crossentropy": 2.7394611835479736, "epoch": 0.2573154134059204, "grad_norm": 0.039974041283130646, "grad_norm_var": 1.6281834826239034e-05, "learning_rate": 0.007051361290508666, "loss": 2.7395, "step": 3025 }, { "crossentropy": 2.685227155685425, "epoch": 0.25740047635250085, "grad_norm": 0.038204021751880646, "grad_norm_var": 1.567135189139325e-05, "learning_rate": 0.007049533946948123, "loss": 2.6852, "step": 3026 }, { "crossentropy": 2.875068187713623, "epoch": 0.2574855392990813, "grad_norm": 0.038929399102926254, "grad_norm_var": 1.559728107413764e-05, "learning_rate": 0.007047706274291488, "loss": 2.8751, "step": 3027 }, { "crossentropy": 2.8302154541015625, "epoch": 0.2575706022456618, "grad_norm": 0.036348044872283936, "grad_norm_var": 1.6738529994064547e-05, "learning_rate": 0.007045878272832227, "loss": 2.8302, "step": 3028 }, { "crossentropy": 2.7567994594573975, "epoch": 0.25765566519224226, "grad_norm": 0.04088986665010452, "grad_norm_var": 1.5759110516520153e-05, "learning_rate": 0.007044049942863865, "loss": 2.7568, "step": 3029 }, { "crossentropy": 2.744591474533081, "epoch": 0.2577407281388227, "grad_norm": 0.037526778876781464, "grad_norm_var": 1.5205947771335105e-05, "learning_rate": 0.007042221284679981, "loss": 2.7446, "step": 3030 }, { "crossentropy": 2.8129968643188477, "epoch": 0.2578257910854032, "grad_norm": 0.040141064673662186, "grad_norm_var": 1.4031872799039983e-05, "learning_rate": 0.007040392298574204, "loss": 2.813, "step": 3031 }, { "crossentropy": 2.775784730911255, "epoch": 0.25791085403198366, "grad_norm": 0.03968580812215805, "grad_norm_var": 1.333058983179752e-05, "learning_rate": 0.007038562984840215, "loss": 2.7758, "step": 3032 }, { "crossentropy": 2.8419039249420166, "epoch": 0.2579959169785641, "grad_norm": 0.03996456041932106, "grad_norm_var": 1.2139776519180107e-05, "learning_rate": 0.007036733343771753, "loss": 2.8419, "step": 3033 }, { "crossentropy": 2.763115167617798, "epoch": 0.2580809799251446, "grad_norm": 0.04648613929748535, "grad_norm_var": 7.923137019725385e-06, "learning_rate": 0.007034903375662602, "loss": 2.7631, "step": 3034 }, { "crossentropy": 2.8921868801116943, "epoch": 0.25816604287172507, "grad_norm": 0.04328348860144615, "grad_norm_var": 8.370561945205725e-06, "learning_rate": 0.0070330730808066035, "loss": 2.8922, "step": 3035 }, { "crossentropy": 2.7395262718200684, "epoch": 0.25825110581830557, "grad_norm": 0.040086779743433, "grad_norm_var": 8.299546997878917e-06, "learning_rate": 0.00703124245949765, "loss": 2.7395, "step": 3036 }, { "crossentropy": 2.8599162101745605, "epoch": 0.258336168764886, "grad_norm": 0.04049629718065262, "grad_norm_var": 8.061216108314293e-06, "learning_rate": 0.007029411512029686, "loss": 2.8599, "step": 3037 }, { "crossentropy": 2.7125327587127686, "epoch": 0.25842123171146647, "grad_norm": 0.03955995291471481, "grad_norm_var": 8.136610646071404e-06, "learning_rate": 0.007027580238696709, "loss": 2.7125, "step": 3038 }, { "crossentropy": 2.696988582611084, "epoch": 0.258506294658047, "grad_norm": 0.041727207601070404, "grad_norm_var": 8.195639746238817e-06, "learning_rate": 0.007025748639792771, "loss": 2.697, "step": 3039 }, { "crossentropy": 2.7684895992279053, "epoch": 0.2585913576046274, "grad_norm": 0.0388154499232769, "grad_norm_var": 5.476154063456991e-06, "learning_rate": 0.007023916715611969, "loss": 2.7685, "step": 3040 }, { "crossentropy": 2.8472256660461426, "epoch": 0.2586764205512079, "grad_norm": 0.04432586953043938, "grad_norm_var": 6.567899804130137e-06, "learning_rate": 0.007022084466448462, "loss": 2.8472, "step": 3041 }, { "crossentropy": 2.7444279193878174, "epoch": 0.2587614834977884, "grad_norm": 0.041247397661209106, "grad_norm_var": 6.253897924516673e-06, "learning_rate": 0.007020251892596455, "loss": 2.7444, "step": 3042 }, { "crossentropy": 2.856757640838623, "epoch": 0.25884654644436883, "grad_norm": 0.03730424866080284, "grad_norm_var": 6.779801318358182e-06, "learning_rate": 0.007018418994350205, "loss": 2.8568, "step": 3043 }, { "crossentropy": 2.760648488998413, "epoch": 0.2589316093909493, "grad_norm": 0.04085589200258255, "grad_norm_var": 5.558498435952591e-06, "learning_rate": 0.007016585772004026, "loss": 2.7606, "step": 3044 }, { "crossentropy": 2.737550973892212, "epoch": 0.2590166723375298, "grad_norm": 0.03950886055827141, "grad_norm_var": 5.656509372151813e-06, "learning_rate": 0.007014752225852278, "loss": 2.7376, "step": 3045 }, { "crossentropy": 2.7568459510803223, "epoch": 0.25910173528411024, "grad_norm": 0.05008840933442116, "grad_norm_var": 1.0223174421261117e-05, "learning_rate": 0.007012918356189375, "loss": 2.7568, "step": 3046 }, { "crossentropy": 2.69914174079895, "epoch": 0.2591867982306907, "grad_norm": 0.04218149185180664, "grad_norm_var": 1.0120860811794566e-05, "learning_rate": 0.007011084163309786, "loss": 2.6991, "step": 3047 }, { "crossentropy": 2.7909276485443115, "epoch": 0.2592718611772712, "grad_norm": 0.04205918312072754, "grad_norm_var": 9.86681849081845e-06, "learning_rate": 0.0070092496475080285, "loss": 2.7909, "step": 3048 }, { "crossentropy": 2.823415517807007, "epoch": 0.25935692412385164, "grad_norm": 0.058149419724941254, "grad_norm_var": 2.6207154647879943e-05, "learning_rate": 0.007007414809078671, "loss": 2.8234, "step": 3049 }, { "crossentropy": 2.731820821762085, "epoch": 0.25944198707043215, "grad_norm": 0.04460888355970383, "grad_norm_var": 2.5526293968617236e-05, "learning_rate": 0.0070055796483163395, "loss": 2.7318, "step": 3050 }, { "crossentropy": 2.801698923110962, "epoch": 0.2595270500170126, "grad_norm": 0.040310751646757126, "grad_norm_var": 2.58745635174963e-05, "learning_rate": 0.0070037441655157046, "loss": 2.8017, "step": 3051 }, { "crossentropy": 2.799377918243408, "epoch": 0.25961211296359304, "grad_norm": 0.0359954908490181, "grad_norm_var": 2.8282364965048537e-05, "learning_rate": 0.007001908360971494, "loss": 2.7994, "step": 3052 }, { "crossentropy": 2.6855478286743164, "epoch": 0.25969717591017355, "grad_norm": 0.041560880839824677, "grad_norm_var": 2.8093315544256563e-05, "learning_rate": 0.007000072234978485, "loss": 2.6855, "step": 3053 }, { "crossentropy": 2.686028242111206, "epoch": 0.259782238856754, "grad_norm": 0.04010431841015816, "grad_norm_var": 2.7906156327244837e-05, "learning_rate": 0.006998235787831505, "loss": 2.686, "step": 3054 }, { "crossentropy": 2.7814276218414307, "epoch": 0.25986730180333445, "grad_norm": 0.040307044982910156, "grad_norm_var": 2.8164858514363214e-05, "learning_rate": 0.006996399019825436, "loss": 2.7814, "step": 3055 }, { "crossentropy": 2.7159695625305176, "epoch": 0.25995236474991495, "grad_norm": 0.04000150039792061, "grad_norm_var": 2.769556784134406e-05, "learning_rate": 0.006994561931255209, "loss": 2.716, "step": 3056 }, { "crossentropy": 2.8089380264282227, "epoch": 0.2600374276964954, "grad_norm": 0.04313111677765846, "grad_norm_var": 2.7480078016283288e-05, "learning_rate": 0.0069927245224158064, "loss": 2.8089, "step": 3057 }, { "crossentropy": 2.7099361419677734, "epoch": 0.26012249064307585, "grad_norm": 0.044913023710250854, "grad_norm_var": 2.7786636386770088e-05, "learning_rate": 0.006990886793602267, "loss": 2.7099, "step": 3058 }, { "crossentropy": 2.7723922729492188, "epoch": 0.26020755358965636, "grad_norm": 0.04236387833952904, "grad_norm_var": 2.583592503503923e-05, "learning_rate": 0.006989048745109674, "loss": 2.7724, "step": 3059 }, { "crossentropy": 2.7015321254730225, "epoch": 0.2602926165362368, "grad_norm": 0.03936375677585602, "grad_norm_var": 2.637852618642128e-05, "learning_rate": 0.006987210377233164, "loss": 2.7015, "step": 3060 }, { "crossentropy": 2.821794033050537, "epoch": 0.26037767948281726, "grad_norm": 0.03962545469403267, "grad_norm_var": 2.6328359824269396e-05, "learning_rate": 0.0069853716902679285, "loss": 2.8218, "step": 3061 }, { "crossentropy": 2.7563211917877197, "epoch": 0.26046274242939776, "grad_norm": 0.040821585804224014, "grad_norm_var": 2.268736556249266e-05, "learning_rate": 0.006983532684509207, "loss": 2.7563, "step": 3062 }, { "crossentropy": 2.726957321166992, "epoch": 0.2605478053759782, "grad_norm": 0.0380885973572731, "grad_norm_var": 2.3754608967848678e-05, "learning_rate": 0.00698169336025229, "loss": 2.727, "step": 3063 }, { "crossentropy": 2.796989917755127, "epoch": 0.2606328683225587, "grad_norm": 0.038184765726327896, "grad_norm_var": 2.4643015702034004e-05, "learning_rate": 0.006979853717792523, "loss": 2.797, "step": 3064 }, { "crossentropy": 2.7311811447143555, "epoch": 0.26071793126913917, "grad_norm": 0.038691744208335876, "grad_norm_var": 5.683508626085539e-06, "learning_rate": 0.006978013757425294, "loss": 2.7312, "step": 3065 }, { "crossentropy": 2.7508115768432617, "epoch": 0.2608029942157196, "grad_norm": 0.03643321618437767, "grad_norm_var": 5.3870154896990334e-06, "learning_rate": 0.006976173479446052, "loss": 2.7508, "step": 3066 }, { "crossentropy": 2.634746551513672, "epoch": 0.2608880571623001, "grad_norm": 0.045302461832761765, "grad_norm_var": 7.155442219717162e-06, "learning_rate": 0.006974332884150292, "loss": 2.6347, "step": 3067 }, { "crossentropy": 2.824479341506958, "epoch": 0.2609731201088806, "grad_norm": 0.04425586387515068, "grad_norm_var": 6.673023658246985e-06, "learning_rate": 0.006972491971833558, "loss": 2.8245, "step": 3068 }, { "crossentropy": 2.7312495708465576, "epoch": 0.261058183055461, "grad_norm": 0.03930850699543953, "grad_norm_var": 6.768147417887151e-06, "learning_rate": 0.006970650742791451, "loss": 2.7312, "step": 3069 }, { "crossentropy": 2.7605926990509033, "epoch": 0.26114324600204153, "grad_norm": 0.03658534958958626, "grad_norm_var": 7.812694887537366e-06, "learning_rate": 0.0069688091973196175, "loss": 2.7606, "step": 3070 }, { "crossentropy": 2.7168710231781006, "epoch": 0.261228308948622, "grad_norm": 0.04026555269956589, "grad_norm_var": 7.813654859799233e-06, "learning_rate": 0.006966967335713756, "loss": 2.7169, "step": 3071 }, { "crossentropy": 2.7776474952697754, "epoch": 0.26131337189520243, "grad_norm": 0.04453219845890999, "grad_norm_var": 8.820521965253956e-06, "learning_rate": 0.006965125158269619, "loss": 2.7776, "step": 3072 }, { "crossentropy": 2.7116851806640625, "epoch": 0.26139843484178293, "grad_norm": 0.04482465982437134, "grad_norm_var": 9.539323282965982e-06, "learning_rate": 0.0069632826652830035, "loss": 2.7117, "step": 3073 }, { "crossentropy": 2.762650728225708, "epoch": 0.2614834977883634, "grad_norm": 0.04403558745980263, "grad_norm_var": 9.111814468653646e-06, "learning_rate": 0.006961439857049763, "loss": 2.7627, "step": 3074 }, { "crossentropy": 2.706911087036133, "epoch": 0.26156856073494383, "grad_norm": 0.04182454198598862, "grad_norm_var": 9.01700879651704e-06, "learning_rate": 0.006959596733865801, "loss": 2.7069, "step": 3075 }, { "crossentropy": 2.7492289543151855, "epoch": 0.26165362368152434, "grad_norm": 0.03942016884684563, "grad_norm_var": 9.006713290569437e-06, "learning_rate": 0.006957753296027065, "loss": 2.7492, "step": 3076 }, { "crossentropy": 2.7017605304718018, "epoch": 0.2617386866281048, "grad_norm": 0.036656562238931656, "grad_norm_var": 1.0007716794188138e-05, "learning_rate": 0.0069559095438295636, "loss": 2.7018, "step": 3077 }, { "crossentropy": 2.8005549907684326, "epoch": 0.2618237495746853, "grad_norm": 0.04880189150571823, "grad_norm_var": 1.4248338358607606e-05, "learning_rate": 0.006954065477569347, "loss": 2.8006, "step": 3078 }, { "crossentropy": 2.765854597091675, "epoch": 0.26190881252126574, "grad_norm": 0.04567751660943031, "grad_norm_var": 1.482527224157809e-05, "learning_rate": 0.006952221097542518, "loss": 2.7659, "step": 3079 }, { "crossentropy": 2.7247660160064697, "epoch": 0.2619938754678462, "grad_norm": 0.04150429740548134, "grad_norm_var": 1.4024494731383098e-05, "learning_rate": 0.006950376404045234, "loss": 2.7248, "step": 3080 }, { "crossentropy": 2.8406801223754883, "epoch": 0.2620789384144267, "grad_norm": 0.04317726567387581, "grad_norm_var": 1.3448449060867127e-05, "learning_rate": 0.0069485313973737, "loss": 2.8407, "step": 3081 }, { "crossentropy": 2.783280849456787, "epoch": 0.26216400136100715, "grad_norm": 0.036317259073257446, "grad_norm_var": 1.3535942431711484e-05, "learning_rate": 0.006946686077824167, "loss": 2.7833, "step": 3082 }, { "crossentropy": 2.6636910438537598, "epoch": 0.2622490643075876, "grad_norm": 0.04050438851118088, "grad_norm_var": 1.2881638955430699e-05, "learning_rate": 0.006944840445692943, "loss": 2.6637, "step": 3083 }, { "crossentropy": 2.820202112197876, "epoch": 0.2623341272541681, "grad_norm": 0.044128626585006714, "grad_norm_var": 1.2839811889471473e-05, "learning_rate": 0.0069429945012763825, "loss": 2.8202, "step": 3084 }, { "crossentropy": 2.7626686096191406, "epoch": 0.26241919020074855, "grad_norm": 0.04000352695584297, "grad_norm_var": 1.2646274246027635e-05, "learning_rate": 0.006941148244870892, "loss": 2.7627, "step": 3085 }, { "crossentropy": 2.844356060028076, "epoch": 0.262504253147329, "grad_norm": 0.04400326684117317, "grad_norm_var": 1.09612065263354e-05, "learning_rate": 0.006939301676772927, "loss": 2.8444, "step": 3086 }, { "crossentropy": 2.7338194847106934, "epoch": 0.2625893160939095, "grad_norm": 0.039351124316453934, "grad_norm_var": 1.1252960086438382e-05, "learning_rate": 0.00693745479727899, "loss": 2.7338, "step": 3087 }, { "crossentropy": 2.7565677165985107, "epoch": 0.26267437904048996, "grad_norm": 0.037695884704589844, "grad_norm_var": 1.2023188193684084e-05, "learning_rate": 0.006935607606685641, "loss": 2.7566, "step": 3088 }, { "crossentropy": 2.7448811531066895, "epoch": 0.2627594419870704, "grad_norm": 0.03755329176783562, "grad_norm_var": 1.234235712458583e-05, "learning_rate": 0.006933760105289483, "loss": 2.7449, "step": 3089 }, { "crossentropy": 2.762977361679077, "epoch": 0.2628445049336509, "grad_norm": 0.0399763397872448, "grad_norm_var": 1.1886711940639384e-05, "learning_rate": 0.0069319122933871715, "loss": 2.763, "step": 3090 }, { "crossentropy": 2.7826294898986816, "epoch": 0.26292956788023136, "grad_norm": 0.047725144773721695, "grad_norm_var": 1.4682183478074926e-05, "learning_rate": 0.0069300641712754145, "loss": 2.7826, "step": 3091 }, { "crossentropy": 2.6955597400665283, "epoch": 0.26301463082681187, "grad_norm": 0.039996061474084854, "grad_norm_var": 1.4550425678917731e-05, "learning_rate": 0.006928215739250962, "loss": 2.6956, "step": 3092 }, { "crossentropy": 2.8248724937438965, "epoch": 0.2630996937733923, "grad_norm": 0.03885848447680473, "grad_norm_var": 1.3448491390779562e-05, "learning_rate": 0.006926366997610623, "loss": 2.8249, "step": 3093 }, { "crossentropy": 2.76220440864563, "epoch": 0.26318475671997277, "grad_norm": 0.040222831070423126, "grad_norm_var": 9.78716713693663e-06, "learning_rate": 0.006924517946651253, "loss": 2.7622, "step": 3094 }, { "crossentropy": 2.7177813053131104, "epoch": 0.2632698196665533, "grad_norm": 0.03731251135468483, "grad_norm_var": 8.991974658530537e-06, "learning_rate": 0.0069226685866697535, "loss": 2.7178, "step": 3095 }, { "crossentropy": 2.730039358139038, "epoch": 0.2633548826131337, "grad_norm": 0.03734411299228668, "grad_norm_var": 9.528046640206826e-06, "learning_rate": 0.00692081891796308, "loss": 2.73, "step": 3096 }, { "crossentropy": 2.781965494155884, "epoch": 0.2634399455597142, "grad_norm": 0.037245288491249084, "grad_norm_var": 9.420465213874108e-06, "learning_rate": 0.006918968940828234, "loss": 2.782, "step": 3097 }, { "crossentropy": 2.731515884399414, "epoch": 0.2635250085062947, "grad_norm": 0.039126280695199966, "grad_norm_var": 8.575550465525494e-06, "learning_rate": 0.00691711865556227, "loss": 2.7315, "step": 3098 }, { "crossentropy": 2.741694688796997, "epoch": 0.2636100714528751, "grad_norm": 0.04205583035945892, "grad_norm_var": 8.816785002022792e-06, "learning_rate": 0.006915268062462292, "loss": 2.7417, "step": 3099 }, { "crossentropy": 2.784881591796875, "epoch": 0.2636951343994556, "grad_norm": 0.03899618983268738, "grad_norm_var": 7.748975416484402e-06, "learning_rate": 0.00691341716182545, "loss": 2.7849, "step": 3100 }, { "crossentropy": 2.7745888233184814, "epoch": 0.2637801973460361, "grad_norm": 0.041490938514471054, "grad_norm_var": 7.919356527579432e-06, "learning_rate": 0.006911565953948944, "loss": 2.7746, "step": 3101 }, { "crossentropy": 2.7961783409118652, "epoch": 0.26386526029261653, "grad_norm": 0.03787272796034813, "grad_norm_var": 6.94257539721165e-06, "learning_rate": 0.006909714439130028, "loss": 2.7962, "step": 3102 }, { "crossentropy": 2.7787632942199707, "epoch": 0.263950323239197, "grad_norm": 0.03946539759635925, "grad_norm_var": 6.940339446312391e-06, "learning_rate": 0.006907862617666, "loss": 2.7788, "step": 3103 }, { "crossentropy": 2.7983837127685547, "epoch": 0.2640353861857775, "grad_norm": 0.041071902960538864, "grad_norm_var": 6.8142163653741225e-06, "learning_rate": 0.006906010489854209, "loss": 2.7984, "step": 3104 }, { "crossentropy": 2.764986515045166, "epoch": 0.26412044913235794, "grad_norm": 0.041296251118183136, "grad_norm_var": 6.583760153120304e-06, "learning_rate": 0.0069041580559920535, "loss": 2.765, "step": 3105 }, { "crossentropy": 2.8818278312683105, "epoch": 0.2642055120789384, "grad_norm": 0.039498280733823776, "grad_norm_var": 6.599776325611334e-06, "learning_rate": 0.006902305316376982, "loss": 2.8818, "step": 3106 }, { "crossentropy": 2.760835886001587, "epoch": 0.2642905750255189, "grad_norm": 0.03960902988910675, "grad_norm_var": 2.3284530621297633e-06, "learning_rate": 0.00690045227130649, "loss": 2.7608, "step": 3107 }, { "crossentropy": 2.722477912902832, "epoch": 0.26437563797209934, "grad_norm": 0.03918546065688133, "grad_norm_var": 2.3122724029554323e-06, "learning_rate": 0.006898598921078123, "loss": 2.7225, "step": 3108 }, { "crossentropy": 2.707521438598633, "epoch": 0.26446070091867985, "grad_norm": 0.040780872106552124, "grad_norm_var": 2.4004161209594786e-06, "learning_rate": 0.006896745265989476, "loss": 2.7075, "step": 3109 }, { "crossentropy": 2.7999565601348877, "epoch": 0.2645457638652603, "grad_norm": 0.038567543029785156, "grad_norm_var": 2.4200487959507553e-06, "learning_rate": 0.006894891306338195, "loss": 2.8, "step": 3110 }, { "crossentropy": 2.7409307956695557, "epoch": 0.26463082681184075, "grad_norm": 0.04032719507813454, "grad_norm_var": 2.1359573447019343e-06, "learning_rate": 0.006893037042421968, "loss": 2.7409, "step": 3111 }, { "crossentropy": 2.7970223426818848, "epoch": 0.26471588975842125, "grad_norm": 0.04125692695379257, "grad_norm_var": 1.905056016222352e-06, "learning_rate": 0.006891182474538539, "loss": 2.797, "step": 3112 }, { "crossentropy": 2.8192243576049805, "epoch": 0.2648009527050017, "grad_norm": 0.04344404861330986, "grad_norm_var": 2.141084180403337e-06, "learning_rate": 0.0068893276029856985, "loss": 2.8192, "step": 3113 }, { "crossentropy": 2.8345370292663574, "epoch": 0.26488601565158215, "grad_norm": 0.04220551997423172, "grad_norm_var": 2.2711797206928853e-06, "learning_rate": 0.006887472428061285, "loss": 2.8345, "step": 3114 }, { "crossentropy": 2.7234864234924316, "epoch": 0.26497107859816266, "grad_norm": 0.03718501329421997, "grad_norm_var": 2.7080091107721412e-06, "learning_rate": 0.006885616950063185, "loss": 2.7235, "step": 3115 }, { "crossentropy": 2.7440378665924072, "epoch": 0.2650561415447431, "grad_norm": 0.04100142419338226, "grad_norm_var": 2.653282868131321e-06, "learning_rate": 0.0068837611692893355, "loss": 2.744, "step": 3116 }, { "crossentropy": 2.8717265129089355, "epoch": 0.26514120449132356, "grad_norm": 0.03723164275288582, "grad_norm_var": 3.0915727308196303e-06, "learning_rate": 0.006881905086037721, "loss": 2.8717, "step": 3117 }, { "crossentropy": 2.7358920574188232, "epoch": 0.26522626743790406, "grad_norm": 0.038492221385240555, "grad_norm_var": 2.9398516211535825e-06, "learning_rate": 0.006880048700606377, "loss": 2.7359, "step": 3118 }, { "crossentropy": 2.812274932861328, "epoch": 0.2653113303844845, "grad_norm": 0.04222598671913147, "grad_norm_var": 3.2051453774962006e-06, "learning_rate": 0.006878192013293383, "loss": 2.8123, "step": 3119 }, { "crossentropy": 2.8475587368011475, "epoch": 0.26539639333106496, "grad_norm": 0.03978034853935242, "grad_norm_var": 3.1611844194587753e-06, "learning_rate": 0.006876335024396872, "loss": 2.8476, "step": 3120 }, { "crossentropy": 2.7154693603515625, "epoch": 0.26548145627764547, "grad_norm": 0.04155821353197098, "grad_norm_var": 3.206191681764716e-06, "learning_rate": 0.00687447773421502, "loss": 2.7155, "step": 3121 }, { "crossentropy": 2.7958476543426514, "epoch": 0.2655665192242259, "grad_norm": 0.03958914056420326, "grad_norm_var": 3.1988503687592947e-06, "learning_rate": 0.006872620143046055, "loss": 2.7958, "step": 3122 }, { "crossentropy": 2.6804230213165283, "epoch": 0.2656515821708064, "grad_norm": 0.03896813467144966, "grad_norm_var": 3.2709661588145937e-06, "learning_rate": 0.006870762251188254, "loss": 2.6804, "step": 3123 }, { "crossentropy": 2.7188832759857178, "epoch": 0.26573664511738687, "grad_norm": 0.04094788804650307, "grad_norm_var": 3.2472598048018917e-06, "learning_rate": 0.006868904058939942, "loss": 2.7189, "step": 3124 }, { "crossentropy": 2.8122010231018066, "epoch": 0.2658217080639673, "grad_norm": 0.03961419686675072, "grad_norm_var": 3.245492583947342e-06, "learning_rate": 0.006867045566599488, "loss": 2.8122, "step": 3125 }, { "crossentropy": 2.7509381771087646, "epoch": 0.2659067710105478, "grad_norm": 0.05858122184872627, "grad_norm_var": 2.4057689736381393e-05, "learning_rate": 0.006865186774465314, "loss": 2.7509, "step": 3126 }, { "crossentropy": 2.8134031295776367, "epoch": 0.2659918339571283, "grad_norm": 0.03963339701294899, "grad_norm_var": 2.418706854782343e-05, "learning_rate": 0.0068633276828358875, "loss": 2.8134, "step": 3127 }, { "crossentropy": 2.839942693710327, "epoch": 0.2660768969037087, "grad_norm": 0.0597582682967186, "grad_norm_var": 4.53334181344272e-05, "learning_rate": 0.006861468292009726, "loss": 2.8399, "step": 3128 }, { "crossentropy": 2.732344150543213, "epoch": 0.26616195985028923, "grad_norm": 0.04329502210021019, "grad_norm_var": 4.531631682957952e-05, "learning_rate": 0.006859608602285394, "loss": 2.7323, "step": 3129 }, { "crossentropy": 2.903884172439575, "epoch": 0.2662470227968697, "grad_norm": 0.04568186402320862, "grad_norm_var": 4.5933172648166856e-05, "learning_rate": 0.0068577486139615035, "loss": 2.9039, "step": 3130 }, { "crossentropy": 2.8110713958740234, "epoch": 0.26633208574345013, "grad_norm": 0.03880292922258377, "grad_norm_var": 4.490243344244567e-05, "learning_rate": 0.006855888327336717, "loss": 2.8111, "step": 3131 }, { "crossentropy": 2.7227184772491455, "epoch": 0.26641714869003064, "grad_norm": 0.039896100759506226, "grad_norm_var": 4.524719338652814e-05, "learning_rate": 0.00685402774270974, "loss": 2.7227, "step": 3132 }, { "crossentropy": 2.8572587966918945, "epoch": 0.2665022116366111, "grad_norm": 0.04643229767680168, "grad_norm_var": 4.3763941977210895e-05, "learning_rate": 0.00685216686037933, "loss": 2.8573, "step": 3133 }, { "crossentropy": 2.7574801445007324, "epoch": 0.26658727458319154, "grad_norm": 0.04553532600402832, "grad_norm_var": 4.232254687815962e-05, "learning_rate": 0.006850305680644291, "loss": 2.7575, "step": 3134 }, { "crossentropy": 2.7566325664520264, "epoch": 0.26667233752977204, "grad_norm": 0.04432905092835426, "grad_norm_var": 4.216636690146189e-05, "learning_rate": 0.006848444203803475, "loss": 2.7566, "step": 3135 }, { "crossentropy": 2.756671190261841, "epoch": 0.2667574004763525, "grad_norm": 0.03972461819648743, "grad_norm_var": 4.2197174542594505e-05, "learning_rate": 0.006846582430155782, "loss": 2.7567, "step": 3136 }, { "crossentropy": 2.755175828933716, "epoch": 0.266842463422933, "grad_norm": 0.037195831537246704, "grad_norm_var": 4.474677299633369e-05, "learning_rate": 0.006844720360000159, "loss": 2.7552, "step": 3137 }, { "crossentropy": 2.8483991622924805, "epoch": 0.26692752636951345, "grad_norm": 0.03931977227330208, "grad_norm_var": 4.4896225933796904e-05, "learning_rate": 0.006842857993635601, "loss": 2.8484, "step": 3138 }, { "crossentropy": 2.724740982055664, "epoch": 0.2670125893160939, "grad_norm": 0.03800414875149727, "grad_norm_var": 4.555057683790816e-05, "learning_rate": 0.0068409953313611485, "loss": 2.7247, "step": 3139 }, { "crossentropy": 2.788559913635254, "epoch": 0.2670976522626744, "grad_norm": 0.03963804617524147, "grad_norm_var": 4.6111729892480724e-05, "learning_rate": 0.006839132373475894, "loss": 2.7886, "step": 3140 }, { "crossentropy": 2.7494008541107178, "epoch": 0.26718271520925485, "grad_norm": 0.04349568486213684, "grad_norm_var": 4.5060371234002346e-05, "learning_rate": 0.0068372691202789725, "loss": 2.7494, "step": 3141 }, { "crossentropy": 2.6635360717773438, "epoch": 0.2672677781558353, "grad_norm": 0.04620586708188057, "grad_norm_var": 3.009023363743183e-05, "learning_rate": 0.0068354055720695715, "loss": 2.6635, "step": 3142 }, { "crossentropy": 2.886960506439209, "epoch": 0.2673528411024158, "grad_norm": 0.037685830146074295, "grad_norm_var": 3.1184451752674926e-05, "learning_rate": 0.006833541729146922, "loss": 2.887, "step": 3143 }, { "crossentropy": 2.7561588287353516, "epoch": 0.26743790404899626, "grad_norm": 0.04098402336239815, "grad_norm_var": 1.0794871143291786e-05, "learning_rate": 0.006831677591810302, "loss": 2.7562, "step": 3144 }, { "crossentropy": 2.714688539505005, "epoch": 0.2675229669955767, "grad_norm": 0.039399534463882446, "grad_norm_var": 1.0883240824717995e-05, "learning_rate": 0.006829813160359038, "loss": 2.7147, "step": 3145 }, { "crossentropy": 2.681047201156616, "epoch": 0.2676080299421572, "grad_norm": 0.087069571018219, "grad_norm_var": 0.0001415948287520304, "learning_rate": 0.006827948435092508, "loss": 2.681, "step": 3146 }, { "crossentropy": 2.660705327987671, "epoch": 0.26769309288873766, "grad_norm": 0.039226531982421875, "grad_norm_var": 0.0001413135044721345, "learning_rate": 0.006826083416310128, "loss": 2.6607, "step": 3147 }, { "crossentropy": 2.6675102710723877, "epoch": 0.2677781558353181, "grad_norm": 0.03789038956165314, "grad_norm_var": 0.00014266480988818196, "learning_rate": 0.00682421810431137, "loss": 2.6675, "step": 3148 }, { "crossentropy": 2.8383970260620117, "epoch": 0.2678632187818986, "grad_norm": 0.04023401811718941, "grad_norm_var": 0.00014295958239325297, "learning_rate": 0.00682235249939575, "loss": 2.8384, "step": 3149 }, { "crossentropy": 2.692249059677124, "epoch": 0.26794828172847907, "grad_norm": 0.037498533725738525, "grad_norm_var": 0.00014481132433070604, "learning_rate": 0.006820486601862825, "loss": 2.6922, "step": 3150 }, { "crossentropy": 2.752579927444458, "epoch": 0.26803334467505957, "grad_norm": 0.042107101529836655, "grad_norm_var": 0.0001447243212693477, "learning_rate": 0.00681862041201221, "loss": 2.7526, "step": 3151 }, { "crossentropy": 2.7007312774658203, "epoch": 0.26811840762164, "grad_norm": 0.03873421251773834, "grad_norm_var": 0.00014519900335065611, "learning_rate": 0.006816753930143558, "loss": 2.7007, "step": 3152 }, { "crossentropy": 2.7797768115997314, "epoch": 0.26820347056822047, "grad_norm": 0.03943955898284912, "grad_norm_var": 0.0001438391579907196, "learning_rate": 0.006814887156556574, "loss": 2.7798, "step": 3153 }, { "crossentropy": 2.7977988719940186, "epoch": 0.268288533514801, "grad_norm": 0.03996533527970314, "grad_norm_var": 0.00014355417019027775, "learning_rate": 0.006813020091551008, "loss": 2.7978, "step": 3154 }, { "crossentropy": 2.6706395149230957, "epoch": 0.2683735964613814, "grad_norm": 0.03913011774420738, "grad_norm_var": 0.00014288734095079526, "learning_rate": 0.006811152735426655, "loss": 2.6706, "step": 3155 }, { "crossentropy": 2.6874141693115234, "epoch": 0.2684586594079619, "grad_norm": 0.03725465014576912, "grad_norm_var": 0.00014432474898173157, "learning_rate": 0.006809285088483361, "loss": 2.6874, "step": 3156 }, { "crossentropy": 2.780839204788208, "epoch": 0.2685437223545424, "grad_norm": 0.03627642244100571, "grad_norm_var": 0.00014700396591599992, "learning_rate": 0.006807417151021015, "loss": 2.7808, "step": 3157 }, { "crossentropy": 2.7614834308624268, "epoch": 0.26862878530112283, "grad_norm": 0.037772566080093384, "grad_norm_var": 0.00014721884518188635, "learning_rate": 0.006805548923339553, "loss": 2.7615, "step": 3158 }, { "crossentropy": 2.801013708114624, "epoch": 0.2687138482477033, "grad_norm": 0.03927728533744812, "grad_norm_var": 0.00014647935960170427, "learning_rate": 0.006803680405738961, "loss": 2.801, "step": 3159 }, { "crossentropy": 2.7764601707458496, "epoch": 0.2687989111942838, "grad_norm": 0.04368389770388603, "grad_norm_var": 0.00014656336122113197, "learning_rate": 0.006801811598519268, "loss": 2.7765, "step": 3160 }, { "crossentropy": 2.77872633934021, "epoch": 0.26888397414086423, "grad_norm": 0.045397330075502396, "grad_norm_var": 0.00014658416816777184, "learning_rate": 0.006799942501980548, "loss": 2.7787, "step": 3161 }, { "crossentropy": 2.8548355102539062, "epoch": 0.2689690370874447, "grad_norm": 0.041079118847846985, "grad_norm_var": 5.842951297079825e-06, "learning_rate": 0.006798073116422929, "loss": 2.8548, "step": 3162 }, { "crossentropy": 2.7284152507781982, "epoch": 0.2690541000340252, "grad_norm": 0.042993590235710144, "grad_norm_var": 6.499373081073935e-06, "learning_rate": 0.006796203442146576, "loss": 2.7284, "step": 3163 }, { "crossentropy": 2.7875821590423584, "epoch": 0.26913916298060564, "grad_norm": 0.04328136146068573, "grad_norm_var": 6.856273116790691e-06, "learning_rate": 0.006794333479451707, "loss": 2.7876, "step": 3164 }, { "crossentropy": 2.7524704933166504, "epoch": 0.26922422592718614, "grad_norm": 0.03876982629299164, "grad_norm_var": 6.994910717929523e-06, "learning_rate": 0.006792463228638585, "loss": 2.7525, "step": 3165 }, { "crossentropy": 2.806596517562866, "epoch": 0.2693092888737666, "grad_norm": 0.041359793394804, "grad_norm_var": 6.553281676549503e-06, "learning_rate": 0.006790592690007515, "loss": 2.8066, "step": 3166 }, { "crossentropy": 2.8673510551452637, "epoch": 0.26939435182034704, "grad_norm": 0.042650461196899414, "grad_norm_var": 6.694856998706635e-06, "learning_rate": 0.006788721863858855, "loss": 2.8674, "step": 3167 }, { "crossentropy": 2.7408714294433594, "epoch": 0.26947941476692755, "grad_norm": 0.03874018043279648, "grad_norm_var": 6.693500622505696e-06, "learning_rate": 0.006786850750493005, "loss": 2.7409, "step": 3168 }, { "crossentropy": 2.8742589950561523, "epoch": 0.269564477713508, "grad_norm": 0.04121711850166321, "grad_norm_var": 6.6534039823124e-06, "learning_rate": 0.006784979350210409, "loss": 2.8743, "step": 3169 }, { "crossentropy": 2.7011828422546387, "epoch": 0.26964954066008845, "grad_norm": 0.03862644359469414, "grad_norm_var": 6.870364443665317e-06, "learning_rate": 0.006783107663311565, "loss": 2.7012, "step": 3170 }, { "crossentropy": 2.8085358142852783, "epoch": 0.26973460360666895, "grad_norm": 0.0444590300321579, "grad_norm_var": 7.693617559819711e-06, "learning_rate": 0.00678123569009701, "loss": 2.8085, "step": 3171 }, { "crossentropy": 2.83988356590271, "epoch": 0.2698196665532494, "grad_norm": 0.04359472170472145, "grad_norm_var": 7.2067985069377075e-06, "learning_rate": 0.006779363430867325, "loss": 2.8399, "step": 3172 }, { "crossentropy": 2.7218313217163086, "epoch": 0.26990472949982985, "grad_norm": 0.0454782173037529, "grad_norm_var": 6.459695058325052e-06, "learning_rate": 0.006777490885923146, "loss": 2.7218, "step": 3173 }, { "crossentropy": 2.807529926300049, "epoch": 0.26998979244641036, "grad_norm": 0.040391746908426285, "grad_norm_var": 5.491121361550499e-06, "learning_rate": 0.0067756180555651494, "loss": 2.8075, "step": 3174 }, { "crossentropy": 2.7533154487609863, "epoch": 0.2700748553929908, "grad_norm": 0.03693581745028496, "grad_norm_var": 6.6642858810634155e-06, "learning_rate": 0.006773744940094055, "loss": 2.7533, "step": 3175 }, { "crossentropy": 2.7575323581695557, "epoch": 0.27015991833957126, "grad_norm": 0.03609834238886833, "grad_norm_var": 8.346253611482036e-06, "learning_rate": 0.0067718715398106324, "loss": 2.7575, "step": 3176 }, { "crossentropy": 2.65405011177063, "epoch": 0.27024498128615176, "grad_norm": 0.03708404302597046, "grad_norm_var": 8.142957128997921e-06, "learning_rate": 0.006769997855015695, "loss": 2.6541, "step": 3177 }, { "crossentropy": 2.808955430984497, "epoch": 0.2703300442327322, "grad_norm": 0.04224945232272148, "grad_norm_var": 8.272509044611406e-06, "learning_rate": 0.006768123886010105, "loss": 2.809, "step": 3178 }, { "crossentropy": 2.6767959594726562, "epoch": 0.27041510717931266, "grad_norm": 0.040359977632761, "grad_norm_var": 7.960531229779753e-06, "learning_rate": 0.006766249633094765, "loss": 2.6768, "step": 3179 }, { "crossentropy": 2.775034189224243, "epoch": 0.27050017012589317, "grad_norm": 0.039447132498025894, "grad_norm_var": 7.5627768641107925e-06, "learning_rate": 0.006764375096570628, "loss": 2.775, "step": 3180 }, { "crossentropy": 2.7389118671417236, "epoch": 0.2705852330724736, "grad_norm": 0.03646537661552429, "grad_norm_var": 8.415969717643207e-06, "learning_rate": 0.006762500276738689, "loss": 2.7389, "step": 3181 }, { "crossentropy": 2.8374292850494385, "epoch": 0.2706702960190541, "grad_norm": 0.04063362255692482, "grad_norm_var": 8.348480864241575e-06, "learning_rate": 0.006760625173899991, "loss": 2.8374, "step": 3182 }, { "crossentropy": 2.720217227935791, "epoch": 0.2707553589656346, "grad_norm": 0.04075092077255249, "grad_norm_var": 7.972860330181647e-06, "learning_rate": 0.006758749788355622, "loss": 2.7202, "step": 3183 }, { "crossentropy": 2.7488627433776855, "epoch": 0.270840421912215, "grad_norm": 0.039370402693748474, "grad_norm_var": 7.878523463564155e-06, "learning_rate": 0.0067568741204067145, "loss": 2.7489, "step": 3184 }, { "crossentropy": 2.7370073795318604, "epoch": 0.27092548485879553, "grad_norm": 0.04396539181470871, "grad_norm_var": 8.724157540092257e-06, "learning_rate": 0.006754998170354445, "loss": 2.737, "step": 3185 }, { "crossentropy": 2.8087961673736572, "epoch": 0.271010547805376, "grad_norm": 0.04241681471467018, "grad_norm_var": 8.741221841383704e-06, "learning_rate": 0.00675312193850004, "loss": 2.8088, "step": 3186 }, { "crossentropy": 2.6551105976104736, "epoch": 0.27109561075195643, "grad_norm": 0.04449373856186867, "grad_norm_var": 8.759126752450053e-06, "learning_rate": 0.006751245425144765, "loss": 2.6551, "step": 3187 }, { "crossentropy": 2.7262933254241943, "epoch": 0.27118067369853693, "grad_norm": 0.04581195488572121, "grad_norm_var": 9.949209611228212e-06, "learning_rate": 0.006749368630589937, "loss": 2.7263, "step": 3188 }, { "crossentropy": 2.765021800994873, "epoch": 0.2712657366451174, "grad_norm": 0.04161122068762779, "grad_norm_var": 8.444430713774817e-06, "learning_rate": 0.006747491555136913, "loss": 2.765, "step": 3189 }, { "crossentropy": 2.762355089187622, "epoch": 0.27135079959169783, "grad_norm": 0.03671342134475708, "grad_norm_var": 9.345787433714023e-06, "learning_rate": 0.0067456141990871, "loss": 2.7624, "step": 3190 }, { "crossentropy": 2.801069974899292, "epoch": 0.27143586253827834, "grad_norm": 0.038846228271722794, "grad_norm_var": 8.723208911091267e-06, "learning_rate": 0.0067437365627419435, "loss": 2.8011, "step": 3191 }, { "crossentropy": 2.7583160400390625, "epoch": 0.2715209254848588, "grad_norm": 0.038043782114982605, "grad_norm_var": 7.845268202664491e-06, "learning_rate": 0.006741858646402941, "loss": 2.7583, "step": 3192 }, { "crossentropy": 2.754148006439209, "epoch": 0.27160598843143924, "grad_norm": 0.036956142634153366, "grad_norm_var": 7.904825068116619e-06, "learning_rate": 0.0067399804503716285, "loss": 2.7541, "step": 3193 }, { "crossentropy": 2.762127161026001, "epoch": 0.27169105137801974, "grad_norm": 0.03874044120311737, "grad_norm_var": 7.859849088785673e-06, "learning_rate": 0.006738101974949593, "loss": 2.7621, "step": 3194 }, { "crossentropy": 2.683718204498291, "epoch": 0.2717761143246002, "grad_norm": 0.035605523735284805, "grad_norm_var": 9.227758214256215e-06, "learning_rate": 0.006736223220438463, "loss": 2.6837, "step": 3195 }, { "crossentropy": 2.720548152923584, "epoch": 0.2718611772711807, "grad_norm": 0.040908679366111755, "grad_norm_var": 9.255084346494978e-06, "learning_rate": 0.006734344187139913, "loss": 2.7205, "step": 3196 }, { "crossentropy": 2.816019058227539, "epoch": 0.27194624021776115, "grad_norm": 0.11254648864269257, "grad_norm_var": 0.00033432476206031345, "learning_rate": 0.006732464875355657, "loss": 2.816, "step": 3197 }, { "crossentropy": 2.7078495025634766, "epoch": 0.2720313031643416, "grad_norm": 0.05391214042901993, "grad_norm_var": 0.00033790023215702573, "learning_rate": 0.006730585285387465, "loss": 2.7078, "step": 3198 }, { "crossentropy": 2.7016677856445312, "epoch": 0.2721163661109221, "grad_norm": 0.07580828666687012, "grad_norm_var": 0.00039172838821547543, "learning_rate": 0.00672870541753714, "loss": 2.7017, "step": 3199 }, { "crossentropy": 2.7591607570648193, "epoch": 0.27220142905750255, "grad_norm": 0.04316125810146332, "grad_norm_var": 0.00038833580010503525, "learning_rate": 0.006726825272106538, "loss": 2.7592, "step": 3200 }, { "crossentropy": 2.69797420501709, "epoch": 0.272286492004083, "grad_norm": 0.045537713915109634, "grad_norm_var": 0.0003876242872951498, "learning_rate": 0.006724944849397554, "loss": 2.698, "step": 3201 }, { "crossentropy": 2.805814027786255, "epoch": 0.2723715549506635, "grad_norm": 0.04506690055131912, "grad_norm_var": 0.000386021666522925, "learning_rate": 0.006723064149712131, "loss": 2.8058, "step": 3202 }, { "crossentropy": 2.7721102237701416, "epoch": 0.27245661789724396, "grad_norm": 0.037035852670669556, "grad_norm_var": 0.00039334271563999727, "learning_rate": 0.006721183173352255, "loss": 2.7721, "step": 3203 }, { "crossentropy": 2.8386712074279785, "epoch": 0.2725416808438244, "grad_norm": 0.043075282126665115, "grad_norm_var": 0.00039457056498597224, "learning_rate": 0.006719301920619953, "loss": 2.8387, "step": 3204 }, { "crossentropy": 2.684818744659424, "epoch": 0.2726267437904049, "grad_norm": 0.03867728263139725, "grad_norm_var": 0.0003974994754185968, "learning_rate": 0.006717420391817305, "loss": 2.6848, "step": 3205 }, { "crossentropy": 2.8164892196655273, "epoch": 0.27271180673698536, "grad_norm": 0.04076215624809265, "grad_norm_var": 0.00039267961944016295, "learning_rate": 0.0067155385872464295, "loss": 2.8165, "step": 3206 }, { "crossentropy": 2.7755584716796875, "epoch": 0.2727968696835658, "grad_norm": 0.038311783224344254, "grad_norm_var": 0.00039333499534949473, "learning_rate": 0.006713656507209488, "loss": 2.7756, "step": 3207 }, { "crossentropy": 2.746201515197754, "epoch": 0.2728819326301463, "grad_norm": 0.03947228938341141, "grad_norm_var": 0.00039161203230737797, "learning_rate": 0.00671177415200869, "loss": 2.7462, "step": 3208 }, { "crossentropy": 2.9126768112182617, "epoch": 0.27296699557672677, "grad_norm": 0.042430292814970016, "grad_norm_var": 0.0003855346393059478, "learning_rate": 0.006709891521946286, "loss": 2.9127, "step": 3209 }, { "crossentropy": 2.7456164360046387, "epoch": 0.2730520585233073, "grad_norm": 0.04137396439909935, "grad_norm_var": 0.0003826497487680933, "learning_rate": 0.006708008617324572, "loss": 2.7456, "step": 3210 }, { "crossentropy": 2.782581090927124, "epoch": 0.2731371214698877, "grad_norm": 0.03686530515551567, "grad_norm_var": 0.00038060733701685646, "learning_rate": 0.006706125438445891, "loss": 2.7826, "step": 3211 }, { "crossentropy": 2.668278932571411, "epoch": 0.2732221844164682, "grad_norm": 0.03812350705265999, "grad_norm_var": 0.00038388677526557017, "learning_rate": 0.006704241985612625, "loss": 2.6683, "step": 3212 }, { "crossentropy": 2.7925920486450195, "epoch": 0.2733072473630487, "grad_norm": 0.03631887212395668, "grad_norm_var": 9.366524995914223e-05, "learning_rate": 0.006702358259127202, "loss": 2.7926, "step": 3213 }, { "crossentropy": 2.82063364982605, "epoch": 0.2733923103096291, "grad_norm": 0.03860372304916382, "grad_norm_var": 8.705096387366476e-05, "learning_rate": 0.006700474259292096, "loss": 2.8206, "step": 3214 }, { "crossentropy": 2.8394360542297363, "epoch": 0.2734773732562096, "grad_norm": 0.03724249079823494, "grad_norm_var": 8.93445070760847e-06, "learning_rate": 0.006698589986409822, "loss": 2.8394, "step": 3215 }, { "crossentropy": 2.6691343784332275, "epoch": 0.2735624362027901, "grad_norm": 0.041645050048828125, "grad_norm_var": 8.46505929154902e-06, "learning_rate": 0.006696705440782939, "loss": 2.6691, "step": 3216 }, { "crossentropy": 2.7858645915985107, "epoch": 0.27364749914937053, "grad_norm": 0.044676560908555984, "grad_norm_var": 7.879458677797064e-06, "learning_rate": 0.006694820622714052, "loss": 2.7859, "step": 3217 }, { "crossentropy": 2.816089630126953, "epoch": 0.273732562095951, "grad_norm": 0.03813917934894562, "grad_norm_var": 6.180366548350485e-06, "learning_rate": 0.006692935532505808, "loss": 2.8161, "step": 3218 }, { "crossentropy": 2.8469324111938477, "epoch": 0.2738176250425315, "grad_norm": 0.04023009166121483, "grad_norm_var": 5.748527881023711e-06, "learning_rate": 0.0066910501704608985, "loss": 2.8469, "step": 3219 }, { "crossentropy": 2.7658140659332275, "epoch": 0.27390268798911194, "grad_norm": 0.05031723156571388, "grad_norm_var": 1.2240410743807652e-05, "learning_rate": 0.006689164536882059, "loss": 2.7658, "step": 3220 }, { "crossentropy": 2.783806800842285, "epoch": 0.2739877509356924, "grad_norm": 0.03747856244444847, "grad_norm_var": 1.2573491727188984e-05, "learning_rate": 0.006687278632072066, "loss": 2.7838, "step": 3221 }, { "crossentropy": 2.801213264465332, "epoch": 0.2740728138822729, "grad_norm": 0.03792816400527954, "grad_norm_var": 1.283449065354458e-05, "learning_rate": 0.006685392456333743, "loss": 2.8012, "step": 3222 }, { "crossentropy": 2.808211088180542, "epoch": 0.27415787682885334, "grad_norm": 0.04156388342380524, "grad_norm_var": 1.2786311198342605e-05, "learning_rate": 0.006683506009969954, "loss": 2.8082, "step": 3223 }, { "crossentropy": 2.868713855743408, "epoch": 0.27424293977543385, "grad_norm": 0.0391414575278759, "grad_norm_var": 1.2823071516466535e-05, "learning_rate": 0.00668161929328361, "loss": 2.8687, "step": 3224 }, { "crossentropy": 2.7721948623657227, "epoch": 0.2743280027220143, "grad_norm": 0.043732307851314545, "grad_norm_var": 1.332837770910399e-05, "learning_rate": 0.006679732306577662, "loss": 2.7722, "step": 3225 }, { "crossentropy": 2.830707550048828, "epoch": 0.27441306566859475, "grad_norm": 0.038948364555835724, "grad_norm_var": 1.3320068408602335e-05, "learning_rate": 0.006677845050155106, "loss": 2.8307, "step": 3226 }, { "crossentropy": 2.8364439010620117, "epoch": 0.27449812861517525, "grad_norm": 0.03738079592585564, "grad_norm_var": 1.3117121031497719e-05, "learning_rate": 0.0066759575243189795, "loss": 2.8364, "step": 3227 }, { "crossentropy": 2.69852876663208, "epoch": 0.2745831915617557, "grad_norm": 0.03538769856095314, "grad_norm_var": 1.4302927427835693e-05, "learning_rate": 0.006674069729372369, "loss": 2.6985, "step": 3228 }, { "crossentropy": 2.753777265548706, "epoch": 0.27466825450833615, "grad_norm": 0.03849557414650917, "grad_norm_var": 1.3553648280591595e-05, "learning_rate": 0.006672181665618396, "loss": 2.7538, "step": 3229 }, { "crossentropy": 2.7344722747802734, "epoch": 0.27475331745491666, "grad_norm": 0.03867528960108757, "grad_norm_var": 1.35401014373047e-05, "learning_rate": 0.006670293333360229, "loss": 2.7345, "step": 3230 }, { "crossentropy": 2.7847180366516113, "epoch": 0.2748383804014971, "grad_norm": 0.040173038840293884, "grad_norm_var": 1.2975391196526758e-05, "learning_rate": 0.006668404732901082, "loss": 2.7847, "step": 3231 }, { "crossentropy": 2.80804443359375, "epoch": 0.27492344334807756, "grad_norm": 0.0402042493224144, "grad_norm_var": 1.2836095243665077e-05, "learning_rate": 0.006666515864544209, "loss": 2.808, "step": 3232 }, { "crossentropy": 2.7539823055267334, "epoch": 0.27500850629465806, "grad_norm": 0.03933149203658104, "grad_norm_var": 1.1398961751821606e-05, "learning_rate": 0.006664626728592908, "loss": 2.754, "step": 3233 }, { "crossentropy": 2.745643138885498, "epoch": 0.2750935692412385, "grad_norm": 0.040510039776563644, "grad_norm_var": 1.1218794970026489e-05, "learning_rate": 0.006662737325350518, "loss": 2.7456, "step": 3234 }, { "crossentropy": 2.654755115509033, "epoch": 0.27517863218781896, "grad_norm": 0.03738642856478691, "grad_norm_var": 1.1625065512021795e-05, "learning_rate": 0.006660847655120426, "loss": 2.6548, "step": 3235 }, { "crossentropy": 2.8068435192108154, "epoch": 0.27526369513439947, "grad_norm": 0.04796069115400314, "grad_norm_var": 8.66471908376783e-06, "learning_rate": 0.006658957718206058, "loss": 2.8068, "step": 3236 }, { "crossentropy": 2.845247268676758, "epoch": 0.2753487580809799, "grad_norm": 0.03735073655843735, "grad_norm_var": 8.702640478628482e-06, "learning_rate": 0.00665706751491088, "loss": 2.8452, "step": 3237 }, { "crossentropy": 2.5901806354522705, "epoch": 0.2754338210275604, "grad_norm": 0.03605526685714722, "grad_norm_var": 9.348264278406099e-06, "learning_rate": 0.0066551770455384055, "loss": 2.5902, "step": 3238 }, { "crossentropy": 2.770564079284668, "epoch": 0.27551888397414087, "grad_norm": 0.03802674263715744, "grad_norm_var": 9.16562212217864e-06, "learning_rate": 0.006653286310392193, "loss": 2.7706, "step": 3239 }, { "crossentropy": 2.789006471633911, "epoch": 0.2756039469207213, "grad_norm": 0.04272252321243286, "grad_norm_var": 9.892612451300962e-06, "learning_rate": 0.006651395309775836, "loss": 2.789, "step": 3240 }, { "crossentropy": 2.8581032752990723, "epoch": 0.2756890098673018, "grad_norm": 0.04044999182224274, "grad_norm_var": 8.723059906869517e-06, "learning_rate": 0.0066495040439929765, "loss": 2.8581, "step": 3241 }, { "crossentropy": 2.7965967655181885, "epoch": 0.2757740728138823, "grad_norm": 0.03771347925066948, "grad_norm_var": 8.87893053083492e-06, "learning_rate": 0.006647612513347297, "loss": 2.7966, "step": 3242 }, { "crossentropy": 2.784573554992676, "epoch": 0.2758591357604627, "grad_norm": 0.037574347108602524, "grad_norm_var": 8.833317501706165e-06, "learning_rate": 0.006645720718142522, "loss": 2.7846, "step": 3243 }, { "crossentropy": 2.7686421871185303, "epoch": 0.27594419870704323, "grad_norm": 0.03747750446200371, "grad_norm_var": 8.029772012714773e-06, "learning_rate": 0.00664382865868242, "loss": 2.7686, "step": 3244 }, { "crossentropy": 2.667206287384033, "epoch": 0.2760292616536237, "grad_norm": 0.0331321619451046, "grad_norm_var": 1.0461355408526255e-05, "learning_rate": 0.006641936335270802, "loss": 2.6672, "step": 3245 }, { "crossentropy": 2.838120937347412, "epoch": 0.27611432460020413, "grad_norm": 0.0412120521068573, "grad_norm_var": 1.0737997165845686e-05, "learning_rate": 0.00664004374821152, "loss": 2.8381, "step": 3246 }, { "crossentropy": 2.758554458618164, "epoch": 0.27619938754678464, "grad_norm": 0.04516565054655075, "grad_norm_var": 1.294025747703009e-05, "learning_rate": 0.006638150897808468, "loss": 2.7586, "step": 3247 }, { "crossentropy": 2.7434849739074707, "epoch": 0.2762844504933651, "grad_norm": 0.0471009686589241, "grad_norm_var": 1.6544944325047547e-05, "learning_rate": 0.006636257784365585, "loss": 2.7435, "step": 3248 }, { "crossentropy": 2.7463772296905518, "epoch": 0.27636951343994554, "grad_norm": 0.04153762012720108, "grad_norm_var": 1.666774764672007e-05, "learning_rate": 0.006634364408186848, "loss": 2.7464, "step": 3249 }, { "crossentropy": 2.823967456817627, "epoch": 0.27645457638652604, "grad_norm": 0.03973569720983505, "grad_norm_var": 1.6661444095659947e-05, "learning_rate": 0.006632470769576282, "loss": 2.824, "step": 3250 }, { "crossentropy": 2.679765462875366, "epoch": 0.2765396393331065, "grad_norm": 0.03746313601732254, "grad_norm_var": 1.6634696399182394e-05, "learning_rate": 0.006630576868837948, "loss": 2.6798, "step": 3251 }, { "crossentropy": 2.7053451538085938, "epoch": 0.276624702279687, "grad_norm": 0.03829335793852806, "grad_norm_var": 1.2269292075984111e-05, "learning_rate": 0.006628682706275952, "loss": 2.7053, "step": 3252 }, { "crossentropy": 2.8761637210845947, "epoch": 0.27670976522626745, "grad_norm": 0.03825608268380165, "grad_norm_var": 1.2068536434515591e-05, "learning_rate": 0.006626788282194445, "loss": 2.8762, "step": 3253 }, { "crossentropy": 2.7939395904541016, "epoch": 0.2767948281728479, "grad_norm": 0.036887701600790024, "grad_norm_var": 1.1730088915866816e-05, "learning_rate": 0.0066248935968976136, "loss": 2.7939, "step": 3254 }, { "crossentropy": 2.722996950149536, "epoch": 0.2768798911194284, "grad_norm": 0.04131890833377838, "grad_norm_var": 1.1740242711830568e-05, "learning_rate": 0.006622998650689691, "loss": 2.723, "step": 3255 }, { "crossentropy": 2.704758405685425, "epoch": 0.27696495406600885, "grad_norm": 0.03933677822351456, "grad_norm_var": 1.11159649164906e-05, "learning_rate": 0.0066211034438749495, "loss": 2.7048, "step": 3256 }, { "crossentropy": 2.7209441661834717, "epoch": 0.2770500170125893, "grad_norm": 0.036334991455078125, "grad_norm_var": 1.167553883952051e-05, "learning_rate": 0.006619207976757707, "loss": 2.7209, "step": 3257 }, { "crossentropy": 2.7936668395996094, "epoch": 0.2771350799591698, "grad_norm": 0.03607573360204697, "grad_norm_var": 1.218607686661761e-05, "learning_rate": 0.00661731224964232, "loss": 2.7937, "step": 3258 }, { "crossentropy": 2.794813632965088, "epoch": 0.27722014290575026, "grad_norm": 0.037361785769462585, "grad_norm_var": 1.223444758768847e-05, "learning_rate": 0.006615416262833187, "loss": 2.7948, "step": 3259 }, { "crossentropy": 2.742884635925293, "epoch": 0.2773052058523307, "grad_norm": 0.039064399898052216, "grad_norm_var": 1.2034123944413417e-05, "learning_rate": 0.0066135200166347495, "loss": 2.7429, "step": 3260 }, { "crossentropy": 2.8342623710632324, "epoch": 0.2773902687989112, "grad_norm": 0.038847822695970535, "grad_norm_var": 9.400396083839024e-06, "learning_rate": 0.006611623511351491, "loss": 2.8343, "step": 3261 }, { "crossentropy": 2.838597297668457, "epoch": 0.27747533174549166, "grad_norm": 0.039215754717588425, "grad_norm_var": 9.226919231091328e-06, "learning_rate": 0.006609726747287933, "loss": 2.8386, "step": 3262 }, { "crossentropy": 2.7104506492614746, "epoch": 0.2775603946920721, "grad_norm": 0.036610376089811325, "grad_norm_var": 7.338380833555683e-06, "learning_rate": 0.006607829724748643, "loss": 2.7105, "step": 3263 }, { "crossentropy": 2.775991916656494, "epoch": 0.2776454576386526, "grad_norm": 0.037454456090927124, "grad_norm_var": 2.6899242399437694e-06, "learning_rate": 0.006605932444038228, "loss": 2.776, "step": 3264 }, { "crossentropy": 2.734544277191162, "epoch": 0.27773052058523306, "grad_norm": 0.04062816873192787, "grad_norm_var": 2.35656154093742e-06, "learning_rate": 0.006604034905461336, "loss": 2.7345, "step": 3265 }, { "crossentropy": 2.763427734375, "epoch": 0.2778155835318135, "grad_norm": 0.041696757078170776, "grad_norm_var": 2.97092815036374e-06, "learning_rate": 0.006602137109322658, "loss": 2.7634, "step": 3266 }, { "crossentropy": 2.812476634979248, "epoch": 0.277900646478394, "grad_norm": 0.045284729450941086, "grad_norm_var": 5.788390996611103e-06, "learning_rate": 0.006600239055926924, "loss": 2.8125, "step": 3267 }, { "crossentropy": 2.7928450107574463, "epoch": 0.27798570942497447, "grad_norm": 0.04033125564455986, "grad_norm_var": 5.8785706100517036e-06, "learning_rate": 0.006598340745578908, "loss": 2.7928, "step": 3268 }, { "crossentropy": 2.717926263809204, "epoch": 0.278070772371555, "grad_norm": 0.03810082748532295, "grad_norm_var": 5.896389757789611e-06, "learning_rate": 0.006596442178583424, "loss": 2.7179, "step": 3269 }, { "crossentropy": 2.7672770023345947, "epoch": 0.2781558353181354, "grad_norm": 0.03565994277596474, "grad_norm_var": 6.342019274405835e-06, "learning_rate": 0.006594543355245324, "loss": 2.7673, "step": 3270 }, { "crossentropy": 2.881411075592041, "epoch": 0.2782408982647159, "grad_norm": 0.041969966143369675, "grad_norm_var": 6.573485402420048e-06, "learning_rate": 0.0065926442758695095, "loss": 2.8814, "step": 3271 }, { "crossentropy": 2.7831127643585205, "epoch": 0.2783259612112964, "grad_norm": 0.03625475987792015, "grad_norm_var": 7.028094200282192e-06, "learning_rate": 0.006590744940760914, "loss": 2.7831, "step": 3272 }, { "crossentropy": 2.8635613918304443, "epoch": 0.27841102415787683, "grad_norm": 0.03902817890048027, "grad_norm_var": 6.59420019356634e-06, "learning_rate": 0.006588845350224517, "loss": 2.8636, "step": 3273 }, { "crossentropy": 2.7811872959136963, "epoch": 0.2784960871044573, "grad_norm": 0.03660158812999725, "grad_norm_var": 6.408270002859704e-06, "learning_rate": 0.00658694550456534, "loss": 2.7812, "step": 3274 }, { "crossentropy": 2.871533155441284, "epoch": 0.2785811500510378, "grad_norm": 0.03832319751381874, "grad_norm_var": 6.255152290949677e-06, "learning_rate": 0.006585045404088441, "loss": 2.8715, "step": 3275 }, { "crossentropy": 2.7014641761779785, "epoch": 0.27866621299761823, "grad_norm": 0.03784193471074104, "grad_norm_var": 6.348979262560337e-06, "learning_rate": 0.006583145049098922, "loss": 2.7015, "step": 3276 }, { "crossentropy": 2.818953275680542, "epoch": 0.2787512759441987, "grad_norm": 0.03639240562915802, "grad_norm_var": 6.772542408189688e-06, "learning_rate": 0.006581244439901926, "loss": 2.819, "step": 3277 }, { "crossentropy": 2.7315874099731445, "epoch": 0.2788363388907792, "grad_norm": 0.03787427023053169, "grad_norm_var": 6.8172960873765415e-06, "learning_rate": 0.006579343576802633, "loss": 2.7316, "step": 3278 }, { "crossentropy": 2.8390557765960693, "epoch": 0.27892140183735964, "grad_norm": 0.038047242909669876, "grad_norm_var": 6.535786386539094e-06, "learning_rate": 0.006577442460106272, "loss": 2.8391, "step": 3279 }, { "crossentropy": 2.782418727874756, "epoch": 0.2790064647839401, "grad_norm": 0.04056697338819504, "grad_norm_var": 6.564979098181804e-06, "learning_rate": 0.006575541090118105, "loss": 2.7824, "step": 3280 }, { "crossentropy": 2.824150323867798, "epoch": 0.2790915277305206, "grad_norm": 0.03970564901828766, "grad_norm_var": 6.422529729088223e-06, "learning_rate": 0.006573639467143435, "loss": 2.8242, "step": 3281 }, { "crossentropy": 2.7405240535736084, "epoch": 0.27917659067710104, "grad_norm": 0.03716843202710152, "grad_norm_var": 6.063811045768179e-06, "learning_rate": 0.006571737591487611, "loss": 2.7405, "step": 3282 }, { "crossentropy": 2.6989338397979736, "epoch": 0.27926165362368155, "grad_norm": 0.04350193962454796, "grad_norm_var": 4.696509396243109e-06, "learning_rate": 0.006569835463456019, "loss": 2.6989, "step": 3283 }, { "crossentropy": 2.7625575065612793, "epoch": 0.279346716570262, "grad_norm": 0.03661741688847542, "grad_norm_var": 4.6941035828656585e-06, "learning_rate": 0.006567933083354084, "loss": 2.7626, "step": 3284 }, { "crossentropy": 2.657655715942383, "epoch": 0.27943177951684245, "grad_norm": 0.040450841188430786, "grad_norm_var": 4.9601176391762696e-06, "learning_rate": 0.006566030451487276, "loss": 2.6577, "step": 3285 }, { "crossentropy": 2.735517978668213, "epoch": 0.27951684246342295, "grad_norm": 0.039688218384981155, "grad_norm_var": 4.448741898694224e-06, "learning_rate": 0.006564127568161101, "loss": 2.7355, "step": 3286 }, { "crossentropy": 2.831019401550293, "epoch": 0.2796019054100034, "grad_norm": 0.03954918310046196, "grad_norm_var": 3.776357891522158e-06, "learning_rate": 0.006562224433681108, "loss": 2.831, "step": 3287 }, { "crossentropy": 2.7038490772247314, "epoch": 0.27968696835658385, "grad_norm": 0.03866098076105118, "grad_norm_var": 3.3855592095991907e-06, "learning_rate": 0.006560321048352886, "loss": 2.7038, "step": 3288 }, { "crossentropy": 2.7745864391326904, "epoch": 0.27977203130316436, "grad_norm": 0.0349971204996109, "grad_norm_var": 4.252254502603153e-06, "learning_rate": 0.006558417412482062, "loss": 2.7746, "step": 3289 }, { "crossentropy": 2.8582844734191895, "epoch": 0.2798570942497448, "grad_norm": 0.038035158067941666, "grad_norm_var": 4.017982780444435e-06, "learning_rate": 0.006556513526374307, "loss": 2.8583, "step": 3290 }, { "crossentropy": 2.7670791149139404, "epoch": 0.27994215719632526, "grad_norm": 0.04224122315645218, "grad_norm_var": 4.838658589310408e-06, "learning_rate": 0.006554609390335331, "loss": 2.7671, "step": 3291 }, { "crossentropy": 2.7148895263671875, "epoch": 0.28002722014290576, "grad_norm": 0.03851458430290222, "grad_norm_var": 4.777990291905907e-06, "learning_rate": 0.006552705004670879, "loss": 2.7149, "step": 3292 }, { "crossentropy": 2.7089016437530518, "epoch": 0.2801122830894862, "grad_norm": 0.038484837859869, "grad_norm_var": 4.358807983346052e-06, "learning_rate": 0.0065508003696867455, "loss": 2.7089, "step": 3293 }, { "crossentropy": 2.6884219646453857, "epoch": 0.28019734603606666, "grad_norm": 0.043103840202093124, "grad_norm_var": 5.278603361404526e-06, "learning_rate": 0.006548895485688758, "loss": 2.6884, "step": 3294 }, { "crossentropy": 2.6916098594665527, "epoch": 0.28028240898264717, "grad_norm": 0.03810802847146988, "grad_norm_var": 5.268410705123553e-06, "learning_rate": 0.006546990352982784, "loss": 2.6916, "step": 3295 }, { "crossentropy": 2.7923905849456787, "epoch": 0.2803674719292276, "grad_norm": 0.03863811865448952, "grad_norm_var": 5.184654419472928e-06, "learning_rate": 0.006545084971874737, "loss": 2.7924, "step": 3296 }, { "crossentropy": 2.6896421909332275, "epoch": 0.2804525348758081, "grad_norm": 0.03623136132955551, "grad_norm_var": 5.7125245343809965e-06, "learning_rate": 0.006543179342670564, "loss": 2.6896, "step": 3297 }, { "crossentropy": 2.706117630004883, "epoch": 0.2805375978223886, "grad_norm": 0.03915832191705704, "grad_norm_var": 5.474198773382464e-06, "learning_rate": 0.006541273465676253, "loss": 2.7061, "step": 3298 }, { "crossentropy": 2.7627623081207275, "epoch": 0.280622660768969, "grad_norm": 0.04337676241993904, "grad_norm_var": 5.402106059951284e-06, "learning_rate": 0.006539367341197835, "loss": 2.7628, "step": 3299 }, { "crossentropy": 2.795279026031494, "epoch": 0.28070772371554953, "grad_norm": 0.04216211289167404, "grad_norm_var": 5.476400043982234e-06, "learning_rate": 0.006537460969541378, "loss": 2.7953, "step": 3300 }, { "crossentropy": 2.8207638263702393, "epoch": 0.28079278666213, "grad_norm": 0.040208086371421814, "grad_norm_var": 5.448094620557192e-06, "learning_rate": 0.00653555435101299, "loss": 2.8208, "step": 3301 }, { "crossentropy": 2.674741506576538, "epoch": 0.28087784960871043, "grad_norm": 0.0408150777220726, "grad_norm_var": 5.563644662564862e-06, "learning_rate": 0.006533647485918819, "loss": 2.6747, "step": 3302 }, { "crossentropy": 2.774320602416992, "epoch": 0.28096291255529093, "grad_norm": 0.04145258665084839, "grad_norm_var": 5.798043393140327e-06, "learning_rate": 0.006531740374565053, "loss": 2.7743, "step": 3303 }, { "crossentropy": 2.7323482036590576, "epoch": 0.2810479755018714, "grad_norm": 0.044133253395557404, "grad_norm_var": 6.9576879938479734e-06, "learning_rate": 0.006529833017257919, "loss": 2.7323, "step": 3304 }, { "crossentropy": 2.7515690326690674, "epoch": 0.28113303844845183, "grad_norm": 0.043200280517339706, "grad_norm_var": 5.714714714164653e-06, "learning_rate": 0.006527925414303684, "loss": 2.7516, "step": 3305 }, { "crossentropy": 2.6968281269073486, "epoch": 0.28121810139503234, "grad_norm": 0.043562646955251694, "grad_norm_var": 5.813982049272341e-06, "learning_rate": 0.006526017566008652, "loss": 2.6968, "step": 3306 }, { "crossentropy": 2.7639403343200684, "epoch": 0.2813031643416128, "grad_norm": 0.04209635779261589, "grad_norm_var": 5.788169507218478e-06, "learning_rate": 0.006524109472679172, "loss": 2.7639, "step": 3307 }, { "crossentropy": 2.769975423812866, "epoch": 0.28138822728819324, "grad_norm": 0.039068710058927536, "grad_norm_var": 5.63644541710373e-06, "learning_rate": 0.0065222011346216255, "loss": 2.77, "step": 3308 }, { "crossentropy": 2.7341368198394775, "epoch": 0.28147329023477374, "grad_norm": 0.03721006214618683, "grad_norm_var": 6.142146768218538e-06, "learning_rate": 0.006520292552142438, "loss": 2.7341, "step": 3309 }, { "crossentropy": 2.8571910858154297, "epoch": 0.2815583531813542, "grad_norm": 0.0660300999879837, "grad_norm_var": 4.608786531612244e-05, "learning_rate": 0.006518383725548074, "loss": 2.8572, "step": 3310 }, { "crossentropy": 2.8507494926452637, "epoch": 0.2816434161279347, "grad_norm": 0.04502669349312782, "grad_norm_var": 4.5290291755394e-05, "learning_rate": 0.006516474655145035, "loss": 2.8507, "step": 3311 }, { "crossentropy": 2.7710795402526855, "epoch": 0.28172847907451515, "grad_norm": 0.04117552936077118, "grad_norm_var": 4.433601274693537e-05, "learning_rate": 0.006514565341239861, "loss": 2.7711, "step": 3312 }, { "crossentropy": 2.76753306388855, "epoch": 0.2818135420210956, "grad_norm": 0.04022092744708061, "grad_norm_var": 4.183307812462885e-05, "learning_rate": 0.006512655784139136, "loss": 2.7675, "step": 3313 }, { "crossentropy": 2.761634349822998, "epoch": 0.2818986049676761, "grad_norm": 0.04147183522582054, "grad_norm_var": 4.0965259836676235e-05, "learning_rate": 0.006510745984149476, "loss": 2.7616, "step": 3314 }, { "crossentropy": 2.773224353790283, "epoch": 0.28198366791425655, "grad_norm": 0.037818606942892075, "grad_norm_var": 4.276559224099992e-05, "learning_rate": 0.006508835941577544, "loss": 2.7732, "step": 3315 }, { "crossentropy": 2.647928476333618, "epoch": 0.282068730860837, "grad_norm": 0.03886866942048073, "grad_norm_var": 4.374703528658979e-05, "learning_rate": 0.006506925656730036, "loss": 2.6479, "step": 3316 }, { "crossentropy": 2.8526651859283447, "epoch": 0.2821537938074175, "grad_norm": 0.03824417665600777, "grad_norm_var": 4.462685643710403e-05, "learning_rate": 0.006505015129913689, "loss": 2.8527, "step": 3317 }, { "crossentropy": 2.737004041671753, "epoch": 0.28223885675399796, "grad_norm": 0.045221976935863495, "grad_norm_var": 4.4836091380257015e-05, "learning_rate": 0.006503104361435278, "loss": 2.737, "step": 3318 }, { "crossentropy": 2.824993848800659, "epoch": 0.2823239197005784, "grad_norm": 0.03914947062730789, "grad_norm_var": 4.5581425725414044e-05, "learning_rate": 0.0065011933516016165, "loss": 2.825, "step": 3319 }, { "crossentropy": 2.7948343753814697, "epoch": 0.2824089826471589, "grad_norm": 0.03979397565126419, "grad_norm_var": 4.590368311701912e-05, "learning_rate": 0.006499282100719558, "loss": 2.7948, "step": 3320 }, { "crossentropy": 2.791339159011841, "epoch": 0.28249404559373936, "grad_norm": 0.03950536251068115, "grad_norm_var": 4.635530727916803e-05, "learning_rate": 0.0064973706090959964, "loss": 2.7913, "step": 3321 }, { "crossentropy": 2.7768518924713135, "epoch": 0.2825791085403198, "grad_norm": 0.03726853430271149, "grad_norm_var": 4.76491982357626e-05, "learning_rate": 0.006495458877037861, "loss": 2.7769, "step": 3322 }, { "crossentropy": 2.816417932510376, "epoch": 0.2826641714869003, "grad_norm": 0.0425967276096344, "grad_norm_var": 4.7687240976629756e-05, "learning_rate": 0.006493546904852119, "loss": 2.8164, "step": 3323 }, { "crossentropy": 2.6779510974884033, "epoch": 0.28274923443348077, "grad_norm": 0.0404849648475647, "grad_norm_var": 4.729835999480713e-05, "learning_rate": 0.00649163469284578, "loss": 2.678, "step": 3324 }, { "crossentropy": 2.649756908416748, "epoch": 0.2828342973800613, "grad_norm": 0.03937400504946709, "grad_norm_var": 4.624349107955241e-05, "learning_rate": 0.0064897222413258905, "loss": 2.6498, "step": 3325 }, { "crossentropy": 2.7387499809265137, "epoch": 0.2829193603266417, "grad_norm": 0.0376427099108696, "grad_norm_var": 5.714668033376426e-06, "learning_rate": 0.006487809550599536, "loss": 2.7387, "step": 3326 }, { "crossentropy": 2.5908331871032715, "epoch": 0.28300442327322217, "grad_norm": 0.08750647306442261, "grad_norm_var": 0.00014560103410516181, "learning_rate": 0.006485896620973835, "loss": 2.5908, "step": 3327 }, { "crossentropy": 2.71816349029541, "epoch": 0.2830894862198027, "grad_norm": 0.04034982621669769, "grad_norm_var": 0.0001458331134514099, "learning_rate": 0.006483983452755953, "loss": 2.7182, "step": 3328 }, { "crossentropy": 2.8081557750701904, "epoch": 0.2831745491663831, "grad_norm": 0.039792250841856, "grad_norm_var": 0.00014599457620163178, "learning_rate": 0.00648207004625309, "loss": 2.8082, "step": 3329 }, { "crossentropy": 2.710876703262329, "epoch": 0.2832596121129636, "grad_norm": 0.04283614456653595, "grad_norm_var": 0.00014586601412468013, "learning_rate": 0.00648015640177248, "loss": 2.7109, "step": 3330 }, { "crossentropy": 2.7499630451202393, "epoch": 0.2833446750595441, "grad_norm": 0.042833730578422546, "grad_norm_var": 0.0001440378871812133, "learning_rate": 0.006478242519621403, "loss": 2.75, "step": 3331 }, { "crossentropy": 2.7608773708343506, "epoch": 0.28342973800612453, "grad_norm": 0.03794462978839874, "grad_norm_var": 0.00014462696695368975, "learning_rate": 0.006476328400107171, "loss": 2.7609, "step": 3332 }, { "crossentropy": 2.8366878032684326, "epoch": 0.283514800952705, "grad_norm": 0.038278184831142426, "grad_norm_var": 0.0001446047530768731, "learning_rate": 0.006474414043537138, "loss": 2.8367, "step": 3333 }, { "crossentropy": 2.813589334487915, "epoch": 0.2835998638992855, "grad_norm": 0.03723147138953209, "grad_norm_var": 0.000146399695493074, "learning_rate": 0.006472499450218694, "loss": 2.8136, "step": 3334 }, { "crossentropy": 2.881937265396118, "epoch": 0.28368492684586594, "grad_norm": 0.03810299560427666, "grad_norm_var": 0.0001469582122451627, "learning_rate": 0.006470584620459267, "loss": 2.8819, "step": 3335 }, { "crossentropy": 2.849626302719116, "epoch": 0.2837699897924464, "grad_norm": 0.04004901647567749, "grad_norm_var": 0.00014686698080514557, "learning_rate": 0.006468669554566324, "loss": 2.8496, "step": 3336 }, { "crossentropy": 2.750284194946289, "epoch": 0.2838550527390269, "grad_norm": 0.044341303408145905, "grad_norm_var": 0.0001463252880968063, "learning_rate": 0.00646675425284737, "loss": 2.7503, "step": 3337 }, { "crossentropy": 2.7944602966308594, "epoch": 0.28394011568560734, "grad_norm": 0.038059283047914505, "grad_norm_var": 0.00014576908989978363, "learning_rate": 0.006464838715609945, "loss": 2.7945, "step": 3338 }, { "crossentropy": 2.75449538230896, "epoch": 0.2840251786321878, "grad_norm": 0.037595003843307495, "grad_norm_var": 0.00014757758832580759, "learning_rate": 0.006462922943161631, "loss": 2.7545, "step": 3339 }, { "crossentropy": 2.7514116764068604, "epoch": 0.2841102415787683, "grad_norm": 0.037197086960077286, "grad_norm_var": 0.0001492029408974795, "learning_rate": 0.0064610069358100475, "loss": 2.7514, "step": 3340 }, { "crossentropy": 2.6709609031677246, "epoch": 0.28419530452534875, "grad_norm": 0.03897674381732941, "grad_norm_var": 0.00014937551612270952, "learning_rate": 0.0064590906938628454, "loss": 2.671, "step": 3341 }, { "crossentropy": 2.687450647354126, "epoch": 0.28428036747192925, "grad_norm": 0.0411318838596344, "grad_norm_var": 0.00014791341583604654, "learning_rate": 0.006457174217627722, "loss": 2.6875, "step": 3342 }, { "crossentropy": 2.729343891143799, "epoch": 0.2843654304185097, "grad_norm": 0.040900569409132004, "grad_norm_var": 4.859210973904432e-06, "learning_rate": 0.006455257507412406, "loss": 2.7293, "step": 3343 }, { "crossentropy": 2.803065061569214, "epoch": 0.28445049336509015, "grad_norm": 0.040239181369543076, "grad_norm_var": 4.8507768321052905e-06, "learning_rate": 0.006453340563524669, "loss": 2.8031, "step": 3344 }, { "crossentropy": 2.6718454360961914, "epoch": 0.28453555631167066, "grad_norm": 0.04063883051276207, "grad_norm_var": 4.903800100787038e-06, "learning_rate": 0.006451423386272311, "loss": 2.6718, "step": 3345 }, { "crossentropy": 2.8114371299743652, "epoch": 0.2846206192582511, "grad_norm": 0.03832199424505234, "grad_norm_var": 4.333281988924166e-06, "learning_rate": 0.006449505975963181, "loss": 2.8114, "step": 3346 }, { "crossentropy": 2.6772897243499756, "epoch": 0.28470568220483156, "grad_norm": 0.036808665841817856, "grad_norm_var": 3.9160565693250235e-06, "learning_rate": 0.006447588332905159, "loss": 2.6773, "step": 3347 }, { "crossentropy": 2.800508737564087, "epoch": 0.28479074515141206, "grad_norm": 0.04270683228969574, "grad_norm_var": 4.591247631220938e-06, "learning_rate": 0.006445670457406161, "loss": 2.8005, "step": 3348 }, { "crossentropy": 2.924844980239868, "epoch": 0.2848758080979925, "grad_norm": 0.08782755583524704, "grad_norm_var": 0.0001505522127499453, "learning_rate": 0.0064437523497741424, "loss": 2.9248, "step": 3349 }, { "crossentropy": 2.6723203659057617, "epoch": 0.28496087104457296, "grad_norm": 0.0441877506673336, "grad_norm_var": 0.00014868255147686577, "learning_rate": 0.006441834010317097, "loss": 2.6723, "step": 3350 }, { "crossentropy": 2.6916282176971436, "epoch": 0.28504593399115347, "grad_norm": 0.04197731986641884, "grad_norm_var": 0.0001471205745308892, "learning_rate": 0.0064399154393430585, "loss": 2.6916, "step": 3351 }, { "crossentropy": 2.7803139686584473, "epoch": 0.2851309969377339, "grad_norm": 0.04032405465841293, "grad_norm_var": 0.00014701030262088665, "learning_rate": 0.0064379966371600865, "loss": 2.7803, "step": 3352 }, { "crossentropy": 2.823599100112915, "epoch": 0.28521605988431437, "grad_norm": 0.035585466772317886, "grad_norm_var": 0.00015047192116608425, "learning_rate": 0.00643607760407629, "loss": 2.8236, "step": 3353 }, { "crossentropy": 2.773040294647217, "epoch": 0.28530112283089487, "grad_norm": 0.046389494091272354, "grad_norm_var": 0.00014970463157706285, "learning_rate": 0.006434158340399811, "loss": 2.773, "step": 3354 }, { "crossentropy": 2.7120282649993896, "epoch": 0.2853861857774753, "grad_norm": 0.04084070399403572, "grad_norm_var": 0.00014794801480023817, "learning_rate": 0.006432238846438829, "loss": 2.712, "step": 3355 }, { "crossentropy": 2.7554056644439697, "epoch": 0.2854712487240558, "grad_norm": 0.04158950224518776, "grad_norm_var": 0.000145533737348291, "learning_rate": 0.006430319122501555, "loss": 2.7554, "step": 3356 }, { "crossentropy": 2.788694381713867, "epoch": 0.2855563116706363, "grad_norm": 0.04033810645341873, "grad_norm_var": 0.00014480077484237778, "learning_rate": 0.006428399168896244, "loss": 2.7887, "step": 3357 }, { "crossentropy": 2.7702460289001465, "epoch": 0.2856413746172167, "grad_norm": 0.038715675473213196, "grad_norm_var": 0.00014600524129805743, "learning_rate": 0.006426478985931184, "loss": 2.7702, "step": 3358 }, { "crossentropy": 2.762420177459717, "epoch": 0.28572643756379723, "grad_norm": 0.03833874315023422, "grad_norm_var": 0.0001473330420416964, "learning_rate": 0.006424558573914704, "loss": 2.7624, "step": 3359 }, { "crossentropy": 2.8036346435546875, "epoch": 0.2858115005103777, "grad_norm": 0.03907482326030731, "grad_norm_var": 0.0001479126562227771, "learning_rate": 0.006422637933155162, "loss": 2.8036, "step": 3360 }, { "crossentropy": 2.829568862915039, "epoch": 0.28589656345695813, "grad_norm": 0.04085470363497734, "grad_norm_var": 0.00014783741511624018, "learning_rate": 0.006420717063960961, "loss": 2.8296, "step": 3361 }, { "crossentropy": 2.7241711616516113, "epoch": 0.28598162640353864, "grad_norm": 0.04283832386136055, "grad_norm_var": 0.00014607390084127772, "learning_rate": 0.006418795966640539, "loss": 2.7242, "step": 3362 }, { "crossentropy": 2.9192771911621094, "epoch": 0.2860666893501191, "grad_norm": 0.042148441076278687, "grad_norm_var": 0.00014298525228146158, "learning_rate": 0.006416874641502364, "loss": 2.9193, "step": 3363 }, { "crossentropy": 2.787822723388672, "epoch": 0.28615175229669954, "grad_norm": 0.04676494374871254, "grad_norm_var": 0.00014332368711846144, "learning_rate": 0.006414953088854948, "loss": 2.7878, "step": 3364 }, { "crossentropy": 2.7367684841156006, "epoch": 0.28623681524328004, "grad_norm": 0.03863950073719025, "grad_norm_var": 8.657092509336585e-06, "learning_rate": 0.00641303130900684, "loss": 2.7368, "step": 3365 }, { "crossentropy": 2.784947633743286, "epoch": 0.2863218781898605, "grad_norm": 0.03690017759799957, "grad_norm_var": 9.037281615331683e-06, "learning_rate": 0.006411109302266615, "loss": 2.7849, "step": 3366 }, { "crossentropy": 2.7350523471832275, "epoch": 0.28640694113644094, "grad_norm": 0.037523750215768814, "grad_norm_var": 9.522892731520885e-06, "learning_rate": 0.0064091870689428985, "loss": 2.7351, "step": 3367 }, { "crossentropy": 2.749622106552124, "epoch": 0.28649200408302145, "grad_norm": 0.03758632019162178, "grad_norm_var": 1.00297054426721e-05, "learning_rate": 0.006407264609344343, "loss": 2.7496, "step": 3368 }, { "crossentropy": 2.7842295169830322, "epoch": 0.2865770670296019, "grad_norm": 0.03919463977217674, "grad_norm_var": 8.595287435934849e-06, "learning_rate": 0.006405341923779641, "loss": 2.7842, "step": 3369 }, { "crossentropy": 2.682857036590576, "epoch": 0.2866621299761824, "grad_norm": 0.04797772318124771, "grad_norm_var": 1.0003593651075384e-05, "learning_rate": 0.00640341901255752, "loss": 2.6829, "step": 3370 }, { "crossentropy": 2.797651767730713, "epoch": 0.28674719292276285, "grad_norm": 0.04108014702796936, "grad_norm_var": 1.0015408185534645e-05, "learning_rate": 0.006401495875986743, "loss": 2.7977, "step": 3371 }, { "crossentropy": 2.7612955570220947, "epoch": 0.2868322558693433, "grad_norm": 0.03681230545043945, "grad_norm_var": 1.08101132512232e-05, "learning_rate": 0.006399572514376113, "loss": 2.7613, "step": 3372 }, { "crossentropy": 2.7773940563201904, "epoch": 0.2869173188159238, "grad_norm": 0.0391472689807415, "grad_norm_var": 1.0892577779874195e-05, "learning_rate": 0.006397648928034465, "loss": 2.7774, "step": 3373 }, { "crossentropy": 2.7559919357299805, "epoch": 0.28700238176250426, "grad_norm": 0.036660388112068176, "grad_norm_var": 1.1570160378292631e-05, "learning_rate": 0.00639572511727067, "loss": 2.756, "step": 3374 }, { "crossentropy": 2.7359488010406494, "epoch": 0.2870874447090847, "grad_norm": 0.03879028931260109, "grad_norm_var": 1.1477082741217134e-05, "learning_rate": 0.00639380108239364, "loss": 2.7359, "step": 3375 }, { "crossentropy": 2.7407338619232178, "epoch": 0.2871725076556652, "grad_norm": 0.03962139040231705, "grad_norm_var": 1.141924992338871e-05, "learning_rate": 0.006391876823712317, "loss": 2.7407, "step": 3376 }, { "crossentropy": 2.6855435371398926, "epoch": 0.28725757060224566, "grad_norm": 0.041553936898708344, "grad_norm_var": 1.1514690569318306e-05, "learning_rate": 0.006389952341535681, "loss": 2.6855, "step": 3377 }, { "crossentropy": 2.7524192333221436, "epoch": 0.2873426335488261, "grad_norm": 0.03450215980410576, "grad_norm_var": 1.2928198001664557e-05, "learning_rate": 0.006388027636172751, "loss": 2.7524, "step": 3378 }, { "crossentropy": 2.737384796142578, "epoch": 0.2874276964954066, "grad_norm": 0.038145799189805984, "grad_norm_var": 1.2612927893410989e-05, "learning_rate": 0.0063861027079325765, "loss": 2.7374, "step": 3379 }, { "crossentropy": 2.724973201751709, "epoch": 0.28751275944198706, "grad_norm": 0.03961776942014694, "grad_norm_var": 8.816911617386993e-06, "learning_rate": 0.0063841775571242465, "loss": 2.725, "step": 3380 }, { "crossentropy": 2.8253469467163086, "epoch": 0.2875978223885675, "grad_norm": 0.03956224396824837, "grad_norm_var": 8.827669417293728e-06, "learning_rate": 0.006382252184056887, "loss": 2.8253, "step": 3381 }, { "crossentropy": 2.746995449066162, "epoch": 0.287682885335148, "grad_norm": 0.03717925027012825, "grad_norm_var": 8.752830445513058e-06, "learning_rate": 0.006380326589039653, "loss": 2.747, "step": 3382 }, { "crossentropy": 2.715297222137451, "epoch": 0.28776794828172847, "grad_norm": 0.03655291348695755, "grad_norm_var": 9.010560528562491e-06, "learning_rate": 0.006378400772381744, "loss": 2.7153, "step": 3383 }, { "crossentropy": 2.744431257247925, "epoch": 0.287853011228309, "grad_norm": 0.03647782653570175, "grad_norm_var": 9.296155834606577e-06, "learning_rate": 0.006376474734392388, "loss": 2.7444, "step": 3384 }, { "crossentropy": 2.6702523231506348, "epoch": 0.2879380741748894, "grad_norm": 0.03879593312740326, "grad_norm_var": 9.292009668572753e-06, "learning_rate": 0.006374548475380852, "loss": 2.6703, "step": 3385 }, { "crossentropy": 2.727024793624878, "epoch": 0.2880231371214699, "grad_norm": 0.04159041866660118, "grad_norm_var": 4.115022612180957e-06, "learning_rate": 0.006372621995656437, "loss": 2.727, "step": 3386 }, { "crossentropy": 2.805691957473755, "epoch": 0.2881082000680504, "grad_norm": 0.035812944173812866, "grad_norm_var": 4.040917879288321e-06, "learning_rate": 0.006370695295528481, "loss": 2.8057, "step": 3387 }, { "crossentropy": 2.7791807651519775, "epoch": 0.28819326301463083, "grad_norm": 0.04266025498509407, "grad_norm_var": 5.114682842510821e-06, "learning_rate": 0.006368768375306357, "loss": 2.7792, "step": 3388 }, { "crossentropy": 2.7219536304473877, "epoch": 0.2882783259612113, "grad_norm": 0.036993298679590225, "grad_norm_var": 5.230804474607581e-06, "learning_rate": 0.006366841235299471, "loss": 2.722, "step": 3389 }, { "crossentropy": 2.7285587787628174, "epoch": 0.2883633889077918, "grad_norm": 0.0378357470035553, "grad_norm_var": 5.043379634493001e-06, "learning_rate": 0.006364913875817267, "loss": 2.7286, "step": 3390 }, { "crossentropy": 2.718163013458252, "epoch": 0.28844845185437223, "grad_norm": 0.0396147184073925, "grad_norm_var": 5.11988439615696e-06, "learning_rate": 0.006362986297169224, "loss": 2.7182, "step": 3391 }, { "crossentropy": 2.863860845565796, "epoch": 0.2885335148009527, "grad_norm": 0.04130022972822189, "grad_norm_var": 5.539831172884924e-06, "learning_rate": 0.006361058499664856, "loss": 2.8639, "step": 3392 }, { "crossentropy": 2.7133655548095703, "epoch": 0.2886185777475332, "grad_norm": 0.03891792148351669, "grad_norm_var": 4.9489808294186e-06, "learning_rate": 0.00635913048361371, "loss": 2.7134, "step": 3393 }, { "crossentropy": 2.840486526489258, "epoch": 0.28870364069411364, "grad_norm": 0.0367753691971302, "grad_norm_var": 4.068570589552817e-06, "learning_rate": 0.006357202249325371, "loss": 2.8405, "step": 3394 }, { "crossentropy": 2.7130048274993896, "epoch": 0.2887887036406941, "grad_norm": 0.03938737139105797, "grad_norm_var": 4.0873177113126545e-06, "learning_rate": 0.006355273797109459, "loss": 2.713, "step": 3395 }, { "crossentropy": 2.7582168579101562, "epoch": 0.2888737665872746, "grad_norm": 0.03937627375125885, "grad_norm_var": 4.061157929080341e-06, "learning_rate": 0.006353345127275625, "loss": 2.7582, "step": 3396 }, { "crossentropy": 2.7427523136138916, "epoch": 0.28895882953385504, "grad_norm": 0.041116904467344284, "grad_norm_var": 4.395709757580117e-06, "learning_rate": 0.006351416240133559, "loss": 2.7428, "step": 3397 }, { "crossentropy": 2.732196807861328, "epoch": 0.28904389248043555, "grad_norm": 0.038371190428733826, "grad_norm_var": 4.231025148467421e-06, "learning_rate": 0.006349487135992986, "loss": 2.7322, "step": 3398 }, { "crossentropy": 2.7099034786224365, "epoch": 0.289128955427016, "grad_norm": 0.038707103580236435, "grad_norm_var": 3.8616484346692955e-06, "learning_rate": 0.006347557815163663, "loss": 2.7099, "step": 3399 }, { "crossentropy": 2.742872714996338, "epoch": 0.28921401837359645, "grad_norm": 0.03772900253534317, "grad_norm_var": 3.5415093952291993e-06, "learning_rate": 0.006345628277955384, "loss": 2.7429, "step": 3400 }, { "crossentropy": 2.7134621143341064, "epoch": 0.28929908132017695, "grad_norm": 0.04036007449030876, "grad_norm_var": 3.639024597848426e-06, "learning_rate": 0.006343698524677979, "loss": 2.7135, "step": 3401 }, { "crossentropy": 2.7101500034332275, "epoch": 0.2893841442667574, "grad_norm": 0.04306628555059433, "grad_norm_var": 4.253561760702897e-06, "learning_rate": 0.006341768555641305, "loss": 2.7102, "step": 3402 }, { "crossentropy": 2.7632980346679688, "epoch": 0.28946920721333785, "grad_norm": 0.04121417552232742, "grad_norm_var": 3.600537228972319e-06, "learning_rate": 0.006339838371155266, "loss": 2.7633, "step": 3403 }, { "crossentropy": 2.7988624572753906, "epoch": 0.28955427015991836, "grad_norm": 0.04488606005907059, "grad_norm_var": 4.821608265249908e-06, "learning_rate": 0.006337907971529789, "loss": 2.7989, "step": 3404 }, { "crossentropy": 2.7503719329833984, "epoch": 0.2896393331064988, "grad_norm": 0.03659302741289139, "grad_norm_var": 4.977583910734801e-06, "learning_rate": 0.006335977357074845, "loss": 2.7504, "step": 3405 }, { "crossentropy": 2.7918148040771484, "epoch": 0.28972439605307926, "grad_norm": 0.03709089010953903, "grad_norm_var": 5.197725929133775e-06, "learning_rate": 0.006334046528100431, "loss": 2.7918, "step": 3406 }, { "crossentropy": 2.761821985244751, "epoch": 0.28980945899965976, "grad_norm": 0.03886106610298157, "grad_norm_var": 5.237440239481452e-06, "learning_rate": 0.006332115484916585, "loss": 2.7618, "step": 3407 }, { "crossentropy": 2.6742312908172607, "epoch": 0.2898945219462402, "grad_norm": 0.0498664528131485, "grad_norm_var": 1.1754722914344e-05, "learning_rate": 0.006330184227833375, "loss": 2.6742, "step": 3408 }, { "crossentropy": 2.667529344558716, "epoch": 0.28997958489282066, "grad_norm": 0.06558865308761597, "grad_norm_var": 5.184929199830429e-05, "learning_rate": 0.006328252757160908, "loss": 2.6675, "step": 3409 }, { "crossentropy": 2.743492364883423, "epoch": 0.29006464783940117, "grad_norm": 0.0390096940100193, "grad_norm_var": 5.066088149345138e-05, "learning_rate": 0.00632632107320932, "loss": 2.7435, "step": 3410 }, { "crossentropy": 2.740050792694092, "epoch": 0.2901497107859816, "grad_norm": 0.03776910528540611, "grad_norm_var": 5.137781759532578e-05, "learning_rate": 0.006324389176288786, "loss": 2.7401, "step": 3411 }, { "crossentropy": 2.72074556350708, "epoch": 0.2902347737325621, "grad_norm": 0.03669026494026184, "grad_norm_var": 5.2714792924321516e-05, "learning_rate": 0.00632245706670951, "loss": 2.7207, "step": 3412 }, { "crossentropy": 2.6775877475738525, "epoch": 0.2903198366791426, "grad_norm": 0.04135722666978836, "grad_norm_var": 5.27002793410114e-05, "learning_rate": 0.006320524744781736, "loss": 2.6776, "step": 3413 }, { "crossentropy": 2.7874345779418945, "epoch": 0.290404899625723, "grad_norm": 0.042010027915239334, "grad_norm_var": 5.191398888771028e-05, "learning_rate": 0.006318592210815738, "loss": 2.7874, "step": 3414 }, { "crossentropy": 2.785280704498291, "epoch": 0.29048996257230353, "grad_norm": 0.042212024331092834, "grad_norm_var": 5.117799796382656e-05, "learning_rate": 0.0063166594651218235, "loss": 2.7853, "step": 3415 }, { "crossentropy": 2.6413767337799072, "epoch": 0.290575025518884, "grad_norm": 0.04303910210728645, "grad_norm_var": 4.98144421211941e-05, "learning_rate": 0.00631472650801034, "loss": 2.6414, "step": 3416 }, { "crossentropy": 2.6947126388549805, "epoch": 0.29066008846546443, "grad_norm": 0.04130445793271065, "grad_norm_var": 4.96037654211268e-05, "learning_rate": 0.0063127933397916615, "loss": 2.6947, "step": 3417 }, { "crossentropy": 2.7450594902038574, "epoch": 0.29074515141204493, "grad_norm": 0.04542578011751175, "grad_norm_var": 5.011888760503933e-05, "learning_rate": 0.0063108599607761984, "loss": 2.7451, "step": 3418 }, { "crossentropy": 2.6721794605255127, "epoch": 0.2908302143586254, "grad_norm": 0.03952263668179512, "grad_norm_var": 5.062885472736714e-05, "learning_rate": 0.0063089263712743995, "loss": 2.6722, "step": 3419 }, { "crossentropy": 2.7893593311309814, "epoch": 0.29091527730520583, "grad_norm": 0.03791343793272972, "grad_norm_var": 5.152043082052114e-05, "learning_rate": 0.006306992571596741, "loss": 2.7894, "step": 3420 }, { "crossentropy": 2.7676637172698975, "epoch": 0.29100034025178634, "grad_norm": 0.03818754106760025, "grad_norm_var": 5.0499854962152085e-05, "learning_rate": 0.006305058562053734, "loss": 2.7677, "step": 3421 }, { "crossentropy": 2.7299022674560547, "epoch": 0.2910854031983668, "grad_norm": 0.038353919982910156, "grad_norm_var": 4.973233912320124e-05, "learning_rate": 0.006303124342955927, "loss": 2.7299, "step": 3422 }, { "crossentropy": 2.811458110809326, "epoch": 0.29117046614494724, "grad_norm": 0.0372619703412056, "grad_norm_var": 5.0629532475903775e-05, "learning_rate": 0.006301189914613901, "loss": 2.8115, "step": 3423 }, { "crossentropy": 2.8507838249206543, "epoch": 0.29125552909152774, "grad_norm": 0.047908179461956024, "grad_norm_var": 4.887257126265498e-05, "learning_rate": 0.006299255277338265, "loss": 2.8508, "step": 3424 }, { "crossentropy": 2.8071141242980957, "epoch": 0.2913405920381082, "grad_norm": 0.04015914350748062, "grad_norm_var": 9.63841809823168e-06, "learning_rate": 0.006297320431439669, "loss": 2.8071, "step": 3425 }, { "crossentropy": 2.7208781242370605, "epoch": 0.29142565498468864, "grad_norm": 0.0408271923661232, "grad_norm_var": 9.481838033189213e-06, "learning_rate": 0.006295385377228794, "loss": 2.7209, "step": 3426 }, { "crossentropy": 2.7658450603485107, "epoch": 0.29151071793126915, "grad_norm": 0.03741629049181938, "grad_norm_var": 9.623794350300622e-06, "learning_rate": 0.006293450115016352, "loss": 2.7658, "step": 3427 }, { "crossentropy": 2.7284889221191406, "epoch": 0.2915957808778496, "grad_norm": 0.039097536355257034, "grad_norm_var": 8.731290023851884e-06, "learning_rate": 0.006291514645113095, "loss": 2.7285, "step": 3428 }, { "crossentropy": 2.7824511528015137, "epoch": 0.2916808438244301, "grad_norm": 0.0437375046312809, "grad_norm_var": 9.278183564331854e-06, "learning_rate": 0.006289578967829795, "loss": 2.7825, "step": 3429 }, { "crossentropy": 2.7834253311157227, "epoch": 0.29176590677101055, "grad_norm": 0.034954193979501724, "grad_norm_var": 1.1344076064200477e-05, "learning_rate": 0.0062876430834772744, "loss": 2.7834, "step": 3430 }, { "crossentropy": 2.7429463863372803, "epoch": 0.291850969717591, "grad_norm": 0.03407173231244087, "grad_norm_var": 1.3581347961923116e-05, "learning_rate": 0.006285706992366377, "loss": 2.7429, "step": 3431 }, { "crossentropy": 2.830042600631714, "epoch": 0.2919360326641715, "grad_norm": 0.0360216423869133, "grad_norm_var": 1.376765762607345e-05, "learning_rate": 0.006283770694807982, "loss": 2.83, "step": 3432 }, { "crossentropy": 2.7850821018218994, "epoch": 0.29202109561075196, "grad_norm": 0.04594522342085838, "grad_norm_var": 1.622393391563781e-05, "learning_rate": 0.0062818341911130065, "loss": 2.7851, "step": 3433 }, { "crossentropy": 2.807988166809082, "epoch": 0.2921061585573324, "grad_norm": 0.0399354062974453, "grad_norm_var": 1.3989774841128138e-05, "learning_rate": 0.006279897481592394, "loss": 2.808, "step": 3434 }, { "crossentropy": 2.7720048427581787, "epoch": 0.2921912215039129, "grad_norm": 0.03860722854733467, "grad_norm_var": 1.40341486807346e-05, "learning_rate": 0.006277960566557124, "loss": 2.772, "step": 3435 }, { "crossentropy": 2.707972526550293, "epoch": 0.29227628445049336, "grad_norm": 0.037586282938718796, "grad_norm_var": 1.4105677844847228e-05, "learning_rate": 0.006276023446318213, "loss": 2.708, "step": 3436 }, { "crossentropy": 2.736973762512207, "epoch": 0.2923613473970738, "grad_norm": 0.04308526962995529, "grad_norm_var": 1.4826567704522877e-05, "learning_rate": 0.006274086121186703, "loss": 2.737, "step": 3437 }, { "crossentropy": 2.841036319732666, "epoch": 0.2924464103436543, "grad_norm": 0.040135692805051804, "grad_norm_var": 1.4708633634972583e-05, "learning_rate": 0.006272148591473674, "loss": 2.841, "step": 3438 }, { "crossentropy": 2.7290725708007812, "epoch": 0.29253147329023477, "grad_norm": 0.037605032324790955, "grad_norm_var": 1.4600037366535358e-05, "learning_rate": 0.006270210857490237, "loss": 2.7291, "step": 3439 }, { "crossentropy": 2.7728981971740723, "epoch": 0.2926165362368152, "grad_norm": 0.0433788038790226, "grad_norm_var": 1.0996654761147711e-05, "learning_rate": 0.006268272919547536, "loss": 2.7729, "step": 3440 }, { "crossentropy": 2.6968019008636475, "epoch": 0.2927015991833957, "grad_norm": 0.04688224568963051, "grad_norm_var": 1.4380917982343792e-05, "learning_rate": 0.00626633477795675, "loss": 2.6968, "step": 3441 }, { "crossentropy": 2.7442715167999268, "epoch": 0.29278666212997617, "grad_norm": 0.044888898730278015, "grad_norm_var": 1.588410803764091e-05, "learning_rate": 0.0062643964330290885, "loss": 2.7443, "step": 3442 }, { "crossentropy": 2.6431050300598145, "epoch": 0.2928717250765567, "grad_norm": 0.03741132467985153, "grad_norm_var": 1.588595886115035e-05, "learning_rate": 0.0062624578850757895, "loss": 2.6431, "step": 3443 }, { "crossentropy": 2.817657709121704, "epoch": 0.2929567880231371, "grad_norm": 0.035246025770902634, "grad_norm_var": 1.7383868018652713e-05, "learning_rate": 0.006260519134408134, "loss": 2.8177, "step": 3444 }, { "crossentropy": 2.766749382019043, "epoch": 0.2930418509697176, "grad_norm": 0.038166239857673645, "grad_norm_var": 1.6523893146720933e-05, "learning_rate": 0.006258580181337424, "loss": 2.7667, "step": 3445 }, { "crossentropy": 2.797311305999756, "epoch": 0.2931269139162981, "grad_norm": 0.04171764850616455, "grad_norm_var": 1.5175247315400187e-05, "learning_rate": 0.0062566410261750026, "loss": 2.7973, "step": 3446 }, { "crossentropy": 2.803234577178955, "epoch": 0.29321197686287853, "grad_norm": 0.0388067364692688, "grad_norm_var": 1.2806780495602799e-05, "learning_rate": 0.006254701669232243, "loss": 2.8032, "step": 3447 }, { "crossentropy": 2.7018070220947266, "epoch": 0.293297039809459, "grad_norm": 0.03664255142211914, "grad_norm_var": 1.2473473390021279e-05, "learning_rate": 0.006252762110820548, "loss": 2.7018, "step": 3448 }, { "crossentropy": 2.624894618988037, "epoch": 0.2933821027560395, "grad_norm": 0.04052259400486946, "grad_norm_var": 1.0285747753353372e-05, "learning_rate": 0.006250822351251355, "loss": 2.6249, "step": 3449 }, { "crossentropy": 2.78865909576416, "epoch": 0.29346716570261994, "grad_norm": 0.03977733477950096, "grad_norm_var": 1.0289484850118751e-05, "learning_rate": 0.006248882390836135, "loss": 2.7887, "step": 3450 }, { "crossentropy": 2.6030750274658203, "epoch": 0.2935522286492004, "grad_norm": 0.037715256214141846, "grad_norm_var": 1.0508271137508497e-05, "learning_rate": 0.006246942229886388, "loss": 2.6031, "step": 3451 }, { "crossentropy": 2.77608323097229, "epoch": 0.2936372915957809, "grad_norm": 0.03871220350265503, "grad_norm_var": 1.0229202282519926e-05, "learning_rate": 0.006245001868713649, "loss": 2.7761, "step": 3452 }, { "crossentropy": 2.7782962322235107, "epoch": 0.29372235454236134, "grad_norm": 0.04133202135562897, "grad_norm_var": 9.710224813104749e-06, "learning_rate": 0.006243061307629483, "loss": 2.7783, "step": 3453 }, { "crossentropy": 2.8038599491119385, "epoch": 0.2938074174889418, "grad_norm": 0.040287312120199203, "grad_norm_var": 9.715743275736053e-06, "learning_rate": 0.006241120546945489, "loss": 2.8039, "step": 3454 }, { "crossentropy": 2.714796543121338, "epoch": 0.2938924804355223, "grad_norm": 0.03710014745593071, "grad_norm_var": 9.889080132765606e-06, "learning_rate": 0.006239179586973301, "loss": 2.7148, "step": 3455 }, { "crossentropy": 2.777644634246826, "epoch": 0.29397754338210275, "grad_norm": 0.036125313490629196, "grad_norm_var": 9.824261751053125e-06, "learning_rate": 0.006237238428024572, "loss": 2.7776, "step": 3456 }, { "crossentropy": 2.7169673442840576, "epoch": 0.29406260632868325, "grad_norm": 0.03585519641637802, "grad_norm_var": 6.5088684333164394e-06, "learning_rate": 0.006235297070411003, "loss": 2.717, "step": 3457 }, { "crossentropy": 2.67775559425354, "epoch": 0.2941476692752637, "grad_norm": 0.03491496667265892, "grad_norm_var": 4.587965213794359e-06, "learning_rate": 0.006233355514444321, "loss": 2.6778, "step": 3458 }, { "crossentropy": 2.733525276184082, "epoch": 0.29423273222184415, "grad_norm": 0.03569106012582779, "grad_norm_var": 4.941388720951176e-06, "learning_rate": 0.006231413760436281, "loss": 2.7335, "step": 3459 }, { "crossentropy": 2.720720052719116, "epoch": 0.29431779516842466, "grad_norm": 0.039290279150009155, "grad_norm_var": 4.457955675172635e-06, "learning_rate": 0.006229471808698673, "loss": 2.7207, "step": 3460 }, { "crossentropy": 2.7015981674194336, "epoch": 0.2944028581150051, "grad_norm": 0.041233744472265244, "grad_norm_var": 4.995005711989851e-06, "learning_rate": 0.006227529659543318, "loss": 2.7016, "step": 3461 }, { "crossentropy": 2.8086423873901367, "epoch": 0.29448792106158556, "grad_norm": 0.04181591048836708, "grad_norm_var": 5.037991214324837e-06, "learning_rate": 0.006225587313282071, "loss": 2.8086, "step": 3462 }, { "crossentropy": 2.6596336364746094, "epoch": 0.29457298400816606, "grad_norm": 0.034927722066640854, "grad_norm_var": 5.814035002716228e-06, "learning_rate": 0.006223644770226818, "loss": 2.6596, "step": 3463 }, { "crossentropy": 2.7903881072998047, "epoch": 0.2946580469547465, "grad_norm": 0.03583679720759392, "grad_norm_var": 6.026928343536714e-06, "learning_rate": 0.00622170203068947, "loss": 2.7904, "step": 3464 }, { "crossentropy": 2.8197383880615234, "epoch": 0.29474310990132696, "grad_norm": 0.0378209687769413, "grad_norm_var": 5.645065879679676e-06, "learning_rate": 0.006219759094981979, "loss": 2.8197, "step": 3465 }, { "crossentropy": 2.7546966075897217, "epoch": 0.29482817284790747, "grad_norm": 0.03722598776221275, "grad_norm_var": 5.456563441959504e-06, "learning_rate": 0.0062178159634163266, "loss": 2.7547, "step": 3466 }, { "crossentropy": 2.7848193645477295, "epoch": 0.2949132357944879, "grad_norm": 0.041958827525377274, "grad_norm_var": 6.49574322318282e-06, "learning_rate": 0.006215872636304519, "loss": 2.7848, "step": 3467 }, { "crossentropy": 2.7332265377044678, "epoch": 0.29499829874106837, "grad_norm": 0.037244319915771484, "grad_norm_var": 6.517056029852325e-06, "learning_rate": 0.0062139291139586, "loss": 2.7332, "step": 3468 }, { "crossentropy": 2.6427907943725586, "epoch": 0.29508336168764887, "grad_norm": 0.03852619603276253, "grad_norm_var": 5.777999844872964e-06, "learning_rate": 0.0062119853966906445, "loss": 2.6428, "step": 3469 }, { "crossentropy": 2.863927125930786, "epoch": 0.2951684246342293, "grad_norm": 0.0383780412375927, "grad_norm_var": 5.389420060811223e-06, "learning_rate": 0.006210041484812759, "loss": 2.8639, "step": 3470 }, { "crossentropy": 2.6659417152404785, "epoch": 0.2952534875808098, "grad_norm": 0.04079461097717285, "grad_norm_var": 5.924050707704388e-06, "learning_rate": 0.006208097378637076, "loss": 2.6659, "step": 3471 }, { "crossentropy": 2.7382733821868896, "epoch": 0.2953385505273903, "grad_norm": 0.03924545273184776, "grad_norm_var": 5.761962538125958e-06, "learning_rate": 0.006206153078475762, "loss": 2.7383, "step": 3472 }, { "crossentropy": 2.732343912124634, "epoch": 0.2954236134739707, "grad_norm": 0.040826525539159775, "grad_norm_var": 5.770580731697071e-06, "learning_rate": 0.0062042085846410215, "loss": 2.7323, "step": 3473 }, { "crossentropy": 2.740983009338379, "epoch": 0.29550867642055123, "grad_norm": 0.036717623472213745, "grad_norm_var": 5.1160357785369365e-06, "learning_rate": 0.00620226389744508, "loss": 2.741, "step": 3474 }, { "crossentropy": 2.7422618865966797, "epoch": 0.2955937393671317, "grad_norm": 0.03496674448251724, "grad_norm_var": 5.429359471705435e-06, "learning_rate": 0.006200319017200198, "loss": 2.7423, "step": 3475 }, { "crossentropy": 2.771364688873291, "epoch": 0.29567880231371213, "grad_norm": 0.03570966795086861, "grad_norm_var": 5.8775287441831244e-06, "learning_rate": 0.006198373944218669, "loss": 2.7714, "step": 3476 }, { "crossentropy": 2.761214256286621, "epoch": 0.29576386526029264, "grad_norm": 0.03845404088497162, "grad_norm_var": 5.283066106545551e-06, "learning_rate": 0.006196428678812815, "loss": 2.7612, "step": 3477 }, { "crossentropy": 2.807920455932617, "epoch": 0.2958489282068731, "grad_norm": 0.041371025145053864, "grad_norm_var": 5.078164927764759e-06, "learning_rate": 0.006194483221294989, "loss": 2.8079, "step": 3478 }, { "crossentropy": 2.7473225593566895, "epoch": 0.29593399115345354, "grad_norm": 0.039936505258083344, "grad_norm_var": 4.510706326797155e-06, "learning_rate": 0.006192537571977575, "loss": 2.7473, "step": 3479 }, { "crossentropy": 2.8398845195770264, "epoch": 0.29601905410003404, "grad_norm": 0.03777410835027695, "grad_norm_var": 4.073281715543294e-06, "learning_rate": 0.006190591731172991, "loss": 2.8399, "step": 3480 }, { "crossentropy": 2.78056001663208, "epoch": 0.2961041170466145, "grad_norm": 0.041443146765232086, "grad_norm_var": 4.5366545486937835e-06, "learning_rate": 0.00618864569919368, "loss": 2.7806, "step": 3481 }, { "crossentropy": 2.7575795650482178, "epoch": 0.29618917999319494, "grad_norm": 0.06670588254928589, "grad_norm_var": 5.2722080848453116e-05, "learning_rate": 0.006186699476352121, "loss": 2.7576, "step": 3482 }, { "crossentropy": 2.6858811378479004, "epoch": 0.29627424293977545, "grad_norm": 0.04038560017943382, "grad_norm_var": 5.259767374677371e-05, "learning_rate": 0.006184753062960818, "loss": 2.6859, "step": 3483 }, { "crossentropy": 2.7809603214263916, "epoch": 0.2963593058863559, "grad_norm": 0.03517851606011391, "grad_norm_var": 5.3769396002107706e-05, "learning_rate": 0.006182806459332313, "loss": 2.781, "step": 3484 }, { "crossentropy": 2.8278346061706543, "epoch": 0.2964443688329364, "grad_norm": 0.041317712515592575, "grad_norm_var": 5.355867890448149e-05, "learning_rate": 0.006180859665779173, "loss": 2.8278, "step": 3485 }, { "crossentropy": 2.764570474624634, "epoch": 0.29652943177951685, "grad_norm": 0.03927234932780266, "grad_norm_var": 5.334665907291604e-05, "learning_rate": 0.006178912682613993, "loss": 2.7646, "step": 3486 }, { "crossentropy": 2.75567364692688, "epoch": 0.2966144947260973, "grad_norm": 0.036947570741176605, "grad_norm_var": 5.418783167849405e-05, "learning_rate": 0.006176965510149408, "loss": 2.7557, "step": 3487 }, { "crossentropy": 2.7806172370910645, "epoch": 0.2966995576726778, "grad_norm": 0.03794625401496887, "grad_norm_var": 5.4491727451958653e-05, "learning_rate": 0.006175018148698076, "loss": 2.7806, "step": 3488 }, { "crossentropy": 2.7913129329681396, "epoch": 0.29678462061925825, "grad_norm": 0.04345804452896118, "grad_norm_var": 5.510591362944308e-05, "learning_rate": 0.0061730705985726855, "loss": 2.7913, "step": 3489 }, { "crossentropy": 2.666496515274048, "epoch": 0.2968696835658387, "grad_norm": 0.04078616201877594, "grad_norm_var": 5.4102721278634765e-05, "learning_rate": 0.006171122860085959, "loss": 2.6665, "step": 3490 }, { "crossentropy": 2.7049596309661865, "epoch": 0.2969547465124192, "grad_norm": 0.036002349108457565, "grad_norm_var": 5.337418736733601e-05, "learning_rate": 0.006169174933550646, "loss": 2.705, "step": 3491 }, { "crossentropy": 2.845407485961914, "epoch": 0.29703980945899966, "grad_norm": 0.03915822505950928, "grad_norm_var": 5.178008983593435e-05, "learning_rate": 0.0061672268192795275, "loss": 2.8454, "step": 3492 }, { "crossentropy": 2.7672841548919678, "epoch": 0.2971248724055801, "grad_norm": 0.036107663065195084, "grad_norm_var": 5.292337556826732e-05, "learning_rate": 0.006165278517585415, "loss": 2.7673, "step": 3493 }, { "crossentropy": 2.7195241451263428, "epoch": 0.2972099353521606, "grad_norm": 0.03890155255794525, "grad_norm_var": 5.313689762589183e-05, "learning_rate": 0.006163330028781149, "loss": 2.7195, "step": 3494 }, { "crossentropy": 2.770967721939087, "epoch": 0.29729499829874106, "grad_norm": 0.03772227093577385, "grad_norm_var": 5.3670976763972066e-05, "learning_rate": 0.0061613813531796015, "loss": 2.771, "step": 3495 }, { "crossentropy": 2.6905908584594727, "epoch": 0.2973800612453215, "grad_norm": 0.03590560704469681, "grad_norm_var": 5.4585537056048233e-05, "learning_rate": 0.006159432491093673, "loss": 2.6906, "step": 3496 }, { "crossentropy": 2.6541075706481934, "epoch": 0.297465124191902, "grad_norm": 0.03846299275755882, "grad_norm_var": 5.4746954944383715e-05, "learning_rate": 0.006157483442836294, "loss": 2.6541, "step": 3497 }, { "crossentropy": 2.8048179149627686, "epoch": 0.29755018713848247, "grad_norm": 0.04165840148925781, "grad_norm_var": 5.6582210284918576e-06, "learning_rate": 0.006155534208720426, "loss": 2.8048, "step": 3498 }, { "crossentropy": 2.732174873352051, "epoch": 0.2976352500850629, "grad_norm": 0.04392610117793083, "grad_norm_var": 7.237051064949244e-06, "learning_rate": 0.006153584789059061, "loss": 2.7322, "step": 3499 }, { "crossentropy": 2.7300360202789307, "epoch": 0.2977203130316434, "grad_norm": 0.043188076466321945, "grad_norm_var": 7.248810942135873e-06, "learning_rate": 0.006151635184165219, "loss": 2.73, "step": 3500 }, { "crossentropy": 2.796647310256958, "epoch": 0.2978053759782239, "grad_norm": 0.04439404234290123, "grad_norm_var": 8.617637771618272e-06, "learning_rate": 0.006149685394351949, "loss": 2.7966, "step": 3501 }, { "crossentropy": 2.822676420211792, "epoch": 0.2978904389248044, "grad_norm": 0.038263484835624695, "grad_norm_var": 8.727322839842805e-06, "learning_rate": 0.006147735419932333, "loss": 2.8227, "step": 3502 }, { "crossentropy": 2.730719804763794, "epoch": 0.29797550187138483, "grad_norm": 0.03524360805749893, "grad_norm_var": 9.500458795450433e-06, "learning_rate": 0.006145785261219478, "loss": 2.7307, "step": 3503 }, { "crossentropy": 2.7712631225585938, "epoch": 0.2980605648179653, "grad_norm": 0.038743842393159866, "grad_norm_var": 9.380801540263365e-06, "learning_rate": 0.006143834918526527, "loss": 2.7713, "step": 3504 }, { "crossentropy": 2.8992953300476074, "epoch": 0.2981456277645458, "grad_norm": 0.03623107820749283, "grad_norm_var": 8.8264907413776e-06, "learning_rate": 0.006141884392166646, "loss": 2.8993, "step": 3505 }, { "crossentropy": 2.7224109172821045, "epoch": 0.29823069071112623, "grad_norm": 0.037771377712488174, "grad_norm_var": 8.694034874285641e-06, "learning_rate": 0.006139933682453035, "loss": 2.7224, "step": 3506 }, { "crossentropy": 2.8322360515594482, "epoch": 0.2983157536577067, "grad_norm": 0.037208229303359985, "grad_norm_var": 8.32625160438009e-06, "learning_rate": 0.00613798278969892, "loss": 2.8322, "step": 3507 }, { "crossentropy": 2.755704641342163, "epoch": 0.2984008166042872, "grad_norm": 0.04093635454773903, "grad_norm_var": 8.577872021908219e-06, "learning_rate": 0.00613603171421756, "loss": 2.7557, "step": 3508 }, { "crossentropy": 2.7251594066619873, "epoch": 0.29848587955086764, "grad_norm": 0.03714914619922638, "grad_norm_var": 8.23825348186998e-06, "learning_rate": 0.0061340804563222405, "loss": 2.7252, "step": 3509 }, { "crossentropy": 2.769479990005493, "epoch": 0.2985709424974481, "grad_norm": 0.03665963187813759, "grad_norm_var": 8.6136954244711e-06, "learning_rate": 0.006132129016326279, "loss": 2.7695, "step": 3510 }, { "crossentropy": 2.651505470275879, "epoch": 0.2986560054440286, "grad_norm": 0.03549303859472275, "grad_norm_var": 9.294115706826034e-06, "learning_rate": 0.006130177394543016, "loss": 2.6515, "step": 3511 }, { "crossentropy": 2.711287260055542, "epoch": 0.29874106839060904, "grad_norm": 0.0357632152736187, "grad_norm_var": 9.350850804507906e-06, "learning_rate": 0.006128225591285831, "loss": 2.7113, "step": 3512 }, { "crossentropy": 2.7751996517181396, "epoch": 0.2988261313371895, "grad_norm": 0.039781272411346436, "grad_norm_var": 9.397016521713273e-06, "learning_rate": 0.006126273606868125, "loss": 2.7752, "step": 3513 }, { "crossentropy": 2.745420455932617, "epoch": 0.29891119428377, "grad_norm": 0.03884873539209366, "grad_norm_var": 8.857302441273955e-06, "learning_rate": 0.006124321441603329, "loss": 2.7454, "step": 3514 }, { "crossentropy": 2.6410443782806396, "epoch": 0.29899625723035045, "grad_norm": 0.03790397569537163, "grad_norm_var": 6.947764591142917e-06, "learning_rate": 0.006122369095804907, "loss": 2.641, "step": 3515 }, { "crossentropy": 2.7783758640289307, "epoch": 0.29908132017693095, "grad_norm": 0.040596626698970795, "grad_norm_var": 5.695354907185995e-06, "learning_rate": 0.006120416569786349, "loss": 2.7784, "step": 3516 }, { "crossentropy": 2.7944910526275635, "epoch": 0.2991663831235114, "grad_norm": 0.03631848841905594, "grad_norm_var": 3.0875994894816645e-06, "learning_rate": 0.006118463863861174, "loss": 2.7945, "step": 3517 }, { "crossentropy": 2.8590219020843506, "epoch": 0.29925144607009185, "grad_norm": 0.03681556507945061, "grad_norm_var": 3.106371122638986e-06, "learning_rate": 0.00611651097834293, "loss": 2.859, "step": 3518 }, { "crossentropy": 2.704519510269165, "epoch": 0.29933650901667236, "grad_norm": 0.03634272515773773, "grad_norm_var": 2.8377919791224814e-06, "learning_rate": 0.006114557913545193, "loss": 2.7045, "step": 3519 }, { "crossentropy": 2.7072019577026367, "epoch": 0.2994215719632528, "grad_norm": 0.03740864247083664, "grad_norm_var": 2.7562983199141964e-06, "learning_rate": 0.006112604669781572, "loss": 2.7072, "step": 3520 }, { "crossentropy": 2.711954355239868, "epoch": 0.29950663490983326, "grad_norm": 0.03828169405460358, "grad_norm_var": 2.65118318992022e-06, "learning_rate": 0.006110651247365701, "loss": 2.712, "step": 3521 }, { "crossentropy": 2.667745351791382, "epoch": 0.29959169785641376, "grad_norm": 0.03744254633784294, "grad_norm_var": 2.6550275323074708e-06, "learning_rate": 0.00610869764661124, "loss": 2.6677, "step": 3522 }, { "crossentropy": 2.7059478759765625, "epoch": 0.2996767608029942, "grad_norm": 0.039268989115953445, "grad_norm_var": 2.789620562975749e-06, "learning_rate": 0.006106743867831884, "loss": 2.7059, "step": 3523 }, { "crossentropy": 2.706774950027466, "epoch": 0.29976182374957466, "grad_norm": 0.03775256872177124, "grad_norm_var": 2.097342637113105e-06, "learning_rate": 0.006104789911341354, "loss": 2.7068, "step": 3524 }, { "crossentropy": 2.8004515171051025, "epoch": 0.29984688669615517, "grad_norm": 0.03665545582771301, "grad_norm_var": 2.1431867254210572e-06, "learning_rate": 0.006102835777453397, "loss": 2.8005, "step": 3525 }, { "crossentropy": 2.6778969764709473, "epoch": 0.2999319496427356, "grad_norm": 0.03878505155444145, "grad_norm_var": 2.1637605486584758e-06, "learning_rate": 0.006100881466481792, "loss": 2.6779, "step": 3526 }, { "crossentropy": 2.749321937561035, "epoch": 0.30001701258931607, "grad_norm": 0.039784986525774, "grad_norm_var": 2.042857669925085e-06, "learning_rate": 0.006098926978740344, "loss": 2.7493, "step": 3527 }, { "crossentropy": 2.648672342300415, "epoch": 0.3001020755358966, "grad_norm": 0.03467150777578354, "grad_norm_var": 2.4406658802073046e-06, "learning_rate": 0.006096972314542889, "loss": 2.6487, "step": 3528 }, { "crossentropy": 2.7334132194519043, "epoch": 0.300187138482477, "grad_norm": 0.03542328625917435, "grad_norm_var": 2.543927228951942e-06, "learning_rate": 0.006095017474203288, "loss": 2.7334, "step": 3529 }, { "crossentropy": 2.6671254634857178, "epoch": 0.30027220142905753, "grad_norm": 0.04089352488517761, "grad_norm_var": 3.1337611123995653e-06, "learning_rate": 0.006093062458035433, "loss": 2.6671, "step": 3530 }, { "crossentropy": 2.7152349948883057, "epoch": 0.300357264375638, "grad_norm": 0.046302348375320435, "grad_norm_var": 7.690282228215752e-06, "learning_rate": 0.006091107266353244, "loss": 2.7152, "step": 3531 }, { "crossentropy": 2.700798273086548, "epoch": 0.3004423273222184, "grad_norm": 0.04371014982461929, "grad_norm_var": 9.25102503661849e-06, "learning_rate": 0.006089151899470667, "loss": 2.7008, "step": 3532 }, { "crossentropy": 2.7493388652801514, "epoch": 0.30052739026879893, "grad_norm": 0.04038836434483528, "grad_norm_var": 9.10730258326849e-06, "learning_rate": 0.006087196357701679, "loss": 2.7493, "step": 3533 }, { "crossentropy": 2.7454090118408203, "epoch": 0.3006124532153794, "grad_norm": 0.038848139345645905, "grad_norm_var": 8.84249104947567e-06, "learning_rate": 0.006085240641360281, "loss": 2.7454, "step": 3534 }, { "crossentropy": 2.8504116535186768, "epoch": 0.30069751616195983, "grad_norm": 0.036730486899614334, "grad_norm_var": 8.721095240347513e-06, "learning_rate": 0.006083284750760508, "loss": 2.8504, "step": 3535 }, { "crossentropy": 2.75291109085083, "epoch": 0.30078257910854034, "grad_norm": 0.03574511781334877, "grad_norm_var": 9.224115953440116e-06, "learning_rate": 0.0060813286862164175, "loss": 2.7529, "step": 3536 }, { "crossentropy": 2.702629804611206, "epoch": 0.3008676420551208, "grad_norm": 0.03709839656949043, "grad_norm_var": 9.392260987061326e-06, "learning_rate": 0.006079372448042098, "loss": 2.7026, "step": 3537 }, { "crossentropy": 2.7632639408111572, "epoch": 0.30095270500170124, "grad_norm": 0.041446711868047714, "grad_norm_var": 9.712963370345218e-06, "learning_rate": 0.006077416036551665, "loss": 2.7633, "step": 3538 }, { "crossentropy": 2.7256946563720703, "epoch": 0.30103776794828174, "grad_norm": 0.03881564363837242, "grad_norm_var": 9.707679443436006e-06, "learning_rate": 0.006075459452059261, "loss": 2.7257, "step": 3539 }, { "crossentropy": 2.731278896331787, "epoch": 0.3011228308948622, "grad_norm": 0.03761663660407066, "grad_norm_var": 9.730368928446454e-06, "learning_rate": 0.0060735026948790585, "loss": 2.7313, "step": 3540 }, { "crossentropy": 2.7575337886810303, "epoch": 0.30120789384144264, "grad_norm": 0.03671050816774368, "grad_norm_var": 9.713846059394327e-06, "learning_rate": 0.006071545765325254, "loss": 2.7575, "step": 3541 }, { "crossentropy": 2.7732062339782715, "epoch": 0.30129295678802315, "grad_norm": 0.036600615829229355, "grad_norm_var": 1.0055952422750348e-05, "learning_rate": 0.0060695886637120765, "loss": 2.7732, "step": 3542 }, { "crossentropy": 2.7883217334747314, "epoch": 0.3013780197346036, "grad_norm": 0.03697265684604645, "grad_norm_var": 1.0180611582014804e-05, "learning_rate": 0.006067631390353779, "loss": 2.7883, "step": 3543 }, { "crossentropy": 2.679680109024048, "epoch": 0.3014630826811841, "grad_norm": 0.038290902972221375, "grad_norm_var": 9.092244164735085e-06, "learning_rate": 0.006065673945564643, "loss": 2.6797, "step": 3544 }, { "crossentropy": 2.7525620460510254, "epoch": 0.30154814562776455, "grad_norm": 0.03566271439194679, "grad_norm_var": 8.986446461314336e-06, "learning_rate": 0.006063716329658978, "loss": 2.7526, "step": 3545 }, { "crossentropy": 2.7344558238983154, "epoch": 0.301633208574345, "grad_norm": 0.03758130222558975, "grad_norm_var": 8.776070367482771e-06, "learning_rate": 0.006061758542951121, "loss": 2.7345, "step": 3546 }, { "crossentropy": 2.773449420928955, "epoch": 0.3017182715209255, "grad_norm": 0.0369100458920002, "grad_norm_var": 4.715886824405851e-06, "learning_rate": 0.006059800585755436, "loss": 2.7734, "step": 3547 }, { "crossentropy": 2.665733814239502, "epoch": 0.30180333446750596, "grad_norm": 0.03814249485731125, "grad_norm_var": 2.466712352642425e-06, "learning_rate": 0.006057842458386315, "loss": 2.6657, "step": 3548 }, { "crossentropy": 2.778336763381958, "epoch": 0.3018883974140864, "grad_norm": 0.03707760572433472, "grad_norm_var": 1.9749984382347664e-06, "learning_rate": 0.006055884161158176, "loss": 2.7783, "step": 3549 }, { "crossentropy": 2.7046892642974854, "epoch": 0.3019734603606669, "grad_norm": 0.036311469972133636, "grad_norm_var": 1.92647978331095e-06, "learning_rate": 0.006053925694385464, "loss": 2.7047, "step": 3550 }, { "crossentropy": 2.7089314460754395, "epoch": 0.30205852330724736, "grad_norm": 0.035001661628484726, "grad_norm_var": 2.2577185187814057e-06, "learning_rate": 0.006051967058382655, "loss": 2.7089, "step": 3551 }, { "crossentropy": 2.67611026763916, "epoch": 0.3021435862538278, "grad_norm": 0.03865741565823555, "grad_norm_var": 2.203832122498785e-06, "learning_rate": 0.006050008253464246, "loss": 2.6761, "step": 3552 }, { "crossentropy": 2.7278311252593994, "epoch": 0.3022286492004083, "grad_norm": 0.034374937415122986, "grad_norm_var": 2.7882043309147842e-06, "learning_rate": 0.006048049279944769, "loss": 2.7278, "step": 3553 }, { "crossentropy": 2.820253849029541, "epoch": 0.30231371214698877, "grad_norm": 0.04037189111113548, "grad_norm_var": 2.260530838597147e-06, "learning_rate": 0.006046090138138777, "loss": 2.8203, "step": 3554 }, { "crossentropy": 2.7999398708343506, "epoch": 0.3023987750935692, "grad_norm": 0.03809274733066559, "grad_norm_var": 2.136854889025983e-06, "learning_rate": 0.0060441308283608495, "loss": 2.7999, "step": 3555 }, { "crossentropy": 2.7676315307617188, "epoch": 0.3024838380401497, "grad_norm": 0.046324845403432846, "grad_norm_var": 7.419990555677415e-06, "learning_rate": 0.006042171350925598, "loss": 2.7676, "step": 3556 }, { "crossentropy": 2.805283308029175, "epoch": 0.30256890098673017, "grad_norm": 0.04130368307232857, "grad_norm_var": 8.137028297483297e-06, "learning_rate": 0.006040211706147657, "loss": 2.8053, "step": 3557 }, { "crossentropy": 2.7447524070739746, "epoch": 0.3026539639333107, "grad_norm": 0.03729305416345596, "grad_norm_var": 8.039660795162937e-06, "learning_rate": 0.006038251894341687, "loss": 2.7448, "step": 3558 }, { "crossentropy": 2.6762731075286865, "epoch": 0.3027390268798911, "grad_norm": 0.03706928342580795, "grad_norm_var": 8.026711046060727e-06, "learning_rate": 0.0060362919158223815, "loss": 2.6763, "step": 3559 }, { "crossentropy": 2.741147518157959, "epoch": 0.3028240898264716, "grad_norm": 0.03783208504319191, "grad_norm_var": 8.023853918591913e-06, "learning_rate": 0.006034331770904454, "loss": 2.7411, "step": 3560 }, { "crossentropy": 2.745164155960083, "epoch": 0.3029091527730521, "grad_norm": 0.03904379531741142, "grad_norm_var": 7.684458194822986e-06, "learning_rate": 0.006032371459902648, "loss": 2.7452, "step": 3561 }, { "crossentropy": 2.7896928787231445, "epoch": 0.30299421571963253, "grad_norm": 0.03766946494579315, "grad_norm_var": 7.67753282049366e-06, "learning_rate": 0.006030410983131732, "loss": 2.7897, "step": 3562 }, { "crossentropy": 2.8220670223236084, "epoch": 0.303079278666213, "grad_norm": 0.03888355568051338, "grad_norm_var": 7.576975535624382e-06, "learning_rate": 0.006028450340906504, "loss": 2.8221, "step": 3563 }, { "crossentropy": 2.7415685653686523, "epoch": 0.3031643416127935, "grad_norm": 0.03698012977838516, "grad_norm_var": 7.692125339922584e-06, "learning_rate": 0.006026489533541783, "loss": 2.7416, "step": 3564 }, { "crossentropy": 2.783640146255493, "epoch": 0.30324940455937394, "grad_norm": 0.03985712304711342, "grad_norm_var": 7.733828426592129e-06, "learning_rate": 0.006024528561352422, "loss": 2.7836, "step": 3565 }, { "crossentropy": 2.6636171340942383, "epoch": 0.3033344675059544, "grad_norm": 0.037123050540685654, "grad_norm_var": 7.544481477887027e-06, "learning_rate": 0.006022567424653295, "loss": 2.6636, "step": 3566 }, { "crossentropy": 2.7226898670196533, "epoch": 0.3034195304525349, "grad_norm": 0.03801117464900017, "grad_norm_var": 6.70982339127819e-06, "learning_rate": 0.006020606123759304, "loss": 2.7227, "step": 3567 }, { "crossentropy": 2.706143617630005, "epoch": 0.30350459339911534, "grad_norm": 0.04189807549118996, "grad_norm_var": 7.356209795263367e-06, "learning_rate": 0.0060186446589853785, "loss": 2.7061, "step": 3568 }, { "crossentropy": 2.726962089538574, "epoch": 0.3035896563456958, "grad_norm": 0.042325522750616074, "grad_norm_var": 6.527990181433701e-06, "learning_rate": 0.006016683030646471, "loss": 2.727, "step": 3569 }, { "crossentropy": 2.8077125549316406, "epoch": 0.3036747192922763, "grad_norm": 0.037630531936883926, "grad_norm_var": 6.635118335196882e-06, "learning_rate": 0.006014721239057564, "loss": 2.8077, "step": 3570 }, { "crossentropy": 2.7200067043304443, "epoch": 0.30375978223885675, "grad_norm": 0.036570120602846146, "grad_norm_var": 7.006561421939635e-06, "learning_rate": 0.006012759284533665, "loss": 2.72, "step": 3571 }, { "crossentropy": 2.731796979904175, "epoch": 0.30384484518543725, "grad_norm": 0.03842581808567047, "grad_norm_var": 3.3111775262134025e-06, "learning_rate": 0.006010797167389808, "loss": 2.7318, "step": 3572 }, { "crossentropy": 2.737243175506592, "epoch": 0.3039299081320177, "grad_norm": 0.04183194413781166, "grad_norm_var": 3.5176590108362248e-06, "learning_rate": 0.006008834887941049, "loss": 2.7372, "step": 3573 }, { "crossentropy": 2.750361442565918, "epoch": 0.30401497107859815, "grad_norm": 0.038822367787361145, "grad_norm_var": 3.3865712070460864e-06, "learning_rate": 0.006006872446502478, "loss": 2.7504, "step": 3574 }, { "crossentropy": 2.770756721496582, "epoch": 0.30410003402517866, "grad_norm": 0.03802741691470146, "grad_norm_var": 3.22944124927782e-06, "learning_rate": 0.006004909843389203, "loss": 2.7708, "step": 3575 }, { "crossentropy": 2.8051295280456543, "epoch": 0.3041850969717591, "grad_norm": 0.041833121329545975, "grad_norm_var": 3.7091973792535702e-06, "learning_rate": 0.0060029470789163646, "loss": 2.8051, "step": 3576 }, { "crossentropy": 2.7301559448242188, "epoch": 0.30427015991833956, "grad_norm": 0.038988057523965836, "grad_norm_var": 3.7094995344671514e-06, "learning_rate": 0.006000984153399122, "loss": 2.7302, "step": 3577 }, { "crossentropy": 2.732999324798584, "epoch": 0.30435522286492006, "grad_norm": 0.03933620825409889, "grad_norm_var": 3.575250843867485e-06, "learning_rate": 0.00599902106715267, "loss": 2.733, "step": 3578 }, { "crossentropy": 2.741483688354492, "epoch": 0.3044402858115005, "grad_norm": 0.03519868478178978, "grad_norm_var": 4.559229884267858e-06, "learning_rate": 0.00599705782049222, "loss": 2.7415, "step": 3579 }, { "crossentropy": 2.8526933193206787, "epoch": 0.30452534875808096, "grad_norm": 0.03825288638472557, "grad_norm_var": 4.3297985718428186e-06, "learning_rate": 0.0059950944137330126, "loss": 2.8527, "step": 3580 }, { "crossentropy": 2.7843356132507324, "epoch": 0.30461041170466147, "grad_norm": 0.04215386137366295, "grad_norm_var": 4.91943641251121e-06, "learning_rate": 0.005993130847190317, "loss": 2.7843, "step": 3581 }, { "crossentropy": 2.736265182495117, "epoch": 0.3046954746512419, "grad_norm": 0.039904333651065826, "grad_norm_var": 4.650569529344948e-06, "learning_rate": 0.005991167121179422, "loss": 2.7363, "step": 3582 }, { "crossentropy": 2.7282164096832275, "epoch": 0.30478053759782237, "grad_norm": 0.03548013046383858, "grad_norm_var": 5.4945496437099875e-06, "learning_rate": 0.0059892032360156465, "loss": 2.7282, "step": 3583 }, { "crossentropy": 2.723188877105713, "epoch": 0.30486560054440287, "grad_norm": 0.03525198996067047, "grad_norm_var": 5.835466975127416e-06, "learning_rate": 0.005987239192014335, "loss": 2.7232, "step": 3584 }, { "crossentropy": 2.765479326248169, "epoch": 0.3049506634909833, "grad_norm": 0.036239564418792725, "grad_norm_var": 5.250673295467631e-06, "learning_rate": 0.005985274989490858, "loss": 2.7655, "step": 3585 }, { "crossentropy": 2.7920358180999756, "epoch": 0.30503572643756377, "grad_norm": 0.03865555301308632, "grad_norm_var": 5.215046394088594e-06, "learning_rate": 0.0059833106287606064, "loss": 2.792, "step": 3586 }, { "crossentropy": 2.7158634662628174, "epoch": 0.3051207893841443, "grad_norm": 0.03770240768790245, "grad_norm_var": 5.013518391799345e-06, "learning_rate": 0.005981346110139001, "loss": 2.7159, "step": 3587 }, { "crossentropy": 2.7300832271575928, "epoch": 0.3052058523307247, "grad_norm": 0.04318922758102417, "grad_norm_var": 6.380391243184363e-06, "learning_rate": 0.005979381433941487, "loss": 2.7301, "step": 3588 }, { "crossentropy": 2.7833399772644043, "epoch": 0.30529091527730523, "grad_norm": 0.037429749965667725, "grad_norm_var": 5.814456362168476e-06, "learning_rate": 0.005977416600483536, "loss": 2.7833, "step": 3589 }, { "crossentropy": 2.691570997238159, "epoch": 0.3053759782238857, "grad_norm": 0.03717096149921417, "grad_norm_var": 5.920328344838123e-06, "learning_rate": 0.005975451610080642, "loss": 2.6916, "step": 3590 }, { "crossentropy": 2.6912548542022705, "epoch": 0.30546104117046613, "grad_norm": 0.0357484333217144, "grad_norm_var": 6.366018931885699e-06, "learning_rate": 0.0059734864630483255, "loss": 2.6913, "step": 3591 }, { "crossentropy": 2.81150484085083, "epoch": 0.30554610411704664, "grad_norm": 0.03723837435245514, "grad_norm_var": 5.510853517880163e-06, "learning_rate": 0.005971521159702136, "loss": 2.8115, "step": 3592 }, { "crossentropy": 2.7358148097991943, "epoch": 0.3056311670636271, "grad_norm": 0.036692794412374496, "grad_norm_var": 5.536598282769531e-06, "learning_rate": 0.00596955570035764, "loss": 2.7358, "step": 3593 }, { "crossentropy": 2.752497673034668, "epoch": 0.30571623001020753, "grad_norm": 0.038534630089998245, "grad_norm_var": 5.418216311747712e-06, "learning_rate": 0.005967590085330435, "loss": 2.7525, "step": 3594 }, { "crossentropy": 2.7091991901397705, "epoch": 0.30580129295678804, "grad_norm": 0.038259997963905334, "grad_norm_var": 4.941039793540658e-06, "learning_rate": 0.005965624314936143, "loss": 2.7092, "step": 3595 }, { "crossentropy": 2.6724936962127686, "epoch": 0.3058863559033685, "grad_norm": 0.03376707807183266, "grad_norm_var": 6.043886097004819e-06, "learning_rate": 0.005963658389490411, "loss": 2.6725, "step": 3596 }, { "crossentropy": 2.679625988006592, "epoch": 0.30597141884994894, "grad_norm": 0.035515639930963516, "grad_norm_var": 4.868033961624718e-06, "learning_rate": 0.005961692309308909, "loss": 2.6796, "step": 3597 }, { "crossentropy": 2.7522215843200684, "epoch": 0.30605648179652944, "grad_norm": 0.039475396275520325, "grad_norm_var": 4.7305186345989365e-06, "learning_rate": 0.005959726074707331, "loss": 2.7522, "step": 3598 }, { "crossentropy": 2.7173099517822266, "epoch": 0.3061415447431099, "grad_norm": 0.03559887781739235, "grad_norm_var": 4.703029377844716e-06, "learning_rate": 0.005957759686001401, "loss": 2.7173, "step": 3599 }, { "crossentropy": 2.747560977935791, "epoch": 0.30622660768969034, "grad_norm": 0.036459002643823624, "grad_norm_var": 4.467800289960298e-06, "learning_rate": 0.005955793143506863, "loss": 2.7476, "step": 3600 }, { "crossentropy": 2.7550337314605713, "epoch": 0.30631167063627085, "grad_norm": 0.03814342990517616, "grad_norm_var": 4.411229034022764e-06, "learning_rate": 0.005953826447539484, "loss": 2.755, "step": 3601 }, { "crossentropy": 2.663018226623535, "epoch": 0.3063967335828513, "grad_norm": 0.034929025918245316, "grad_norm_var": 4.692012277653553e-06, "learning_rate": 0.005951859598415063, "loss": 2.663, "step": 3602 }, { "crossentropy": 2.7538819313049316, "epoch": 0.3064817965294318, "grad_norm": 0.038172993808984756, "grad_norm_var": 4.734807738237858e-06, "learning_rate": 0.0059498925964494185, "loss": 2.7539, "step": 3603 }, { "crossentropy": 2.708263635635376, "epoch": 0.30656685947601225, "grad_norm": 0.03720042109489441, "grad_norm_var": 2.2501527840277275e-06, "learning_rate": 0.005947925441958393, "loss": 2.7083, "step": 3604 }, { "crossentropy": 2.7216455936431885, "epoch": 0.3066519224225927, "grad_norm": 0.03705573454499245, "grad_norm_var": 2.2322808413784955e-06, "learning_rate": 0.005945958135257855, "loss": 2.7216, "step": 3605 }, { "crossentropy": 2.774585723876953, "epoch": 0.3067369853691732, "grad_norm": 0.03821069002151489, "grad_norm_var": 2.3411972236920982e-06, "learning_rate": 0.005943990676663698, "loss": 2.7746, "step": 3606 }, { "crossentropy": 2.7111494541168213, "epoch": 0.30682204831575366, "grad_norm": 0.03538423404097557, "grad_norm_var": 2.4072359060243903e-06, "learning_rate": 0.005942023066491838, "loss": 2.7111, "step": 3607 }, { "crossentropy": 2.7439568042755127, "epoch": 0.3069071112623341, "grad_norm": 0.03481109440326691, "grad_norm_var": 2.670776428246163e-06, "learning_rate": 0.005940055305058219, "loss": 2.744, "step": 3608 }, { "crossentropy": 2.8239285945892334, "epoch": 0.3069921742089146, "grad_norm": 0.038322411477565765, "grad_norm_var": 2.8214589381715673e-06, "learning_rate": 0.0059380873926788025, "loss": 2.8239, "step": 3609 }, { "crossentropy": 2.743762969970703, "epoch": 0.30707723715549506, "grad_norm": 0.03693515062332153, "grad_norm_var": 2.6252917462549557e-06, "learning_rate": 0.005936119329669583, "loss": 2.7438, "step": 3610 }, { "crossentropy": 2.7138705253601074, "epoch": 0.3071623001020755, "grad_norm": 0.03640728443861008, "grad_norm_var": 2.4705370952901942e-06, "learning_rate": 0.005934151116346573, "loss": 2.7139, "step": 3611 }, { "crossentropy": 2.8732621669769287, "epoch": 0.307247363048656, "grad_norm": 0.03552136942744255, "grad_norm_var": 1.988720525655738e-06, "learning_rate": 0.0059321827530258095, "loss": 2.8733, "step": 3612 }, { "crossentropy": 2.8220839500427246, "epoch": 0.30733242599523647, "grad_norm": 0.04285503551363945, "grad_norm_var": 4.138732204751692e-06, "learning_rate": 0.005930214240023356, "loss": 2.8221, "step": 3613 }, { "crossentropy": 2.6706595420837402, "epoch": 0.3074174889418169, "grad_norm": 0.04016270115971565, "grad_norm_var": 4.375159214669207e-06, "learning_rate": 0.005928245577655302, "loss": 2.6707, "step": 3614 }, { "crossentropy": 2.690887212753296, "epoch": 0.3075025518883974, "grad_norm": 0.043507833033800125, "grad_norm_var": 6.53231042861413e-06, "learning_rate": 0.0059262767662377515, "loss": 2.6909, "step": 3615 }, { "crossentropy": 2.802797317504883, "epoch": 0.3075876148349779, "grad_norm": 0.04593735560774803, "grad_norm_var": 1.050952823523979e-05, "learning_rate": 0.005924307806086844, "loss": 2.8028, "step": 3616 }, { "crossentropy": 2.707747459411621, "epoch": 0.3076726777815584, "grad_norm": 0.03863692656159401, "grad_norm_var": 1.051133500211109e-05, "learning_rate": 0.0059223386975187356, "loss": 2.7077, "step": 3617 }, { "crossentropy": 2.6947028636932373, "epoch": 0.30775774072813883, "grad_norm": 0.03705110773444176, "grad_norm_var": 9.816879600967516e-06, "learning_rate": 0.0059203694408496085, "loss": 2.6947, "step": 3618 }, { "crossentropy": 2.7281198501586914, "epoch": 0.3078428036747193, "grad_norm": 0.03549870103597641, "grad_norm_var": 1.0384311896121691e-05, "learning_rate": 0.005918400036395671, "loss": 2.7281, "step": 3619 }, { "crossentropy": 2.8157246112823486, "epoch": 0.3079278666212998, "grad_norm": 0.038077306002378464, "grad_norm_var": 1.0298708371730968e-05, "learning_rate": 0.0059164304844731485, "loss": 2.8157, "step": 3620 }, { "crossentropy": 2.6947860717773438, "epoch": 0.30801292956788023, "grad_norm": 0.0459967665374279, "grad_norm_var": 1.3694404809242582e-05, "learning_rate": 0.005914460785398298, "loss": 2.6948, "step": 3621 }, { "crossentropy": 2.719005823135376, "epoch": 0.3080979925144607, "grad_norm": 0.04230862855911255, "grad_norm_var": 1.4336060426948377e-05, "learning_rate": 0.005912490939487395, "loss": 2.719, "step": 3622 }, { "crossentropy": 2.7359135150909424, "epoch": 0.3081830554610412, "grad_norm": 0.04087573662400246, "grad_norm_var": 1.3417153931184292e-05, "learning_rate": 0.005910520947056739, "loss": 2.7359, "step": 3623 }, { "crossentropy": 2.8041329383850098, "epoch": 0.30826811840762164, "grad_norm": 0.04766537994146347, "grad_norm_var": 1.5610870933892528e-05, "learning_rate": 0.0059085508084226555, "loss": 2.8041, "step": 3624 }, { "crossentropy": 2.7395102977752686, "epoch": 0.3083531813542021, "grad_norm": 0.04398934543132782, "grad_norm_var": 1.6078435204492738e-05, "learning_rate": 0.005906580523901492, "loss": 2.7395, "step": 3625 }, { "crossentropy": 2.725874900817871, "epoch": 0.3084382443007826, "grad_norm": 0.03824213892221451, "grad_norm_var": 1.5526648640615954e-05, "learning_rate": 0.005904610093809617, "loss": 2.7259, "step": 3626 }, { "crossentropy": 2.76265025138855, "epoch": 0.30852330724736304, "grad_norm": 0.038604747503995895, "grad_norm_var": 1.4542622985641978e-05, "learning_rate": 0.005902639518463429, "loss": 2.7627, "step": 3627 }, { "crossentropy": 2.7912607192993164, "epoch": 0.3086083701939435, "grad_norm": 0.03802819177508354, "grad_norm_var": 1.3126519017438107e-05, "learning_rate": 0.0059006687981793425, "loss": 2.7913, "step": 3628 }, { "crossentropy": 2.7053651809692383, "epoch": 0.308693433140524, "grad_norm": 0.032633550465106964, "grad_norm_var": 1.72507580459944e-05, "learning_rate": 0.005898697933273798, "loss": 2.7054, "step": 3629 }, { "crossentropy": 2.750894784927368, "epoch": 0.30877849608710445, "grad_norm": 0.04881361871957779, "grad_norm_var": 2.1595586472497538e-05, "learning_rate": 0.005896726924063262, "loss": 2.7509, "step": 3630 }, { "crossentropy": 2.744710922241211, "epoch": 0.30886355903368495, "grad_norm": 0.036856044083833694, "grad_norm_var": 2.2129415985461197e-05, "learning_rate": 0.0058947557708642195, "loss": 2.7447, "step": 3631 }, { "crossentropy": 2.7866873741149902, "epoch": 0.3089486219802654, "grad_norm": 0.03726916387677193, "grad_norm_var": 2.062904545175644e-05, "learning_rate": 0.005892784473993183, "loss": 2.7867, "step": 3632 }, { "crossentropy": 2.7794270515441895, "epoch": 0.30903368492684585, "grad_norm": 0.03898646682500839, "grad_norm_var": 2.057156071014176e-05, "learning_rate": 0.005890813033766686, "loss": 2.7794, "step": 3633 }, { "crossentropy": 2.685098171234131, "epoch": 0.30911874787342636, "grad_norm": 0.0333331860601902, "grad_norm_var": 2.2925116136040165e-05, "learning_rate": 0.005888841450501282, "loss": 2.6851, "step": 3634 }, { "crossentropy": 2.8016884326934814, "epoch": 0.3092038108200068, "grad_norm": 0.04153743386268616, "grad_norm_var": 2.1721935879927497e-05, "learning_rate": 0.005886869724513554, "loss": 2.8017, "step": 3635 }, { "crossentropy": 2.764902114868164, "epoch": 0.30928887376658726, "grad_norm": 0.0395975336432457, "grad_norm_var": 2.1435891090496906e-05, "learning_rate": 0.005884897856120102, "loss": 2.7649, "step": 3636 }, { "crossentropy": 2.8197505474090576, "epoch": 0.30937393671316776, "grad_norm": 0.03529860079288483, "grad_norm_var": 2.045753592688192e-05, "learning_rate": 0.005882925845637551, "loss": 2.8198, "step": 3637 }, { "crossentropy": 2.7568116188049316, "epoch": 0.3094589996597482, "grad_norm": 0.03692498430609703, "grad_norm_var": 2.034443641986122e-05, "learning_rate": 0.005880953693382551, "loss": 2.7568, "step": 3638 }, { "crossentropy": 2.701356887817383, "epoch": 0.30954406260632866, "grad_norm": 0.04156065732240677, "grad_norm_var": 2.0518478013501624e-05, "learning_rate": 0.005878981399671773, "loss": 2.7014, "step": 3639 }, { "crossentropy": 2.751488447189331, "epoch": 0.30962912555290917, "grad_norm": 0.04042472317814827, "grad_norm_var": 1.5751705685383178e-05, "learning_rate": 0.0058770089648219085, "loss": 2.7515, "step": 3640 }, { "crossentropy": 2.78189754486084, "epoch": 0.3097141884994896, "grad_norm": 0.03839188814163208, "grad_norm_var": 1.3897631432121561e-05, "learning_rate": 0.005875036389149675, "loss": 2.7819, "step": 3641 }, { "crossentropy": 2.5892536640167236, "epoch": 0.30979925144607007, "grad_norm": 0.035170797258615494, "grad_norm_var": 1.4605672156434158e-05, "learning_rate": 0.00587306367297181, "loss": 2.5893, "step": 3642 }, { "crossentropy": 2.737461805343628, "epoch": 0.3098843143926506, "grad_norm": 0.03731822967529297, "grad_norm_var": 1.4663613826077817e-05, "learning_rate": 0.005871090816605077, "loss": 2.7375, "step": 3643 }, { "crossentropy": 2.7073657512664795, "epoch": 0.309969377339231, "grad_norm": 0.040531352162361145, "grad_norm_var": 1.4978171417699e-05, "learning_rate": 0.0058691178203662585, "loss": 2.7074, "step": 3644 }, { "crossentropy": 2.7571330070495605, "epoch": 0.31005444028581153, "grad_norm": 0.03589849919080734, "grad_norm_var": 1.3127372450581071e-05, "learning_rate": 0.005867144684572161, "loss": 2.7571, "step": 3645 }, { "crossentropy": 2.6929609775543213, "epoch": 0.310139503232392, "grad_norm": 0.037052325904369354, "grad_norm_var": 5.7868531906644e-06, "learning_rate": 0.005865171409539614, "loss": 2.693, "step": 3646 }, { "crossentropy": 2.6367454528808594, "epoch": 0.3102245661789724, "grad_norm": 0.038389429450035095, "grad_norm_var": 5.723539827776012e-06, "learning_rate": 0.005863197995585468, "loss": 2.6367, "step": 3647 }, { "crossentropy": 2.6575722694396973, "epoch": 0.31030962912555293, "grad_norm": 0.03571297228336334, "grad_norm_var": 6.0224594190416704e-06, "learning_rate": 0.0058612244430265945, "loss": 2.6576, "step": 3648 }, { "crossentropy": 2.787097215652466, "epoch": 0.3103946920721334, "grad_norm": 0.037441764026880264, "grad_norm_var": 5.944334554047227e-06, "learning_rate": 0.005859250752179893, "loss": 2.7871, "step": 3649 }, { "crossentropy": 2.7458252906799316, "epoch": 0.31047975501871383, "grad_norm": 0.03568674623966217, "grad_norm_var": 4.893044352375503e-06, "learning_rate": 0.005857276923362279, "loss": 2.7458, "step": 3650 }, { "crossentropy": 2.8237428665161133, "epoch": 0.31056481796529434, "grad_norm": 0.04157672077417374, "grad_norm_var": 4.912018508503498e-06, "learning_rate": 0.005855302956890692, "loss": 2.8237, "step": 3651 }, { "crossentropy": 2.733323574066162, "epoch": 0.3106498809118748, "grad_norm": 0.043256908655166626, "grad_norm_var": 6.5596102239808e-06, "learning_rate": 0.005853328853082097, "loss": 2.7333, "step": 3652 }, { "crossentropy": 2.7135517597198486, "epoch": 0.31073494385845524, "grad_norm": 0.040780846029520035, "grad_norm_var": 6.3429634323106455e-06, "learning_rate": 0.005851354612253474, "loss": 2.7136, "step": 3653 }, { "crossentropy": 2.7594244480133057, "epoch": 0.31082000680503574, "grad_norm": 0.03941907733678818, "grad_norm_var": 6.205509850742743e-06, "learning_rate": 0.0058493802347218335, "loss": 2.7594, "step": 3654 }, { "crossentropy": 2.795374631881714, "epoch": 0.3109050697516162, "grad_norm": 0.03612048923969269, "grad_norm_var": 5.9536156108615694e-06, "learning_rate": 0.005847405720804201, "loss": 2.7954, "step": 3655 }, { "crossentropy": 2.7399046421051025, "epoch": 0.31099013269819664, "grad_norm": 0.03674434870481491, "grad_norm_var": 5.7689837136868165e-06, "learning_rate": 0.005845431070817626, "loss": 2.7399, "step": 3656 }, { "crossentropy": 2.772214889526367, "epoch": 0.31107519564477715, "grad_norm": 0.038319241255521774, "grad_norm_var": 5.766421116847516e-06, "learning_rate": 0.005843456285079184, "loss": 2.7722, "step": 3657 }, { "crossentropy": 2.6994822025299072, "epoch": 0.3111602585913576, "grad_norm": 0.037682484835386276, "grad_norm_var": 5.1835142059972654e-06, "learning_rate": 0.005841481363905965, "loss": 2.6995, "step": 3658 }, { "crossentropy": 2.7613706588745117, "epoch": 0.31124532153793805, "grad_norm": 0.03650836646556854, "grad_norm_var": 5.324658077500664e-06, "learning_rate": 0.005839506307615085, "loss": 2.7614, "step": 3659 }, { "crossentropy": 2.6942977905273438, "epoch": 0.31133038448451855, "grad_norm": 0.037691809237003326, "grad_norm_var": 4.9440767396805954e-06, "learning_rate": 0.005837531116523682, "loss": 2.6943, "step": 3660 }, { "crossentropy": 2.7940568923950195, "epoch": 0.311415447431099, "grad_norm": 0.03529432788491249, "grad_norm_var": 5.137599491949508e-06, "learning_rate": 0.005835555790948916, "loss": 2.7941, "step": 3661 }, { "crossentropy": 2.7786693572998047, "epoch": 0.3115005103776795, "grad_norm": 0.03683511167764664, "grad_norm_var": 5.167411693039227e-06, "learning_rate": 0.0058335803312079635, "loss": 2.7787, "step": 3662 }, { "crossentropy": 2.818765878677368, "epoch": 0.31158557332425996, "grad_norm": 0.03837967664003372, "grad_norm_var": 5.166867398382196e-06, "learning_rate": 0.0058316047376180315, "loss": 2.8188, "step": 3663 }, { "crossentropy": 2.8156795501708984, "epoch": 0.3116706362708404, "grad_norm": 0.040615323930978775, "grad_norm_var": 5.196457282282083e-06, "learning_rate": 0.00582962901049634, "loss": 2.8157, "step": 3664 }, { "crossentropy": 2.771190881729126, "epoch": 0.3117556992174209, "grad_norm": 0.038800109177827835, "grad_norm_var": 5.161395791895146e-06, "learning_rate": 0.005827653150160134, "loss": 2.7712, "step": 3665 }, { "crossentropy": 2.799816370010376, "epoch": 0.31184076216400136, "grad_norm": 0.03578348457813263, "grad_norm_var": 5.127538896485679e-06, "learning_rate": 0.0058256771569266804, "loss": 2.7998, "step": 3666 }, { "crossentropy": 2.7574880123138428, "epoch": 0.3119258251105818, "grad_norm": 0.03900015726685524, "grad_norm_var": 4.438415951845721e-06, "learning_rate": 0.005823701031113266, "loss": 2.7575, "step": 3667 }, { "crossentropy": 2.762082576751709, "epoch": 0.3120108880571623, "grad_norm": 0.03575879707932472, "grad_norm_var": 2.8986200827694116e-06, "learning_rate": 0.005821724773037202, "loss": 2.7621, "step": 3668 }, { "crossentropy": 2.661447286605835, "epoch": 0.31209595100374277, "grad_norm": 0.03531642258167267, "grad_norm_var": 2.5444931452613003e-06, "learning_rate": 0.0058197483830158174, "loss": 2.6614, "step": 3669 }, { "crossentropy": 2.8247313499450684, "epoch": 0.3121810139503232, "grad_norm": 0.038098640739917755, "grad_norm_var": 2.296551079028891e-06, "learning_rate": 0.00581777186136646, "loss": 2.8247, "step": 3670 }, { "crossentropy": 2.739776849746704, "epoch": 0.3122660768969037, "grad_norm": 0.0380195714533329, "grad_norm_var": 2.2209383791343498e-06, "learning_rate": 0.005815795208406508, "loss": 2.7398, "step": 3671 }, { "crossentropy": 2.7450873851776123, "epoch": 0.31235113984348417, "grad_norm": 0.03740270808339119, "grad_norm_var": 2.18801712448437e-06, "learning_rate": 0.005813818424453351, "loss": 2.7451, "step": 3672 }, { "crossentropy": 2.7383365631103516, "epoch": 0.3124362027900646, "grad_norm": 0.0361386314034462, "grad_norm_var": 2.238042980099812e-06, "learning_rate": 0.005811841509824405, "loss": 2.7383, "step": 3673 }, { "crossentropy": 2.5536794662475586, "epoch": 0.3125212657366451, "grad_norm": 0.03572650998830795, "grad_norm_var": 2.3859746132323088e-06, "learning_rate": 0.005809864464837104, "loss": 2.5537, "step": 3674 }, { "crossentropy": 2.7624855041503906, "epoch": 0.3126063286832256, "grad_norm": 0.03448844701051712, "grad_norm_var": 2.8301074442682017e-06, "learning_rate": 0.005807887289808909, "loss": 2.7625, "step": 3675 }, { "crossentropy": 2.7777538299560547, "epoch": 0.3126913916298061, "grad_norm": 0.03926927223801613, "grad_norm_var": 3.1133960264403033e-06, "learning_rate": 0.005805909985057291, "loss": 2.7778, "step": 3676 }, { "crossentropy": 2.8106179237365723, "epoch": 0.31277645457638653, "grad_norm": 0.03641148656606674, "grad_norm_var": 2.9100800778155665e-06, "learning_rate": 0.005803932550899752, "loss": 2.8106, "step": 3677 }, { "crossentropy": 2.810142755508423, "epoch": 0.312861517522967, "grad_norm": 0.03613710403442383, "grad_norm_var": 2.979401665110087e-06, "learning_rate": 0.005801954987653809, "loss": 2.8101, "step": 3678 }, { "crossentropy": 2.7367806434631348, "epoch": 0.3129465804695475, "grad_norm": 0.036956075578927994, "grad_norm_var": 2.8838842561142166e-06, "learning_rate": 0.005799977295637005, "loss": 2.7368, "step": 3679 }, { "crossentropy": 2.7794175148010254, "epoch": 0.31303164341612794, "grad_norm": 0.036737628281116486, "grad_norm_var": 2.0165819205834765e-06, "learning_rate": 0.005797999475166896, "loss": 2.7794, "step": 3680 }, { "crossentropy": 2.7727580070495605, "epoch": 0.3131167063627084, "grad_norm": 0.03610815852880478, "grad_norm_var": 1.7795316421552002e-06, "learning_rate": 0.005796021526561066, "loss": 2.7728, "step": 3681 }, { "crossentropy": 2.79093337059021, "epoch": 0.3132017693092889, "grad_norm": 0.037877362221479416, "grad_norm_var": 1.7950043538443478e-06, "learning_rate": 0.005794043450137117, "loss": 2.7909, "step": 3682 }, { "crossentropy": 2.65091872215271, "epoch": 0.31328683225586934, "grad_norm": 0.0353524275124073, "grad_norm_var": 1.5762144334111396e-06, "learning_rate": 0.005792065246212667, "loss": 2.6509, "step": 3683 }, { "crossentropy": 2.7057483196258545, "epoch": 0.3133718952024498, "grad_norm": 0.036286622285842896, "grad_norm_var": 1.5335494379355164e-06, "learning_rate": 0.005790086915105364, "loss": 2.7057, "step": 3684 }, { "crossentropy": 2.573216676712036, "epoch": 0.3134569581490303, "grad_norm": 0.03948981687426567, "grad_norm_var": 1.882589530457536e-06, "learning_rate": 0.005788108457132866, "loss": 2.5732, "step": 3685 }, { "crossentropy": 2.749264717102051, "epoch": 0.31354202109561075, "grad_norm": 0.03949911519885063, "grad_norm_var": 2.227822210517048e-06, "learning_rate": 0.0057861298726128605, "loss": 2.7493, "step": 3686 }, { "crossentropy": 2.7658638954162598, "epoch": 0.3136270840421912, "grad_norm": 0.03694596514105797, "grad_norm_var": 2.1530262180130545e-06, "learning_rate": 0.0057841511618630485, "loss": 2.7659, "step": 3687 }, { "crossentropy": 2.8451762199401855, "epoch": 0.3137121469887717, "grad_norm": 0.034860461950302124, "grad_norm_var": 2.3956168268399537e-06, "learning_rate": 0.0057821723252011546, "loss": 2.8452, "step": 3688 }, { "crossentropy": 2.6955490112304688, "epoch": 0.31379720993535215, "grad_norm": 0.036181967705488205, "grad_norm_var": 2.392098655797555e-06, "learning_rate": 0.005780193362944922, "loss": 2.6955, "step": 3689 }, { "crossentropy": 2.7079503536224365, "epoch": 0.31388227288193266, "grad_norm": 0.03545272350311279, "grad_norm_var": 2.434895270680739e-06, "learning_rate": 0.005778214275412118, "loss": 2.708, "step": 3690 }, { "crossentropy": 2.8057219982147217, "epoch": 0.3139673358285131, "grad_norm": 0.03488514572381973, "grad_norm_var": 2.3249295869718262e-06, "learning_rate": 0.005776235062920525, "loss": 2.8057, "step": 3691 }, { "crossentropy": 2.742274522781372, "epoch": 0.31405239877509356, "grad_norm": 0.04017337039113045, "grad_norm_var": 2.6763055155307656e-06, "learning_rate": 0.005774255725787946, "loss": 2.7423, "step": 3692 }, { "crossentropy": 2.7407467365264893, "epoch": 0.31413746172167406, "grad_norm": 0.03813225403428078, "grad_norm_var": 2.764266983841003e-06, "learning_rate": 0.00577227626433221, "loss": 2.7407, "step": 3693 }, { "crossentropy": 2.6127803325653076, "epoch": 0.3142225246682545, "grad_norm": 0.0359794907271862, "grad_norm_var": 2.78274009646029e-06, "learning_rate": 0.005770296678871155, "loss": 2.6128, "step": 3694 }, { "crossentropy": 2.687910318374634, "epoch": 0.31430758761483496, "grad_norm": 0.04113084822893143, "grad_norm_var": 3.885207742826818e-06, "learning_rate": 0.00576831696972265, "loss": 2.6879, "step": 3695 }, { "crossentropy": 2.7638001441955566, "epoch": 0.31439265056141547, "grad_norm": 0.03663775324821472, "grad_norm_var": 3.891899676778237e-06, "learning_rate": 0.005766337137204579, "loss": 2.7638, "step": 3696 }, { "crossentropy": 2.7065556049346924, "epoch": 0.3144777135079959, "grad_norm": 0.03855699673295021, "grad_norm_var": 3.914415513866833e-06, "learning_rate": 0.005764357181634846, "loss": 2.7066, "step": 3697 }, { "crossentropy": 2.656484365463257, "epoch": 0.31456277645457636, "grad_norm": 0.03457736223936081, "grad_norm_var": 4.358664971001955e-06, "learning_rate": 0.005762377103331372, "loss": 2.6565, "step": 3698 }, { "crossentropy": 2.7117888927459717, "epoch": 0.31464783940115687, "grad_norm": 0.037408798933029175, "grad_norm_var": 4.13450856530014e-06, "learning_rate": 0.005760396902612105, "loss": 2.7118, "step": 3699 }, { "crossentropy": 2.784985303878784, "epoch": 0.3147329023477373, "grad_norm": 0.03678393363952637, "grad_norm_var": 4.085262717988999e-06, "learning_rate": 0.005758416579795005, "loss": 2.785, "step": 3700 }, { "crossentropy": 2.7393040657043457, "epoch": 0.31481796529431777, "grad_norm": 0.036381661891937256, "grad_norm_var": 3.7788529426929205e-06, "learning_rate": 0.005756436135198055, "loss": 2.7393, "step": 3701 }, { "crossentropy": 2.847933769226074, "epoch": 0.3149030282408983, "grad_norm": 0.038657497614622116, "grad_norm_var": 3.5538193954150084e-06, "learning_rate": 0.005754455569139258, "loss": 2.8479, "step": 3702 }, { "crossentropy": 2.660533905029297, "epoch": 0.3149880911874787, "grad_norm": 0.03548815846443176, "grad_norm_var": 3.706212911332394e-06, "learning_rate": 0.0057524748819366344, "loss": 2.6605, "step": 3703 }, { "crossentropy": 2.6346659660339355, "epoch": 0.31507315413405923, "grad_norm": 0.03578620404005051, "grad_norm_var": 3.5011767131924257e-06, "learning_rate": 0.00575049407390823, "loss": 2.6347, "step": 3704 }, { "crossentropy": 2.6909940242767334, "epoch": 0.3151582170806397, "grad_norm": 0.03553590178489685, "grad_norm_var": 3.5988843755112184e-06, "learning_rate": 0.005748513145372099, "loss": 2.691, "step": 3705 }, { "crossentropy": 2.683220863342285, "epoch": 0.31524328002722013, "grad_norm": 0.042169343680143356, "grad_norm_var": 5.056957734804006e-06, "learning_rate": 0.005746532096646327, "loss": 2.6832, "step": 3706 }, { "crossentropy": 2.834059238433838, "epoch": 0.31532834297380063, "grad_norm": 0.042568497359752655, "grad_norm_var": 6.177622430252513e-06, "learning_rate": 0.005744550928049009, "loss": 2.8341, "step": 3707 }, { "crossentropy": 2.8150174617767334, "epoch": 0.3154134059203811, "grad_norm": 0.03751140460371971, "grad_norm_var": 5.804035278405886e-06, "learning_rate": 0.005742569639898268, "loss": 2.815, "step": 3708 }, { "crossentropy": 2.7751176357269287, "epoch": 0.31549846886696153, "grad_norm": 0.034991901367902756, "grad_norm_var": 6.242184810691572e-06, "learning_rate": 0.005740588232512238, "loss": 2.7751, "step": 3709 }, { "crossentropy": 2.751107931137085, "epoch": 0.31558353181354204, "grad_norm": 0.03936531022191048, "grad_norm_var": 6.267571213752534e-06, "learning_rate": 0.0057386067062090785, "loss": 2.7511, "step": 3710 }, { "crossentropy": 2.7546443939208984, "epoch": 0.3156685947601225, "grad_norm": 0.0366123765707016, "grad_norm_var": 5.489887260050804e-06, "learning_rate": 0.005736625061306963, "loss": 2.7546, "step": 3711 }, { "crossentropy": 2.6945295333862305, "epoch": 0.31575365770670294, "grad_norm": 0.03650541976094246, "grad_norm_var": 5.505129377714133e-06, "learning_rate": 0.005734643298124091, "loss": 2.6945, "step": 3712 }, { "crossentropy": 2.7555415630340576, "epoch": 0.31583872065328344, "grad_norm": 0.0386660099029541, "grad_norm_var": 5.522234250489774e-06, "learning_rate": 0.00573266141697867, "loss": 2.7555, "step": 3713 }, { "crossentropy": 2.72724986076355, "epoch": 0.3159237835998639, "grad_norm": 0.03648235276341438, "grad_norm_var": 5.022419389319408e-06, "learning_rate": 0.005730679418188937, "loss": 2.7272, "step": 3714 }, { "crossentropy": 2.751392126083374, "epoch": 0.31600884654644434, "grad_norm": 0.034537266939878464, "grad_norm_var": 5.594583615991846e-06, "learning_rate": 0.005728697302073147, "loss": 2.7514, "step": 3715 }, { "crossentropy": 2.6488037109375, "epoch": 0.31609390949302485, "grad_norm": 0.03713557496666908, "grad_norm_var": 5.574472684633363e-06, "learning_rate": 0.005726715068949564, "loss": 2.6488, "step": 3716 }, { "crossentropy": 2.8042378425598145, "epoch": 0.3161789724396053, "grad_norm": 0.038511574268341064, "grad_norm_var": 5.568900321969473e-06, "learning_rate": 0.005724732719136481, "loss": 2.8042, "step": 3717 }, { "crossentropy": 2.6775805950164795, "epoch": 0.3162640353861858, "grad_norm": 0.03617697209119797, "grad_norm_var": 5.581484315319898e-06, "learning_rate": 0.005722750252952208, "loss": 2.6776, "step": 3718 }, { "crossentropy": 2.676632881164551, "epoch": 0.31634909833276625, "grad_norm": 0.03908408433198929, "grad_norm_var": 5.483666461066535e-06, "learning_rate": 0.0057207676707150676, "loss": 2.6766, "step": 3719 }, { "crossentropy": 2.7808220386505127, "epoch": 0.3164341612793467, "grad_norm": 0.035112716257572174, "grad_norm_var": 5.675117084068722e-06, "learning_rate": 0.0057187849727434095, "loss": 2.7808, "step": 3720 }, { "crossentropy": 2.7530322074890137, "epoch": 0.3165192242259272, "grad_norm": 0.035824477672576904, "grad_norm_var": 5.602424916183642e-06, "learning_rate": 0.005716802159355594, "loss": 2.753, "step": 3721 }, { "crossentropy": 2.7302961349487305, "epoch": 0.31660428717250766, "grad_norm": 0.03660903498530388, "grad_norm_var": 4.13117182689234e-06, "learning_rate": 0.0057148192308700075, "loss": 2.7303, "step": 3722 }, { "crossentropy": 2.6967520713806152, "epoch": 0.3166893501190881, "grad_norm": 0.04190362989902496, "grad_norm_var": 3.6856304134097e-06, "learning_rate": 0.005712836187605049, "loss": 2.6968, "step": 3723 }, { "crossentropy": 2.754077196121216, "epoch": 0.3167744130656686, "grad_norm": 0.04103587940335274, "grad_norm_var": 4.613328808781662e-06, "learning_rate": 0.005710853029879139, "loss": 2.7541, "step": 3724 }, { "crossentropy": 2.805112838745117, "epoch": 0.31685947601224906, "grad_norm": 0.03821666166186333, "grad_norm_var": 4.223711729904138e-06, "learning_rate": 0.005708869758010714, "loss": 2.8051, "step": 3725 }, { "crossentropy": 2.8251280784606934, "epoch": 0.3169445389588295, "grad_norm": 0.03751588240265846, "grad_norm_var": 4.004941168407142e-06, "learning_rate": 0.005706886372318234, "loss": 2.8251, "step": 3726 }, { "crossentropy": 2.7450644969940186, "epoch": 0.31702960190541, "grad_norm": 0.036468930542469025, "grad_norm_var": 4.023120243697396e-06, "learning_rate": 0.0057049028731201695, "loss": 2.7451, "step": 3727 }, { "crossentropy": 2.7214155197143555, "epoch": 0.31711466485199047, "grad_norm": 0.03727903589606285, "grad_norm_var": 3.959312187202616e-06, "learning_rate": 0.005702919260735014, "loss": 2.7214, "step": 3728 }, { "crossentropy": 2.8122429847717285, "epoch": 0.3171997277985709, "grad_norm": 0.033507730811834335, "grad_norm_var": 4.844430698334028e-06, "learning_rate": 0.005700935535481282, "loss": 2.8122, "step": 3729 }, { "crossentropy": 2.7192654609680176, "epoch": 0.3172847907451514, "grad_norm": 0.03646943345665932, "grad_norm_var": 4.845699057247759e-06, "learning_rate": 0.005698951697677498, "loss": 2.7193, "step": 3730 }, { "crossentropy": 2.710747718811035, "epoch": 0.3173698536917319, "grad_norm": 0.035467274487018585, "grad_norm_var": 4.568110730499703e-06, "learning_rate": 0.005696967747642212, "loss": 2.7107, "step": 3731 }, { "crossentropy": 2.7320032119750977, "epoch": 0.3174549166383124, "grad_norm": 0.03705785423517227, "grad_norm_var": 4.56988056054258e-06, "learning_rate": 0.0056949836856939875, "loss": 2.732, "step": 3732 }, { "crossentropy": 2.7386865615844727, "epoch": 0.31753997958489283, "grad_norm": 0.037422411143779755, "grad_norm_var": 4.463003774507884e-06, "learning_rate": 0.005692999512151409, "loss": 2.7387, "step": 3733 }, { "crossentropy": 2.7933120727539062, "epoch": 0.3176250425314733, "grad_norm": 0.03541883826255798, "grad_norm_var": 4.602035786353299e-06, "learning_rate": 0.0056910152273330775, "loss": 2.7933, "step": 3734 }, { "crossentropy": 2.8242015838623047, "epoch": 0.3177101054780538, "grad_norm": 0.037284426391124725, "grad_norm_var": 4.340274949384861e-06, "learning_rate": 0.005689030831557611, "loss": 2.8242, "step": 3735 }, { "crossentropy": 2.64967679977417, "epoch": 0.31779516842463423, "grad_norm": 0.03824233263731003, "grad_norm_var": 4.149403961608138e-06, "learning_rate": 0.0056870463251436475, "loss": 2.6497, "step": 3736 }, { "crossentropy": 2.808476209640503, "epoch": 0.3178802313712147, "grad_norm": 0.03803545981645584, "grad_norm_var": 4.0397793204304785e-06, "learning_rate": 0.005685061708409841, "loss": 2.8085, "step": 3737 }, { "crossentropy": 2.8823249340057373, "epoch": 0.3179652943177952, "grad_norm": 0.0370979830622673, "grad_norm_var": 4.005051189638157e-06, "learning_rate": 0.0056830769816748625, "loss": 2.8823, "step": 3738 }, { "crossentropy": 2.755648136138916, "epoch": 0.31805035726437564, "grad_norm": 0.03658776730298996, "grad_norm_var": 2.58016324518028e-06, "learning_rate": 0.005681092145257405, "loss": 2.7556, "step": 3739 }, { "crossentropy": 2.6846351623535156, "epoch": 0.3181354202109561, "grad_norm": 0.03779620677232742, "grad_norm_var": 1.5227172465173111e-06, "learning_rate": 0.0056791071994761735, "loss": 2.6846, "step": 3740 }, { "crossentropy": 2.724066734313965, "epoch": 0.3182204831575366, "grad_norm": 0.03799031674861908, "grad_norm_var": 1.485180261072879e-06, "learning_rate": 0.005677122144649893, "loss": 2.7241, "step": 3741 }, { "crossentropy": 2.758680820465088, "epoch": 0.31830554610411704, "grad_norm": 0.04048452526330948, "grad_norm_var": 2.2985155724079716e-06, "learning_rate": 0.005675136981097308, "loss": 2.7587, "step": 3742 }, { "crossentropy": 2.784043073654175, "epoch": 0.3183906090506975, "grad_norm": 0.03854437544941902, "grad_norm_var": 2.410212534905328e-06, "learning_rate": 0.0056731517091371765, "loss": 2.784, "step": 3743 }, { "crossentropy": 2.6468210220336914, "epoch": 0.318475671997278, "grad_norm": 0.04260411858558655, "grad_norm_var": 4.261420854024568e-06, "learning_rate": 0.0056711663290882775, "loss": 2.6468, "step": 3744 }, { "crossentropy": 2.735426425933838, "epoch": 0.31856073494385845, "grad_norm": 0.04250408336520195, "grad_norm_var": 4.53020808145136e-06, "learning_rate": 0.005669180841269406, "loss": 2.7354, "step": 3745 }, { "crossentropy": 2.7170655727386475, "epoch": 0.3186457978904389, "grad_norm": 0.038865502923727036, "grad_norm_var": 4.379935576354482e-06, "learning_rate": 0.005667195245999372, "loss": 2.7171, "step": 3746 }, { "crossentropy": 2.796405792236328, "epoch": 0.3187308608370194, "grad_norm": 0.03935382515192032, "grad_norm_var": 3.9013083301873855e-06, "learning_rate": 0.005665209543597008, "loss": 2.7964, "step": 3747 }, { "crossentropy": 2.740366220474243, "epoch": 0.31881592378359985, "grad_norm": 0.03978121280670166, "grad_norm_var": 3.8572995350917455e-06, "learning_rate": 0.005663223734381158, "loss": 2.7404, "step": 3748 }, { "crossentropy": 2.7721359729766846, "epoch": 0.31890098673018036, "grad_norm": 0.0389128252863884, "grad_norm_var": 3.756985952900876e-06, "learning_rate": 0.005661237818670685, "loss": 2.7721, "step": 3749 }, { "crossentropy": 2.7157835960388184, "epoch": 0.3189860496767608, "grad_norm": 0.03862332925200462, "grad_norm_var": 2.988743814528553e-06, "learning_rate": 0.005659251796784474, "loss": 2.7158, "step": 3750 }, { "crossentropy": 2.740438938140869, "epoch": 0.31907111262334126, "grad_norm": 0.03747083991765976, "grad_norm_var": 2.950281473383202e-06, "learning_rate": 0.005657265669041419, "loss": 2.7404, "step": 3751 }, { "crossentropy": 2.7716360092163086, "epoch": 0.31915617556992176, "grad_norm": 0.036188796162605286, "grad_norm_var": 3.402383027445212e-06, "learning_rate": 0.005655279435760436, "loss": 2.7716, "step": 3752 }, { "crossentropy": 2.852301836013794, "epoch": 0.3192412385165022, "grad_norm": 0.03742876276373863, "grad_norm_var": 3.48744214989227e-06, "learning_rate": 0.005653293097260457, "loss": 2.8523, "step": 3753 }, { "crossentropy": 2.713761329650879, "epoch": 0.31932630146308266, "grad_norm": 0.039118099957704544, "grad_norm_var": 3.2935805360676434e-06, "learning_rate": 0.005651306653860432, "loss": 2.7138, "step": 3754 }, { "crossentropy": 2.793208599090576, "epoch": 0.31941136440966317, "grad_norm": 0.03825566917657852, "grad_norm_var": 2.955259889569674e-06, "learning_rate": 0.005649320105879325, "loss": 2.7932, "step": 3755 }, { "crossentropy": 2.787402868270874, "epoch": 0.3194964273562436, "grad_norm": 0.03487652540206909, "grad_norm_var": 3.954783403130787e-06, "learning_rate": 0.005647333453636117, "loss": 2.7874, "step": 3756 }, { "crossentropy": 2.7349114418029785, "epoch": 0.31958149030282407, "grad_norm": 0.03754482418298721, "grad_norm_var": 4.016034675219151e-06, "learning_rate": 0.0056453466974498104, "loss": 2.7349, "step": 3757 }, { "crossentropy": 2.672416925430298, "epoch": 0.3196665532494046, "grad_norm": 0.03939948230981827, "grad_norm_var": 3.843718403256693e-06, "learning_rate": 0.005643359837639419, "loss": 2.6724, "step": 3758 }, { "crossentropy": 2.775872230529785, "epoch": 0.319751616195985, "grad_norm": 0.03754998743534088, "grad_norm_var": 3.928408570984659e-06, "learning_rate": 0.005641372874523977, "loss": 2.7759, "step": 3759 }, { "crossentropy": 2.709613084793091, "epoch": 0.31983667914256547, "grad_norm": 0.0371689610183239, "grad_norm_var": 2.9127438045552696e-06, "learning_rate": 0.0056393858084225305, "loss": 2.7096, "step": 3760 }, { "crossentropy": 2.786875009536743, "epoch": 0.319921742089146, "grad_norm": 0.0374419167637825, "grad_norm_var": 1.6870094622464226e-06, "learning_rate": 0.005637398639654147, "loss": 2.7869, "step": 3761 }, { "crossentropy": 2.6753199100494385, "epoch": 0.3200068050357264, "grad_norm": 0.03977237269282341, "grad_norm_var": 1.8432102947949295e-06, "learning_rate": 0.005635411368537909, "loss": 2.6753, "step": 3762 }, { "crossentropy": 2.8025643825531006, "epoch": 0.32009186798230693, "grad_norm": 0.03946773335337639, "grad_norm_var": 1.8637404290515007e-06, "learning_rate": 0.0056334239953929124, "loss": 2.8026, "step": 3763 }, { "crossentropy": 2.7470037937164307, "epoch": 0.3201769309288874, "grad_norm": 0.036210596561431885, "grad_norm_var": 1.8423630131399655e-06, "learning_rate": 0.005631436520538276, "loss": 2.747, "step": 3764 }, { "crossentropy": 2.7849624156951904, "epoch": 0.32026199387546783, "grad_norm": 0.03318755701184273, "grad_norm_var": 3.071627239902413e-06, "learning_rate": 0.005629448944293128, "loss": 2.785, "step": 3765 }, { "crossentropy": 2.719611406326294, "epoch": 0.32034705682204834, "grad_norm": 0.03746182098984718, "grad_norm_var": 2.9791276053116205e-06, "learning_rate": 0.0056274612669766156, "loss": 2.7196, "step": 3766 }, { "crossentropy": 2.7153706550598145, "epoch": 0.3204321197686288, "grad_norm": 0.039327893406152725, "grad_norm_var": 3.209980928967071e-06, "learning_rate": 0.005625473488907905, "loss": 2.7154, "step": 3767 }, { "crossentropy": 2.713545799255371, "epoch": 0.32051718271520924, "grad_norm": 0.040005482733249664, "grad_norm_var": 3.440409840473289e-06, "learning_rate": 0.005623485610406174, "loss": 2.7135, "step": 3768 }, { "crossentropy": 2.717132091522217, "epoch": 0.32060224566178974, "grad_norm": 0.03559040278196335, "grad_norm_var": 3.733707634925807e-06, "learning_rate": 0.005621497631790619, "loss": 2.7171, "step": 3769 }, { "crossentropy": 2.778961181640625, "epoch": 0.3206873086083702, "grad_norm": 0.04062899947166443, "grad_norm_var": 4.172397559370119e-06, "learning_rate": 0.005619509553380454, "loss": 2.779, "step": 3770 }, { "crossentropy": 2.7883200645446777, "epoch": 0.32077237155495064, "grad_norm": 0.03927982226014137, "grad_norm_var": 4.3079410592787465e-06, "learning_rate": 0.005617521375494903, "loss": 2.7883, "step": 3771 }, { "crossentropy": 2.7652523517608643, "epoch": 0.32085743450153115, "grad_norm": 0.03522578999400139, "grad_norm_var": 4.179090111344532e-06, "learning_rate": 0.005615533098453215, "loss": 2.7653, "step": 3772 }, { "crossentropy": 2.657045364379883, "epoch": 0.3209424974481116, "grad_norm": 0.04060107097029686, "grad_norm_var": 4.647087957380889e-06, "learning_rate": 0.005613544722574646, "loss": 2.657, "step": 3773 }, { "crossentropy": 2.7510063648223877, "epoch": 0.32102756039469205, "grad_norm": 0.04611834138631821, "grad_norm_var": 8.704342134147925e-06, "learning_rate": 0.005611556248178474, "loss": 2.751, "step": 3774 }, { "crossentropy": 2.869666337966919, "epoch": 0.32111262334127255, "grad_norm": 0.04288019612431526, "grad_norm_var": 9.847565973458704e-06, "learning_rate": 0.00560956767558399, "loss": 2.8697, "step": 3775 }, { "crossentropy": 2.702329397201538, "epoch": 0.321197686287853, "grad_norm": 0.04112653061747551, "grad_norm_var": 9.980018927500481e-06, "learning_rate": 0.005607579005110503, "loss": 2.7023, "step": 3776 }, { "crossentropy": 2.7576711177825928, "epoch": 0.3212827492344335, "grad_norm": 0.03642747178673744, "grad_norm_var": 1.0257843250717896e-05, "learning_rate": 0.005605590237077331, "loss": 2.7577, "step": 3777 }, { "crossentropy": 2.734650135040283, "epoch": 0.32136781218101396, "grad_norm": 0.03520320728421211, "grad_norm_var": 1.1065932884487112e-05, "learning_rate": 0.00560360137180382, "loss": 2.7347, "step": 3778 }, { "crossentropy": 2.7204384803771973, "epoch": 0.3214528751275944, "grad_norm": 0.03569726273417473, "grad_norm_var": 1.1554136962084865e-05, "learning_rate": 0.00560161240960932, "loss": 2.7204, "step": 3779 }, { "crossentropy": 2.781149387359619, "epoch": 0.3215379380741749, "grad_norm": 0.037498269230127335, "grad_norm_var": 1.1275727579170235e-05, "learning_rate": 0.005599623350813202, "loss": 2.7811, "step": 3780 }, { "crossentropy": 2.700458526611328, "epoch": 0.32162300102075536, "grad_norm": 0.044062886387109756, "grad_norm_var": 1.0940927276510923e-05, "learning_rate": 0.0055976341957348534, "loss": 2.7005, "step": 3781 }, { "crossentropy": 2.7367570400238037, "epoch": 0.3217080639673358, "grad_norm": 0.03703468665480614, "grad_norm_var": 1.1051091699127805e-05, "learning_rate": 0.005595644944693671, "loss": 2.7368, "step": 3782 }, { "crossentropy": 2.7918951511383057, "epoch": 0.3217931269139163, "grad_norm": 0.03905511274933815, "grad_norm_var": 1.1049973012584406e-05, "learning_rate": 0.005593655598009074, "loss": 2.7919, "step": 3783 }, { "crossentropy": 2.8055713176727295, "epoch": 0.32187818986049677, "grad_norm": 0.03789787366986275, "grad_norm_var": 1.108782000557116e-05, "learning_rate": 0.005591666156000494, "loss": 2.8056, "step": 3784 }, { "crossentropy": 2.669956922531128, "epoch": 0.3219632528070772, "grad_norm": 0.036921095103025436, "grad_norm_var": 1.0589905021734144e-05, "learning_rate": 0.005589676618987378, "loss": 2.67, "step": 3785 }, { "crossentropy": 2.732910394668579, "epoch": 0.3220483157536577, "grad_norm": 0.037990495562553406, "grad_norm_var": 1.0488397471088613e-05, "learning_rate": 0.0055876869872891885, "loss": 2.7329, "step": 3786 }, { "crossentropy": 2.6938679218292236, "epoch": 0.32213337870023817, "grad_norm": 0.038130298256874084, "grad_norm_var": 1.0518710288552555e-05, "learning_rate": 0.005585697261225403, "loss": 2.6939, "step": 3787 }, { "crossentropy": 2.6590640544891357, "epoch": 0.3222184416468186, "grad_norm": 0.03860733285546303, "grad_norm_var": 9.59170604432896e-06, "learning_rate": 0.0055837074411155116, "loss": 2.6591, "step": 3788 }, { "crossentropy": 2.7124760150909424, "epoch": 0.3223035045933991, "grad_norm": 0.03945948928594589, "grad_norm_var": 9.441367903380756e-06, "learning_rate": 0.005581717527279027, "loss": 2.7125, "step": 3789 }, { "crossentropy": 2.717837333679199, "epoch": 0.3223885675399796, "grad_norm": 0.04036497697234154, "grad_norm_var": 6.054905036261398e-06, "learning_rate": 0.005579727520035468, "loss": 2.7178, "step": 3790 }, { "crossentropy": 2.7042367458343506, "epoch": 0.3224736304865601, "grad_norm": 0.0389859713613987, "grad_norm_var": 4.804882674821521e-06, "learning_rate": 0.005577737419704374, "loss": 2.7042, "step": 3791 }, { "crossentropy": 2.7720394134521484, "epoch": 0.32255869343314053, "grad_norm": 0.03705229237675667, "grad_norm_var": 4.363345845988647e-06, "learning_rate": 0.005575747226605298, "loss": 2.772, "step": 3792 }, { "crossentropy": 2.6655056476593018, "epoch": 0.322643756379721, "grad_norm": 0.03515936806797981, "grad_norm_var": 4.754978032717588e-06, "learning_rate": 0.005573756941057805, "loss": 2.6655, "step": 3793 }, { "crossentropy": 2.8773157596588135, "epoch": 0.3227288193263015, "grad_norm": 0.03582276403903961, "grad_norm_var": 4.54214672769455e-06, "learning_rate": 0.005571766563381482, "loss": 2.8773, "step": 3794 }, { "crossentropy": 2.7383298873901367, "epoch": 0.32281388227288194, "grad_norm": 0.035643093287944794, "grad_norm_var": 4.559747393024077e-06, "learning_rate": 0.005569776093895924, "loss": 2.7383, "step": 3795 }, { "crossentropy": 2.7640798091888428, "epoch": 0.3228989452194624, "grad_norm": 0.037343621253967285, "grad_norm_var": 4.573760507510053e-06, "learning_rate": 0.005567785532920742, "loss": 2.7641, "step": 3796 }, { "crossentropy": 2.725426197052002, "epoch": 0.3229840081660429, "grad_norm": 0.08758775144815445, "grad_norm_var": 0.00015760403495882242, "learning_rate": 0.005565794880775564, "loss": 2.7254, "step": 3797 }, { "crossentropy": 2.7752315998077393, "epoch": 0.32306907111262334, "grad_norm": 0.039506036788225174, "grad_norm_var": 0.0001567397603529562, "learning_rate": 0.005563804137780032, "loss": 2.7752, "step": 3798 }, { "crossentropy": 2.718470335006714, "epoch": 0.3231541340592038, "grad_norm": 0.03896082565188408, "grad_norm_var": 0.0001567643951533086, "learning_rate": 0.005561813304253799, "loss": 2.7185, "step": 3799 }, { "crossentropy": 2.7270822525024414, "epoch": 0.3232391970057843, "grad_norm": 0.03584275767207146, "grad_norm_var": 0.0001578686890812988, "learning_rate": 0.005559822380516539, "loss": 2.7271, "step": 3800 }, { "crossentropy": 2.7690141201019287, "epoch": 0.32332425995236475, "grad_norm": 0.03871612250804901, "grad_norm_var": 0.00015713305778926323, "learning_rate": 0.005557831366887937, "loss": 2.769, "step": 3801 }, { "crossentropy": 2.7328267097473145, "epoch": 0.3234093228989452, "grad_norm": 0.039703622460365295, "grad_norm_var": 0.0001566408647729195, "learning_rate": 0.005555840263687688, "loss": 2.7328, "step": 3802 }, { "crossentropy": 2.7018256187438965, "epoch": 0.3234943858455257, "grad_norm": 0.03652185946702957, "grad_norm_var": 0.00015742986891459156, "learning_rate": 0.005553849071235512, "loss": 2.7018, "step": 3803 }, { "crossentropy": 2.753086566925049, "epoch": 0.32357944879210615, "grad_norm": 0.04511177912354469, "grad_norm_var": 0.00015803818571733866, "learning_rate": 0.005551857789851131, "loss": 2.7531, "step": 3804 }, { "crossentropy": 2.778362989425659, "epoch": 0.32366451173868666, "grad_norm": 0.044452693313360214, "grad_norm_var": 0.00015833022686203053, "learning_rate": 0.0055498664198542925, "loss": 2.7784, "step": 3805 }, { "crossentropy": 2.7147316932678223, "epoch": 0.3237495746852671, "grad_norm": 0.040340226143598557, "grad_norm_var": 0.00015833458332458652, "learning_rate": 0.005547874961564751, "loss": 2.7147, "step": 3806 }, { "crossentropy": 2.6227502822875977, "epoch": 0.32383463763184756, "grad_norm": 0.034673113375902176, "grad_norm_var": 0.00016104168083087608, "learning_rate": 0.005545883415302275, "loss": 2.6228, "step": 3807 }, { "crossentropy": 2.6285970211029053, "epoch": 0.32391970057842806, "grad_norm": 0.036852460354566574, "grad_norm_var": 0.00016116008128589924, "learning_rate": 0.0055438917813866555, "loss": 2.6286, "step": 3808 }, { "crossentropy": 2.6796951293945312, "epoch": 0.3240047635250085, "grad_norm": 0.0354144349694252, "grad_norm_var": 0.00016095225446631755, "learning_rate": 0.0055419000601376845, "loss": 2.6797, "step": 3809 }, { "crossentropy": 2.7706522941589355, "epoch": 0.32408982647158896, "grad_norm": 0.03766447678208351, "grad_norm_var": 0.00015979326323036357, "learning_rate": 0.00553990825187518, "loss": 2.7707, "step": 3810 }, { "crossentropy": 2.810636043548584, "epoch": 0.32417488941816947, "grad_norm": 0.04330470785498619, "grad_norm_var": 0.00015745753876922434, "learning_rate": 0.005537916356918967, "loss": 2.8106, "step": 3811 }, { "crossentropy": 2.712862014770508, "epoch": 0.3242599523647499, "grad_norm": 0.03595760464668274, "grad_norm_var": 0.00015843807244875682, "learning_rate": 0.005535924375588887, "loss": 2.7129, "step": 3812 }, { "crossentropy": 2.775026321411133, "epoch": 0.32434501531133036, "grad_norm": 0.03824056312441826, "grad_norm_var": 1.0112909933647439e-05, "learning_rate": 0.005533932308204793, "loss": 2.775, "step": 3813 }, { "crossentropy": 2.7117321491241455, "epoch": 0.32443007825791087, "grad_norm": 0.03892175853252411, "grad_norm_var": 1.0081499049619895e-05, "learning_rate": 0.005531940155086556, "loss": 2.7117, "step": 3814 }, { "crossentropy": 2.7648863792419434, "epoch": 0.3245151412044913, "grad_norm": 0.03747055307030678, "grad_norm_var": 1.0186846914078442e-05, "learning_rate": 0.005529947916554059, "loss": 2.7649, "step": 3815 }, { "crossentropy": 2.7133076190948486, "epoch": 0.32460020415107177, "grad_norm": 0.03907246142625809, "grad_norm_var": 9.608680622097351e-06, "learning_rate": 0.005527955592927197, "loss": 2.7133, "step": 3816 }, { "crossentropy": 2.758838415145874, "epoch": 0.3246852670976523, "grad_norm": 0.037999242544174194, "grad_norm_var": 9.65848631611221e-06, "learning_rate": 0.005525963184525877, "loss": 2.7588, "step": 3817 }, { "crossentropy": 2.6611194610595703, "epoch": 0.3247703300442327, "grad_norm": 0.03753509372472763, "grad_norm_var": 9.707414890343784e-06, "learning_rate": 0.005523970691670026, "loss": 2.6611, "step": 3818 }, { "crossentropy": 2.731238842010498, "epoch": 0.3248553929908132, "grad_norm": 0.03593418002128601, "grad_norm_var": 9.901304408658993e-06, "learning_rate": 0.005521978114679581, "loss": 2.7312, "step": 3819 }, { "crossentropy": 2.821638584136963, "epoch": 0.3249404559373937, "grad_norm": 0.036330096423625946, "grad_norm_var": 7.195046173026862e-06, "learning_rate": 0.00551998545387449, "loss": 2.8216, "step": 3820 }, { "crossentropy": 2.794316053390503, "epoch": 0.32502551888397413, "grad_norm": 0.035971786826848984, "grad_norm_var": 4.546697386733032e-06, "learning_rate": 0.005517992709574717, "loss": 2.7943, "step": 3821 }, { "crossentropy": 2.7759242057800293, "epoch": 0.32511058183055463, "grad_norm": 0.03603758662939072, "grad_norm_var": 4.13468157059952e-06, "learning_rate": 0.005515999882100242, "loss": 2.7759, "step": 3822 }, { "crossentropy": 2.649634838104248, "epoch": 0.3251956447771351, "grad_norm": 0.03971988707780838, "grad_norm_var": 3.93451376025251e-06, "learning_rate": 0.005514006971771055, "loss": 2.6496, "step": 3823 }, { "crossentropy": 2.688539743423462, "epoch": 0.32528070772371553, "grad_norm": 0.03528084605932236, "grad_norm_var": 4.256362498014122e-06, "learning_rate": 0.005512013978907156, "loss": 2.6885, "step": 3824 }, { "crossentropy": 2.769207715988159, "epoch": 0.32536577067029604, "grad_norm": 0.03724580258131027, "grad_norm_var": 3.943670776474819e-06, "learning_rate": 0.0055100209038285684, "loss": 2.7692, "step": 3825 }, { "crossentropy": 2.7748093605041504, "epoch": 0.3254508336168765, "grad_norm": 0.039578258991241455, "grad_norm_var": 4.171703472777288e-06, "learning_rate": 0.005508027746855319, "loss": 2.7748, "step": 3826 }, { "crossentropy": 2.7213921546936035, "epoch": 0.32553589656345694, "grad_norm": 0.038310661911964417, "grad_norm_var": 2.0567436781493768e-06, "learning_rate": 0.0055060345083074525, "loss": 2.7214, "step": 3827 }, { "crossentropy": 2.7739739418029785, "epoch": 0.32562095951003744, "grad_norm": 0.035503026098012924, "grad_norm_var": 2.1616530140683433e-06, "learning_rate": 0.005504041188505023, "loss": 2.774, "step": 3828 }, { "crossentropy": 2.7453808784484863, "epoch": 0.3257060224566179, "grad_norm": 0.03807733207941055, "grad_norm_var": 2.1460468038446307e-06, "learning_rate": 0.005502047787768102, "loss": 2.7454, "step": 3829 }, { "crossentropy": 2.7210521697998047, "epoch": 0.32579108540319834, "grad_norm": 0.036591436713933945, "grad_norm_var": 2.0240515654354937e-06, "learning_rate": 0.0055000543064167755, "loss": 2.7211, "step": 3830 }, { "crossentropy": 2.658217668533325, "epoch": 0.32587614834977885, "grad_norm": 0.03854230046272278, "grad_norm_var": 2.1214796765972945e-06, "learning_rate": 0.005498060744771134, "loss": 2.6582, "step": 3831 }, { "crossentropy": 2.7419254779815674, "epoch": 0.3259612112963593, "grad_norm": 0.038323067128658295, "grad_norm_var": 1.985283969987e-06, "learning_rate": 0.005496067103151287, "loss": 2.7419, "step": 3832 }, { "crossentropy": 2.6159861087799072, "epoch": 0.32604627424293975, "grad_norm": 0.03876319155097008, "grad_norm_var": 2.091835076415786e-06, "learning_rate": 0.005494073381877359, "loss": 2.616, "step": 3833 }, { "crossentropy": 2.6968233585357666, "epoch": 0.32613133718952025, "grad_norm": 0.03646664693951607, "grad_norm_var": 2.138102433340133e-06, "learning_rate": 0.005492079581269483, "loss": 2.6968, "step": 3834 }, { "crossentropy": 2.8769595623016357, "epoch": 0.3262164001361007, "grad_norm": 0.03919896110892296, "grad_norm_var": 2.2131007396765378e-06, "learning_rate": 0.0054900857016478045, "loss": 2.877, "step": 3835 }, { "crossentropy": 2.7495691776275635, "epoch": 0.3263014630826812, "grad_norm": 0.03577961400151253, "grad_norm_var": 2.3176371863603897e-06, "learning_rate": 0.005488091743332482, "loss": 2.7496, "step": 3836 }, { "crossentropy": 2.67094087600708, "epoch": 0.32638652602926166, "grad_norm": 0.03683653473854065, "grad_norm_var": 2.19256431742675e-06, "learning_rate": 0.005486097706643691, "loss": 2.6709, "step": 3837 }, { "crossentropy": 2.769232749938965, "epoch": 0.3264715889758421, "grad_norm": 0.041121162474155426, "grad_norm_var": 2.805688080871847e-06, "learning_rate": 0.005484103591901616, "loss": 2.7692, "step": 3838 }, { "crossentropy": 2.7464230060577393, "epoch": 0.3265566519224226, "grad_norm": 0.035845592617988586, "grad_norm_var": 2.7694551987265487e-06, "learning_rate": 0.005482109399426452, "loss": 2.7464, "step": 3839 }, { "crossentropy": 2.737771987915039, "epoch": 0.32664171486900306, "grad_norm": 0.03921913355588913, "grad_norm_var": 2.5254867462863134e-06, "learning_rate": 0.0054801151295384085, "loss": 2.7378, "step": 3840 }, { "crossentropy": 2.651935577392578, "epoch": 0.3267267778155835, "grad_norm": 0.03520134463906288, "grad_norm_var": 2.9480645532976743e-06, "learning_rate": 0.005478120782557713, "loss": 2.6519, "step": 3841 }, { "crossentropy": 2.759862184524536, "epoch": 0.326811840762164, "grad_norm": 0.03918200358748436, "grad_norm_var": 2.859164774589538e-06, "learning_rate": 0.005476126358804594, "loss": 2.7599, "step": 3842 }, { "crossentropy": 2.6914620399475098, "epoch": 0.32689690370874447, "grad_norm": 0.039254330098629, "grad_norm_var": 2.9935281230556813e-06, "learning_rate": 0.0054741318585993, "loss": 2.6915, "step": 3843 }, { "crossentropy": 2.68953013420105, "epoch": 0.3269819666553249, "grad_norm": 0.0405387207865715, "grad_norm_var": 3.0736984780593254e-06, "learning_rate": 0.005472137282262094, "loss": 2.6895, "step": 3844 }, { "crossentropy": 2.7607176303863525, "epoch": 0.3270670296019054, "grad_norm": 0.03634364530444145, "grad_norm_var": 3.2572772707665602e-06, "learning_rate": 0.005470142630113244, "loss": 2.7607, "step": 3845 }, { "crossentropy": 2.70542049407959, "epoch": 0.3271520925484859, "grad_norm": 0.04003063961863518, "grad_norm_var": 3.3733309802710015e-06, "learning_rate": 0.005468147902473035, "loss": 2.7054, "step": 3846 }, { "crossentropy": 2.671130895614624, "epoch": 0.3272371554950663, "grad_norm": 0.036975543946027756, "grad_norm_var": 3.4480228995242257e-06, "learning_rate": 0.00546615309966176, "loss": 2.6711, "step": 3847 }, { "crossentropy": 2.771643877029419, "epoch": 0.32732221844164683, "grad_norm": 0.04063244163990021, "grad_norm_var": 3.86003938363007e-06, "learning_rate": 0.005464158221999731, "loss": 2.7716, "step": 3848 }, { "crossentropy": 2.6179213523864746, "epoch": 0.3274072813882273, "grad_norm": 0.041986215859651566, "grad_norm_var": 4.7462163773811496e-06, "learning_rate": 0.005462163269807267, "loss": 2.6179, "step": 3849 }, { "crossentropy": 2.7786707878112793, "epoch": 0.3274923443348078, "grad_norm": 0.039595622569322586, "grad_norm_var": 4.545991592953766e-06, "learning_rate": 0.005460168243404696, "loss": 2.7787, "step": 3850 }, { "crossentropy": 2.6597633361816406, "epoch": 0.32757740728138823, "grad_norm": 0.03836677968502045, "grad_norm_var": 4.523796547961645e-06, "learning_rate": 0.005458173143112365, "loss": 2.6598, "step": 3851 }, { "crossentropy": 2.8428499698638916, "epoch": 0.3276624702279687, "grad_norm": 0.03935489431023598, "grad_norm_var": 3.998799444281572e-06, "learning_rate": 0.005456177969250632, "loss": 2.8428, "step": 3852 }, { "crossentropy": 2.7553491592407227, "epoch": 0.3277475331745492, "grad_norm": 0.03904327377676964, "grad_norm_var": 3.7312414050753746e-06, "learning_rate": 0.0054541827221398585, "loss": 2.7553, "step": 3853 }, { "crossentropy": 2.7501771450042725, "epoch": 0.32783259612112964, "grad_norm": 0.038067057728767395, "grad_norm_var": 3.4171402330311513e-06, "learning_rate": 0.005452187402100427, "loss": 2.7502, "step": 3854 }, { "crossentropy": 2.679980516433716, "epoch": 0.3279176590677101, "grad_norm": 0.03600369766354561, "grad_norm_var": 3.3579536487598574e-06, "learning_rate": 0.0054501920094527315, "loss": 2.68, "step": 3855 }, { "crossentropy": 2.7844104766845703, "epoch": 0.3280027220142906, "grad_norm": 0.036958370357751846, "grad_norm_var": 3.532125329779439e-06, "learning_rate": 0.005448196544517168, "loss": 2.7844, "step": 3856 }, { "crossentropy": 2.744706392288208, "epoch": 0.32808778496087104, "grad_norm": 0.03367278352379799, "grad_norm_var": 4.369996879115672e-06, "learning_rate": 0.005446201007614155, "loss": 2.7447, "step": 3857 }, { "crossentropy": 2.7943801879882812, "epoch": 0.3281728479074515, "grad_norm": 0.03898288309574127, "grad_norm_var": 4.354378144429359e-06, "learning_rate": 0.005444205399064116, "loss": 2.7944, "step": 3858 }, { "crossentropy": 2.806915521621704, "epoch": 0.328257910854032, "grad_norm": 0.040131326764822006, "grad_norm_var": 4.492065573081551e-06, "learning_rate": 0.005442209719187492, "loss": 2.8069, "step": 3859 }, { "crossentropy": 2.72268009185791, "epoch": 0.32834297380061245, "grad_norm": 0.036483488976955414, "grad_norm_var": 4.440652080926234e-06, "learning_rate": 0.0054402139683047265, "loss": 2.7227, "step": 3860 }, { "crossentropy": 2.8121254444122314, "epoch": 0.3284280367471929, "grad_norm": 0.03685392439365387, "grad_norm_var": 4.3245497817590016e-06, "learning_rate": 0.005438218146736284, "loss": 2.8121, "step": 3861 }, { "crossentropy": 2.7565908432006836, "epoch": 0.3285130996937734, "grad_norm": 0.036452922970056534, "grad_norm_var": 4.309093594948715e-06, "learning_rate": 0.005436222254802632, "loss": 2.7566, "step": 3862 }, { "crossentropy": 2.74232816696167, "epoch": 0.32859816264035385, "grad_norm": 0.03720663860440254, "grad_norm_var": 4.277858621021258e-06, "learning_rate": 0.005434226292824258, "loss": 2.7423, "step": 3863 }, { "crossentropy": 2.7389256954193115, "epoch": 0.32868322558693436, "grad_norm": 0.0341406874358654, "grad_norm_var": 4.7301938950259154e-06, "learning_rate": 0.005432230261121651, "loss": 2.7389, "step": 3864 }, { "crossentropy": 2.675415515899658, "epoch": 0.3287682885335148, "grad_norm": 0.03617824614048004, "grad_norm_var": 3.5241151069377585e-06, "learning_rate": 0.00543023416001532, "loss": 2.6754, "step": 3865 }, { "crossentropy": 2.6606264114379883, "epoch": 0.32885335148009526, "grad_norm": 0.03734863921999931, "grad_norm_var": 3.1648788688119306e-06, "learning_rate": 0.005428237989825779, "loss": 2.6606, "step": 3866 }, { "crossentropy": 2.7386178970336914, "epoch": 0.32893841442667576, "grad_norm": 0.03505172207951546, "grad_norm_var": 3.3372638458414292e-06, "learning_rate": 0.005426241750873556, "loss": 2.7386, "step": 3867 }, { "crossentropy": 2.7637906074523926, "epoch": 0.3290234773732562, "grad_norm": 0.03607233986258507, "grad_norm_var": 2.9781361525496018e-06, "learning_rate": 0.005424245443479191, "loss": 2.7638, "step": 3868 }, { "crossentropy": 2.7565975189208984, "epoch": 0.32910854031983666, "grad_norm": 0.0385059230029583, "grad_norm_var": 2.8347788157615308e-06, "learning_rate": 0.005422249067963231, "loss": 2.7566, "step": 3869 }, { "crossentropy": 2.7683558464050293, "epoch": 0.32919360326641717, "grad_norm": 0.04011237993836403, "grad_norm_var": 3.4535260842174137e-06, "learning_rate": 0.005420252624646238, "loss": 2.7684, "step": 3870 }, { "crossentropy": 2.7776992321014404, "epoch": 0.3292786662129976, "grad_norm": 0.037548236548900604, "grad_norm_var": 3.4211838290067996e-06, "learning_rate": 0.005418256113848783, "loss": 2.7777, "step": 3871 }, { "crossentropy": 2.533660888671875, "epoch": 0.32936372915957807, "grad_norm": 0.034495625644922256, "grad_norm_var": 3.807776721279482e-06, "learning_rate": 0.005416259535891447, "loss": 2.5337, "step": 3872 }, { "crossentropy": 2.766867160797119, "epoch": 0.32944879210615857, "grad_norm": 0.03711795434355736, "grad_norm_var": 3.100527632753959e-06, "learning_rate": 0.005414262891094824, "loss": 2.7669, "step": 3873 }, { "crossentropy": 2.7894508838653564, "epoch": 0.329533855052739, "grad_norm": 0.040035564452409744, "grad_norm_var": 3.4421078260081043e-06, "learning_rate": 0.005412266179779519, "loss": 2.7895, "step": 3874 }, { "crossentropy": 2.7700650691986084, "epoch": 0.32961891799931947, "grad_norm": 0.038299158215522766, "grad_norm_var": 2.9134608790346362e-06, "learning_rate": 0.005410269402266143, "loss": 2.7701, "step": 3875 }, { "crossentropy": 2.7111220359802246, "epoch": 0.3297039809459, "grad_norm": 0.03529100492596626, "grad_norm_var": 3.083501736955707e-06, "learning_rate": 0.005408272558875323, "loss": 2.7111, "step": 3876 }, { "crossentropy": 2.7138583660125732, "epoch": 0.3297890438924804, "grad_norm": 0.04093480482697487, "grad_norm_var": 4.088705081633538e-06, "learning_rate": 0.0054062756499276944, "loss": 2.7139, "step": 3877 }, { "crossentropy": 2.6883490085601807, "epoch": 0.32987410683906093, "grad_norm": 0.03651198372244835, "grad_norm_var": 4.0832409161458675e-06, "learning_rate": 0.005404278675743902, "loss": 2.6883, "step": 3878 }, { "crossentropy": 2.7546849250793457, "epoch": 0.3299591697856414, "grad_norm": 0.038320429623126984, "grad_norm_var": 4.1650000577912275e-06, "learning_rate": 0.005402281636644605, "loss": 2.7547, "step": 3879 }, { "crossentropy": 2.586674690246582, "epoch": 0.33004423273222183, "grad_norm": 0.03690400347113609, "grad_norm_var": 3.4974558841077697e-06, "learning_rate": 0.005400284532950467, "loss": 2.5867, "step": 3880 }, { "crossentropy": 2.6966609954833984, "epoch": 0.33012929567880234, "grad_norm": 0.041796375066041946, "grad_norm_var": 4.5396136110139575e-06, "learning_rate": 0.005398287364982168, "loss": 2.6967, "step": 3881 }, { "crossentropy": 2.7028417587280273, "epoch": 0.3302143586253828, "grad_norm": 0.03655526787042618, "grad_norm_var": 4.623699090332741e-06, "learning_rate": 0.005396290133060394, "loss": 2.7028, "step": 3882 }, { "crossentropy": 2.7608039379119873, "epoch": 0.33029942157196324, "grad_norm": 0.03794768825173378, "grad_norm_var": 4.116772905123541e-06, "learning_rate": 0.0053942928375058435, "loss": 2.7608, "step": 3883 }, { "crossentropy": 2.7062180042266846, "epoch": 0.33038448451854374, "grad_norm": 0.04089123010635376, "grad_norm_var": 4.391866276717779e-06, "learning_rate": 0.0053922954786392256, "loss": 2.7062, "step": 3884 }, { "crossentropy": 2.7301578521728516, "epoch": 0.3304695474651242, "grad_norm": 0.040403757244348526, "grad_norm_var": 4.693319767651247e-06, "learning_rate": 0.005390298056781257, "loss": 2.7302, "step": 3885 }, { "crossentropy": 2.7526824474334717, "epoch": 0.33055461041170464, "grad_norm": 0.035950493067502975, "grad_norm_var": 4.782853634227533e-06, "learning_rate": 0.005388300572252666, "loss": 2.7527, "step": 3886 }, { "crossentropy": 2.702422618865967, "epoch": 0.33063967335828515, "grad_norm": 0.03558620065450668, "grad_norm_var": 5.158044958694594e-06, "learning_rate": 0.005386303025374192, "loss": 2.7024, "step": 3887 }, { "crossentropy": 2.795219898223877, "epoch": 0.3307247363048656, "grad_norm": 0.038638584315776825, "grad_norm_var": 4.328094985324669e-06, "learning_rate": 0.005384305416466584, "loss": 2.7952, "step": 3888 }, { "crossentropy": 2.712336301803589, "epoch": 0.33080979925144605, "grad_norm": 0.037374671548604965, "grad_norm_var": 4.2952098287823514e-06, "learning_rate": 0.005382307745850598, "loss": 2.7123, "step": 3889 }, { "crossentropy": 2.707977294921875, "epoch": 0.33089486219802655, "grad_norm": 0.037255968898534775, "grad_norm_var": 4.103398111119742e-06, "learning_rate": 0.005380310013847005, "loss": 2.708, "step": 3890 }, { "crossentropy": 2.6644275188446045, "epoch": 0.330979925144607, "grad_norm": 0.0387394018471241, "grad_norm_var": 4.130644557898076e-06, "learning_rate": 0.005378312220776583, "loss": 2.6644, "step": 3891 }, { "crossentropy": 2.729771614074707, "epoch": 0.3310649880911875, "grad_norm": 0.03607601299881935, "grad_norm_var": 3.8784069016754736e-06, "learning_rate": 0.005376314366960118, "loss": 2.7298, "step": 3892 }, { "crossentropy": 2.808762311935425, "epoch": 0.33115005103776796, "grad_norm": 0.03750170022249222, "grad_norm_var": 3.3256278758513985e-06, "learning_rate": 0.005374316452718408, "loss": 2.8088, "step": 3893 }, { "crossentropy": 2.756021499633789, "epoch": 0.3312351139843484, "grad_norm": 0.036981262266635895, "grad_norm_var": 3.252332666820769e-06, "learning_rate": 0.005372318478372262, "loss": 2.756, "step": 3894 }, { "crossentropy": 2.7831339836120605, "epoch": 0.3313201769309289, "grad_norm": 0.037428118288517, "grad_norm_var": 3.2559651806965755e-06, "learning_rate": 0.005370320444242498, "loss": 2.7831, "step": 3895 }, { "crossentropy": 2.8226025104522705, "epoch": 0.33140523987750936, "grad_norm": 0.04036244750022888, "grad_norm_var": 3.554879928711202e-06, "learning_rate": 0.005368322350649941, "loss": 2.8226, "step": 3896 }, { "crossentropy": 2.600682497024536, "epoch": 0.3314903028240898, "grad_norm": 0.037511784583330154, "grad_norm_var": 2.5866198828233358e-06, "learning_rate": 0.005366324197915425, "loss": 2.6007, "step": 3897 }, { "crossentropy": 2.809734582901001, "epoch": 0.3315753657706703, "grad_norm": 0.03441258519887924, "grad_norm_var": 3.236395969759507e-06, "learning_rate": 0.005364325986359802, "loss": 2.8097, "step": 3898 }, { "crossentropy": 2.825747013092041, "epoch": 0.33166042871725077, "grad_norm": 0.03614877164363861, "grad_norm_var": 3.377172730290902e-06, "learning_rate": 0.005362327716303921, "loss": 2.8257, "step": 3899 }, { "crossentropy": 2.777461528778076, "epoch": 0.3317454916638312, "grad_norm": 0.036967046558856964, "grad_norm_var": 2.6065508946482645e-06, "learning_rate": 0.005360329388068649, "loss": 2.7775, "step": 3900 }, { "crossentropy": 2.7881338596343994, "epoch": 0.3318305546104117, "grad_norm": 0.03672845661640167, "grad_norm_var": 1.946327413117575e-06, "learning_rate": 0.0053583310019748595, "loss": 2.7881, "step": 3901 }, { "crossentropy": 2.757858991622925, "epoch": 0.33191561755699217, "grad_norm": 0.035115838050842285, "grad_norm_var": 2.118235253944535e-06, "learning_rate": 0.0053563325583434376, "loss": 2.7579, "step": 3902 }, { "crossentropy": 2.742084503173828, "epoch": 0.3320006805035726, "grad_norm": 0.03809449449181557, "grad_norm_var": 2.0213014664388165e-06, "learning_rate": 0.005354334057495274, "loss": 2.7421, "step": 3903 }, { "crossentropy": 2.79042911529541, "epoch": 0.3320857434501531, "grad_norm": 0.03804539889097214, "grad_norm_var": 1.930191579514971e-06, "learning_rate": 0.00535233549975127, "loss": 2.7904, "step": 3904 }, { "crossentropy": 2.761115312576294, "epoch": 0.3321708063967336, "grad_norm": 0.03932543843984604, "grad_norm_var": 2.220880830046343e-06, "learning_rate": 0.005350336885432337, "loss": 2.7611, "step": 3905 }, { "crossentropy": 2.7199087142944336, "epoch": 0.332255869343314, "grad_norm": 0.040218666195869446, "grad_norm_var": 2.7546849540906193e-06, "learning_rate": 0.005348338214859395, "loss": 2.7199, "step": 3906 }, { "crossentropy": 2.658752679824829, "epoch": 0.33234093228989453, "grad_norm": 0.03642231598496437, "grad_norm_var": 2.70071883118686e-06, "learning_rate": 0.005346339488353374, "loss": 2.6588, "step": 3907 }, { "crossentropy": 2.7852659225463867, "epoch": 0.332425995236475, "grad_norm": 0.03588034212589264, "grad_norm_var": 2.735925991630544e-06, "learning_rate": 0.005344340706235209, "loss": 2.7853, "step": 3908 }, { "crossentropy": 2.7211861610412598, "epoch": 0.3325110581830555, "grad_norm": 0.042053379118442535, "grad_norm_var": 4.140123794512246e-06, "learning_rate": 0.005342341868825849, "loss": 2.7212, "step": 3909 }, { "crossentropy": 2.744457244873047, "epoch": 0.33259612112963594, "grad_norm": 0.03723656013607979, "grad_norm_var": 4.1229307261608785e-06, "learning_rate": 0.005340342976446251, "loss": 2.7445, "step": 3910 }, { "crossentropy": 2.727489471435547, "epoch": 0.3326811840762164, "grad_norm": 0.03613593056797981, "grad_norm_var": 4.260690418759518e-06, "learning_rate": 0.005338344029417375, "loss": 2.7275, "step": 3911 }, { "crossentropy": 2.753528356552124, "epoch": 0.3327662470227969, "grad_norm": 0.038010772317647934, "grad_norm_var": 3.7217229172841854e-06, "learning_rate": 0.005336345028060198, "loss": 2.7535, "step": 3912 }, { "crossentropy": 2.706636428833008, "epoch": 0.33285130996937734, "grad_norm": 0.03888712450861931, "grad_norm_var": 3.861501256568108e-06, "learning_rate": 0.0053343459726957025, "loss": 2.7066, "step": 3913 }, { "crossentropy": 2.7671918869018555, "epoch": 0.3329363729159578, "grad_norm": 0.04329173266887665, "grad_norm_var": 5.157253623762579e-06, "learning_rate": 0.005332346863644876, "loss": 2.7672, "step": 3914 }, { "crossentropy": 2.68721866607666, "epoch": 0.3330214358625383, "grad_norm": 0.03450114652514458, "grad_norm_var": 5.741324513973816e-06, "learning_rate": 0.00533034770122872, "loss": 2.6872, "step": 3915 }, { "crossentropy": 2.7347874641418457, "epoch": 0.33310649880911875, "grad_norm": 0.03657937049865723, "grad_norm_var": 5.800604928436238e-06, "learning_rate": 0.005328348485768242, "loss": 2.7348, "step": 3916 }, { "crossentropy": 2.748453378677368, "epoch": 0.3331915617556992, "grad_norm": 0.035452015697956085, "grad_norm_var": 6.103174272505986e-06, "learning_rate": 0.0053263492175844585, "loss": 2.7485, "step": 3917 }, { "crossentropy": 2.6807758808135986, "epoch": 0.3332766247022797, "grad_norm": 0.034388814121484756, "grad_norm_var": 6.399132366450428e-06, "learning_rate": 0.005324349896998395, "loss": 2.6808, "step": 3918 }, { "crossentropy": 2.6959421634674072, "epoch": 0.33336168764886015, "grad_norm": 0.04164821654558182, "grad_norm_var": 7.336169653109475e-06, "learning_rate": 0.005322350524331082, "loss": 2.6959, "step": 3919 }, { "crossentropy": 2.6774027347564697, "epoch": 0.3334467505954406, "grad_norm": 0.04043728485703468, "grad_norm_var": 7.706678821927983e-06, "learning_rate": 0.0053203510999035655, "loss": 2.6774, "step": 3920 }, { "crossentropy": 2.704451322555542, "epoch": 0.3335318135420211, "grad_norm": 0.03552907332777977, "grad_norm_var": 8.014653677763882e-06, "learning_rate": 0.005318351624036891, "loss": 2.7045, "step": 3921 }, { "crossentropy": 2.763380289077759, "epoch": 0.33361687648860155, "grad_norm": 0.036952029913663864, "grad_norm_var": 7.679111827041165e-06, "learning_rate": 0.00531635209705212, "loss": 2.7634, "step": 3922 }, { "crossentropy": 2.755882740020752, "epoch": 0.33370193943518206, "grad_norm": 0.040497083216905594, "grad_norm_var": 8.015677450262427e-06, "learning_rate": 0.0053143525192703165, "loss": 2.7559, "step": 3923 }, { "crossentropy": 2.6917178630828857, "epoch": 0.3337870023817625, "grad_norm": 0.03620888292789459, "grad_norm_var": 7.930992373089464e-06, "learning_rate": 0.005312352891012558, "loss": 2.6917, "step": 3924 }, { "crossentropy": 2.7927303314208984, "epoch": 0.33387206532834296, "grad_norm": 0.038455575704574585, "grad_norm_var": 6.789855377313988e-06, "learning_rate": 0.005310353212599924, "loss": 2.7927, "step": 3925 }, { "crossentropy": 2.7735066413879395, "epoch": 0.33395712827492346, "grad_norm": 0.03704769164323807, "grad_norm_var": 6.8053475752199425e-06, "learning_rate": 0.005308353484353508, "loss": 2.7735, "step": 3926 }, { "crossentropy": 2.775399923324585, "epoch": 0.3340421912215039, "grad_norm": 0.03345241770148277, "grad_norm_var": 7.833450606990697e-06, "learning_rate": 0.005306353706594407, "loss": 2.7754, "step": 3927 }, { "crossentropy": 2.725277900695801, "epoch": 0.33412725416808436, "grad_norm": 0.035348210483789444, "grad_norm_var": 8.124914347100384e-06, "learning_rate": 0.005304353879643726, "loss": 2.7253, "step": 3928 }, { "crossentropy": 2.753784418106079, "epoch": 0.33421231711466487, "grad_norm": 0.037791576236486435, "grad_norm_var": 7.985225462653286e-06, "learning_rate": 0.005302354003822583, "loss": 2.7538, "step": 3929 }, { "crossentropy": 2.7433254718780518, "epoch": 0.3342973800612453, "grad_norm": 0.03705449029803276, "grad_norm_var": 5.474356867542681e-06, "learning_rate": 0.0053003540794521, "loss": 2.7433, "step": 3930 }, { "crossentropy": 2.6420090198516846, "epoch": 0.33438244300782577, "grad_norm": 0.03504582867026329, "grad_norm_var": 5.314399974795658e-06, "learning_rate": 0.005298354106853405, "loss": 2.642, "step": 3931 }, { "crossentropy": 2.594432830810547, "epoch": 0.3344675059544063, "grad_norm": 0.036788780242204666, "grad_norm_var": 5.305590698834126e-06, "learning_rate": 0.00529635408634764, "loss": 2.5944, "step": 3932 }, { "crossentropy": 2.772756814956665, "epoch": 0.3345525689009867, "grad_norm": 0.03764185681939125, "grad_norm_var": 5.15153698163647e-06, "learning_rate": 0.005294354018255945, "loss": 2.7728, "step": 3933 }, { "crossentropy": 2.7175564765930176, "epoch": 0.3346376318475672, "grad_norm": 0.03684260696172714, "grad_norm_var": 4.626765972571128e-06, "learning_rate": 0.005292353902899478, "loss": 2.7176, "step": 3934 }, { "crossentropy": 2.7163584232330322, "epoch": 0.3347226947941477, "grad_norm": 0.03769154101610184, "grad_norm_var": 3.3093647176226626e-06, "learning_rate": 0.005290353740599397, "loss": 2.7164, "step": 3935 }, { "crossentropy": 2.6658852100372314, "epoch": 0.33480775774072813, "grad_norm": 0.03698708862066269, "grad_norm_var": 2.4946826483775164e-06, "learning_rate": 0.005288353531676873, "loss": 2.6659, "step": 3936 }, { "crossentropy": 2.773982286453247, "epoch": 0.33489282068730863, "grad_norm": 0.03976479545235634, "grad_norm_var": 2.8793693572642942e-06, "learning_rate": 0.00528635327645308, "loss": 2.774, "step": 3937 }, { "crossentropy": 2.7577781677246094, "epoch": 0.3349778836338891, "grad_norm": 0.0395868644118309, "grad_norm_var": 3.261931549143095e-06, "learning_rate": 0.005284352975249202, "loss": 2.7578, "step": 3938 }, { "crossentropy": 2.689645290374756, "epoch": 0.33506294658046953, "grad_norm": 0.03769826889038086, "grad_norm_var": 2.5445736621385514e-06, "learning_rate": 0.005282352628386428, "loss": 2.6896, "step": 3939 }, { "crossentropy": 2.788466691970825, "epoch": 0.33514800952705004, "grad_norm": 0.03846035525202751, "grad_norm_var": 2.5975150299718694e-06, "learning_rate": 0.005280352236185959, "loss": 2.7885, "step": 3940 }, { "crossentropy": 2.730968713760376, "epoch": 0.3352330724736305, "grad_norm": 0.03718995302915573, "grad_norm_var": 2.490579481974747e-06, "learning_rate": 0.005278351798968999, "loss": 2.731, "step": 3941 }, { "crossentropy": 2.7114596366882324, "epoch": 0.33531813542021094, "grad_norm": 0.0397467315196991, "grad_norm_var": 2.909235196308218e-06, "learning_rate": 0.005276351317056761, "loss": 2.7115, "step": 3942 }, { "crossentropy": 2.7617568969726562, "epoch": 0.33540319836679144, "grad_norm": 0.038268718868494034, "grad_norm_var": 1.8765229516964331e-06, "learning_rate": 0.0052743507907704636, "loss": 2.7618, "step": 3943 }, { "crossentropy": 2.6991324424743652, "epoch": 0.3354882613133719, "grad_norm": 0.04222501069307327, "grad_norm_var": 2.7498595316995573e-06, "learning_rate": 0.005272350220431334, "loss": 2.6991, "step": 3944 }, { "crossentropy": 2.646151304244995, "epoch": 0.33557332425995234, "grad_norm": 0.03796391934156418, "grad_norm_var": 2.745799883710429e-06, "learning_rate": 0.005270349606360609, "loss": 2.6462, "step": 3945 }, { "crossentropy": 2.7216877937316895, "epoch": 0.33565838720653285, "grad_norm": 0.03495708853006363, "grad_norm_var": 3.3018818921997745e-06, "learning_rate": 0.005268348948879525, "loss": 2.7217, "step": 3946 }, { "crossentropy": 2.7065465450286865, "epoch": 0.3357434501531133, "grad_norm": 0.03661840781569481, "grad_norm_var": 2.8519695409750616e-06, "learning_rate": 0.005266348248309332, "loss": 2.7065, "step": 3947 }, { "crossentropy": 2.7739028930664062, "epoch": 0.33582851309969375, "grad_norm": 0.039038799703121185, "grad_norm_var": 2.7969123591680665e-06, "learning_rate": 0.005264347504971286, "loss": 2.7739, "step": 3948 }, { "crossentropy": 2.7367262840270996, "epoch": 0.33591357604627425, "grad_norm": 0.04051942750811577, "grad_norm_var": 3.112713329143403e-06, "learning_rate": 0.005262346719186648, "loss": 2.7367, "step": 3949 }, { "crossentropy": 2.7232139110565186, "epoch": 0.3359986389928547, "grad_norm": 0.034263137727975845, "grad_norm_var": 4.046134800847479e-06, "learning_rate": 0.005260345891276684, "loss": 2.7232, "step": 3950 }, { "crossentropy": 2.745042324066162, "epoch": 0.3360837019394352, "grad_norm": 0.036799706518650055, "grad_norm_var": 4.154672626241474e-06, "learning_rate": 0.005258345021562674, "loss": 2.745, "step": 3951 }, { "crossentropy": 2.7176589965820312, "epoch": 0.33616876488601566, "grad_norm": 0.03553151339292526, "grad_norm_var": 4.509004154959473e-06, "learning_rate": 0.005256344110365896, "loss": 2.7177, "step": 3952 }, { "crossentropy": 2.7426981925964355, "epoch": 0.3362538278325961, "grad_norm": 0.03544677793979645, "grad_norm_var": 4.681044867076561e-06, "learning_rate": 0.005254343158007642, "loss": 2.7427, "step": 3953 }, { "crossentropy": 2.867441415786743, "epoch": 0.3363388907791766, "grad_norm": 0.03546552732586861, "grad_norm_var": 4.744062813962178e-06, "learning_rate": 0.005252342164809204, "loss": 2.8674, "step": 3954 }, { "crossentropy": 2.733011245727539, "epoch": 0.33642395372575706, "grad_norm": 0.03500880300998688, "grad_norm_var": 5.129374430660333e-06, "learning_rate": 0.005250341131091886, "loss": 2.733, "step": 3955 }, { "crossentropy": 2.7426915168762207, "epoch": 0.3365090166723375, "grad_norm": 0.04091009870171547, "grad_norm_var": 5.869092465373334e-06, "learning_rate": 0.005248340057176996, "loss": 2.7427, "step": 3956 }, { "crossentropy": 2.8393657207489014, "epoch": 0.336594079618918, "grad_norm": 0.036949217319488525, "grad_norm_var": 5.8825734464101425e-06, "learning_rate": 0.005246338943385849, "loss": 2.8394, "step": 3957 }, { "crossentropy": 2.7329585552215576, "epoch": 0.33667914256549847, "grad_norm": 0.03813765197992325, "grad_norm_var": 5.5585219657992384e-06, "learning_rate": 0.005244337790039764, "loss": 2.733, "step": 3958 }, { "crossentropy": 2.673414468765259, "epoch": 0.3367642055120789, "grad_norm": 0.03762492537498474, "grad_norm_var": 5.508267207931551e-06, "learning_rate": 0.005242336597460071, "loss": 2.6734, "step": 3959 }, { "crossentropy": 2.7424209117889404, "epoch": 0.3368492684586594, "grad_norm": 0.03729832172393799, "grad_norm_var": 3.817181591334908e-06, "learning_rate": 0.005240335365968104, "loss": 2.7424, "step": 3960 }, { "crossentropy": 2.7340519428253174, "epoch": 0.3369343314052399, "grad_norm": 0.03806261718273163, "grad_norm_var": 3.830036672077391e-06, "learning_rate": 0.005238334095885203, "loss": 2.7341, "step": 3961 }, { "crossentropy": 2.7643165588378906, "epoch": 0.3370193943518203, "grad_norm": 0.036181699484586716, "grad_norm_var": 3.583746763049141e-06, "learning_rate": 0.005236332787532712, "loss": 2.7643, "step": 3962 }, { "crossentropy": 2.7176613807678223, "epoch": 0.33710445729840083, "grad_norm": 0.03637807071208954, "grad_norm_var": 3.6033034657031424e-06, "learning_rate": 0.005234331441231985, "loss": 2.7177, "step": 3963 }, { "crossentropy": 2.6999411582946777, "epoch": 0.3371895202449813, "grad_norm": 0.039575159549713135, "grad_norm_var": 3.7598633224573846e-06, "learning_rate": 0.005232330057304383, "loss": 2.6999, "step": 3964 }, { "crossentropy": 2.7166757583618164, "epoch": 0.3372745831915618, "grad_norm": 0.041562389582395554, "grad_norm_var": 4.298556712400923e-06, "learning_rate": 0.005230328636071265, "loss": 2.7167, "step": 3965 }, { "crossentropy": 2.810485363006592, "epoch": 0.33735964613814223, "grad_norm": 0.04028260335326195, "grad_norm_var": 4.206287248814146e-06, "learning_rate": 0.005228327177854006, "loss": 2.8105, "step": 3966 }, { "crossentropy": 2.5751430988311768, "epoch": 0.3374447090847227, "grad_norm": 0.03877950459718704, "grad_norm_var": 4.246356823214316e-06, "learning_rate": 0.005226325682973983, "loss": 2.5751, "step": 3967 }, { "crossentropy": 2.7553727626800537, "epoch": 0.3375297720313032, "grad_norm": 0.03693912550806999, "grad_norm_var": 3.9632675392788084e-06, "learning_rate": 0.005224324151752576, "loss": 2.7554, "step": 3968 }, { "crossentropy": 2.747267723083496, "epoch": 0.33761483497788364, "grad_norm": 0.03609469532966614, "grad_norm_var": 3.7872787822465967e-06, "learning_rate": 0.005222322584511172, "loss": 2.7473, "step": 3969 }, { "crossentropy": 2.811997175216675, "epoch": 0.3376998979244641, "grad_norm": 0.039670128375291824, "grad_norm_var": 3.5676771768002047e-06, "learning_rate": 0.005220320981571169, "loss": 2.812, "step": 3970 }, { "crossentropy": 2.7930185794830322, "epoch": 0.3377849608710446, "grad_norm": 0.03578975424170494, "grad_norm_var": 3.284862004909681e-06, "learning_rate": 0.005218319343253964, "loss": 2.793, "step": 3971 }, { "crossentropy": 2.746995210647583, "epoch": 0.33787002381762504, "grad_norm": 0.03629619628190994, "grad_norm_var": 2.911084077011674e-06, "learning_rate": 0.005216317669880965, "loss": 2.747, "step": 3972 }, { "crossentropy": 2.813868999481201, "epoch": 0.3379550867642055, "grad_norm": 0.036047715693712234, "grad_norm_var": 3.0703181548352144e-06, "learning_rate": 0.005214315961773578, "loss": 2.8139, "step": 3973 }, { "crossentropy": 2.765022039413452, "epoch": 0.338040149710786, "grad_norm": 0.03542141988873482, "grad_norm_var": 3.4073542984252675e-06, "learning_rate": 0.005212314219253225, "loss": 2.765, "step": 3974 }, { "crossentropy": 2.703111171722412, "epoch": 0.33812521265736645, "grad_norm": 0.03475644066929817, "grad_norm_var": 3.921749051434958e-06, "learning_rate": 0.005210312442641326, "loss": 2.7031, "step": 3975 }, { "crossentropy": 2.790350914001465, "epoch": 0.3382102756039469, "grad_norm": 0.03602755814790726, "grad_norm_var": 4.047696774136094e-06, "learning_rate": 0.005208310632259307, "loss": 2.7904, "step": 3976 }, { "crossentropy": 2.6924822330474854, "epoch": 0.3382953385505274, "grad_norm": 0.03654653578996658, "grad_norm_var": 4.0506507844456065e-06, "learning_rate": 0.005206308788428604, "loss": 2.6925, "step": 3977 }, { "crossentropy": 2.756837844848633, "epoch": 0.33838040149710785, "grad_norm": 0.036441512405872345, "grad_norm_var": 4.01710631968306e-06, "learning_rate": 0.0052043069114706565, "loss": 2.7568, "step": 3978 }, { "crossentropy": 2.749642848968506, "epoch": 0.3384654644436883, "grad_norm": 0.03696556016802788, "grad_norm_var": 3.967397329838045e-06, "learning_rate": 0.005202305001706903, "loss": 2.7496, "step": 3979 }, { "crossentropy": 2.7725918292999268, "epoch": 0.3385505273902688, "grad_norm": 0.037459880113601685, "grad_norm_var": 3.6123537148717463e-06, "learning_rate": 0.005200303059458796, "loss": 2.7726, "step": 3980 }, { "crossentropy": 2.7829031944274902, "epoch": 0.33863559033684926, "grad_norm": 0.036646321415901184, "grad_norm_var": 2.25852172044382e-06, "learning_rate": 0.005198301085047791, "loss": 2.7829, "step": 3981 }, { "crossentropy": 2.7541701793670654, "epoch": 0.33872065328342976, "grad_norm": 0.03958171606063843, "grad_norm_var": 1.971741728829836e-06, "learning_rate": 0.005196299078795343, "loss": 2.7542, "step": 3982 }, { "crossentropy": 2.7523539066314697, "epoch": 0.3388057162300102, "grad_norm": 0.03750069439411163, "grad_norm_var": 1.7435067988244915e-06, "learning_rate": 0.005194297041022922, "loss": 2.7524, "step": 3983 }, { "crossentropy": 2.7458908557891846, "epoch": 0.33889077917659066, "grad_norm": 0.03577538579702377, "grad_norm_var": 1.8006007824690937e-06, "learning_rate": 0.005192294972051992, "loss": 2.7459, "step": 3984 }, { "crossentropy": 2.783661365509033, "epoch": 0.33897584212317117, "grad_norm": 0.03538407385349274, "grad_norm_var": 1.8884575837744646e-06, "learning_rate": 0.005190292872204032, "loss": 2.7837, "step": 3985 }, { "crossentropy": 2.7735819816589355, "epoch": 0.3390609050697516, "grad_norm": 0.037905383855104446, "grad_norm_var": 1.3711583636666358e-06, "learning_rate": 0.00518829074180052, "loss": 2.7736, "step": 3986 }, { "crossentropy": 2.661168098449707, "epoch": 0.33914596801633207, "grad_norm": 0.036127254366874695, "grad_norm_var": 1.3447803945904405e-06, "learning_rate": 0.005186288581162939, "loss": 2.6612, "step": 3987 }, { "crossentropy": 2.756962537765503, "epoch": 0.33923103096291257, "grad_norm": 0.038946401327848434, "grad_norm_var": 1.6922229265819525e-06, "learning_rate": 0.005184286390612781, "loss": 2.757, "step": 3988 }, { "crossentropy": 2.7679779529571533, "epoch": 0.339316093909493, "grad_norm": 0.03966990485787392, "grad_norm_var": 2.1871351854568326e-06, "learning_rate": 0.0051822841704715385, "loss": 2.768, "step": 3989 }, { "crossentropy": 2.836928367614746, "epoch": 0.33940115685607347, "grad_norm": 0.03508712723851204, "grad_norm_var": 2.2621296179486478e-06, "learning_rate": 0.00518028192106071, "loss": 2.8369, "step": 3990 }, { "crossentropy": 2.6464245319366455, "epoch": 0.339486219802654, "grad_norm": 0.03475824370980263, "grad_norm_var": 2.2616081609506954e-06, "learning_rate": 0.0051782796427018, "loss": 2.6464, "step": 3991 }, { "crossentropy": 2.7413413524627686, "epoch": 0.3395712827492344, "grad_norm": 0.04196319729089737, "grad_norm_var": 3.7521792393115095e-06, "learning_rate": 0.0051762773357163176, "loss": 2.7413, "step": 3992 }, { "crossentropy": 2.738699197769165, "epoch": 0.3396563456958149, "grad_norm": 0.03968821465969086, "grad_norm_var": 4.054512723500325e-06, "learning_rate": 0.005174275000425772, "loss": 2.7387, "step": 3993 }, { "crossentropy": 2.7901015281677246, "epoch": 0.3397414086423954, "grad_norm": 0.03503607586026192, "grad_norm_var": 4.375156595989339e-06, "learning_rate": 0.005172272637151685, "loss": 2.7901, "step": 3994 }, { "crossentropy": 2.6274542808532715, "epoch": 0.33982647158897583, "grad_norm": 0.06006244197487831, "grad_norm_var": 3.636051560901108e-05, "learning_rate": 0.005170270246215575, "loss": 2.6275, "step": 3995 }, { "crossentropy": 2.6673295497894287, "epoch": 0.33991153453555634, "grad_norm": 0.03852192312479019, "grad_norm_var": 3.623423062013523e-05, "learning_rate": 0.005168267827938971, "loss": 2.6673, "step": 3996 }, { "crossentropy": 2.7067856788635254, "epoch": 0.3399965974821368, "grad_norm": 0.034140828996896744, "grad_norm_var": 3.7384761305133314e-05, "learning_rate": 0.005166265382643401, "loss": 2.7068, "step": 3997 }, { "crossentropy": 2.6592204570770264, "epoch": 0.34008166042871724, "grad_norm": 0.039082642644643784, "grad_norm_var": 3.734560259510868e-05, "learning_rate": 0.005164262910650402, "loss": 2.6592, "step": 3998 }, { "crossentropy": 2.7474799156188965, "epoch": 0.34016672337529774, "grad_norm": 0.03679470717906952, "grad_norm_var": 3.749229254663786e-05, "learning_rate": 0.005162260412281512, "loss": 2.7475, "step": 3999 }, { "crossentropy": 2.767075300216675, "epoch": 0.3402517863218782, "grad_norm": 0.04105833172798157, "grad_norm_var": 3.7187839035985636e-05, "learning_rate": 0.005160257887858277, "loss": 2.7671, "step": 4000 }, { "crossentropy": 2.732020854949951, "epoch": 0.34033684926845864, "grad_norm": 0.03779440373182297, "grad_norm_var": 3.6384313452030895e-05, "learning_rate": 0.005158255337702241, "loss": 2.732, "step": 4001 }, { "crossentropy": 2.775350332260132, "epoch": 0.34042191221503915, "grad_norm": 0.041269365698099136, "grad_norm_var": 3.65266918237803e-05, "learning_rate": 0.0051562527621349585, "loss": 2.7754, "step": 4002 }, { "crossentropy": 2.7513253688812256, "epoch": 0.3405069751616196, "grad_norm": 0.039583392441272736, "grad_norm_var": 3.577659583897611e-05, "learning_rate": 0.0051542501614779855, "loss": 2.7513, "step": 4003 }, { "crossentropy": 2.704155683517456, "epoch": 0.34059203810820005, "grad_norm": 0.03687015175819397, "grad_norm_var": 3.622448877341547e-05, "learning_rate": 0.00515224753605288, "loss": 2.7042, "step": 4004 }, { "crossentropy": 2.7289302349090576, "epoch": 0.34067710105478055, "grad_norm": 0.038264792412519455, "grad_norm_var": 3.630880510427833e-05, "learning_rate": 0.005150244886181208, "loss": 2.7289, "step": 4005 }, { "crossentropy": 2.6990714073181152, "epoch": 0.340762164001361, "grad_norm": 0.036692406982183456, "grad_norm_var": 3.555242128056001e-05, "learning_rate": 0.0051482422121845355, "loss": 2.6991, "step": 4006 }, { "crossentropy": 2.688599109649658, "epoch": 0.34084722694794145, "grad_norm": 0.03826429694890976, "grad_norm_var": 3.411628865360305e-05, "learning_rate": 0.0051462395143844365, "loss": 2.6886, "step": 4007 }, { "crossentropy": 2.8243954181671143, "epoch": 0.34093228989452196, "grad_norm": 0.038601312786340714, "grad_norm_var": 3.3805038397987974e-05, "learning_rate": 0.005144236793102485, "loss": 2.8244, "step": 4008 }, { "crossentropy": 2.7466654777526855, "epoch": 0.3410173528411024, "grad_norm": 0.03731200844049454, "grad_norm_var": 3.4092864317103455e-05, "learning_rate": 0.005142234048660259, "loss": 2.7467, "step": 4009 }, { "crossentropy": 2.734057903289795, "epoch": 0.3411024157876829, "grad_norm": 0.03461930900812149, "grad_norm_var": 3.434256885796594e-05, "learning_rate": 0.005140231281379345, "loss": 2.7341, "step": 4010 }, { "crossentropy": 2.6864733695983887, "epoch": 0.34118747873426336, "grad_norm": 0.0393441766500473, "grad_norm_var": 3.838416645553696e-06, "learning_rate": 0.0051382284915813285, "loss": 2.6865, "step": 4011 }, { "crossentropy": 2.722444534301758, "epoch": 0.3412725416808438, "grad_norm": 0.03444952145218849, "grad_norm_var": 4.59881185732678e-06, "learning_rate": 0.005136225679587797, "loss": 2.7224, "step": 4012 }, { "crossentropy": 2.7308189868927, "epoch": 0.3413576046274243, "grad_norm": 0.0436960831284523, "grad_norm_var": 5.695756710027977e-06, "learning_rate": 0.005134222845720348, "loss": 2.7308, "step": 4013 }, { "crossentropy": 2.7197487354278564, "epoch": 0.34144266757400477, "grad_norm": 0.04183270037174225, "grad_norm_var": 6.434853339262226e-06, "learning_rate": 0.005132219990300577, "loss": 2.7197, "step": 4014 }, { "crossentropy": 2.795423746109009, "epoch": 0.3415277305205852, "grad_norm": 0.0368170440196991, "grad_norm_var": 6.429722544716981e-06, "learning_rate": 0.005130217113650085, "loss": 2.7954, "step": 4015 }, { "crossentropy": 2.639572858810425, "epoch": 0.3416127934671657, "grad_norm": 0.03622334823012352, "grad_norm_var": 6.260432291747368e-06, "learning_rate": 0.005128214216090478, "loss": 2.6396, "step": 4016 }, { "crossentropy": 2.724545955657959, "epoch": 0.34169785641374617, "grad_norm": 0.03475598618388176, "grad_norm_var": 7.01274412113802e-06, "learning_rate": 0.005126211297943362, "loss": 2.7245, "step": 4017 }, { "crossentropy": 2.7506072521209717, "epoch": 0.3417829193603266, "grad_norm": 0.03782219812273979, "grad_norm_var": 6.269873825356873e-06, "learning_rate": 0.005124208359530347, "loss": 2.7506, "step": 4018 }, { "crossentropy": 2.8022992610931396, "epoch": 0.3418679823069071, "grad_norm": 0.035929519683122635, "grad_norm_var": 6.2460778439075064e-06, "learning_rate": 0.005122205401173049, "loss": 2.8023, "step": 4019 }, { "crossentropy": 2.7495055198669434, "epoch": 0.3419530452534876, "grad_norm": 0.039034824818372726, "grad_norm_var": 6.330186574702151e-06, "learning_rate": 0.005120202423193085, "loss": 2.7495, "step": 4020 }, { "crossentropy": 2.6913013458251953, "epoch": 0.342038108200068, "grad_norm": 0.03635513037443161, "grad_norm_var": 6.421616672347419e-06, "learning_rate": 0.005118199425912076, "loss": 2.6913, "step": 4021 }, { "crossentropy": 2.8013851642608643, "epoch": 0.34212317114664853, "grad_norm": 0.033029139041900635, "grad_norm_var": 7.708213307425157e-06, "learning_rate": 0.005116196409651644, "loss": 2.8014, "step": 4022 }, { "crossentropy": 2.621983528137207, "epoch": 0.342208234093229, "grad_norm": 0.034792374819517136, "grad_norm_var": 8.052433093500706e-06, "learning_rate": 0.0051141933747334166, "loss": 2.622, "step": 4023 }, { "crossentropy": 2.840447187423706, "epoch": 0.3422932970398095, "grad_norm": 0.037426359951496124, "grad_norm_var": 7.913453981615338e-06, "learning_rate": 0.005112190321479026, "loss": 2.8404, "step": 4024 }, { "crossentropy": 2.7163474559783936, "epoch": 0.34237835998638994, "grad_norm": 0.03500517085194588, "grad_norm_var": 8.177757421597386e-06, "learning_rate": 0.0051101872502101, "loss": 2.7163, "step": 4025 }, { "crossentropy": 2.6960341930389404, "epoch": 0.3424634229329704, "grad_norm": 0.03571163862943649, "grad_norm_var": 7.913491295865004e-06, "learning_rate": 0.005108184161248277, "loss": 2.696, "step": 4026 }, { "crossentropy": 2.823711633682251, "epoch": 0.3425484858795509, "grad_norm": 0.03782778978347778, "grad_norm_var": 7.586094417457121e-06, "learning_rate": 0.005106181054915195, "loss": 2.8237, "step": 4027 }, { "crossentropy": 2.7746083736419678, "epoch": 0.34263354882613134, "grad_norm": 0.034818124026060104, "grad_norm_var": 7.473203837433477e-06, "learning_rate": 0.005104177931532497, "loss": 2.7746, "step": 4028 }, { "crossentropy": 2.669916868209839, "epoch": 0.3427186117727118, "grad_norm": 0.0390426442027092, "grad_norm_var": 4.636191904220747e-06, "learning_rate": 0.005102174791421822, "loss": 2.6699, "step": 4029 }, { "crossentropy": 2.810279369354248, "epoch": 0.3428036747192923, "grad_norm": 0.03572044521570206, "grad_norm_var": 2.748661338371437e-06, "learning_rate": 0.005100171634904821, "loss": 2.8103, "step": 4030 }, { "crossentropy": 2.730947971343994, "epoch": 0.34288873766587274, "grad_norm": 0.036772456020116806, "grad_norm_var": 2.7455303109412296e-06, "learning_rate": 0.005098168462303141, "loss": 2.7309, "step": 4031 }, { "crossentropy": 2.7321557998657227, "epoch": 0.3429738006124532, "grad_norm": 0.03998149558901787, "grad_norm_var": 3.606538372846688e-06, "learning_rate": 0.005096165273938436, "loss": 2.7322, "step": 4032 }, { "crossentropy": 2.7377822399139404, "epoch": 0.3430588635590337, "grad_norm": 0.06089309602975845, "grad_norm_var": 4.022001290275486e-05, "learning_rate": 0.005094162070132358, "loss": 2.7378, "step": 4033 }, { "crossentropy": 2.6524012088775635, "epoch": 0.34314392650561415, "grad_norm": 0.0367935374379158, "grad_norm_var": 4.032906972839528e-05, "learning_rate": 0.005092158851206566, "loss": 2.6524, "step": 4034 }, { "crossentropy": 2.733532428741455, "epoch": 0.3432289894521946, "grad_norm": 0.037122733891010284, "grad_norm_var": 4.0077377846663614e-05, "learning_rate": 0.005090155617482717, "loss": 2.7335, "step": 4035 }, { "crossentropy": 2.8132123947143555, "epoch": 0.3433140523987751, "grad_norm": 0.03517577424645424, "grad_norm_var": 4.055051812094622e-05, "learning_rate": 0.005088152369282474, "loss": 2.8132, "step": 4036 }, { "crossentropy": 2.685800790786743, "epoch": 0.34339911534535555, "grad_norm": 0.03759276494383812, "grad_norm_var": 4.039062022255646e-05, "learning_rate": 0.005086149106927499, "loss": 2.6858, "step": 4037 }, { "crossentropy": 2.6464552879333496, "epoch": 0.34348417829193606, "grad_norm": 0.036561835557222366, "grad_norm_var": 3.883787951709516e-05, "learning_rate": 0.005084145830739462, "loss": 2.6465, "step": 4038 }, { "crossentropy": 2.70041823387146, "epoch": 0.3435692412385165, "grad_norm": 0.03657183796167374, "grad_norm_var": 3.822671888945948e-05, "learning_rate": 0.005082142541040029, "loss": 2.7004, "step": 4039 }, { "crossentropy": 2.738342523574829, "epoch": 0.34365430418509696, "grad_norm": 0.034738823771476746, "grad_norm_var": 3.8996081360251284e-05, "learning_rate": 0.00508013923815087, "loss": 2.7383, "step": 4040 }, { "crossentropy": 2.742953062057495, "epoch": 0.34373936713167746, "grad_norm": 0.04722665250301361, "grad_norm_var": 4.321388556045395e-05, "learning_rate": 0.005078135922393657, "loss": 2.743, "step": 4041 }, { "crossentropy": 2.862619638442993, "epoch": 0.3438244300782579, "grad_norm": 0.039335813373327255, "grad_norm_var": 4.2489530175805235e-05, "learning_rate": 0.0050761325940900685, "loss": 2.8626, "step": 4042 }, { "crossentropy": 2.7923738956451416, "epoch": 0.34390949302483836, "grad_norm": 0.038089435547590256, "grad_norm_var": 4.2448170860068245e-05, "learning_rate": 0.005074129253561778, "loss": 2.7924, "step": 4043 }, { "crossentropy": 2.6876580715179443, "epoch": 0.34399455597141887, "grad_norm": 0.03524580970406532, "grad_norm_var": 4.22124453289742e-05, "learning_rate": 0.0050721259011304665, "loss": 2.6877, "step": 4044 }, { "crossentropy": 2.7398715019226074, "epoch": 0.3440796189179993, "grad_norm": 0.03706347569823265, "grad_norm_var": 4.2493266456202806e-05, "learning_rate": 0.005070122537117812, "loss": 2.7399, "step": 4045 }, { "crossentropy": 2.7120728492736816, "epoch": 0.34416468186457977, "grad_norm": 0.03558635711669922, "grad_norm_var": 4.2554013421045904e-05, "learning_rate": 0.0050681191618455, "loss": 2.7121, "step": 4046 }, { "crossentropy": 2.7041566371917725, "epoch": 0.3442497448111603, "grad_norm": 0.03797151520848274, "grad_norm_var": 4.228023164546211e-05, "learning_rate": 0.005066115775635213, "loss": 2.7042, "step": 4047 }, { "crossentropy": 2.8331215381622314, "epoch": 0.3443348077577407, "grad_norm": 0.03986981511116028, "grad_norm_var": 4.226821169223405e-05, "learning_rate": 0.005064112378808636, "loss": 2.8331, "step": 4048 }, { "crossentropy": 2.7044289112091064, "epoch": 0.3444198707043212, "grad_norm": 0.035190727561712265, "grad_norm_var": 8.92313796710207e-06, "learning_rate": 0.005062108971687461, "loss": 2.7044, "step": 4049 }, { "crossentropy": 2.7923033237457275, "epoch": 0.3445049336509017, "grad_norm": 0.03357474505901337, "grad_norm_var": 9.877543529583585e-06, "learning_rate": 0.0050601055545933735, "loss": 2.7923, "step": 4050 }, { "crossentropy": 2.7046048641204834, "epoch": 0.34458999659748213, "grad_norm": 0.034389130771160126, "grad_norm_var": 1.0411880894919899e-05, "learning_rate": 0.0050581021278480656, "loss": 2.7046, "step": 4051 }, { "crossentropy": 2.744168758392334, "epoch": 0.3446750595440626, "grad_norm": 0.06280050426721573, "grad_norm_var": 5.08851833880238e-05, "learning_rate": 0.00505609869177323, "loss": 2.7442, "step": 4052 }, { "crossentropy": 2.784730911254883, "epoch": 0.3447601224906431, "grad_norm": 0.0388222374022007, "grad_norm_var": 5.0771416596469506e-05, "learning_rate": 0.005054095246690562, "loss": 2.7847, "step": 4053 }, { "crossentropy": 2.686424970626831, "epoch": 0.34484518543722353, "grad_norm": 0.039505235850811005, "grad_norm_var": 5.037960475042832e-05, "learning_rate": 0.005052091792921755, "loss": 2.6864, "step": 4054 }, { "crossentropy": 2.788771390914917, "epoch": 0.34493024838380404, "grad_norm": 0.0358915701508522, "grad_norm_var": 5.0640004001462474e-05, "learning_rate": 0.005050088330788507, "loss": 2.7888, "step": 4055 }, { "crossentropy": 2.7435083389282227, "epoch": 0.3450153113303845, "grad_norm": 0.034115035086870193, "grad_norm_var": 5.102550064637982e-05, "learning_rate": 0.005048084860612517, "loss": 2.7435, "step": 4056 }, { "crossentropy": 2.681248188018799, "epoch": 0.34510037427696494, "grad_norm": 0.03930845484137535, "grad_norm_var": 4.630349025586487e-05, "learning_rate": 0.005046081382715484, "loss": 2.6812, "step": 4057 }, { "crossentropy": 2.7950761318206787, "epoch": 0.34518543722354544, "grad_norm": 0.03738968074321747, "grad_norm_var": 4.6335647559550554e-05, "learning_rate": 0.005044077897419108, "loss": 2.7951, "step": 4058 }, { "crossentropy": 2.73899245262146, "epoch": 0.3452705001701259, "grad_norm": 0.03741500899195671, "grad_norm_var": 4.639432806826197e-05, "learning_rate": 0.005042074405045092, "loss": 2.739, "step": 4059 }, { "crossentropy": 2.7247040271759033, "epoch": 0.34535556311670634, "grad_norm": 0.03725561127066612, "grad_norm_var": 4.58059111521097e-05, "learning_rate": 0.005040070905915139, "loss": 2.7247, "step": 4060 }, { "crossentropy": 2.801387310028076, "epoch": 0.34544062606328685, "grad_norm": 0.035998906940221786, "grad_norm_var": 4.6081969441587554e-05, "learning_rate": 0.005038067400350953, "loss": 2.8014, "step": 4061 }, { "crossentropy": 2.6992616653442383, "epoch": 0.3455256890098673, "grad_norm": 0.03693131357431412, "grad_norm_var": 4.568279029813216e-05, "learning_rate": 0.00503606388867424, "loss": 2.6993, "step": 4062 }, { "crossentropy": 2.774979591369629, "epoch": 0.34561075195644775, "grad_norm": 0.03868725895881653, "grad_norm_var": 4.566181201958632e-05, "learning_rate": 0.005034060371206704, "loss": 2.775, "step": 4063 }, { "crossentropy": 2.781809091567993, "epoch": 0.34569581490302825, "grad_norm": 0.044159453362226486, "grad_norm_var": 4.755440370753904e-05, "learning_rate": 0.005032056848270056, "loss": 2.7818, "step": 4064 }, { "crossentropy": 2.726499319076538, "epoch": 0.3457808778496087, "grad_norm": 0.037850189954042435, "grad_norm_var": 4.670254984810255e-05, "learning_rate": 0.005030053320186, "loss": 2.7265, "step": 4065 }, { "crossentropy": 2.7588436603546143, "epoch": 0.34586594079618915, "grad_norm": 0.03446680307388306, "grad_norm_var": 4.610629841094819e-05, "learning_rate": 0.005028049787276249, "loss": 2.7588, "step": 4066 }, { "crossentropy": 2.717195749282837, "epoch": 0.34595100374276966, "grad_norm": 0.03433260694146156, "grad_norm_var": 4.6141712583484296e-05, "learning_rate": 0.0050260462498625084, "loss": 2.7172, "step": 4067 }, { "crossentropy": 2.5779471397399902, "epoch": 0.3460360666893501, "grad_norm": 0.03419642150402069, "grad_norm_var": 6.728247978141956e-06, "learning_rate": 0.005024042708266492, "loss": 2.5779, "step": 4068 }, { "crossentropy": 2.671921730041504, "epoch": 0.3461211296359306, "grad_norm": 0.03500289097428322, "grad_norm_var": 6.849674180919805e-06, "learning_rate": 0.005022039162809909, "loss": 2.6719, "step": 4069 }, { "crossentropy": 2.6399505138397217, "epoch": 0.34620619258251106, "grad_norm": 0.03605154901742935, "grad_norm_var": 6.456106991231149e-06, "learning_rate": 0.005020035613814469, "loss": 2.64, "step": 4070 }, { "crossentropy": 2.6378612518310547, "epoch": 0.3462912555290915, "grad_norm": 0.03647990524768829, "grad_norm_var": 6.405239932228725e-06, "learning_rate": 0.005018032061601891, "loss": 2.6379, "step": 4071 }, { "crossentropy": 2.7296483516693115, "epoch": 0.346376318475672, "grad_norm": 0.038434095680713654, "grad_norm_var": 5.994656548116706e-06, "learning_rate": 0.00501602850649388, "loss": 2.7296, "step": 4072 }, { "crossentropy": 2.8042521476745605, "epoch": 0.34646138142225247, "grad_norm": 0.03611372411251068, "grad_norm_var": 5.701416316569672e-06, "learning_rate": 0.005014024948812154, "loss": 2.8043, "step": 4073 }, { "crossentropy": 2.690366744995117, "epoch": 0.3465464443688329, "grad_norm": 0.03736383467912674, "grad_norm_var": 5.699849264090227e-06, "learning_rate": 0.005012021388878425, "loss": 2.6904, "step": 4074 }, { "crossentropy": 2.7687935829162598, "epoch": 0.3466315073154134, "grad_norm": 0.03435303643345833, "grad_norm_var": 6.084234616945902e-06, "learning_rate": 0.005010017827014408, "loss": 2.7688, "step": 4075 }, { "crossentropy": 2.6967711448669434, "epoch": 0.3467165702619939, "grad_norm": 0.035892847925424576, "grad_norm_var": 6.10477312407842e-06, "learning_rate": 0.005008014263541817, "loss": 2.6968, "step": 4076 }, { "crossentropy": 2.703745126724243, "epoch": 0.3468016332085743, "grad_norm": 0.0347183533012867, "grad_norm_var": 6.317520886872594e-06, "learning_rate": 0.005006010698782365, "loss": 2.7037, "step": 4077 }, { "crossentropy": 2.6869285106658936, "epoch": 0.34688669615515483, "grad_norm": 0.03344440832734108, "grad_norm_var": 6.906954814818918e-06, "learning_rate": 0.0050040071330577675, "loss": 2.6869, "step": 4078 }, { "crossentropy": 2.7538750171661377, "epoch": 0.3469717591017353, "grad_norm": 0.03470062091946602, "grad_norm_var": 6.6561627726978314e-06, "learning_rate": 0.0050020035666897425, "loss": 2.7539, "step": 4079 }, { "crossentropy": 2.809753179550171, "epoch": 0.3470568220483157, "grad_norm": 0.03473535180091858, "grad_norm_var": 2.07685433965335e-06, "learning_rate": 0.005, "loss": 2.8098, "step": 4080 }, { "crossentropy": 2.6876511573791504, "epoch": 0.34714188499489623, "grad_norm": 0.03611426055431366, "grad_norm_var": 1.7232031516128376e-06, "learning_rate": 0.004997996433310258, "loss": 2.6877, "step": 4081 }, { "crossentropy": 2.7074499130249023, "epoch": 0.3472269479414767, "grad_norm": 0.03564818948507309, "grad_norm_var": 1.6634302836733445e-06, "learning_rate": 0.004995992866942232, "loss": 2.7074, "step": 4082 }, { "crossentropy": 2.709181547164917, "epoch": 0.3473120108880572, "grad_norm": 0.03562202304601669, "grad_norm_var": 1.5711321053500186e-06, "learning_rate": 0.004993989301217635, "loss": 2.7092, "step": 4083 }, { "crossentropy": 2.77126145362854, "epoch": 0.34739707383463764, "grad_norm": 0.04009898751974106, "grad_norm_var": 2.6798541773322136e-06, "learning_rate": 0.004991985736458185, "loss": 2.7713, "step": 4084 }, { "crossentropy": 2.707592725753784, "epoch": 0.3474821367812181, "grad_norm": 0.038541536778211594, "grad_norm_var": 3.028175471821622e-06, "learning_rate": 0.004989982172985594, "loss": 2.7076, "step": 4085 }, { "crossentropy": 2.750131845474243, "epoch": 0.3475671997277986, "grad_norm": 0.03781171143054962, "grad_norm_var": 3.1999860624500052e-06, "learning_rate": 0.004987978611121575, "loss": 2.7501, "step": 4086 }, { "crossentropy": 2.822843313217163, "epoch": 0.34765226267437904, "grad_norm": 0.03696822002530098, "grad_norm_var": 3.2295614880860243e-06, "learning_rate": 0.0049859750511878475, "loss": 2.8228, "step": 4087 }, { "crossentropy": 2.818727731704712, "epoch": 0.3477373256209595, "grad_norm": 0.039841342717409134, "grad_norm_var": 3.756560042927672e-06, "learning_rate": 0.004983971493506121, "loss": 2.8187, "step": 4088 }, { "crossentropy": 2.677370309829712, "epoch": 0.34782238856754, "grad_norm": 0.040675822645425797, "grad_norm_var": 4.899627123817971e-06, "learning_rate": 0.00498196793839811, "loss": 2.6774, "step": 4089 }, { "crossentropy": 2.762866497039795, "epoch": 0.34790745151412045, "grad_norm": 0.03609563782811165, "grad_norm_var": 4.880822608508556e-06, "learning_rate": 0.0049799643861855305, "loss": 2.7629, "step": 4090 }, { "crossentropy": 2.7020788192749023, "epoch": 0.3479925144607009, "grad_norm": 0.03563209995627403, "grad_norm_var": 4.603470566083568e-06, "learning_rate": 0.004977960837190094, "loss": 2.7021, "step": 4091 }, { "crossentropy": 2.728585720062256, "epoch": 0.3480775774072814, "grad_norm": 0.03641730174422264, "grad_norm_var": 4.567097754087692e-06, "learning_rate": 0.00497595729173351, "loss": 2.7286, "step": 4092 }, { "crossentropy": 2.729431390762329, "epoch": 0.34816264035386185, "grad_norm": 0.03599182516336441, "grad_norm_var": 4.3334031842575645e-06, "learning_rate": 0.0049739537501374935, "loss": 2.7294, "step": 4093 }, { "crossentropy": 2.5959889888763428, "epoch": 0.3482477033004423, "grad_norm": 0.03719666972756386, "grad_norm_var": 3.5489664787626807e-06, "learning_rate": 0.004971950212723753, "loss": 2.596, "step": 4094 }, { "crossentropy": 2.6975317001342773, "epoch": 0.3483327662470228, "grad_norm": 0.035566236823797226, "grad_norm_var": 3.329752516180393e-06, "learning_rate": 0.0049699466798140005, "loss": 2.6975, "step": 4095 }, { "crossentropy": 2.7662785053253174, "epoch": 0.34841782919360326, "grad_norm": 0.03562578931450844, "grad_norm_var": 3.103334238785162e-06, "learning_rate": 0.004967943151729944, "loss": 2.7663, "step": 4096 }, { "crossentropy": 2.674776554107666, "epoch": 0.34850289214018376, "grad_norm": 0.037953414022922516, "grad_norm_var": 3.0692204726743648e-06, "learning_rate": 0.004965939628793295, "loss": 2.6748, "step": 4097 }, { "crossentropy": 2.6741244792938232, "epoch": 0.3485879550867642, "grad_norm": 0.034032002091407776, "grad_norm_var": 3.5734329849506735e-06, "learning_rate": 0.004963936111325761, "loss": 2.6741, "step": 4098 }, { "crossentropy": 2.640594244003296, "epoch": 0.34867301803334466, "grad_norm": 0.03516709804534912, "grad_norm_var": 3.677801080982183e-06, "learning_rate": 0.004961932599649049, "loss": 2.6406, "step": 4099 }, { "crossentropy": 2.6392741203308105, "epoch": 0.34875808097992517, "grad_norm": 0.036411624401807785, "grad_norm_var": 3.0536263428424318e-06, "learning_rate": 0.0049599290940848615, "loss": 2.6393, "step": 4100 }, { "crossentropy": 2.7688639163970947, "epoch": 0.3488431439265056, "grad_norm": 0.04014594480395317, "grad_norm_var": 3.571974689740333e-06, "learning_rate": 0.004957925594954909, "loss": 2.7689, "step": 4101 }, { "crossentropy": 2.7360639572143555, "epoch": 0.34892820687308607, "grad_norm": 0.03727419301867485, "grad_norm_var": 3.5297649184418133e-06, "learning_rate": 0.004955922102580893, "loss": 2.7361, "step": 4102 }, { "crossentropy": 2.8119516372680664, "epoch": 0.34901326981966657, "grad_norm": 0.03783252462744713, "grad_norm_var": 3.5800284271893245e-06, "learning_rate": 0.004953918617284516, "loss": 2.812, "step": 4103 }, { "crossentropy": 2.7036263942718506, "epoch": 0.349098332766247, "grad_norm": 0.03741425275802612, "grad_norm_var": 3.025867524441596e-06, "learning_rate": 0.004951915139387484, "loss": 2.7036, "step": 4104 }, { "crossentropy": 2.6732842922210693, "epoch": 0.34918339571282747, "grad_norm": 0.03355364874005318, "grad_norm_var": 2.553167592183175e-06, "learning_rate": 0.004949911669211492, "loss": 2.6733, "step": 4105 }, { "crossentropy": 2.8128228187561035, "epoch": 0.349268458659408, "grad_norm": 0.03614502400159836, "grad_norm_var": 2.551352789610893e-06, "learning_rate": 0.004947908207078247, "loss": 2.8128, "step": 4106 }, { "crossentropy": 2.7771644592285156, "epoch": 0.3493535216059884, "grad_norm": 0.03412431851029396, "grad_norm_var": 2.8473103192215115e-06, "learning_rate": 0.00494590475330944, "loss": 2.7772, "step": 4107 }, { "crossentropy": 2.6837098598480225, "epoch": 0.3494385845525689, "grad_norm": 0.035670410841703415, "grad_norm_var": 2.8708169779419665e-06, "learning_rate": 0.004943901308226771, "loss": 2.6837, "step": 4108 }, { "crossentropy": 2.673452377319336, "epoch": 0.3495236474991494, "grad_norm": 0.038905274122953415, "grad_norm_var": 3.298489262407109e-06, "learning_rate": 0.0049418978721519355, "loss": 2.6735, "step": 4109 }, { "crossentropy": 2.7286956310272217, "epoch": 0.34960871044572983, "grad_norm": 0.03584737330675125, "grad_norm_var": 3.2759046643346854e-06, "learning_rate": 0.004939894445406628, "loss": 2.7287, "step": 4110 }, { "crossentropy": 2.69260573387146, "epoch": 0.34969377339231034, "grad_norm": 0.03587539494037628, "grad_norm_var": 3.249392669957216e-06, "learning_rate": 0.004937891028312539, "loss": 2.6926, "step": 4111 }, { "crossentropy": 2.7277987003326416, "epoch": 0.3497788363388908, "grad_norm": 0.03463957831263542, "grad_norm_var": 3.4085197959942667e-06, "learning_rate": 0.004935887621191363, "loss": 2.7278, "step": 4112 }, { "crossentropy": 2.6585757732391357, "epoch": 0.34986389928547124, "grad_norm": 0.03649973124265671, "grad_norm_var": 3.2224492848217354e-06, "learning_rate": 0.004933884224364789, "loss": 2.6586, "step": 4113 }, { "crossentropy": 2.695467710494995, "epoch": 0.34994896223205174, "grad_norm": 0.037289492785930634, "grad_norm_var": 2.93483511345684e-06, "learning_rate": 0.004931880838154501, "loss": 2.6955, "step": 4114 }, { "crossentropy": 2.6724491119384766, "epoch": 0.3500340251786322, "grad_norm": 0.03701365739107132, "grad_norm_var": 2.8383043672252615e-06, "learning_rate": 0.004929877462882189, "loss": 2.6724, "step": 4115 }, { "crossentropy": 2.806291103363037, "epoch": 0.35011908812521264, "grad_norm": 0.036635901778936386, "grad_norm_var": 2.8376046721803026e-06, "learning_rate": 0.004927874098869535, "loss": 2.8063, "step": 4116 }, { "crossentropy": 2.729677438735962, "epoch": 0.35020415107179315, "grad_norm": 0.04097140207886696, "grad_norm_var": 3.2755051149898604e-06, "learning_rate": 0.004925870746438223, "loss": 2.7297, "step": 4117 }, { "crossentropy": 2.761395215988159, "epoch": 0.3502892140183736, "grad_norm": 0.03506297990679741, "grad_norm_var": 3.384023871870421e-06, "learning_rate": 0.004923867405909932, "loss": 2.7614, "step": 4118 }, { "crossentropy": 2.777142286300659, "epoch": 0.35037427696495405, "grad_norm": 0.034623973071575165, "grad_norm_var": 3.443507957184612e-06, "learning_rate": 0.004921864077606342, "loss": 2.7771, "step": 4119 }, { "crossentropy": 2.641584634780884, "epoch": 0.35045933991153455, "grad_norm": 0.034178297966718674, "grad_norm_var": 3.602987420893375e-06, "learning_rate": 0.004919860761849132, "loss": 2.6416, "step": 4120 }, { "crossentropy": 2.663346767425537, "epoch": 0.350544402858115, "grad_norm": 0.03972181677818298, "grad_norm_var": 3.915671449254236e-06, "learning_rate": 0.004917857458959974, "loss": 2.6633, "step": 4121 }, { "crossentropy": 2.6854915618896484, "epoch": 0.35062946580469545, "grad_norm": 0.03666184842586517, "grad_norm_var": 3.911329866412618e-06, "learning_rate": 0.0049158541692605395, "loss": 2.6855, "step": 4122 }, { "crossentropy": 2.686607599258423, "epoch": 0.35071452875127596, "grad_norm": 0.03436543792486191, "grad_norm_var": 3.839146832006144e-06, "learning_rate": 0.004913850893072501, "loss": 2.6866, "step": 4123 }, { "crossentropy": 2.8689682483673096, "epoch": 0.3507995916978564, "grad_norm": 0.03521229699254036, "grad_norm_var": 3.9027935483506556e-06, "learning_rate": 0.004911847630717527, "loss": 2.869, "step": 4124 }, { "crossentropy": 2.7103517055511475, "epoch": 0.3508846546444369, "grad_norm": 0.04024951532483101, "grad_norm_var": 4.4523836283033e-06, "learning_rate": 0.004909844382517284, "loss": 2.7104, "step": 4125 }, { "crossentropy": 2.6700868606567383, "epoch": 0.35096971759101736, "grad_norm": 0.03710835427045822, "grad_norm_var": 4.433118276099488e-06, "learning_rate": 0.004907841148793435, "loss": 2.6701, "step": 4126 }, { "crossentropy": 2.6750826835632324, "epoch": 0.3510547805375978, "grad_norm": 0.0368562676012516, "grad_norm_var": 4.394318099050961e-06, "learning_rate": 0.0049058379298676435, "loss": 2.6751, "step": 4127 }, { "crossentropy": 2.594839096069336, "epoch": 0.3511398434841783, "grad_norm": 0.03498656302690506, "grad_norm_var": 4.306834828101252e-06, "learning_rate": 0.004903834726061564, "loss": 2.5948, "step": 4128 }, { "crossentropy": 2.830805778503418, "epoch": 0.35122490643075877, "grad_norm": 0.034484073519706726, "grad_norm_var": 4.618577620732882e-06, "learning_rate": 0.004901831537696859, "loss": 2.8308, "step": 4129 }, { "crossentropy": 2.7927708625793457, "epoch": 0.3513099693773392, "grad_norm": 0.03597218170762062, "grad_norm_var": 4.603975527743091e-06, "learning_rate": 0.00489982836509518, "loss": 2.7928, "step": 4130 }, { "crossentropy": 2.738170862197876, "epoch": 0.3513950323239197, "grad_norm": 0.036154087632894516, "grad_norm_var": 4.592033401138301e-06, "learning_rate": 0.004897825208578179, "loss": 2.7382, "step": 4131 }, { "crossentropy": 2.6982152462005615, "epoch": 0.35148009527050017, "grad_norm": 0.037170156836509705, "grad_norm_var": 4.622914872267024e-06, "learning_rate": 0.004895822068467504, "loss": 2.6982, "step": 4132 }, { "crossentropy": 2.694089651107788, "epoch": 0.3515651582170806, "grad_norm": 0.037251751869916916, "grad_norm_var": 3.2632028253089713e-06, "learning_rate": 0.004893818945084805, "loss": 2.6941, "step": 4133 }, { "crossentropy": 2.754080057144165, "epoch": 0.3516502211636611, "grad_norm": 0.036100734025239944, "grad_norm_var": 3.16575106798662e-06, "learning_rate": 0.004891815838751723, "loss": 2.7541, "step": 4134 }, { "crossentropy": 2.734123706817627, "epoch": 0.3517352841102416, "grad_norm": 0.03752008453011513, "grad_norm_var": 3.035596153266815e-06, "learning_rate": 0.004889812749789902, "loss": 2.7341, "step": 4135 }, { "crossentropy": 2.7889981269836426, "epoch": 0.351820347056822, "grad_norm": 0.03612133488059044, "grad_norm_var": 2.6701769064398894e-06, "learning_rate": 0.004887809678520975, "loss": 2.789, "step": 4136 }, { "crossentropy": 2.6920793056488037, "epoch": 0.35190541000340253, "grad_norm": 0.035012777894735336, "grad_norm_var": 2.109221617123865e-06, "learning_rate": 0.004885806625266584, "loss": 2.6921, "step": 4137 }, { "crossentropy": 2.7005131244659424, "epoch": 0.351990472949983, "grad_norm": 0.03746560588479042, "grad_norm_var": 2.1855135336034648e-06, "learning_rate": 0.004883803590348356, "loss": 2.7005, "step": 4138 }, { "crossentropy": 2.7452611923217773, "epoch": 0.35207553589656343, "grad_norm": 0.0403827466070652, "grad_norm_var": 2.8346606935171955e-06, "learning_rate": 0.004881800574087925, "loss": 2.7453, "step": 4139 }, { "crossentropy": 2.7649097442626953, "epoch": 0.35216059884314393, "grad_norm": 0.039275575429201126, "grad_norm_var": 3.0318247631879364e-06, "learning_rate": 0.004879797576806915, "loss": 2.7649, "step": 4140 }, { "crossentropy": 2.774909734725952, "epoch": 0.3522456617897244, "grad_norm": 0.03554948419332504, "grad_norm_var": 2.380470922618873e-06, "learning_rate": 0.00487779459882695, "loss": 2.7749, "step": 4141 }, { "crossentropy": 2.7388362884521484, "epoch": 0.3523307247363049, "grad_norm": 0.03910316899418831, "grad_norm_var": 2.7342679181280304e-06, "learning_rate": 0.004875791640469654, "loss": 2.7388, "step": 4142 }, { "crossentropy": 2.608440637588501, "epoch": 0.35241578768288534, "grad_norm": 0.04074043780565262, "grad_norm_var": 3.686697639511347e-06, "learning_rate": 0.00487378870205664, "loss": 2.6084, "step": 4143 }, { "crossentropy": 2.815117597579956, "epoch": 0.3525008506294658, "grad_norm": 0.04219522699713707, "grad_norm_var": 4.921735412464351e-06, "learning_rate": 0.004871785783909523, "loss": 2.8151, "step": 4144 }, { "crossentropy": 2.6595633029937744, "epoch": 0.3525859135760463, "grad_norm": 0.03594956919550896, "grad_norm_var": 4.460555709234701e-06, "learning_rate": 0.004869782886349916, "loss": 2.6596, "step": 4145 }, { "crossentropy": 2.7545583248138428, "epoch": 0.35267097652262674, "grad_norm": 0.03740314021706581, "grad_norm_var": 4.273603026941295e-06, "learning_rate": 0.004867780009699423, "loss": 2.7546, "step": 4146 }, { "crossentropy": 2.8111572265625, "epoch": 0.3527560394692072, "grad_norm": 0.03597693145275116, "grad_norm_var": 4.312369452781859e-06, "learning_rate": 0.004865777154279653, "loss": 2.8112, "step": 4147 }, { "crossentropy": 2.8311662673950195, "epoch": 0.3528411024157877, "grad_norm": 0.036539070308208466, "grad_norm_var": 4.3819434103475885e-06, "learning_rate": 0.004863774320412203, "loss": 2.8312, "step": 4148 }, { "crossentropy": 2.732848644256592, "epoch": 0.35292616536236815, "grad_norm": 0.03898528218269348, "grad_norm_var": 4.475003174339056e-06, "learning_rate": 0.004861771508418674, "loss": 2.7328, "step": 4149 }, { "crossentropy": 2.646127462387085, "epoch": 0.3530112283089486, "grad_norm": 0.041314393281936646, "grad_norm_var": 5.01344462615072e-06, "learning_rate": 0.004859768718620656, "loss": 2.6461, "step": 4150 }, { "crossentropy": 2.8313260078430176, "epoch": 0.3530962912555291, "grad_norm": 0.037671566009521484, "grad_norm_var": 5.003248197872103e-06, "learning_rate": 0.004857765951339741, "loss": 2.8313, "step": 4151 }, { "crossentropy": 2.794602155685425, "epoch": 0.35318135420210955, "grad_norm": 0.03756522387266159, "grad_norm_var": 4.751580923843119e-06, "learning_rate": 0.004855763206897516, "loss": 2.7946, "step": 4152 }, { "crossentropy": 2.6942341327667236, "epoch": 0.35326641714869, "grad_norm": 0.03860774263739586, "grad_norm_var": 4.033680891305652e-06, "learning_rate": 0.004853760485615565, "loss": 2.6942, "step": 4153 }, { "crossentropy": 2.770622730255127, "epoch": 0.3533514800952705, "grad_norm": 0.03669671341776848, "grad_norm_var": 4.168507238877517e-06, "learning_rate": 0.004851757787815464, "loss": 2.7706, "step": 4154 }, { "crossentropy": 2.726381540298462, "epoch": 0.35343654304185096, "grad_norm": 0.038270942866802216, "grad_norm_var": 3.881140992120259e-06, "learning_rate": 0.004849755113818793, "loss": 2.7264, "step": 4155 }, { "crossentropy": 2.7091586589813232, "epoch": 0.35352160598843146, "grad_norm": 0.03688034415245056, "grad_norm_var": 3.90907530842625e-06, "learning_rate": 0.004847752463947122, "loss": 2.7092, "step": 4156 }, { "crossentropy": 2.833453416824341, "epoch": 0.3536066689350119, "grad_norm": 0.036217689514160156, "grad_norm_var": 3.7105851930302006e-06, "learning_rate": 0.0048457498385220165, "loss": 2.8335, "step": 4157 }, { "crossentropy": 2.604290246963501, "epoch": 0.35369173188159236, "grad_norm": 0.037697818130254745, "grad_norm_var": 3.6521093689724966e-06, "learning_rate": 0.004843747237865042, "loss": 2.6043, "step": 4158 }, { "crossentropy": 2.654628276824951, "epoch": 0.35377679482817287, "grad_norm": 0.03725019842386246, "grad_norm_var": 3.158876956578053e-06, "learning_rate": 0.00484174466229776, "loss": 2.6546, "step": 4159 }, { "crossentropy": 2.6478841304779053, "epoch": 0.3538618577747533, "grad_norm": 0.038251232355833054, "grad_norm_var": 1.833634956799335e-06, "learning_rate": 0.004839742112141724, "loss": 2.6479, "step": 4160 }, { "crossentropy": 2.6326913833618164, "epoch": 0.35394692072133377, "grad_norm": 0.0382172167301178, "grad_norm_var": 1.6620988652233174e-06, "learning_rate": 0.004837739587718488, "loss": 2.6327, "step": 4161 }, { "crossentropy": 2.7805991172790527, "epoch": 0.3540319836679143, "grad_norm": 0.037074241787195206, "grad_norm_var": 1.682824948780135e-06, "learning_rate": 0.004835737089349599, "loss": 2.7806, "step": 4162 }, { "crossentropy": 2.693610668182373, "epoch": 0.3541170466144947, "grad_norm": 0.037652622908353806, "grad_norm_var": 1.4731119122803524e-06, "learning_rate": 0.004833734617356601, "loss": 2.6936, "step": 4163 }, { "crossentropy": 2.6394951343536377, "epoch": 0.3542021095610752, "grad_norm": 0.035076212137937546, "grad_norm_var": 1.8539257102517226e-06, "learning_rate": 0.004831732172061032, "loss": 2.6395, "step": 4164 }, { "crossentropy": 2.685432195663452, "epoch": 0.3542871725076557, "grad_norm": 0.03557829558849335, "grad_norm_var": 2.0020536780531705e-06, "learning_rate": 0.004829729753784427, "loss": 2.6854, "step": 4165 }, { "crossentropy": 2.7972850799560547, "epoch": 0.35437223545423613, "grad_norm": 0.035793378949165344, "grad_norm_var": 1.100277366525235e-06, "learning_rate": 0.004827727362848317, "loss": 2.7973, "step": 4166 }, { "crossentropy": 2.6547935009002686, "epoch": 0.3544572984008166, "grad_norm": 0.03789101168513298, "grad_norm_var": 1.1183623587235347e-06, "learning_rate": 0.004825724999574229, "loss": 2.6548, "step": 4167 }, { "crossentropy": 2.6925547122955322, "epoch": 0.3545423613473971, "grad_norm": 0.034428928047418594, "grad_norm_var": 1.5678855977399945e-06, "learning_rate": 0.004823722664283684, "loss": 2.6926, "step": 4168 }, { "crossentropy": 2.741861343383789, "epoch": 0.35462742429397753, "grad_norm": 0.05263078212738037, "grad_norm_var": 1.6912840682177985e-05, "learning_rate": 0.0048217203572982, "loss": 2.7419, "step": 4169 }, { "crossentropy": 2.751462697982788, "epoch": 0.35471248724055804, "grad_norm": 0.03926771134138107, "grad_norm_var": 1.6930457808893238e-05, "learning_rate": 0.004819718078939291, "loss": 2.7515, "step": 4170 }, { "crossentropy": 2.736243963241577, "epoch": 0.3547975501871385, "grad_norm": 0.035212717950344086, "grad_norm_var": 1.740907581258873e-05, "learning_rate": 0.0048177158295284635, "loss": 2.7362, "step": 4171 }, { "crossentropy": 2.7143771648406982, "epoch": 0.35488261313371894, "grad_norm": 0.03832777589559555, "grad_norm_var": 1.735866710459068e-05, "learning_rate": 0.004815713609387221, "loss": 2.7144, "step": 4172 }, { "crossentropy": 2.7253119945526123, "epoch": 0.35496767608029944, "grad_norm": 0.03617292270064354, "grad_norm_var": 1.736889652757349e-05, "learning_rate": 0.004813711418837061, "loss": 2.7253, "step": 4173 }, { "crossentropy": 2.6569528579711914, "epoch": 0.3550527390268799, "grad_norm": 0.0366111658513546, "grad_norm_var": 1.7473105292746077e-05, "learning_rate": 0.004811709258199481, "loss": 2.657, "step": 4174 }, { "crossentropy": 2.7595736980438232, "epoch": 0.35513780197346034, "grad_norm": 0.036323197185993195, "grad_norm_var": 1.7599685372663803e-05, "learning_rate": 0.004809707127795969, "loss": 2.7596, "step": 4175 }, { "crossentropy": 2.7128682136535645, "epoch": 0.35522286492004085, "grad_norm": 0.034560076892375946, "grad_norm_var": 1.8220210486072767e-05, "learning_rate": 0.004807705027948007, "loss": 2.7129, "step": 4176 }, { "crossentropy": 2.721578359603882, "epoch": 0.3553079278666213, "grad_norm": 0.03743446245789528, "grad_norm_var": 1.818898803022754e-05, "learning_rate": 0.004805702958977079, "loss": 2.7216, "step": 4177 }, { "crossentropy": 2.7797465324401855, "epoch": 0.35539299081320175, "grad_norm": 0.03645507991313934, "grad_norm_var": 1.8248279739909296e-05, "learning_rate": 0.004803700921204659, "loss": 2.7797, "step": 4178 }, { "crossentropy": 2.7568118572235107, "epoch": 0.35547805375978225, "grad_norm": 0.035642266273498535, "grad_norm_var": 1.8450187393245775e-05, "learning_rate": 0.004801698914952211, "loss": 2.7568, "step": 4179 }, { "crossentropy": 2.6378095149993896, "epoch": 0.3555631167063627, "grad_norm": 0.03567908704280853, "grad_norm_var": 1.8291103632006993e-05, "learning_rate": 0.004799696940541204, "loss": 2.6378, "step": 4180 }, { "crossentropy": 2.6826608180999756, "epoch": 0.35564817965294315, "grad_norm": 0.03813748061656952, "grad_norm_var": 1.808717406077727e-05, "learning_rate": 0.004797694998293098, "loss": 2.6827, "step": 4181 }, { "crossentropy": 2.7847023010253906, "epoch": 0.35573324259952366, "grad_norm": 0.036717064678668976, "grad_norm_var": 1.7925942109029273e-05, "learning_rate": 0.004795693088529345, "loss": 2.7847, "step": 4182 }, { "crossentropy": 2.5933475494384766, "epoch": 0.3558183055461041, "grad_norm": 0.03252488747239113, "grad_norm_var": 1.9512592139670805e-05, "learning_rate": 0.004793691211571395, "loss": 2.5933, "step": 4183 }, { "crossentropy": 2.7071664333343506, "epoch": 0.3559033684926846, "grad_norm": 0.03616100177168846, "grad_norm_var": 1.90467768239758e-05, "learning_rate": 0.004791689367740692, "loss": 2.7072, "step": 4184 }, { "crossentropy": 2.6655237674713135, "epoch": 0.35598843143926506, "grad_norm": 0.03700285404920578, "grad_norm_var": 2.503915176276829e-06, "learning_rate": 0.004789687557358676, "loss": 2.6655, "step": 4185 }, { "crossentropy": 2.7556426525115967, "epoch": 0.3560734943858455, "grad_norm": 0.035893429070711136, "grad_norm_var": 1.920543603259328e-06, "learning_rate": 0.004787685780746777, "loss": 2.7556, "step": 4186 }, { "crossentropy": 2.7736167907714844, "epoch": 0.356158557332426, "grad_norm": 0.03785233944654465, "grad_norm_var": 2.0161238269950086e-06, "learning_rate": 0.004785684038226423, "loss": 2.7736, "step": 4187 }, { "crossentropy": 2.7269446849823, "epoch": 0.35624362027900647, "grad_norm": 0.03677048534154892, "grad_norm_var": 1.7556716036731811e-06, "learning_rate": 0.004783682330119037, "loss": 2.7269, "step": 4188 }, { "crossentropy": 2.7697529792785645, "epoch": 0.3563286832255869, "grad_norm": 0.03814598172903061, "grad_norm_var": 1.9797273391005653e-06, "learning_rate": 0.004781680656746036, "loss": 2.7698, "step": 4189 }, { "crossentropy": 2.7591631412506104, "epoch": 0.3564137461721674, "grad_norm": 0.03605020418763161, "grad_norm_var": 1.9813140128540608e-06, "learning_rate": 0.004779679018428832, "loss": 2.7592, "step": 4190 }, { "crossentropy": 2.7978568077087402, "epoch": 0.3564988091187479, "grad_norm": 0.037100303918123245, "grad_norm_var": 2.01789992152294e-06, "learning_rate": 0.004777677415488828, "loss": 2.7979, "step": 4191 }, { "crossentropy": 2.619727373123169, "epoch": 0.3565838720653283, "grad_norm": 0.03818253427743912, "grad_norm_var": 1.9576058029167435e-06, "learning_rate": 0.004775675848247427, "loss": 2.6197, "step": 4192 }, { "crossentropy": 2.7431774139404297, "epoch": 0.3566689350119088, "grad_norm": 0.0376758947968483, "grad_norm_var": 1.987810355874991e-06, "learning_rate": 0.004773674317026019, "loss": 2.7432, "step": 4193 }, { "crossentropy": 2.6823320388793945, "epoch": 0.3567539979584893, "grad_norm": 0.03588896617293358, "grad_norm_var": 2.0206235774237562e-06, "learning_rate": 0.004771672822145995, "loss": 2.6823, "step": 4194 }, { "crossentropy": 2.72298002243042, "epoch": 0.3568390609050697, "grad_norm": 0.03807690739631653, "grad_norm_var": 2.0837474906868863e-06, "learning_rate": 0.004769671363928736, "loss": 2.723, "step": 4195 }, { "crossentropy": 2.6750400066375732, "epoch": 0.35692412385165023, "grad_norm": 0.0357075035572052, "grad_norm_var": 2.0797737003484345e-06, "learning_rate": 0.004767669942695618, "loss": 2.675, "step": 4196 }, { "crossentropy": 2.684784412384033, "epoch": 0.3570091867982307, "grad_norm": 0.03446290269494057, "grad_norm_var": 2.240459411065263e-06, "learning_rate": 0.004765668558768015, "loss": 2.6848, "step": 4197 }, { "crossentropy": 2.723112106323242, "epoch": 0.3570942497448112, "grad_norm": 0.03627773001790047, "grad_norm_var": 2.240588413115104e-06, "learning_rate": 0.004763667212467288, "loss": 2.7231, "step": 4198 }, { "crossentropy": 2.6956076622009277, "epoch": 0.35717931269139164, "grad_norm": 0.03678208217024803, "grad_norm_var": 1.124963363607669e-06, "learning_rate": 0.0047616659041147995, "loss": 2.6956, "step": 4199 }, { "crossentropy": 2.698446035385132, "epoch": 0.3572643756379721, "grad_norm": 0.03649434819817543, "grad_norm_var": 1.1056431768269316e-06, "learning_rate": 0.0047596646340318976, "loss": 2.6984, "step": 4200 }, { "crossentropy": 2.6591849327087402, "epoch": 0.3573494385845526, "grad_norm": 0.036928460001945496, "grad_norm_var": 1.1037069214660245e-06, "learning_rate": 0.00475766340253993, "loss": 2.6592, "step": 4201 }, { "crossentropy": 2.800201177597046, "epoch": 0.35743450153113304, "grad_norm": 0.040412381291389465, "grad_norm_var": 1.8529843649703685e-06, "learning_rate": 0.004755662209960237, "loss": 2.8002, "step": 4202 }, { "crossentropy": 2.6561429500579834, "epoch": 0.3575195644777135, "grad_norm": 0.03575563058257103, "grad_norm_var": 1.9036008482480974e-06, "learning_rate": 0.004753661056614153, "loss": 2.6561, "step": 4203 }, { "crossentropy": 2.606095314025879, "epoch": 0.357604627424294, "grad_norm": 0.034465231001377106, "grad_norm_var": 2.281546498563112e-06, "learning_rate": 0.004751659942823004, "loss": 2.6061, "step": 4204 }, { "crossentropy": 2.6810946464538574, "epoch": 0.35768969037087445, "grad_norm": 0.034100085496902466, "grad_norm_var": 2.5652843945215645e-06, "learning_rate": 0.004749658868908114, "loss": 2.6811, "step": 4205 }, { "crossentropy": 2.7348480224609375, "epoch": 0.3577747533174549, "grad_norm": 0.035793762654066086, "grad_norm_var": 2.5855458620177112e-06, "learning_rate": 0.004747657835190795, "loss": 2.7348, "step": 4206 }, { "crossentropy": 2.745830774307251, "epoch": 0.3578598162640354, "grad_norm": 0.03798295184969902, "grad_norm_var": 2.704114885735779e-06, "learning_rate": 0.0047456568419923595, "loss": 2.7458, "step": 4207 }, { "crossentropy": 2.6099050045013428, "epoch": 0.35794487921061585, "grad_norm": 0.03595081716775894, "grad_norm_var": 2.533104020918664e-06, "learning_rate": 0.0047436558896341045, "loss": 2.6099, "step": 4208 }, { "crossentropy": 2.800407886505127, "epoch": 0.3580299421571963, "grad_norm": 0.03824930265545845, "grad_norm_var": 2.6495020921182252e-06, "learning_rate": 0.004741654978437327, "loss": 2.8004, "step": 4209 }, { "crossentropy": 2.716275930404663, "epoch": 0.3581150051037768, "grad_norm": 0.03635244444012642, "grad_norm_var": 2.6277591017161035e-06, "learning_rate": 0.004739654108723316, "loss": 2.7163, "step": 4210 }, { "crossentropy": 2.7238845825195312, "epoch": 0.35820006805035726, "grad_norm": 0.04038624092936516, "grad_norm_var": 3.4506127178118852e-06, "learning_rate": 0.004737653280813353, "loss": 2.7239, "step": 4211 }, { "crossentropy": 2.8046963214874268, "epoch": 0.3582851309969377, "grad_norm": 0.03491506353020668, "grad_norm_var": 3.587474493349912e-06, "learning_rate": 0.004735652495028714, "loss": 2.8047, "step": 4212 }, { "crossentropy": 2.758301258087158, "epoch": 0.3583701939435182, "grad_norm": 0.03628582879900932, "grad_norm_var": 3.280143647769497e-06, "learning_rate": 0.004733651751690667, "loss": 2.7583, "step": 4213 }, { "crossentropy": 2.66106915473938, "epoch": 0.35845525689009866, "grad_norm": 0.03564555197954178, "grad_norm_var": 3.3403586890939916e-06, "learning_rate": 0.004731651051120477, "loss": 2.6611, "step": 4214 }, { "crossentropy": 2.7549498081207275, "epoch": 0.35854031983667917, "grad_norm": 0.0388563871383667, "grad_norm_var": 3.644078755054604e-06, "learning_rate": 0.004729650393639393, "loss": 2.7549, "step": 4215 }, { "crossentropy": 2.696037769317627, "epoch": 0.3586253827832596, "grad_norm": 0.03663983941078186, "grad_norm_var": 3.639745866156526e-06, "learning_rate": 0.004727649779568666, "loss": 2.696, "step": 4216 }, { "crossentropy": 2.7489640712738037, "epoch": 0.35871044572984007, "grad_norm": 0.03587007522583008, "grad_norm_var": 3.6909232216331135e-06, "learning_rate": 0.004725649209229537, "loss": 2.749, "step": 4217 }, { "crossentropy": 2.7557384967803955, "epoch": 0.35879550867642057, "grad_norm": 0.03571528568863869, "grad_norm_var": 2.7629223690645367e-06, "learning_rate": 0.004723648682943239, "loss": 2.7557, "step": 4218 }, { "crossentropy": 2.6766016483306885, "epoch": 0.358880571623001, "grad_norm": 0.03552062436938286, "grad_norm_var": 2.7876703951915454e-06, "learning_rate": 0.004721648201031001, "loss": 2.6766, "step": 4219 }, { "crossentropy": 2.6249287128448486, "epoch": 0.35896563456958147, "grad_norm": 0.03404043987393379, "grad_norm_var": 2.909697769808613e-06, "learning_rate": 0.00471964776381404, "loss": 2.6249, "step": 4220 }, { "crossentropy": 2.7608678340911865, "epoch": 0.359050697516162, "grad_norm": 0.0368039533495903, "grad_norm_var": 2.539621081071077e-06, "learning_rate": 0.0047176473716135725, "loss": 2.7609, "step": 4221 }, { "crossentropy": 2.782954216003418, "epoch": 0.3591357604627424, "grad_norm": 0.03969259187579155, "grad_norm_var": 3.089773584341362e-06, "learning_rate": 0.0047156470247508, "loss": 2.783, "step": 4222 }, { "crossentropy": 2.724033832550049, "epoch": 0.3592208234093229, "grad_norm": 0.03559402748942375, "grad_norm_var": 3.0717989469795e-06, "learning_rate": 0.004713646723546921, "loss": 2.724, "step": 4223 }, { "crossentropy": 2.708357810974121, "epoch": 0.3593058863559034, "grad_norm": 0.03507305681705475, "grad_norm_var": 3.202648159066531e-06, "learning_rate": 0.004711646468323128, "loss": 2.7084, "step": 4224 }, { "crossentropy": 2.8107800483703613, "epoch": 0.35939094930248383, "grad_norm": 0.03915823996067047, "grad_norm_var": 3.4538569088938057e-06, "learning_rate": 0.004709646259400603, "loss": 2.8108, "step": 4225 }, { "crossentropy": 2.7853574752807617, "epoch": 0.3594760122490643, "grad_norm": 0.03675051033496857, "grad_norm_var": 3.447471117151481e-06, "learning_rate": 0.004707646097100522, "loss": 2.7854, "step": 4226 }, { "crossentropy": 2.7540810108184814, "epoch": 0.3595610751956448, "grad_norm": 0.0382537804543972, "grad_norm_var": 2.6790979024802115e-06, "learning_rate": 0.004705645981744055, "loss": 2.7541, "step": 4227 }, { "crossentropy": 2.743894577026367, "epoch": 0.35964613814222524, "grad_norm": 0.03607086092233658, "grad_norm_var": 2.5104886438891948e-06, "learning_rate": 0.004703645913652362, "loss": 2.7439, "step": 4228 }, { "crossentropy": 2.7472548484802246, "epoch": 0.35973120108880574, "grad_norm": 0.03687350079417229, "grad_norm_var": 2.5056391112054354e-06, "learning_rate": 0.004701645893146595, "loss": 2.7473, "step": 4229 }, { "crossentropy": 2.7329704761505127, "epoch": 0.3598162640353862, "grad_norm": 0.038199953734874725, "grad_norm_var": 2.567969050402905e-06, "learning_rate": 0.004699645920547901, "loss": 2.733, "step": 4230 }, { "crossentropy": 2.7361559867858887, "epoch": 0.35990132698196664, "grad_norm": 0.034828271716833115, "grad_norm_var": 2.4881384789415355e-06, "learning_rate": 0.004697645996177417, "loss": 2.7362, "step": 4231 }, { "crossentropy": 2.7387726306915283, "epoch": 0.35998638992854715, "grad_norm": 0.03626786172389984, "grad_norm_var": 2.4932141581795558e-06, "learning_rate": 0.004695646120356275, "loss": 2.7388, "step": 4232 }, { "crossentropy": 2.8178532123565674, "epoch": 0.3600714528751276, "grad_norm": 0.038176726549863815, "grad_norm_var": 2.618312590382752e-06, "learning_rate": 0.004693646293405594, "loss": 2.8179, "step": 4233 }, { "crossentropy": 2.7091784477233887, "epoch": 0.36015651582170805, "grad_norm": 0.03554747253656387, "grad_norm_var": 2.6418535777688693e-06, "learning_rate": 0.004691646515646492, "loss": 2.7092, "step": 4234 }, { "crossentropy": 2.6489057540893555, "epoch": 0.36024157876828855, "grad_norm": 0.03481941297650337, "grad_norm_var": 2.7808159587981255e-06, "learning_rate": 0.004689646787400077, "loss": 2.6489, "step": 4235 }, { "crossentropy": 2.644970417022705, "epoch": 0.360326641714869, "grad_norm": 0.0331619530916214, "grad_norm_var": 3.13288618409979e-06, "learning_rate": 0.004687647108987444, "loss": 2.645, "step": 4236 }, { "crossentropy": 2.6799449920654297, "epoch": 0.36041170466144945, "grad_norm": 0.035495415329933167, "grad_norm_var": 3.2007443038244964e-06, "learning_rate": 0.004685647480729684, "loss": 2.6799, "step": 4237 }, { "crossentropy": 2.754880428314209, "epoch": 0.36049676760802996, "grad_norm": 0.037092503160238266, "grad_norm_var": 2.515682282065665e-06, "learning_rate": 0.004683647902947881, "loss": 2.7549, "step": 4238 }, { "crossentropy": 2.708570957183838, "epoch": 0.3605818305546104, "grad_norm": 0.037266332656145096, "grad_norm_var": 2.525203007863683e-06, "learning_rate": 0.004681648375963109, "loss": 2.7086, "step": 4239 }, { "crossentropy": 2.7108919620513916, "epoch": 0.36066689350119085, "grad_norm": 0.041995491832494736, "grad_norm_var": 4.258772194626106e-06, "learning_rate": 0.0046796489000964355, "loss": 2.7109, "step": 4240 }, { "crossentropy": 2.677832841873169, "epoch": 0.36075195644777136, "grad_norm": 0.0441468320786953, "grad_norm_var": 7.334571495594353e-06, "learning_rate": 0.004677649475668917, "loss": 2.6778, "step": 4241 }, { "crossentropy": 2.639723777770996, "epoch": 0.3608370193943518, "grad_norm": 0.03646833449602127, "grad_norm_var": 7.3558640924735345e-06, "learning_rate": 0.0046756501030016055, "loss": 2.6397, "step": 4242 }, { "crossentropy": 2.6564345359802246, "epoch": 0.3609220823409323, "grad_norm": 0.034528542309999466, "grad_norm_var": 7.683172917944716e-06, "learning_rate": 0.004673650782415543, "loss": 2.6564, "step": 4243 }, { "crossentropy": 2.7745280265808105, "epoch": 0.36100714528751277, "grad_norm": 0.036290060728788376, "grad_norm_var": 7.660957579069584e-06, "learning_rate": 0.004671651514231759, "loss": 2.7745, "step": 4244 }, { "crossentropy": 2.8035237789154053, "epoch": 0.3610922082340932, "grad_norm": 0.03464921563863754, "grad_norm_var": 7.992094170174099e-06, "learning_rate": 0.00466965229877128, "loss": 2.8035, "step": 4245 }, { "crossentropy": 2.7490103244781494, "epoch": 0.3611772711806737, "grad_norm": 0.03489166125655174, "grad_norm_var": 8.062321345491998e-06, "learning_rate": 0.004667653136355125, "loss": 2.749, "step": 4246 }, { "crossentropy": 2.656111717224121, "epoch": 0.36126233412725417, "grad_norm": 0.03693917393684387, "grad_norm_var": 7.841697339803535e-06, "learning_rate": 0.0046656540273042986, "loss": 2.6561, "step": 4247 }, { "crossentropy": 2.675386905670166, "epoch": 0.3613473970738346, "grad_norm": 0.035137586295604706, "grad_norm_var": 7.991725093342502e-06, "learning_rate": 0.004663654971939802, "loss": 2.6754, "step": 4248 }, { "crossentropy": 2.6828396320343018, "epoch": 0.3614324600204151, "grad_norm": 0.03473895415663719, "grad_norm_var": 8.036484403626103e-06, "learning_rate": 0.0046616559705826245, "loss": 2.6828, "step": 4249 }, { "crossentropy": 2.741762399673462, "epoch": 0.3615175229669956, "grad_norm": 0.03612665832042694, "grad_norm_var": 7.98790283815237e-06, "learning_rate": 0.004659657023553752, "loss": 2.7418, "step": 4250 }, { "crossentropy": 2.663679361343384, "epoch": 0.361602585913576, "grad_norm": 0.032612815499305725, "grad_norm_var": 8.782038913512693e-06, "learning_rate": 0.004657658131174152, "loss": 2.6637, "step": 4251 }, { "crossentropy": 2.75065279006958, "epoch": 0.36168764886015653, "grad_norm": 0.034102875739336014, "grad_norm_var": 8.43787009779656e-06, "learning_rate": 0.004655659293764792, "loss": 2.7507, "step": 4252 }, { "crossentropy": 2.608093500137329, "epoch": 0.361772711806737, "grad_norm": 0.039099905639886856, "grad_norm_var": 8.812673085335324e-06, "learning_rate": 0.004653660511646627, "loss": 2.6081, "step": 4253 }, { "crossentropy": 2.6959798336029053, "epoch": 0.36185777475331743, "grad_norm": 0.0363638810813427, "grad_norm_var": 8.80096387820643e-06, "learning_rate": 0.004651661785140606, "loss": 2.696, "step": 4254 }, { "crossentropy": 2.705533266067505, "epoch": 0.36194283769989793, "grad_norm": 0.039596445858478546, "grad_norm_var": 9.352013317376745e-06, "learning_rate": 0.004649663114567664, "loss": 2.7055, "step": 4255 }, { "crossentropy": 2.7582006454467773, "epoch": 0.3620279006464784, "grad_norm": 0.03947960212826729, "grad_norm_var": 7.981476914909318e-06, "learning_rate": 0.00464766450024873, "loss": 2.7582, "step": 4256 }, { "crossentropy": 2.733335494995117, "epoch": 0.3621129635930589, "grad_norm": 0.03717918321490288, "grad_norm_var": 3.97975866189276e-06, "learning_rate": 0.004645665942504727, "loss": 2.7333, "step": 4257 }, { "crossentropy": 2.646155834197998, "epoch": 0.36219802653963934, "grad_norm": 0.03821342810988426, "grad_norm_var": 4.247000229100813e-06, "learning_rate": 0.0046436674416565635, "loss": 2.6462, "step": 4258 }, { "crossentropy": 2.690615653991699, "epoch": 0.3622830894862198, "grad_norm": 0.03441791608929634, "grad_norm_var": 4.273110791701747e-06, "learning_rate": 0.004641668998025141, "loss": 2.6906, "step": 4259 }, { "crossentropy": 2.6953935623168945, "epoch": 0.3623681524328003, "grad_norm": 0.03479279577732086, "grad_norm_var": 4.403221608358041e-06, "learning_rate": 0.004639670611931352, "loss": 2.6954, "step": 4260 }, { "crossentropy": 2.6713554859161377, "epoch": 0.36245321537938074, "grad_norm": 0.03714074194431305, "grad_norm_var": 4.293839425416277e-06, "learning_rate": 0.00463767228369608, "loss": 2.6714, "step": 4261 }, { "crossentropy": 2.654512643814087, "epoch": 0.3625382783259612, "grad_norm": 0.03873113542795181, "grad_norm_var": 4.493140493422079e-06, "learning_rate": 0.004635674013640199, "loss": 2.6545, "step": 4262 }, { "crossentropy": 2.737943172454834, "epoch": 0.3626233412725417, "grad_norm": 0.038257211446762085, "grad_norm_var": 4.671503536057599e-06, "learning_rate": 0.0046336758020845735, "loss": 2.7379, "step": 4263 }, { "crossentropy": 2.7412941455841064, "epoch": 0.36270840421912215, "grad_norm": 0.0336262583732605, "grad_norm_var": 5.113878229183003e-06, "learning_rate": 0.004631677649350061, "loss": 2.7413, "step": 4264 }, { "crossentropy": 2.616281509399414, "epoch": 0.3627934671657026, "grad_norm": 0.03332168608903885, "grad_norm_var": 5.57786879707463e-06, "learning_rate": 0.004629679555757503, "loss": 2.6163, "step": 4265 }, { "crossentropy": 2.752887487411499, "epoch": 0.3628785301122831, "grad_norm": 0.033697620034217834, "grad_norm_var": 6.0485717784954905e-06, "learning_rate": 0.004627681521627738, "loss": 2.7529, "step": 4266 }, { "crossentropy": 2.673656702041626, "epoch": 0.36296359305886355, "grad_norm": 0.03587310016155243, "grad_norm_var": 5.114600218468433e-06, "learning_rate": 0.004625683547281592, "loss": 2.6737, "step": 4267 }, { "crossentropy": 2.6979804039001465, "epoch": 0.363048656005444, "grad_norm": 0.0343068391084671, "grad_norm_var": 5.0521907418885535e-06, "learning_rate": 0.004623685633039883, "loss": 2.698, "step": 4268 }, { "crossentropy": 2.697232484817505, "epoch": 0.3631337189520245, "grad_norm": 0.03678635507822037, "grad_norm_var": 4.586605839761113e-06, "learning_rate": 0.004621687779223418, "loss": 2.6972, "step": 4269 }, { "crossentropy": 2.6847941875457764, "epoch": 0.36321878189860496, "grad_norm": 0.035029541701078415, "grad_norm_var": 4.697463288917622e-06, "learning_rate": 0.004619689986152996, "loss": 2.6848, "step": 4270 }, { "crossentropy": 2.820848226547241, "epoch": 0.36330384484518546, "grad_norm": 0.037551362067461014, "grad_norm_var": 4.05402633411577e-06, "learning_rate": 0.004617692254149403, "loss": 2.8208, "step": 4271 }, { "crossentropy": 2.6669559478759766, "epoch": 0.3633889077917659, "grad_norm": 0.03436638042330742, "grad_norm_var": 3.4182954523185686e-06, "learning_rate": 0.004615694583533418, "loss": 2.667, "step": 4272 }, { "crossentropy": 2.721297264099121, "epoch": 0.36347397073834636, "grad_norm": 0.034307338297367096, "grad_norm_var": 3.4174209872704526e-06, "learning_rate": 0.004613696974625809, "loss": 2.7213, "step": 4273 }, { "crossentropy": 2.7572522163391113, "epoch": 0.36355903368492687, "grad_norm": 0.041668813675642014, "grad_norm_var": 5.344101689765529e-06, "learning_rate": 0.004611699427747335, "loss": 2.7573, "step": 4274 }, { "crossentropy": 2.708371639251709, "epoch": 0.3636440966315073, "grad_norm": 0.03389711678028107, "grad_norm_var": 5.461691373335326e-06, "learning_rate": 0.004609701943218744, "loss": 2.7084, "step": 4275 }, { "crossentropy": 2.782121419906616, "epoch": 0.36372915957808777, "grad_norm": 0.03827279433608055, "grad_norm_var": 5.735173592078847e-06, "learning_rate": 0.004607704521360776, "loss": 2.7821, "step": 4276 }, { "crossentropy": 2.7390339374542236, "epoch": 0.3638142225246683, "grad_norm": 0.038605086505413055, "grad_norm_var": 6.0817370952078705e-06, "learning_rate": 0.004605707162494156, "loss": 2.739, "step": 4277 }, { "crossentropy": 2.7163589000701904, "epoch": 0.3638992854712487, "grad_norm": 0.037191323935985565, "grad_norm_var": 5.698696909950338e-06, "learning_rate": 0.004603709866939607, "loss": 2.7164, "step": 4278 }, { "crossentropy": 2.718496322631836, "epoch": 0.3639843484178292, "grad_norm": 0.03815814480185509, "grad_norm_var": 5.670121503353656e-06, "learning_rate": 0.004601712635017834, "loss": 2.7185, "step": 4279 }, { "crossentropy": 2.8079235553741455, "epoch": 0.3640694113644097, "grad_norm": 0.03597528487443924, "grad_norm_var": 5.258612607711433e-06, "learning_rate": 0.004599715467049534, "loss": 2.8079, "step": 4280 }, { "crossentropy": 2.75616455078125, "epoch": 0.36415447431099013, "grad_norm": 0.0340469628572464, "grad_norm_var": 5.01430170991556e-06, "learning_rate": 0.004597718363355396, "loss": 2.7562, "step": 4281 }, { "crossentropy": 2.7825050354003906, "epoch": 0.3642395372575706, "grad_norm": 0.04010879620909691, "grad_norm_var": 5.415624093492386e-06, "learning_rate": 0.004595721324256098, "loss": 2.7825, "step": 4282 }, { "crossentropy": 2.6644811630249023, "epoch": 0.3643246002041511, "grad_norm": 0.03658565133810043, "grad_norm_var": 5.375059122650292e-06, "learning_rate": 0.004593724350072307, "loss": 2.6645, "step": 4283 }, { "crossentropy": 2.721961498260498, "epoch": 0.36440966315073153, "grad_norm": 0.036401182413101196, "grad_norm_var": 4.9868936834455865e-06, "learning_rate": 0.004591727441124678, "loss": 2.722, "step": 4284 }, { "crossentropy": 2.574537515640259, "epoch": 0.36449472609731204, "grad_norm": 0.035661712288856506, "grad_norm_var": 5.069416925078338e-06, "learning_rate": 0.004589730597733857, "loss": 2.5745, "step": 4285 }, { "crossentropy": 2.792705774307251, "epoch": 0.3645797890438925, "grad_norm": 0.0378287173807621, "grad_norm_var": 4.921037108200574e-06, "learning_rate": 0.004587733820220484, "loss": 2.7927, "step": 4286 }, { "crossentropy": 2.8011176586151123, "epoch": 0.36466485199047294, "grad_norm": 0.03828275948762894, "grad_norm_var": 5.0166100618901735e-06, "learning_rate": 0.004585737108905178, "loss": 2.8011, "step": 4287 }, { "crossentropy": 2.7144887447357178, "epoch": 0.36474991493705344, "grad_norm": 0.034927092492580414, "grad_norm_var": 4.842365803783312e-06, "learning_rate": 0.004583740464108554, "loss": 2.7145, "step": 4288 }, { "crossentropy": 2.5903091430664062, "epoch": 0.3648349778836339, "grad_norm": 0.03500911965966225, "grad_norm_var": 4.621667234214894e-06, "learning_rate": 0.004581743886151218, "loss": 2.5903, "step": 4289 }, { "crossentropy": 2.8353309631347656, "epoch": 0.36492004083021434, "grad_norm": 0.0339527390897274, "grad_norm_var": 3.579360564581515e-06, "learning_rate": 0.004579747375353763, "loss": 2.8353, "step": 4290 }, { "crossentropy": 2.7665414810180664, "epoch": 0.36500510377679485, "grad_norm": 0.04135267063975334, "grad_norm_var": 4.409787364961261e-06, "learning_rate": 0.004577750932036769, "loss": 2.7665, "step": 4291 }, { "crossentropy": 2.7513041496276855, "epoch": 0.3650901667233753, "grad_norm": 0.03997620567679405, "grad_norm_var": 4.875106198547139e-06, "learning_rate": 0.00457575455652081, "loss": 2.7513, "step": 4292 }, { "crossentropy": 2.635005235671997, "epoch": 0.36517522966995575, "grad_norm": 0.03549932688474655, "grad_norm_var": 4.866701565768514e-06, "learning_rate": 0.004573758249126445, "loss": 2.635, "step": 4293 }, { "crossentropy": 2.6170852184295654, "epoch": 0.36526029261653625, "grad_norm": 0.035410769283771515, "grad_norm_var": 5.003962549660936e-06, "learning_rate": 0.004571762010174222, "loss": 2.6171, "step": 4294 }, { "crossentropy": 2.746342897415161, "epoch": 0.3653453555631167, "grad_norm": 0.03730615973472595, "grad_norm_var": 4.897725028220566e-06, "learning_rate": 0.004569765839984681, "loss": 2.7463, "step": 4295 }, { "crossentropy": 2.7519121170043945, "epoch": 0.36543041850969715, "grad_norm": 0.03722413256764412, "grad_norm_var": 4.862817293480442e-06, "learning_rate": 0.004567769738878349, "loss": 2.7519, "step": 4296 }, { "crossentropy": 2.662385940551758, "epoch": 0.36551548145627766, "grad_norm": 0.03596189618110657, "grad_norm_var": 4.376733932035838e-06, "learning_rate": 0.004565773707175743, "loss": 2.6624, "step": 4297 }, { "crossentropy": 2.844398021697998, "epoch": 0.3656005444028581, "grad_norm": 0.04189557209610939, "grad_norm_var": 5.324508760140047e-06, "learning_rate": 0.004563777745197368, "loss": 2.8444, "step": 4298 }, { "crossentropy": 2.7080307006835938, "epoch": 0.36568560734943856, "grad_norm": 0.039661455899477005, "grad_norm_var": 5.71316866521468e-06, "learning_rate": 0.004561781853263717, "loss": 2.708, "step": 4299 }, { "crossentropy": 2.7383627891540527, "epoch": 0.36577067029601906, "grad_norm": 0.03425447642803192, "grad_norm_var": 6.2504335030168225e-06, "learning_rate": 0.004559786031695275, "loss": 2.7384, "step": 4300 }, { "crossentropy": 2.66235089302063, "epoch": 0.3658557332425995, "grad_norm": 0.035935353487730026, "grad_norm_var": 6.201257670795414e-06, "learning_rate": 0.00455779028081251, "loss": 2.6624, "step": 4301 }, { "crossentropy": 2.7029716968536377, "epoch": 0.36594079618918, "grad_norm": 0.03482727333903313, "grad_norm_var": 6.494643803733821e-06, "learning_rate": 0.0045557946009358846, "loss": 2.703, "step": 4302 }, { "crossentropy": 2.7248647212982178, "epoch": 0.36602585913576047, "grad_norm": 0.035867542028427124, "grad_norm_var": 6.435611503206815e-06, "learning_rate": 0.004553798992385846, "loss": 2.7249, "step": 4303 }, { "crossentropy": 2.6667656898498535, "epoch": 0.3661109220823409, "grad_norm": 0.03624870255589485, "grad_norm_var": 6.211860384935005e-06, "learning_rate": 0.004551803455482833, "loss": 2.6668, "step": 4304 }, { "crossentropy": 2.7618601322174072, "epoch": 0.3661959850289214, "grad_norm": 0.034545525908470154, "grad_norm_var": 6.342108726506366e-06, "learning_rate": 0.0045498079905472695, "loss": 2.7619, "step": 4305 }, { "crossentropy": 2.6648740768432617, "epoch": 0.36628104797550187, "grad_norm": 0.03708109259605408, "grad_norm_var": 5.736946335913966e-06, "learning_rate": 0.004547812597899572, "loss": 2.6649, "step": 4306 }, { "crossentropy": 2.7807259559631348, "epoch": 0.3663661109220823, "grad_norm": 0.0358266606926918, "grad_norm_var": 4.486709595451626e-06, "learning_rate": 0.0045458172778601435, "loss": 2.7807, "step": 4307 }, { "crossentropy": 2.72515606880188, "epoch": 0.3664511738686628, "grad_norm": 0.03703639283776283, "grad_norm_var": 3.750567006620814e-06, "learning_rate": 0.00454382203074937, "loss": 2.7252, "step": 4308 }, { "crossentropy": 2.780768871307373, "epoch": 0.3665362368152433, "grad_norm": 0.03888722136616707, "grad_norm_var": 3.99946732074404e-06, "learning_rate": 0.004541826856887636, "loss": 2.7808, "step": 4309 }, { "crossentropy": 2.68070125579834, "epoch": 0.3666212997618237, "grad_norm": 0.037027452141046524, "grad_norm_var": 3.874540887389456e-06, "learning_rate": 0.004539831756595305, "loss": 2.6807, "step": 4310 }, { "crossentropy": 2.656362771987915, "epoch": 0.36670636270840423, "grad_norm": 0.04064321145415306, "grad_norm_var": 4.773863361077352e-06, "learning_rate": 0.004537836730192734, "loss": 2.6564, "step": 4311 }, { "crossentropy": 2.659092426300049, "epoch": 0.3667914256549847, "grad_norm": 0.047622401267290115, "grad_norm_var": 1.1762294939857837e-05, "learning_rate": 0.004535841778000269, "loss": 2.6591, "step": 4312 }, { "crossentropy": 2.7200071811676025, "epoch": 0.36687648860156513, "grad_norm": 0.03599734604358673, "grad_norm_var": 1.1754121968918924e-05, "learning_rate": 0.004533846900338239, "loss": 2.72, "step": 4313 }, { "crossentropy": 2.624872922897339, "epoch": 0.36696155154814564, "grad_norm": 0.03651081398129463, "grad_norm_var": 1.0561138342533167e-05, "learning_rate": 0.004531852097526966, "loss": 2.6249, "step": 4314 }, { "crossentropy": 2.687643527984619, "epoch": 0.3670466144947261, "grad_norm": 0.03445607051253319, "grad_norm_var": 1.0666548179055768e-05, "learning_rate": 0.0045298573698867575, "loss": 2.6876, "step": 4315 }, { "crossentropy": 2.790276527404785, "epoch": 0.3671316774413066, "grad_norm": 0.035555943846702576, "grad_norm_var": 1.0287659472942485e-05, "learning_rate": 0.004527862717737907, "loss": 2.7903, "step": 4316 }, { "crossentropy": 2.7297778129577637, "epoch": 0.36721674038788704, "grad_norm": 0.037038132548332214, "grad_norm_var": 1.0188110639447409e-05, "learning_rate": 0.0045258681414007, "loss": 2.7298, "step": 4317 }, { "crossentropy": 2.705439567565918, "epoch": 0.3673018033344675, "grad_norm": 0.035679638385772705, "grad_norm_var": 9.964061714773144e-06, "learning_rate": 0.004523873641195407, "loss": 2.7054, "step": 4318 }, { "crossentropy": 2.673449754714966, "epoch": 0.367386866281048, "grad_norm": 0.1216062605381012, "grad_norm_var": 0.0004535882755211697, "learning_rate": 0.0045218792174422885, "loss": 2.6734, "step": 4319 }, { "crossentropy": 2.789414882659912, "epoch": 0.36747192922762845, "grad_norm": 0.03501173481345177, "grad_norm_var": 0.0004547330982775821, "learning_rate": 0.004519884870461592, "loss": 2.7894, "step": 4320 }, { "crossentropy": 2.6766011714935303, "epoch": 0.3675569921742089, "grad_norm": 0.038255564868450165, "grad_norm_var": 0.0004516422588778377, "learning_rate": 0.0045178906005735485, "loss": 2.6766, "step": 4321 }, { "crossentropy": 2.7392072677612305, "epoch": 0.3676420551207894, "grad_norm": 0.04112185537815094, "grad_norm_var": 0.00044960057136993816, "learning_rate": 0.004515896408098387, "loss": 2.7392, "step": 4322 }, { "crossentropy": 2.779060125350952, "epoch": 0.36772711806736985, "grad_norm": 0.0392380990087986, "grad_norm_var": 0.00044705722066177054, "learning_rate": 0.004513902293356311, "loss": 2.7791, "step": 4323 }, { "crossentropy": 2.6874232292175293, "epoch": 0.3678121810139503, "grad_norm": 0.039398662745952606, "grad_norm_var": 0.00044545503414292545, "learning_rate": 0.004511908256667518, "loss": 2.6874, "step": 4324 }, { "crossentropy": 2.7932684421539307, "epoch": 0.3678972439605308, "grad_norm": 0.04214276373386383, "grad_norm_var": 0.00044416805586652093, "learning_rate": 0.0045099142983521965, "loss": 2.7933, "step": 4325 }, { "crossentropy": 2.718691825866699, "epoch": 0.36798230690711126, "grad_norm": 0.046085916459560394, "grad_norm_var": 0.000441380446735954, "learning_rate": 0.0045079204187305185, "loss": 2.7187, "step": 4326 }, { "crossentropy": 2.646174907684326, "epoch": 0.3680673698536917, "grad_norm": 0.040947556495666504, "grad_norm_var": 0.00044124402294373007, "learning_rate": 0.00450592661812264, "loss": 2.6462, "step": 4327 }, { "crossentropy": 2.7622158527374268, "epoch": 0.3681524328002722, "grad_norm": 0.03634795919060707, "grad_norm_var": 0.00044399392099289615, "learning_rate": 0.004503932896848713, "loss": 2.7622, "step": 4328 }, { "crossentropy": 2.7647321224212646, "epoch": 0.36823749574685266, "grad_norm": 0.03715590387582779, "grad_norm_var": 0.0004429246918496119, "learning_rate": 0.004501939255228868, "loss": 2.7647, "step": 4329 }, { "crossentropy": 2.795980453491211, "epoch": 0.36832255869343317, "grad_norm": 0.036838166415691376, "grad_norm_var": 0.0004426248241726917, "learning_rate": 0.004499945693583226, "loss": 2.796, "step": 4330 }, { "crossentropy": 2.730131149291992, "epoch": 0.3684076216400136, "grad_norm": 0.037425145506858826, "grad_norm_var": 0.00043957372753749456, "learning_rate": 0.004497952212231898, "loss": 2.7301, "step": 4331 }, { "crossentropy": 2.639194965362549, "epoch": 0.36849268458659407, "grad_norm": 0.03605501726269722, "grad_norm_var": 0.0004390446633374623, "learning_rate": 0.0044959588114949775, "loss": 2.6392, "step": 4332 }, { "crossentropy": 2.7672460079193115, "epoch": 0.36857774753317457, "grad_norm": 0.03645380586385727, "grad_norm_var": 0.00043959062267697537, "learning_rate": 0.004493965491692549, "loss": 2.7672, "step": 4333 }, { "crossentropy": 2.6239378452301025, "epoch": 0.368662810479755, "grad_norm": 0.03422931954264641, "grad_norm_var": 0.0004412798480434479, "learning_rate": 0.004491972253144682, "loss": 2.6239, "step": 4334 }, { "crossentropy": 2.7011613845825195, "epoch": 0.36874787342633547, "grad_norm": 0.03530356287956238, "grad_norm_var": 9.682786756794124e-06, "learning_rate": 0.004489979096171431, "loss": 2.7012, "step": 4335 }, { "crossentropy": 2.645906686782837, "epoch": 0.368832936372916, "grad_norm": 0.038471512496471405, "grad_norm_var": 8.936773796814715e-06, "learning_rate": 0.004487986021092844, "loss": 2.6459, "step": 4336 }, { "crossentropy": 2.768720865249634, "epoch": 0.3689179993194964, "grad_norm": 0.03681190311908722, "grad_norm_var": 9.107718229080472e-06, "learning_rate": 0.004485993028228948, "loss": 2.7687, "step": 4337 }, { "crossentropy": 2.71870493888855, "epoch": 0.3690030622660769, "grad_norm": 0.03360928222537041, "grad_norm_var": 9.885379895740026e-06, "learning_rate": 0.004484000117899758, "loss": 2.7187, "step": 4338 }, { "crossentropy": 2.7132699489593506, "epoch": 0.3690881252126574, "grad_norm": 0.03904404118657112, "grad_norm_var": 9.85329635860051e-06, "learning_rate": 0.004482007290425283, "loss": 2.7133, "step": 4339 }, { "crossentropy": 2.6388115882873535, "epoch": 0.36917318815923783, "grad_norm": 0.03714191913604736, "grad_norm_var": 9.719161012457639e-06, "learning_rate": 0.004480014546125511, "loss": 2.6388, "step": 4340 }, { "crossentropy": 2.7477035522460938, "epoch": 0.3692582511058183, "grad_norm": 0.03884325921535492, "grad_norm_var": 8.468809363109247e-06, "learning_rate": 0.00447802188532042, "loss": 2.7477, "step": 4341 }, { "crossentropy": 2.689190626144409, "epoch": 0.3693433140523988, "grad_norm": 0.033331915736198425, "grad_norm_var": 4.115934061974374e-06, "learning_rate": 0.004476029308329974, "loss": 2.6892, "step": 4342 }, { "crossentropy": 2.6404967308044434, "epoch": 0.36942837699897924, "grad_norm": 0.0332510881125927, "grad_norm_var": 3.511304578472189e-06, "learning_rate": 0.004474036815474122, "loss": 2.6405, "step": 4343 }, { "crossentropy": 2.745023250579834, "epoch": 0.36951343994555974, "grad_norm": 0.03698934242129326, "grad_norm_var": 3.5437153808786742e-06, "learning_rate": 0.004472044407072805, "loss": 2.745, "step": 4344 }, { "crossentropy": 2.72067928314209, "epoch": 0.3695985028921402, "grad_norm": 0.03612183779478073, "grad_norm_var": 3.4938752794160905e-06, "learning_rate": 0.004470052083445942, "loss": 2.7207, "step": 4345 }, { "crossentropy": 2.6363730430603027, "epoch": 0.36968356583872064, "grad_norm": 0.033620283007621765, "grad_norm_var": 3.886579920010328e-06, "learning_rate": 0.004468059844913444, "loss": 2.6364, "step": 4346 }, { "crossentropy": 2.7847585678100586, "epoch": 0.36976862878530115, "grad_norm": 0.03511356562376022, "grad_norm_var": 3.7948440347969347e-06, "learning_rate": 0.004466067691795208, "loss": 2.7848, "step": 4347 }, { "crossentropy": 2.763843297958374, "epoch": 0.3698536917318816, "grad_norm": 0.03581975772976875, "grad_norm_var": 3.7934242943273022e-06, "learning_rate": 0.0044640756244111145, "loss": 2.7638, "step": 4348 }, { "crossentropy": 2.7786355018615723, "epoch": 0.36993875467846205, "grad_norm": 0.03688027337193489, "grad_norm_var": 3.83714788995537e-06, "learning_rate": 0.004462083643081033, "loss": 2.7786, "step": 4349 }, { "crossentropy": 2.695826292037964, "epoch": 0.37002381762504255, "grad_norm": 0.0367155559360981, "grad_norm_var": 3.6658673776010856e-06, "learning_rate": 0.00446009174812482, "loss": 2.6958, "step": 4350 }, { "crossentropy": 2.694186210632324, "epoch": 0.370108880571623, "grad_norm": 0.03492363169789314, "grad_norm_var": 3.7135537363055157e-06, "learning_rate": 0.0044580999398623165, "loss": 2.6942, "step": 4351 }, { "crossentropy": 2.695178270339966, "epoch": 0.37019394351820345, "grad_norm": 0.036161672323942184, "grad_norm_var": 3.29910623926005e-06, "learning_rate": 0.0044561082186133465, "loss": 2.6952, "step": 4352 }, { "crossentropy": 2.7251791954040527, "epoch": 0.37027900646478396, "grad_norm": 0.038039807230234146, "grad_norm_var": 3.542849319831722e-06, "learning_rate": 0.004454116584697725, "loss": 2.7252, "step": 4353 }, { "crossentropy": 2.814079523086548, "epoch": 0.3703640694113644, "grad_norm": 0.04242746904492378, "grad_norm_var": 5.6208318228630665e-06, "learning_rate": 0.00445212503843525, "loss": 2.8141, "step": 4354 }, { "crossentropy": 2.6066601276397705, "epoch": 0.37044913235794485, "grad_norm": 0.03664263337850571, "grad_norm_var": 5.175196955512807e-06, "learning_rate": 0.004450133580145709, "loss": 2.6067, "step": 4355 }, { "crossentropy": 2.7344918251037598, "epoch": 0.37053419530452536, "grad_norm": 0.038580041378736496, "grad_norm_var": 5.45122787656005e-06, "learning_rate": 0.004448142210148868, "loss": 2.7345, "step": 4356 }, { "crossentropy": 2.6648411750793457, "epoch": 0.3706192582511058, "grad_norm": 0.037768781185150146, "grad_norm_var": 5.1828642002982315e-06, "learning_rate": 0.004446150928764488, "loss": 2.6648, "step": 4357 }, { "crossentropy": 2.7460968494415283, "epoch": 0.3707043211976863, "grad_norm": 0.03443238511681557, "grad_norm_var": 4.808489250501292e-06, "learning_rate": 0.004444159736312312, "loss": 2.7461, "step": 4358 }, { "crossentropy": 2.80446457862854, "epoch": 0.37078938414426676, "grad_norm": 0.03694528341293335, "grad_norm_var": 4.076907720321904e-06, "learning_rate": 0.004442168633112065, "loss": 2.8045, "step": 4359 }, { "crossentropy": 2.7828874588012695, "epoch": 0.3708744470908472, "grad_norm": 0.03663874790072441, "grad_norm_var": 4.071012768010427e-06, "learning_rate": 0.004440177619483461, "loss": 2.7829, "step": 4360 }, { "crossentropy": 2.721977710723877, "epoch": 0.3709595100374277, "grad_norm": 0.03554290160536766, "grad_norm_var": 4.134813190094453e-06, "learning_rate": 0.004438186695746201, "loss": 2.722, "step": 4361 }, { "crossentropy": 2.7557826042175293, "epoch": 0.37104457298400817, "grad_norm": 0.0345630943775177, "grad_norm_var": 3.8106653972653944e-06, "learning_rate": 0.0044361958622199694, "loss": 2.7558, "step": 4362 }, { "crossentropy": 2.7899413108825684, "epoch": 0.3711296359305886, "grad_norm": 0.03517674282193184, "grad_norm_var": 3.7975536424388982e-06, "learning_rate": 0.004434205119224436, "loss": 2.7899, "step": 4363 }, { "crossentropy": 2.7083449363708496, "epoch": 0.3712146988771691, "grad_norm": 0.037523385137319565, "grad_norm_var": 3.7781685023034288e-06, "learning_rate": 0.004432214467079258, "loss": 2.7083, "step": 4364 }, { "crossentropy": 2.7023160457611084, "epoch": 0.3712997618237496, "grad_norm": 0.0342075489461422, "grad_norm_var": 4.199645231974201e-06, "learning_rate": 0.0044302239061040785, "loss": 2.7023, "step": 4365 }, { "crossentropy": 2.7525346279144287, "epoch": 0.37138482477033, "grad_norm": 0.034554366022348404, "grad_norm_var": 4.470689259779513e-06, "learning_rate": 0.004428233436618519, "loss": 2.7525, "step": 4366 }, { "crossentropy": 2.7309911251068115, "epoch": 0.37146988771691053, "grad_norm": 0.03734169527888298, "grad_norm_var": 4.325305049752518e-06, "learning_rate": 0.004426243058942196, "loss": 2.731, "step": 4367 }, { "crossentropy": 2.7031514644622803, "epoch": 0.371554950663491, "grad_norm": 0.03815304860472679, "grad_norm_var": 4.441062493865111e-06, "learning_rate": 0.004424252773394703, "loss": 2.7032, "step": 4368 }, { "crossentropy": 2.7536447048187256, "epoch": 0.37164001361007143, "grad_norm": 0.037899695336818695, "grad_norm_var": 4.418821896634041e-06, "learning_rate": 0.004422262580295627, "loss": 2.7536, "step": 4369 }, { "crossentropy": 2.6725854873657227, "epoch": 0.37172507655665193, "grad_norm": 0.036944106221199036, "grad_norm_var": 2.165321352873284e-06, "learning_rate": 0.004420272479964532, "loss": 2.6726, "step": 4370 }, { "crossentropy": 2.791525363922119, "epoch": 0.3718101395032324, "grad_norm": 0.03441550210118294, "grad_norm_var": 2.412826298143986e-06, "learning_rate": 0.004418282472720973, "loss": 2.7915, "step": 4371 }, { "crossentropy": 2.619020938873291, "epoch": 0.37189520244981283, "grad_norm": 0.03728095069527626, "grad_norm_var": 2.1221530636401806e-06, "learning_rate": 0.004416292558884489, "loss": 2.619, "step": 4372 }, { "crossentropy": 2.716952323913574, "epoch": 0.37198026539639334, "grad_norm": 0.03634600341320038, "grad_norm_var": 1.9532998024961054e-06, "learning_rate": 0.004414302738774599, "loss": 2.717, "step": 4373 }, { "crossentropy": 2.7472565174102783, "epoch": 0.3720653283429738, "grad_norm": 0.034673742949962616, "grad_norm_var": 1.902540014860297e-06, "learning_rate": 0.004412313012710813, "loss": 2.7473, "step": 4374 }, { "crossentropy": 2.679739236831665, "epoch": 0.3721503912895543, "grad_norm": 0.042077746242284775, "grad_norm_var": 4.101423571760815e-06, "learning_rate": 0.004410323381012622, "loss": 2.6797, "step": 4375 }, { "crossentropy": 2.6821398735046387, "epoch": 0.37223545423613474, "grad_norm": 0.038564372807741165, "grad_norm_var": 4.379401577079128e-06, "learning_rate": 0.004408333843999506, "loss": 2.6821, "step": 4376 }, { "crossentropy": 2.6784181594848633, "epoch": 0.3723205171827152, "grad_norm": 0.04051581025123596, "grad_norm_var": 5.237988260487862e-06, "learning_rate": 0.004406344401990926, "loss": 2.6784, "step": 4377 }, { "crossentropy": 2.6921932697296143, "epoch": 0.3724055801292957, "grad_norm": 0.03580453619360924, "grad_norm_var": 4.949172123481076e-06, "learning_rate": 0.004404355055306329, "loss": 2.6922, "step": 4378 }, { "crossentropy": 2.7826504707336426, "epoch": 0.37249064307587615, "grad_norm": 0.03612222522497177, "grad_norm_var": 4.779298483160536e-06, "learning_rate": 0.004402365804265147, "loss": 2.7827, "step": 4379 }, { "crossentropy": 2.7201011180877686, "epoch": 0.3725757060224566, "grad_norm": 0.03587094321846962, "grad_norm_var": 4.840492368059455e-06, "learning_rate": 0.004400376649186798, "loss": 2.7201, "step": 4380 }, { "crossentropy": 2.713711738586426, "epoch": 0.3726607689690371, "grad_norm": 0.0351819209754467, "grad_norm_var": 4.547013785149758e-06, "learning_rate": 0.004398387590390681, "loss": 2.7137, "step": 4381 }, { "crossentropy": 2.7333292961120605, "epoch": 0.37274583191561755, "grad_norm": 0.0366448275744915, "grad_norm_var": 4.142886671792042e-06, "learning_rate": 0.00439639862819618, "loss": 2.7333, "step": 4382 }, { "crossentropy": 2.7054519653320312, "epoch": 0.372830894862198, "grad_norm": 0.034605007618665695, "grad_norm_var": 4.528193139338721e-06, "learning_rate": 0.004394409762922669, "loss": 2.7055, "step": 4383 }, { "crossentropy": 2.6020209789276123, "epoch": 0.3729159578087785, "grad_norm": 0.03580573573708534, "grad_norm_var": 4.494088801058199e-06, "learning_rate": 0.004392420994889498, "loss": 2.602, "step": 4384 }, { "crossentropy": 2.7468349933624268, "epoch": 0.37300102075535896, "grad_norm": 0.03607796132564545, "grad_norm_var": 4.433683232449303e-06, "learning_rate": 0.00439043232441601, "loss": 2.7468, "step": 4385 }, { "crossentropy": 2.651003122329712, "epoch": 0.3730860837019394, "grad_norm": 0.03649521991610527, "grad_norm_var": 4.430662004293167e-06, "learning_rate": 0.004388443751821526, "loss": 2.651, "step": 4386 }, { "crossentropy": 2.739206552505493, "epoch": 0.3731711466485199, "grad_norm": 0.03497729077935219, "grad_norm_var": 4.282625736644746e-06, "learning_rate": 0.004386455277425355, "loss": 2.7392, "step": 4387 }, { "crossentropy": 2.709343433380127, "epoch": 0.37325620959510036, "grad_norm": 0.03302759677171707, "grad_norm_var": 5.078330365717217e-06, "learning_rate": 0.004384466901546786, "loss": 2.7093, "step": 4388 }, { "crossentropy": 2.6701130867004395, "epoch": 0.37334127254168087, "grad_norm": 0.03541431948542595, "grad_norm_var": 5.142325528543524e-06, "learning_rate": 0.004382478624505097, "loss": 2.6701, "step": 4389 }, { "crossentropy": 2.83566951751709, "epoch": 0.3734263354882613, "grad_norm": 0.03883064538240433, "grad_norm_var": 5.284262905652127e-06, "learning_rate": 0.004380490446619547, "loss": 2.8357, "step": 4390 }, { "crossentropy": 2.6656625270843506, "epoch": 0.37351139843484177, "grad_norm": 0.03908825293183327, "grad_norm_var": 3.6697725805847603e-06, "learning_rate": 0.004378502368209382, "loss": 2.6657, "step": 4391 }, { "crossentropy": 2.770620107650757, "epoch": 0.3735964613814223, "grad_norm": 0.03768514096736908, "grad_norm_var": 3.4689482622316897e-06, "learning_rate": 0.004376514389593827, "loss": 2.7706, "step": 4392 }, { "crossentropy": 2.726992607116699, "epoch": 0.3736815243280027, "grad_norm": 0.03480431064963341, "grad_norm_var": 2.3614274405026408e-06, "learning_rate": 0.004374526511092096, "loss": 2.727, "step": 4393 }, { "crossentropy": 2.755159378051758, "epoch": 0.3737665872745832, "grad_norm": 0.04024782404303551, "grad_norm_var": 3.463411081866946e-06, "learning_rate": 0.004372538733023386, "loss": 2.7552, "step": 4394 }, { "crossentropy": 2.7082786560058594, "epoch": 0.3738516502211637, "grad_norm": 0.03787479177117348, "grad_norm_var": 3.612680535071642e-06, "learning_rate": 0.004370551055706874, "loss": 2.7083, "step": 4395 }, { "crossentropy": 2.7456767559051514, "epoch": 0.37393671316774413, "grad_norm": 0.038571346551179886, "grad_norm_var": 3.872736742329084e-06, "learning_rate": 0.004368563479461725, "loss": 2.7457, "step": 4396 }, { "crossentropy": 2.64683198928833, "epoch": 0.3740217761143246, "grad_norm": 0.041667163372039795, "grad_norm_var": 5.28964661199906e-06, "learning_rate": 0.004366576004607088, "loss": 2.6468, "step": 4397 }, { "crossentropy": 2.594594955444336, "epoch": 0.3741068390609051, "grad_norm": 0.03491072356700897, "grad_norm_var": 5.557073972226971e-06, "learning_rate": 0.004364588631462092, "loss": 2.5946, "step": 4398 }, { "crossentropy": 2.6978070735931396, "epoch": 0.37419190200748553, "grad_norm": 0.035484328866004944, "grad_norm_var": 5.338648378898763e-06, "learning_rate": 0.004362601360345854, "loss": 2.6978, "step": 4399 }, { "crossentropy": 2.809148073196411, "epoch": 0.374276964954066, "grad_norm": 0.034820083528757095, "grad_norm_var": 5.547797796947156e-06, "learning_rate": 0.00436061419157747, "loss": 2.8091, "step": 4400 }, { "crossentropy": 2.646711826324463, "epoch": 0.3743620279006465, "grad_norm": 0.03696117550134659, "grad_norm_var": 5.502860504576617e-06, "learning_rate": 0.004358627125476025, "loss": 2.6467, "step": 4401 }, { "crossentropy": 2.708069324493408, "epoch": 0.37444709084722694, "grad_norm": 0.04137603938579559, "grad_norm_var": 6.709620772275363e-06, "learning_rate": 0.004356640162360581, "loss": 2.7081, "step": 4402 }, { "crossentropy": 2.8357784748077393, "epoch": 0.37453215379380744, "grad_norm": 0.03779887035489082, "grad_norm_var": 6.358274526466201e-06, "learning_rate": 0.004354653302550191, "loss": 2.8358, "step": 4403 }, { "crossentropy": 2.7382235527038574, "epoch": 0.3746172167403879, "grad_norm": 0.03919596970081329, "grad_norm_var": 5.1318854154785525e-06, "learning_rate": 0.004352666546363883, "loss": 2.7382, "step": 4404 }, { "crossentropy": 2.760298490524292, "epoch": 0.37470227968696834, "grad_norm": 0.03724553808569908, "grad_norm_var": 4.760030005531111e-06, "learning_rate": 0.004350679894120677, "loss": 2.7603, "step": 4405 }, { "crossentropy": 2.711092472076416, "epoch": 0.37478734263354885, "grad_norm": 0.034173499792814255, "grad_norm_var": 5.5440011048794755e-06, "learning_rate": 0.004348693346139569, "loss": 2.7111, "step": 4406 }, { "crossentropy": 2.626638412475586, "epoch": 0.3748724055801293, "grad_norm": 0.03501051664352417, "grad_norm_var": 5.7844527876602565e-06, "learning_rate": 0.004346706902739543, "loss": 2.6266, "step": 4407 }, { "crossentropy": 2.6227328777313232, "epoch": 0.37495746852670975, "grad_norm": 0.03404688835144043, "grad_norm_var": 6.456073066102066e-06, "learning_rate": 0.004344720564239566, "loss": 2.6227, "step": 4408 }, { "crossentropy": 2.6978602409362793, "epoch": 0.37504253147329025, "grad_norm": 0.03453761711716652, "grad_norm_var": 6.543460316451218e-06, "learning_rate": 0.0043427343309585835, "loss": 2.6979, "step": 4409 }, { "crossentropy": 2.745774984359741, "epoch": 0.3751275944198707, "grad_norm": 0.04087333008646965, "grad_norm_var": 6.828764586844227e-06, "learning_rate": 0.004340748203215528, "loss": 2.7458, "step": 4410 }, { "crossentropy": 2.7252840995788574, "epoch": 0.37521265736645115, "grad_norm": 0.03666679188609123, "grad_norm_var": 6.804717471451533e-06, "learning_rate": 0.004338762181329316, "loss": 2.7253, "step": 4411 }, { "crossentropy": 2.594118356704712, "epoch": 0.37529772031303166, "grad_norm": 0.03500580042600632, "grad_norm_var": 6.892071387822456e-06, "learning_rate": 0.004336776265618843, "loss": 2.5941, "step": 4412 }, { "crossentropy": 2.767643928527832, "epoch": 0.3753827832596121, "grad_norm": 0.03824642300605774, "grad_norm_var": 5.431280572954812e-06, "learning_rate": 0.004334790456402994, "loss": 2.7676, "step": 4413 }, { "crossentropy": 2.6735923290252686, "epoch": 0.37546784620619256, "grad_norm": 0.03615326061844826, "grad_norm_var": 5.240105982765031e-06, "learning_rate": 0.0043328047540006276, "loss": 2.6736, "step": 4414 }, { "crossentropy": 2.5942678451538086, "epoch": 0.37555290915277306, "grad_norm": 0.04205076023936272, "grad_norm_var": 6.848956297000365e-06, "learning_rate": 0.004330819158730595, "loss": 2.5943, "step": 4415 }, { "crossentropy": 2.7359659671783447, "epoch": 0.3756379720993535, "grad_norm": 0.032652754336595535, "grad_norm_var": 7.811542997294454e-06, "learning_rate": 0.004328833670911724, "loss": 2.736, "step": 4416 }, { "crossentropy": 2.679335594177246, "epoch": 0.375723035045934, "grad_norm": 0.03637153282761574, "grad_norm_var": 7.836301835510012e-06, "learning_rate": 0.004326848290862825, "loss": 2.6793, "step": 4417 }, { "crossentropy": 2.729510545730591, "epoch": 0.37580809799251447, "grad_norm": 0.04518163576722145, "grad_norm_var": 1.0980771308243854e-05, "learning_rate": 0.004324863018902693, "loss": 2.7295, "step": 4418 }, { "crossentropy": 2.6938862800598145, "epoch": 0.3758931609390949, "grad_norm": 0.03911696746945381, "grad_norm_var": 1.1194483892484376e-05, "learning_rate": 0.004322877855350108, "loss": 2.6939, "step": 4419 }, { "crossentropy": 2.684934377670288, "epoch": 0.3759782238856754, "grad_norm": 0.03517187014222145, "grad_norm_var": 1.1180215704087448e-05, "learning_rate": 0.004320892800523827, "loss": 2.6849, "step": 4420 }, { "crossentropy": 2.7588911056518555, "epoch": 0.37606328683225587, "grad_norm": 0.03734809532761574, "grad_norm_var": 1.118379888480602e-05, "learning_rate": 0.004318907854742596, "loss": 2.7589, "step": 4421 }, { "crossentropy": 2.702160358428955, "epoch": 0.3761483497788363, "grad_norm": 0.03514924645423889, "grad_norm_var": 1.0870635860952583e-05, "learning_rate": 0.004316923018325137, "loss": 2.7022, "step": 4422 }, { "crossentropy": 2.6997175216674805, "epoch": 0.3762334127254168, "grad_norm": 0.04077959805727005, "grad_norm_var": 1.1344320049685013e-05, "learning_rate": 0.004314938291590161, "loss": 2.6997, "step": 4423 }, { "crossentropy": 2.679039478302002, "epoch": 0.3763184756719973, "grad_norm": 0.040877070277929306, "grad_norm_var": 1.1152164603963251e-05, "learning_rate": 0.004312953674856354, "loss": 2.679, "step": 4424 }, { "crossentropy": 2.6158995628356934, "epoch": 0.3764035386185777, "grad_norm": 0.03572288528084755, "grad_norm_var": 1.0710737464022143e-05, "learning_rate": 0.00431096916844239, "loss": 2.6159, "step": 4425 }, { "crossentropy": 2.6161344051361084, "epoch": 0.37648860156515823, "grad_norm": 0.033002376556396484, "grad_norm_var": 1.1525833228863355e-05, "learning_rate": 0.004308984772666923, "loss": 2.6161, "step": 4426 }, { "crossentropy": 2.71864914894104, "epoch": 0.3765736645117387, "grad_norm": 0.05010290443897247, "grad_norm_var": 2.137253821168338e-05, "learning_rate": 0.004307000487848592, "loss": 2.7186, "step": 4427 }, { "crossentropy": 2.7086894512176514, "epoch": 0.37665872745831913, "grad_norm": 0.03452816978096962, "grad_norm_var": 2.1597114585529948e-05, "learning_rate": 0.004305016314306013, "loss": 2.7087, "step": 4428 }, { "crossentropy": 2.7569522857666016, "epoch": 0.37674379040489964, "grad_norm": 0.035625189542770386, "grad_norm_var": 2.2037744661572838e-05, "learning_rate": 0.004303032252357789, "loss": 2.757, "step": 4429 }, { "crossentropy": 2.7953126430511475, "epoch": 0.3768288533514801, "grad_norm": 0.03695032373070717, "grad_norm_var": 2.186900524575776e-05, "learning_rate": 0.004301048302322505, "loss": 2.7953, "step": 4430 }, { "crossentropy": 2.7042620182037354, "epoch": 0.3769139162980606, "grad_norm": 0.03816665709018707, "grad_norm_var": 2.079925818036857e-05, "learning_rate": 0.0042990644645187195, "loss": 2.7043, "step": 4431 }, { "crossentropy": 2.7393152713775635, "epoch": 0.37699897924464104, "grad_norm": 0.03512483462691307, "grad_norm_var": 1.944450450585833e-05, "learning_rate": 0.004297080739264987, "loss": 2.7393, "step": 4432 }, { "crossentropy": 2.6612236499786377, "epoch": 0.3770840421912215, "grad_norm": 0.032561253756284714, "grad_norm_var": 2.121793301875577e-05, "learning_rate": 0.004295097126879831, "loss": 2.6612, "step": 4433 }, { "crossentropy": 2.722499132156372, "epoch": 0.377169105137802, "grad_norm": 0.03470085933804512, "grad_norm_var": 1.7821177070024426e-05, "learning_rate": 0.004293113627681766, "loss": 2.7225, "step": 4434 }, { "crossentropy": 2.662733554840088, "epoch": 0.37725416808438245, "grad_norm": 0.034707825630903244, "grad_norm_var": 1.7899269811618342e-05, "learning_rate": 0.004291130241989285, "loss": 2.6627, "step": 4435 }, { "crossentropy": 2.662452459335327, "epoch": 0.3773392310309629, "grad_norm": 0.034756049513816833, "grad_norm_var": 1.8006301668758936e-05, "learning_rate": 0.004289146970120861, "loss": 2.6625, "step": 4436 }, { "crossentropy": 2.591372489929199, "epoch": 0.3774242939775434, "grad_norm": 0.03347279503941536, "grad_norm_var": 1.870380942124433e-05, "learning_rate": 0.004287163812394952, "loss": 2.5914, "step": 4437 }, { "crossentropy": 2.7796366214752197, "epoch": 0.37750935692412385, "grad_norm": 0.036457475274801254, "grad_norm_var": 1.8550873332736038e-05, "learning_rate": 0.0042851807691299945, "loss": 2.7796, "step": 4438 }, { "crossentropy": 2.744333505630493, "epoch": 0.3775944198707043, "grad_norm": 0.03478008508682251, "grad_norm_var": 1.7553906579597284e-05, "learning_rate": 0.004283197840644406, "loss": 2.7443, "step": 4439 }, { "crossentropy": 2.7648239135742188, "epoch": 0.3776794828172848, "grad_norm": 0.03542378172278404, "grad_norm_var": 1.611802365052891e-05, "learning_rate": 0.004281215027256592, "loss": 2.7648, "step": 4440 }, { "crossentropy": 2.626343250274658, "epoch": 0.37776454576386526, "grad_norm": 0.03433717414736748, "grad_norm_var": 1.6290199822617767e-05, "learning_rate": 0.0042792323292849335, "loss": 2.6263, "step": 4441 }, { "crossentropy": 2.7487635612487793, "epoch": 0.3778496087104457, "grad_norm": 0.03398279845714569, "grad_norm_var": 1.596905800373023e-05, "learning_rate": 0.004277249747047793, "loss": 2.7488, "step": 4442 }, { "crossentropy": 2.745577335357666, "epoch": 0.3779346716570262, "grad_norm": 0.03333231434226036, "grad_norm_var": 1.967171007902629e-06, "learning_rate": 0.004275267280863519, "loss": 2.7456, "step": 4443 }, { "crossentropy": 2.7658774852752686, "epoch": 0.37801973460360666, "grad_norm": 0.037269316613674164, "grad_norm_var": 2.2892952940128477e-06, "learning_rate": 0.004273284931050438, "loss": 2.7659, "step": 4444 }, { "crossentropy": 2.785372734069824, "epoch": 0.37810479755018717, "grad_norm": 0.03573736548423767, "grad_norm_var": 2.2978913535436998e-06, "learning_rate": 0.0042713026979268555, "loss": 2.7854, "step": 4445 }, { "crossentropy": 2.8104381561279297, "epoch": 0.3781898604967676, "grad_norm": 0.03771030157804489, "grad_norm_var": 2.5204641891244176e-06, "learning_rate": 0.004269320581811063, "loss": 2.8104, "step": 4446 }, { "crossentropy": 2.733665704727173, "epoch": 0.37827492344334807, "grad_norm": 0.040057264268398285, "grad_norm_var": 3.5024011215522543e-06, "learning_rate": 0.004267338583021331, "loss": 2.7337, "step": 4447 }, { "crossentropy": 2.625458002090454, "epoch": 0.37835998638992857, "grad_norm": 0.03487016633152962, "grad_norm_var": 3.511577993970706e-06, "learning_rate": 0.00426535670187591, "loss": 2.6255, "step": 4448 }, { "crossentropy": 2.62632417678833, "epoch": 0.378445049336509, "grad_norm": 0.03652026504278183, "grad_norm_var": 3.0667110933133477e-06, "learning_rate": 0.004263374938693037, "loss": 2.6263, "step": 4449 }, { "crossentropy": 2.6389055252075195, "epoch": 0.37853011228308947, "grad_norm": 0.03578343987464905, "grad_norm_var": 3.0235636331869705e-06, "learning_rate": 0.004261393293790922, "loss": 2.6389, "step": 4450 }, { "crossentropy": 2.7378244400024414, "epoch": 0.37861517522967, "grad_norm": 0.04026162251830101, "grad_norm_var": 4.309280031895325e-06, "learning_rate": 0.004259411767487762, "loss": 2.7378, "step": 4451 }, { "crossentropy": 2.7088305950164795, "epoch": 0.3787002381762504, "grad_norm": 0.03563743084669113, "grad_norm_var": 4.220810917307006e-06, "learning_rate": 0.004257430360101734, "loss": 2.7088, "step": 4452 }, { "crossentropy": 2.65185284614563, "epoch": 0.3787853011228309, "grad_norm": 0.03713143616914749, "grad_norm_var": 3.8357673683227695e-06, "learning_rate": 0.004255449071950991, "loss": 2.6519, "step": 4453 }, { "crossentropy": 2.759910821914673, "epoch": 0.3788703640694114, "grad_norm": 0.03884997218847275, "grad_norm_var": 4.273815479710561e-06, "learning_rate": 0.004253467903353675, "loss": 2.7599, "step": 4454 }, { "crossentropy": 2.6416819095611572, "epoch": 0.37895542701599183, "grad_norm": 0.03572428971529007, "grad_norm_var": 4.1312261142622516e-06, "learning_rate": 0.004251486854627902, "loss": 2.6417, "step": 4455 }, { "crossentropy": 2.6531174182891846, "epoch": 0.3790404899625723, "grad_norm": 0.03238147497177124, "grad_norm_var": 5.11150126941672e-06, "learning_rate": 0.004249505926091772, "loss": 2.6531, "step": 4456 }, { "crossentropy": 2.6847102642059326, "epoch": 0.3791255529091528, "grad_norm": 0.0326317735016346, "grad_norm_var": 5.722352346166729e-06, "learning_rate": 0.004247525118063366, "loss": 2.6847, "step": 4457 }, { "crossentropy": 2.7360455989837646, "epoch": 0.37921061585573324, "grad_norm": 0.03778119757771492, "grad_norm_var": 5.54292663301244e-06, "learning_rate": 0.0042455444308607425, "loss": 2.736, "step": 4458 }, { "crossentropy": 2.6648178100585938, "epoch": 0.3792956788023137, "grad_norm": 0.03936628997325897, "grad_norm_var": 5.386657509439073e-06, "learning_rate": 0.004243563864801947, "loss": 2.6648, "step": 4459 }, { "crossentropy": 2.6852128505706787, "epoch": 0.3793807417488942, "grad_norm": 0.03860960528254509, "grad_norm_var": 5.594934171017767e-06, "learning_rate": 0.004241583420204998, "loss": 2.6852, "step": 4460 }, { "crossentropy": 2.7217626571655273, "epoch": 0.37946580469547464, "grad_norm": 0.03714646399021149, "grad_norm_var": 5.516402659661428e-06, "learning_rate": 0.004239603097387896, "loss": 2.7218, "step": 4461 }, { "crossentropy": 2.658700942993164, "epoch": 0.37955086764205515, "grad_norm": 0.03736133500933647, "grad_norm_var": 5.486494531790435e-06, "learning_rate": 0.004237622896668628, "loss": 2.6587, "step": 4462 }, { "crossentropy": 2.592780828475952, "epoch": 0.3796359305886356, "grad_norm": 0.03316697105765343, "grad_norm_var": 5.536736141033688e-06, "learning_rate": 0.004235642818365155, "loss": 2.5928, "step": 4463 }, { "crossentropy": 2.623704195022583, "epoch": 0.37972099353521604, "grad_norm": 0.03573291376233101, "grad_norm_var": 5.401353332380211e-06, "learning_rate": 0.00423366286279542, "loss": 2.6237, "step": 4464 }, { "crossentropy": 2.738389253616333, "epoch": 0.37980605648179655, "grad_norm": 0.03587251901626587, "grad_norm_var": 5.4262933680691735e-06, "learning_rate": 0.004231683030277349, "loss": 2.7384, "step": 4465 }, { "crossentropy": 2.6636452674865723, "epoch": 0.379891119428377, "grad_norm": 0.0344502292573452, "grad_norm_var": 5.6585250709726325e-06, "learning_rate": 0.004229703321128846, "loss": 2.6636, "step": 4466 }, { "crossentropy": 2.653364896774292, "epoch": 0.37997618237495745, "grad_norm": 0.03414054214954376, "grad_norm_var": 4.833590503133753e-06, "learning_rate": 0.004227723735667792, "loss": 2.6534, "step": 4467 }, { "crossentropy": 2.591381549835205, "epoch": 0.38006124532153795, "grad_norm": 0.035102568566799164, "grad_norm_var": 4.877257631425748e-06, "learning_rate": 0.004225744274212054, "loss": 2.5914, "step": 4468 }, { "crossentropy": 2.753206729888916, "epoch": 0.3801463082681184, "grad_norm": 0.035069286823272705, "grad_norm_var": 4.822485631502706e-06, "learning_rate": 0.004223764937079476, "loss": 2.7532, "step": 4469 }, { "crossentropy": 2.71291446685791, "epoch": 0.38023137121469885, "grad_norm": 0.03805805370211601, "grad_norm_var": 4.5435143052790596e-06, "learning_rate": 0.004221785724587882, "loss": 2.7129, "step": 4470 }, { "crossentropy": 2.708773612976074, "epoch": 0.38031643416127936, "grad_norm": 0.036431413143873215, "grad_norm_var": 4.568832546159434e-06, "learning_rate": 0.004219806637055077, "loss": 2.7088, "step": 4471 }, { "crossentropy": 2.7223236560821533, "epoch": 0.3804014971078598, "grad_norm": 0.038470301777124405, "grad_norm_var": 4.085134257348725e-06, "learning_rate": 0.004217827674798845, "loss": 2.7223, "step": 4472 }, { "crossentropy": 2.754476308822632, "epoch": 0.38048656005444026, "grad_norm": 0.035641416907310486, "grad_norm_var": 3.214575765381605e-06, "learning_rate": 0.0042158488381369525, "loss": 2.7545, "step": 4473 }, { "crossentropy": 2.692960262298584, "epoch": 0.38057162300102076, "grad_norm": 0.03533712401986122, "grad_norm_var": 3.1378420074741503e-06, "learning_rate": 0.004213870127387141, "loss": 2.693, "step": 4474 }, { "crossentropy": 2.807227611541748, "epoch": 0.3806566859476012, "grad_norm": 0.0360465869307518, "grad_norm_var": 2.4460758606838104e-06, "learning_rate": 0.004211891542867134, "loss": 2.8072, "step": 4475 }, { "crossentropy": 2.6827869415283203, "epoch": 0.3807417488941817, "grad_norm": 0.03534156456589699, "grad_norm_var": 1.9938321924502117e-06, "learning_rate": 0.004209913084894637, "loss": 2.6828, "step": 4476 }, { "crossentropy": 2.7206947803497314, "epoch": 0.38082681184076217, "grad_norm": 0.05341554060578346, "grad_norm_var": 2.1380092057537068e-05, "learning_rate": 0.004207934753787333, "loss": 2.7207, "step": 4477 }, { "crossentropy": 2.70019793510437, "epoch": 0.3809118747873426, "grad_norm": 0.033103663474321365, "grad_norm_var": 2.2224159248085177e-05, "learning_rate": 0.004205956549862884, "loss": 2.7002, "step": 4478 }, { "crossentropy": 2.773164987564087, "epoch": 0.3809969377339231, "grad_norm": 0.033269092440605164, "grad_norm_var": 2.2178252921966658e-05, "learning_rate": 0.004203978473438934, "loss": 2.7732, "step": 4479 }, { "crossentropy": 2.717376947402954, "epoch": 0.3810820006805036, "grad_norm": 0.04429912194609642, "grad_norm_var": 2.5782511030313035e-05, "learning_rate": 0.004202000524833105, "loss": 2.7174, "step": 4480 }, { "crossentropy": 2.6832590103149414, "epoch": 0.381167063627084, "grad_norm": 0.036068618297576904, "grad_norm_var": 2.5752086265290867e-05, "learning_rate": 0.004200022704362997, "loss": 2.6833, "step": 4481 }, { "crossentropy": 2.7523579597473145, "epoch": 0.38125212657366453, "grad_norm": 0.036207396537065506, "grad_norm_var": 2.5314804909872846e-05, "learning_rate": 0.004198045012346192, "loss": 2.7524, "step": 4482 }, { "crossentropy": 2.5937063694000244, "epoch": 0.381337189520245, "grad_norm": 0.038846731185913086, "grad_norm_var": 2.4747818952949806e-05, "learning_rate": 0.0041960674491002495, "loss": 2.5937, "step": 4483 }, { "crossentropy": 2.7059109210968018, "epoch": 0.38142225246682543, "grad_norm": 0.03544266149401665, "grad_norm_var": 2.4644326728444816e-05, "learning_rate": 0.00419409001494271, "loss": 2.7059, "step": 4484 }, { "crossentropy": 2.725050926208496, "epoch": 0.38150731541340593, "grad_norm": 0.03464999049901962, "grad_norm_var": 2.4794870556138262e-05, "learning_rate": 0.004192112710191093, "loss": 2.7251, "step": 4485 }, { "crossentropy": 2.528923988342285, "epoch": 0.3815923783599864, "grad_norm": 0.034380316734313965, "grad_norm_var": 2.5385865894247903e-05, "learning_rate": 0.004190135535162894, "loss": 2.5289, "step": 4486 }, { "crossentropy": 2.7062039375305176, "epoch": 0.38167744130656683, "grad_norm": 0.037062764167785645, "grad_norm_var": 2.5336863597268573e-05, "learning_rate": 0.004188158490175595, "loss": 2.7062, "step": 4487 }, { "crossentropy": 2.577810525894165, "epoch": 0.38176250425314734, "grad_norm": 0.03536450117826462, "grad_norm_var": 2.5475371113801473e-05, "learning_rate": 0.00418618157554665, "loss": 2.5778, "step": 4488 }, { "crossentropy": 2.7012228965759277, "epoch": 0.3818475671997278, "grad_norm": 0.034638311713933945, "grad_norm_var": 2.5740673289993224e-05, "learning_rate": 0.004184204791593493, "loss": 2.7012, "step": 4489 }, { "crossentropy": 2.6786701679229736, "epoch": 0.3819326301463083, "grad_norm": 0.03925420343875885, "grad_norm_var": 2.578304610624706e-05, "learning_rate": 0.00418222813863354, "loss": 2.6787, "step": 4490 }, { "crossentropy": 2.7067534923553467, "epoch": 0.38201769309288874, "grad_norm": 0.03732647746801376, "grad_norm_var": 2.566522689966538e-05, "learning_rate": 0.0041802516169841845, "loss": 2.7068, "step": 4491 }, { "crossentropy": 2.739900827407837, "epoch": 0.3821027560394692, "grad_norm": 0.040253326296806335, "grad_norm_var": 2.581390143663297e-05, "learning_rate": 0.004178275226962799, "loss": 2.7399, "step": 4492 }, { "crossentropy": 2.648703098297119, "epoch": 0.3821878189860497, "grad_norm": 0.03477941080927849, "grad_norm_var": 8.529673963431998e-06, "learning_rate": 0.004176298968886734, "loss": 2.6487, "step": 4493 }, { "crossentropy": 2.7173104286193848, "epoch": 0.38227288193263015, "grad_norm": 0.036848098039627075, "grad_norm_var": 7.680788403887212e-06, "learning_rate": 0.00417432284307332, "loss": 2.7173, "step": 4494 }, { "crossentropy": 2.7005374431610107, "epoch": 0.3823579448792106, "grad_norm": 0.03563021123409271, "grad_norm_var": 6.919777145160804e-06, "learning_rate": 0.004172346849839867, "loss": 2.7005, "step": 4495 }, { "crossentropy": 2.6588711738586426, "epoch": 0.3824430078257911, "grad_norm": 0.03404427319765091, "grad_norm_var": 3.431210530814808e-06, "learning_rate": 0.004170370989503663, "loss": 2.6589, "step": 4496 }, { "crossentropy": 2.662632703781128, "epoch": 0.38252807077237155, "grad_norm": 0.0356740728020668, "grad_norm_var": 3.4531028435748807e-06, "learning_rate": 0.0041683952623819695, "loss": 2.6626, "step": 4497 }, { "crossentropy": 2.68042254447937, "epoch": 0.382613133718952, "grad_norm": 0.03676390275359154, "grad_norm_var": 3.4674300702435915e-06, "learning_rate": 0.004166419668792037, "loss": 2.6804, "step": 4498 }, { "crossentropy": 2.7447609901428223, "epoch": 0.3826981966655325, "grad_norm": 0.03649422526359558, "grad_norm_var": 3.017618164885318e-06, "learning_rate": 0.004164444209051084, "loss": 2.7448, "step": 4499 }, { "crossentropy": 2.6325204372406006, "epoch": 0.38278325961211296, "grad_norm": 0.04538555443286896, "grad_norm_var": 8.241575491363337e-06, "learning_rate": 0.004162468883476319, "loss": 2.6325, "step": 4500 }, { "crossentropy": 2.6215322017669678, "epoch": 0.3828683225586934, "grad_norm": 0.03602948039770126, "grad_norm_var": 7.967935072076583e-06, "learning_rate": 0.004160493692384915, "loss": 2.6215, "step": 4501 }, { "crossentropy": 2.654785633087158, "epoch": 0.3829533855052739, "grad_norm": 0.03386330604553223, "grad_norm_var": 8.156306375606213e-06, "learning_rate": 0.004158518636094037, "loss": 2.6548, "step": 4502 }, { "crossentropy": 2.7432303428649902, "epoch": 0.38303844845185436, "grad_norm": 0.03551256284117699, "grad_norm_var": 8.26009755411523e-06, "learning_rate": 0.004156543714920817, "loss": 2.7432, "step": 4503 }, { "crossentropy": 2.7435672283172607, "epoch": 0.38312351139843487, "grad_norm": 0.034492164850234985, "grad_norm_var": 8.46780389805272e-06, "learning_rate": 0.004154568929182374, "loss": 2.7436, "step": 4504 }, { "crossentropy": 2.6770238876342773, "epoch": 0.3832085743450153, "grad_norm": 0.03449651971459389, "grad_norm_var": 8.507789282226572e-06, "learning_rate": 0.004152594279195799, "loss": 2.677, "step": 4505 }, { "crossentropy": 2.7286229133605957, "epoch": 0.38329363729159577, "grad_norm": 0.033664435148239136, "grad_norm_var": 8.54057312159757e-06, "learning_rate": 0.004150619765278168, "loss": 2.7286, "step": 4506 }, { "crossentropy": 2.75671648979187, "epoch": 0.3833787002381763, "grad_norm": 0.03689255565404892, "grad_norm_var": 8.49460921365855e-06, "learning_rate": 0.004148645387746525, "loss": 2.7567, "step": 4507 }, { "crossentropy": 2.665898561477661, "epoch": 0.3834637631847567, "grad_norm": 0.03469757363200188, "grad_norm_var": 7.496380451715336e-06, "learning_rate": 0.004146671146917904, "loss": 2.6659, "step": 4508 }, { "crossentropy": 2.7092628479003906, "epoch": 0.3835488261313372, "grad_norm": 0.034797303378582, "grad_norm_var": 7.493597623453585e-06, "learning_rate": 0.00414469704310931, "loss": 2.7093, "step": 4509 }, { "crossentropy": 2.69781231880188, "epoch": 0.3836338890779177, "grad_norm": 0.03619474172592163, "grad_norm_var": 7.4425097501181385e-06, "learning_rate": 0.004142723076637723, "loss": 2.6978, "step": 4510 }, { "crossentropy": 2.603915214538574, "epoch": 0.38371895202449813, "grad_norm": 0.03343487158417702, "grad_norm_var": 7.826960386606424e-06, "learning_rate": 0.004140749247820107, "loss": 2.6039, "step": 4511 }, { "crossentropy": 2.6637747287750244, "epoch": 0.3838040149710786, "grad_norm": 0.03725121542811394, "grad_norm_var": 7.72869151149357e-06, "learning_rate": 0.004138775556973406, "loss": 2.6638, "step": 4512 }, { "crossentropy": 2.673497438430786, "epoch": 0.3838890779176591, "grad_norm": 0.04268443211913109, "grad_norm_var": 1.0516382746301028e-05, "learning_rate": 0.004136802004414533, "loss": 2.6735, "step": 4513 }, { "crossentropy": 2.7060704231262207, "epoch": 0.38397414086423953, "grad_norm": 0.03639503940939903, "grad_norm_var": 1.0507772479896535e-05, "learning_rate": 0.004134828590460386, "loss": 2.7061, "step": 4514 }, { "crossentropy": 2.612100839614868, "epoch": 0.38405920381082, "grad_norm": 0.033878911286592484, "grad_norm_var": 1.0899922077867163e-05, "learning_rate": 0.004132855315427839, "loss": 2.6121, "step": 4515 }, { "crossentropy": 2.7502455711364746, "epoch": 0.3841442667574005, "grad_norm": 0.03592381998896599, "grad_norm_var": 4.944139113122134e-06, "learning_rate": 0.0041308821796337426, "loss": 2.7502, "step": 4516 }, { "crossentropy": 2.627310276031494, "epoch": 0.38422932970398094, "grad_norm": 0.0340457558631897, "grad_norm_var": 5.0865569933009405e-06, "learning_rate": 0.004128909183394923, "loss": 2.6273, "step": 4517 }, { "crossentropy": 2.6834914684295654, "epoch": 0.38431439265056144, "grad_norm": 0.03389884904026985, "grad_norm_var": 5.078812844294446e-06, "learning_rate": 0.0041269363270281904, "loss": 2.6835, "step": 4518 }, { "crossentropy": 2.8248391151428223, "epoch": 0.3843994555971419, "grad_norm": 0.03685183823108673, "grad_norm_var": 5.190249699728253e-06, "learning_rate": 0.004124963610850326, "loss": 2.8248, "step": 4519 }, { "crossentropy": 2.680384635925293, "epoch": 0.38448451854372234, "grad_norm": 0.03732982277870178, "grad_norm_var": 5.274363674580227e-06, "learning_rate": 0.0041229910351780925, "loss": 2.6804, "step": 4520 }, { "crossentropy": 2.7202682495117188, "epoch": 0.38456958149030285, "grad_norm": 0.03504543751478195, "grad_norm_var": 5.199452468777436e-06, "learning_rate": 0.004121018600328227, "loss": 2.7203, "step": 4521 }, { "crossentropy": 2.6085188388824463, "epoch": 0.3846546444368833, "grad_norm": 0.03485441580414772, "grad_norm_var": 4.947267945443143e-06, "learning_rate": 0.004119046306617449, "loss": 2.6085, "step": 4522 }, { "crossentropy": 2.6616408824920654, "epoch": 0.38473970738346375, "grad_norm": 0.0370156429708004, "grad_norm_var": 4.964733483747477e-06, "learning_rate": 0.004117074154362448, "loss": 2.6616, "step": 4523 }, { "crossentropy": 2.777034044265747, "epoch": 0.38482477033004425, "grad_norm": 0.03534964844584465, "grad_norm_var": 4.887310839525825e-06, "learning_rate": 0.0041151021438799, "loss": 2.777, "step": 4524 }, { "crossentropy": 2.7266769409179688, "epoch": 0.3849098332766247, "grad_norm": 0.03509219363331795, "grad_norm_var": 4.848033387647108e-06, "learning_rate": 0.0041131302754864475, "loss": 2.7267, "step": 4525 }, { "crossentropy": 2.6382033824920654, "epoch": 0.38499489622320515, "grad_norm": 0.037183381617069244, "grad_norm_var": 4.940998747261724e-06, "learning_rate": 0.004111158549498719, "loss": 2.6382, "step": 4526 }, { "crossentropy": 2.6892478466033936, "epoch": 0.38507995916978566, "grad_norm": 0.03753206878900528, "grad_norm_var": 4.5808431173013e-06, "learning_rate": 0.004109186966233315, "loss": 2.6892, "step": 4527 }, { "crossentropy": 2.6897592544555664, "epoch": 0.3851650221163661, "grad_norm": 0.03617085888981819, "grad_norm_var": 4.5125618841602985e-06, "learning_rate": 0.004107215526006817, "loss": 2.6898, "step": 4528 }, { "crossentropy": 2.7752177715301514, "epoch": 0.38525008506294656, "grad_norm": 0.0335969477891922, "grad_norm_var": 1.8209501201520036e-06, "learning_rate": 0.00410524422913578, "loss": 2.7752, "step": 4529 }, { "crossentropy": 2.6690690517425537, "epoch": 0.38533514800952706, "grad_norm": 0.038014426827430725, "grad_norm_var": 2.1488950209369582e-06, "learning_rate": 0.004103273075936739, "loss": 2.6691, "step": 4530 }, { "crossentropy": 2.6594650745391846, "epoch": 0.3854202109561075, "grad_norm": 0.032881710678339005, "grad_norm_var": 2.4580308945393313e-06, "learning_rate": 0.004101302066726204, "loss": 2.6595, "step": 4531 }, { "crossentropy": 2.690293550491333, "epoch": 0.38550527390268796, "grad_norm": 0.0352177619934082, "grad_norm_var": 2.4656865284364627e-06, "learning_rate": 0.0040993312018206595, "loss": 2.6903, "step": 4532 }, { "crossentropy": 2.7781128883361816, "epoch": 0.38559033684926847, "grad_norm": 0.039638206362724304, "grad_norm_var": 3.2390623924927546e-06, "learning_rate": 0.0040973604815365714, "loss": 2.7781, "step": 4533 }, { "crossentropy": 2.6676416397094727, "epoch": 0.3856753997958489, "grad_norm": 0.0350588895380497, "grad_norm_var": 3.001337971156257e-06, "learning_rate": 0.004095389906190383, "loss": 2.6676, "step": 4534 }, { "crossentropy": 2.722829580307007, "epoch": 0.3857604627424294, "grad_norm": 0.03784078732132912, "grad_norm_var": 3.167920495316991e-06, "learning_rate": 0.004093419476098509, "loss": 2.7228, "step": 4535 }, { "crossentropy": 2.725126028060913, "epoch": 0.38584552568900987, "grad_norm": 0.03408295661211014, "grad_norm_var": 3.3004070494890847e-06, "learning_rate": 0.004091449191577346, "loss": 2.7251, "step": 4536 }, { "crossentropy": 2.7725017070770264, "epoch": 0.3859305886355903, "grad_norm": 0.03440723195672035, "grad_norm_var": 3.399514396054161e-06, "learning_rate": 0.004089479052943261, "loss": 2.7725, "step": 4537 }, { "crossentropy": 2.7498981952667236, "epoch": 0.3860156515821708, "grad_norm": 0.03790733963251114, "grad_norm_var": 3.5681999458164746e-06, "learning_rate": 0.004087509060512607, "loss": 2.7499, "step": 4538 }, { "crossentropy": 2.6771774291992188, "epoch": 0.3861007145287513, "grad_norm": 0.03437967970967293, "grad_norm_var": 3.6672570285035954e-06, "learning_rate": 0.004085539214601704, "loss": 2.6772, "step": 4539 }, { "crossentropy": 2.7313334941864014, "epoch": 0.3861857774753317, "grad_norm": 0.035225119441747665, "grad_norm_var": 3.6773165667504382e-06, "learning_rate": 0.004083569515526852, "loss": 2.7313, "step": 4540 }, { "crossentropy": 2.699291229248047, "epoch": 0.38627084042191223, "grad_norm": 0.03429265320301056, "grad_norm_var": 3.802251527673141e-06, "learning_rate": 0.00408159996360433, "loss": 2.6993, "step": 4541 }, { "crossentropy": 2.776062250137329, "epoch": 0.3863559033684927, "grad_norm": 0.038721080869436264, "grad_norm_var": 4.2255907663692195e-06, "learning_rate": 0.004079630559150392, "loss": 2.7761, "step": 4542 }, { "crossentropy": 2.8120861053466797, "epoch": 0.38644096631507313, "grad_norm": 0.03750777244567871, "grad_norm_var": 4.2204554996594364e-06, "learning_rate": 0.004077661302481264, "loss": 2.8121, "step": 4543 }, { "crossentropy": 2.7758758068084717, "epoch": 0.38652602926165364, "grad_norm": 0.035246968269348145, "grad_norm_var": 4.244621865894839e-06, "learning_rate": 0.004075692193913156, "loss": 2.7759, "step": 4544 }, { "crossentropy": 2.676151752471924, "epoch": 0.3866110922082341, "grad_norm": 0.038068946450948715, "grad_norm_var": 4.13549100949905e-06, "learning_rate": 0.0040737232337622505, "loss": 2.6762, "step": 4545 }, { "crossentropy": 2.667067289352417, "epoch": 0.38669615515481454, "grad_norm": 0.03742256388068199, "grad_norm_var": 4.010704964524323e-06, "learning_rate": 0.0040717544223447, "loss": 2.6671, "step": 4546 }, { "crossentropy": 2.7670650482177734, "epoch": 0.38678121810139504, "grad_norm": 0.036077629774808884, "grad_norm_var": 3.269707038393241e-06, "learning_rate": 0.004069785759976644, "loss": 2.7671, "step": 4547 }, { "crossentropy": 2.7549946308135986, "epoch": 0.3868662810479755, "grad_norm": 0.035111527889966965, "grad_norm_var": 3.286003483843105e-06, "learning_rate": 0.004067817246974191, "loss": 2.755, "step": 4548 }, { "crossentropy": 2.65639328956604, "epoch": 0.386951343994556, "grad_norm": 0.038432877510786057, "grad_norm_var": 2.8422216696174483e-06, "learning_rate": 0.004065848883653428, "loss": 2.6564, "step": 4549 }, { "crossentropy": 2.7161667346954346, "epoch": 0.38703640694113645, "grad_norm": 0.036384113132953644, "grad_norm_var": 2.7439053748687842e-06, "learning_rate": 0.004063880670330417, "loss": 2.7162, "step": 4550 }, { "crossentropy": 2.691061019897461, "epoch": 0.3871214698877169, "grad_norm": 0.03549724444746971, "grad_norm_var": 2.6117534319071895e-06, "learning_rate": 0.004061912607321197, "loss": 2.6911, "step": 4551 }, { "crossentropy": 2.6986193656921387, "epoch": 0.3872065328342974, "grad_norm": 0.033936433494091034, "grad_norm_var": 2.6539244004697426e-06, "learning_rate": 0.004059944694941783, "loss": 2.6986, "step": 4552 }, { "crossentropy": 2.6221859455108643, "epoch": 0.38729159578087785, "grad_norm": 0.03521086275577545, "grad_norm_var": 2.506081515151783e-06, "learning_rate": 0.004057976933508164, "loss": 2.6222, "step": 4553 }, { "crossentropy": 2.7303459644317627, "epoch": 0.3873766587274583, "grad_norm": 0.03954652324318886, "grad_norm_var": 3.0441230222220023e-06, "learning_rate": 0.004056009323336303, "loss": 2.7303, "step": 4554 }, { "crossentropy": 2.654237747192383, "epoch": 0.3874617216740388, "grad_norm": 0.03583652526140213, "grad_norm_var": 2.8005775437396515e-06, "learning_rate": 0.0040540418647421455, "loss": 2.6542, "step": 4555 }, { "crossentropy": 2.687166690826416, "epoch": 0.38754678462061926, "grad_norm": 0.034276459366083145, "grad_norm_var": 3.0063725999118195e-06, "learning_rate": 0.004052074558041608, "loss": 2.6872, "step": 4556 }, { "crossentropy": 2.821528673171997, "epoch": 0.3876318475671997, "grad_norm": 0.038237158209085464, "grad_norm_var": 2.8977690957478207e-06, "learning_rate": 0.004050107403550582, "loss": 2.8215, "step": 4557 }, { "crossentropy": 2.6686034202575684, "epoch": 0.3877169105137802, "grad_norm": 0.03568507730960846, "grad_norm_var": 2.6130784826100887e-06, "learning_rate": 0.004048140401584937, "loss": 2.6686, "step": 4558 }, { "crossentropy": 2.928129196166992, "epoch": 0.38780197346036066, "grad_norm": 0.1690991371870041, "grad_norm_var": 0.0011042311876853597, "learning_rate": 0.004046173552460515, "loss": 2.9281, "step": 4559 }, { "crossentropy": 2.679699182510376, "epoch": 0.3878870364069411, "grad_norm": 0.04197278246283531, "grad_norm_var": 0.0010986445611056862, "learning_rate": 0.004044206856493139, "loss": 2.6797, "step": 4560 }, { "crossentropy": 2.682225465774536, "epoch": 0.3879720993535216, "grad_norm": 0.04002527892589569, "grad_norm_var": 0.0010970628626217963, "learning_rate": 0.0040422403139986005, "loss": 2.6822, "step": 4561 }, { "crossentropy": 2.7292327880859375, "epoch": 0.38805716230010207, "grad_norm": 0.03553156927227974, "grad_norm_var": 0.0010992402425696924, "learning_rate": 0.004040273925292669, "loss": 2.7292, "step": 4562 }, { "crossentropy": 2.672220230102539, "epoch": 0.38814222524668257, "grad_norm": 0.039614368230104446, "grad_norm_var": 0.0010957891643491567, "learning_rate": 0.004038307690691092, "loss": 2.6722, "step": 4563 }, { "crossentropy": 2.744274139404297, "epoch": 0.388227288193263, "grad_norm": 0.03633484989404678, "grad_norm_var": 0.0010942249577821322, "learning_rate": 0.00403634161050959, "loss": 2.7443, "step": 4564 }, { "crossentropy": 2.7578001022338867, "epoch": 0.38831235113984347, "grad_norm": 0.03337182477116585, "grad_norm_var": 0.0011004944682852615, "learning_rate": 0.004034375685063856, "loss": 2.7578, "step": 4565 }, { "crossentropy": 2.678931474685669, "epoch": 0.388397414086424, "grad_norm": 0.03914426639676094, "grad_norm_var": 0.0010977869132298747, "learning_rate": 0.004032409914669566, "loss": 2.6789, "step": 4566 }, { "crossentropy": 2.7008860111236572, "epoch": 0.3884824770330044, "grad_norm": 0.040595587342977524, "grad_norm_var": 0.0010928106383289273, "learning_rate": 0.004030444299642363, "loss": 2.7009, "step": 4567 }, { "crossentropy": 2.692305326461792, "epoch": 0.3885675399795849, "grad_norm": 0.04016570374369621, "grad_norm_var": 0.0010856097970343749, "learning_rate": 0.004028478840297866, "loss": 2.6923, "step": 4568 }, { "crossentropy": 2.748718023300171, "epoch": 0.3886526029261654, "grad_norm": 0.03544468805193901, "grad_norm_var": 0.0010852794788962722, "learning_rate": 0.004026513536951676, "loss": 2.7487, "step": 4569 }, { "crossentropy": 2.7341582775115967, "epoch": 0.38873766587274583, "grad_norm": 0.03456219285726547, "grad_norm_var": 0.0010910745897736015, "learning_rate": 0.004024548389919359, "loss": 2.7342, "step": 4570 }, { "crossentropy": 2.5981836318969727, "epoch": 0.3888227288193263, "grad_norm": 0.03607827052474022, "grad_norm_var": 0.0010907629398873866, "learning_rate": 0.004022583399516464, "loss": 2.5982, "step": 4571 }, { "crossentropy": 2.7305288314819336, "epoch": 0.3889077917659068, "grad_norm": 0.03640188276767731, "grad_norm_var": 0.00108782675266673, "learning_rate": 0.0040206185660585135, "loss": 2.7305, "step": 4572 }, { "crossentropy": 2.6947216987609863, "epoch": 0.38899285471248723, "grad_norm": 0.03489217162132263, "grad_norm_var": 0.001091884151877992, "learning_rate": 0.004018653889860999, "loss": 2.6947, "step": 4573 }, { "crossentropy": 2.792466878890991, "epoch": 0.3890779176590677, "grad_norm": 0.03934304043650627, "grad_norm_var": 0.0010879053948055272, "learning_rate": 0.004016689371239395, "loss": 2.7925, "step": 4574 }, { "crossentropy": 2.699305772781372, "epoch": 0.3891629806056482, "grad_norm": 0.035057224333286285, "grad_norm_var": 6.97550983323747e-06, "learning_rate": 0.004014725010509144, "loss": 2.6993, "step": 4575 }, { "crossentropy": 2.6807055473327637, "epoch": 0.38924804355222864, "grad_norm": 0.03368843346834183, "grad_norm_var": 6.223277115600425e-06, "learning_rate": 0.004012760807985665, "loss": 2.6807, "step": 4576 }, { "crossentropy": 2.6212785243988037, "epoch": 0.38933310649880915, "grad_norm": 0.03421292454004288, "grad_norm_var": 5.905513168959151e-06, "learning_rate": 0.004010796763984355, "loss": 2.6213, "step": 4577 }, { "crossentropy": 2.8687615394592285, "epoch": 0.3894181694453896, "grad_norm": 0.03964030742645264, "grad_norm_var": 6.415053067191738e-06, "learning_rate": 0.00400883287882058, "loss": 2.8688, "step": 4578 }, { "crossentropy": 2.6831071376800537, "epoch": 0.38950323239197004, "grad_norm": 0.037952277809381485, "grad_norm_var": 5.96052013166759e-06, "learning_rate": 0.004006869152809685, "loss": 2.6831, "step": 4579 }, { "crossentropy": 2.7983198165893555, "epoch": 0.38958829533855055, "grad_norm": 0.03668126091361046, "grad_norm_var": 5.9520620302112744e-06, "learning_rate": 0.004004905586266988, "loss": 2.7983, "step": 4580 }, { "crossentropy": 2.7432103157043457, "epoch": 0.389673358285131, "grad_norm": 0.03662329539656639, "grad_norm_var": 5.169085387830783e-06, "learning_rate": 0.0040029421795077826, "loss": 2.7432, "step": 4581 }, { "crossentropy": 2.7433276176452637, "epoch": 0.38975842123171145, "grad_norm": 0.0346701554954052, "grad_norm_var": 5.0844910256974666e-06, "learning_rate": 0.004000978932847331, "loss": 2.7433, "step": 4582 }, { "crossentropy": 2.7386486530303955, "epoch": 0.38984348417829195, "grad_norm": 0.03482522815465927, "grad_norm_var": 4.111113809908881e-06, "learning_rate": 0.003999015846600879, "loss": 2.7386, "step": 4583 }, { "crossentropy": 2.7221603393554688, "epoch": 0.3899285471248724, "grad_norm": 0.036104291677474976, "grad_norm_var": 3.029708293660003e-06, "learning_rate": 0.0039970529210836365, "loss": 2.7222, "step": 4584 }, { "crossentropy": 2.6475040912628174, "epoch": 0.39001361007145285, "grad_norm": 0.040891166776418686, "grad_norm_var": 4.472387635778783e-06, "learning_rate": 0.003995090156610798, "loss": 2.6475, "step": 4585 }, { "crossentropy": 2.7297775745391846, "epoch": 0.39009867301803336, "grad_norm": 0.03818797320127487, "grad_norm_var": 4.429008621389894e-06, "learning_rate": 0.003993127553497523, "loss": 2.7298, "step": 4586 }, { "crossentropy": 2.742159366607666, "epoch": 0.3901837359646138, "grad_norm": 0.03366886079311371, "grad_norm_var": 4.952415723141525e-06, "learning_rate": 0.0039911651120589494, "loss": 2.7422, "step": 4587 }, { "crossentropy": 2.7029404640197754, "epoch": 0.39026879891119426, "grad_norm": 0.034756142646074295, "grad_norm_var": 5.127322534414074e-06, "learning_rate": 0.0039892028326101924, "loss": 2.7029, "step": 4588 }, { "crossentropy": 2.648836612701416, "epoch": 0.39035386185777476, "grad_norm": 0.037950895726680756, "grad_norm_var": 5.127843079333743e-06, "learning_rate": 0.003987240715466336, "loss": 2.6488, "step": 4589 }, { "crossentropy": 2.682159423828125, "epoch": 0.3904389248043552, "grad_norm": 0.03517019748687744, "grad_norm_var": 4.643137949875836e-06, "learning_rate": 0.003985278760942436, "loss": 2.6822, "step": 4590 }, { "crossentropy": 2.734224319458008, "epoch": 0.3905239877509357, "grad_norm": 0.034949224442243576, "grad_norm_var": 4.661115472865256e-06, "learning_rate": 0.00398331696935353, "loss": 2.7342, "step": 4591 }, { "crossentropy": 2.7710928916931152, "epoch": 0.39060905069751617, "grad_norm": 0.03709934279322624, "grad_norm_var": 4.224067687370418e-06, "learning_rate": 0.003981355341014623, "loss": 2.7711, "step": 4592 }, { "crossentropy": 2.706052541732788, "epoch": 0.3906941136440966, "grad_norm": 0.03838041052222252, "grad_norm_var": 4.06012546471096e-06, "learning_rate": 0.003979393876240696, "loss": 2.7061, "step": 4593 }, { "crossentropy": 2.659529685974121, "epoch": 0.3907791765906771, "grad_norm": 0.03370104357600212, "grad_norm_var": 3.953742920612124e-06, "learning_rate": 0.0039774325753467055, "loss": 2.6595, "step": 4594 }, { "crossentropy": 2.6875925064086914, "epoch": 0.3908642395372576, "grad_norm": 0.03553216531872749, "grad_norm_var": 3.803013598319138e-06, "learning_rate": 0.003975471438647578, "loss": 2.6876, "step": 4595 }, { "crossentropy": 2.6100504398345947, "epoch": 0.390949302483838, "grad_norm": 0.033000197261571884, "grad_norm_var": 4.413440018397053e-06, "learning_rate": 0.003973510466458218, "loss": 2.6101, "step": 4596 }, { "crossentropy": 2.6925747394561768, "epoch": 0.39103436543041853, "grad_norm": 0.036890506744384766, "grad_norm_var": 4.441199314212697e-06, "learning_rate": 0.003971549659093498, "loss": 2.6926, "step": 4597 }, { "crossentropy": 2.847611427307129, "epoch": 0.391119428376999, "grad_norm": 0.11872350424528122, "grad_norm_var": 0.000431253458162927, "learning_rate": 0.003969589016868269, "loss": 2.8476, "step": 4598 }, { "crossentropy": 2.684138536453247, "epoch": 0.39120449132357943, "grad_norm": 0.040478236973285675, "grad_norm_var": 0.0004284161218937747, "learning_rate": 0.003967628540097353, "loss": 2.6841, "step": 4599 }, { "crossentropy": 2.710334300994873, "epoch": 0.39128955427015993, "grad_norm": 0.038394201546907425, "grad_norm_var": 0.00042706810597023753, "learning_rate": 0.003965668229095546, "loss": 2.7103, "step": 4600 }, { "crossentropy": 2.66153883934021, "epoch": 0.3913746172167404, "grad_norm": 0.034198787063360214, "grad_norm_var": 0.00043062110427333157, "learning_rate": 0.003963708084177618, "loss": 2.6615, "step": 4601 }, { "crossentropy": 2.6692700386047363, "epoch": 0.39145968016332083, "grad_norm": 0.03383347764611244, "grad_norm_var": 0.00043362326898741346, "learning_rate": 0.003961748105658312, "loss": 2.6693, "step": 4602 }, { "crossentropy": 2.6639585494995117, "epoch": 0.39154474310990134, "grad_norm": 0.033571984618902206, "grad_norm_var": 0.00043371913764408726, "learning_rate": 0.003959788293852345, "loss": 2.664, "step": 4603 }, { "crossentropy": 2.6960179805755615, "epoch": 0.3916298060564818, "grad_norm": 0.03625320643186569, "grad_norm_var": 0.0004326050220526021, "learning_rate": 0.003957828649074403, "loss": 2.696, "step": 4604 }, { "crossentropy": 2.690896987915039, "epoch": 0.3917148690030623, "grad_norm": 0.04790274798870087, "grad_norm_var": 0.0004345726559389249, "learning_rate": 0.0039558691716391516, "loss": 2.6909, "step": 4605 }, { "crossentropy": 2.716836452484131, "epoch": 0.39179993194964274, "grad_norm": 0.03632798418402672, "grad_norm_var": 0.00043363993646661485, "learning_rate": 0.003953909861861224, "loss": 2.7168, "step": 4606 }, { "crossentropy": 2.734934091567993, "epoch": 0.3918849948962232, "grad_norm": 0.03847646713256836, "grad_norm_var": 0.00043118276779111964, "learning_rate": 0.003951950720055232, "loss": 2.7349, "step": 4607 }, { "crossentropy": 2.7255897521972656, "epoch": 0.3919700578428037, "grad_norm": 0.03367637097835541, "grad_norm_var": 0.00043417350577881205, "learning_rate": 0.003949991746535753, "loss": 2.7256, "step": 4608 }, { "crossentropy": 2.777496337890625, "epoch": 0.39205512078938415, "grad_norm": 0.037858352065086365, "grad_norm_var": 0.0004344309248091584, "learning_rate": 0.003948032941617345, "loss": 2.7775, "step": 4609 }, { "crossentropy": 2.6549651622772217, "epoch": 0.3921401837359646, "grad_norm": 0.03652013838291168, "grad_norm_var": 0.0004318829487983376, "learning_rate": 0.003946074305614537, "loss": 2.655, "step": 4610 }, { "crossentropy": 2.734113931655884, "epoch": 0.3922252466825451, "grad_norm": 0.03511131927371025, "grad_norm_var": 0.0004322556782132812, "learning_rate": 0.0039441158388418265, "loss": 2.7341, "step": 4611 }, { "crossentropy": 2.6938529014587402, "epoch": 0.39231030962912555, "grad_norm": 0.03899800032377243, "grad_norm_var": 0.0004273459364353876, "learning_rate": 0.003942157541613686, "loss": 2.6939, "step": 4612 }, { "crossentropy": 2.748156785964966, "epoch": 0.392395372575706, "grad_norm": 0.03582361713051796, "grad_norm_var": 0.00042819028043109686, "learning_rate": 0.003940199414244565, "loss": 2.7482, "step": 4613 }, { "crossentropy": 2.72709321975708, "epoch": 0.3924804355222865, "grad_norm": 0.03557277470827103, "grad_norm_var": 1.2577157193785436e-05, "learning_rate": 0.003938241457048879, "loss": 2.7271, "step": 4614 }, { "crossentropy": 2.7530999183654785, "epoch": 0.39256549846886696, "grad_norm": 0.036767926067113876, "grad_norm_var": 1.1747692661600214e-05, "learning_rate": 0.003936283670341023, "loss": 2.7531, "step": 4615 }, { "crossentropy": 2.677574396133423, "epoch": 0.3926505614154474, "grad_norm": 0.038255441933870316, "grad_norm_var": 1.1719964822908454e-05, "learning_rate": 0.003934326054435357, "loss": 2.6776, "step": 4616 }, { "crossentropy": 2.735840320587158, "epoch": 0.3927356243620279, "grad_norm": 0.034853190183639526, "grad_norm_var": 1.1517863437650264e-05, "learning_rate": 0.003932368609646223, "loss": 2.7358, "step": 4617 }, { "crossentropy": 2.688321590423584, "epoch": 0.39282068730860836, "grad_norm": 0.03491897135972977, "grad_norm_var": 1.1153081883409864e-05, "learning_rate": 0.0039304113362879246, "loss": 2.6883, "step": 4618 }, { "crossentropy": 2.6722936630249023, "epoch": 0.3929057502551888, "grad_norm": 0.03580339998006821, "grad_norm_var": 1.0465041252600132e-05, "learning_rate": 0.003928454234674747, "loss": 2.6723, "step": 4619 }, { "crossentropy": 2.7278475761413574, "epoch": 0.3929908132017693, "grad_norm": 0.03469608724117279, "grad_norm_var": 1.0786158146600516e-05, "learning_rate": 0.0039264973051209425, "loss": 2.7278, "step": 4620 }, { "crossentropy": 2.7000949382781982, "epoch": 0.39307587614834977, "grad_norm": 0.03680591657757759, "grad_norm_var": 2.3104960314222305e-06, "learning_rate": 0.00392454054794074, "loss": 2.7001, "step": 4621 }, { "crossentropy": 2.722858190536499, "epoch": 0.3931609390949303, "grad_norm": 0.036294616758823395, "grad_norm_var": 2.310348232154903e-06, "learning_rate": 0.003922583963448335, "loss": 2.7229, "step": 4622 }, { "crossentropy": 2.680856943130493, "epoch": 0.3932460020415107, "grad_norm": 0.03503810241818428, "grad_norm_var": 2.040919444010192e-06, "learning_rate": 0.0039206275519579024, "loss": 2.6809, "step": 4623 }, { "crossentropy": 2.635852813720703, "epoch": 0.3933310649880912, "grad_norm": 0.03270264342427254, "grad_norm_var": 2.4099236142895975e-06, "learning_rate": 0.003918671313783583, "loss": 2.6359, "step": 4624 }, { "crossentropy": 2.7096850872039795, "epoch": 0.3934161279346717, "grad_norm": 0.03779184818267822, "grad_norm_var": 2.3937330467700574e-06, "learning_rate": 0.003916715249239494, "loss": 2.7097, "step": 4625 }, { "crossentropy": 2.6657891273498535, "epoch": 0.3935011908812521, "grad_norm": 0.03515532240271568, "grad_norm_var": 2.4149775411088276e-06, "learning_rate": 0.003914759358639719, "loss": 2.6658, "step": 4626 }, { "crossentropy": 2.7637860774993896, "epoch": 0.3935862538278326, "grad_norm": 0.03799997270107269, "grad_norm_var": 2.6281801708248504e-06, "learning_rate": 0.0039128036422983225, "loss": 2.7638, "step": 4627 }, { "crossentropy": 2.6434643268585205, "epoch": 0.3936713167744131, "grad_norm": 0.03581644967198372, "grad_norm_var": 2.028231349248307e-06, "learning_rate": 0.003910848100529333, "loss": 2.6435, "step": 4628 }, { "crossentropy": 2.6272506713867188, "epoch": 0.39375637972099353, "grad_norm": 0.033414918929338455, "grad_norm_var": 2.4132947390564928e-06, "learning_rate": 0.003908892733646757, "loss": 2.6273, "step": 4629 }, { "crossentropy": 3.002476215362549, "epoch": 0.393841442667574, "grad_norm": 0.12565182149410248, "grad_norm_var": 0.000507508777371542, "learning_rate": 0.003906937541964566, "loss": 3.0025, "step": 4630 }, { "crossentropy": 2.65600848197937, "epoch": 0.3939265056141545, "grad_norm": 0.04620284587144852, "grad_norm_var": 0.0005072793583553397, "learning_rate": 0.003904982525796711, "loss": 2.656, "step": 4631 }, { "crossentropy": 2.6465723514556885, "epoch": 0.39401156856073494, "grad_norm": 0.037973158061504364, "grad_norm_var": 0.0005074238679551311, "learning_rate": 0.0039030276854571115, "loss": 2.6466, "step": 4632 }, { "crossentropy": 2.6687674522399902, "epoch": 0.3940966315073154, "grad_norm": 0.03494478762149811, "grad_norm_var": 0.0005073377806821042, "learning_rate": 0.003901073021259657, "loss": 2.6688, "step": 4633 }, { "crossentropy": 2.624857187271118, "epoch": 0.3941816944538959, "grad_norm": 0.041511788964271545, "grad_norm_var": 0.0005038731897122237, "learning_rate": 0.003899118533518209, "loss": 2.6249, "step": 4634 }, { "crossentropy": 2.669846296310425, "epoch": 0.39426675740047634, "grad_norm": 0.036869991570711136, "grad_norm_var": 0.000503011473978943, "learning_rate": 0.0038971642225466042, "loss": 2.6698, "step": 4635 }, { "crossentropy": 2.564476728439331, "epoch": 0.39435182034705685, "grad_norm": 0.03627486154437065, "grad_norm_var": 0.0005015393712882119, "learning_rate": 0.0038952100886586465, "loss": 2.5645, "step": 4636 }, { "crossentropy": 2.5662691593170166, "epoch": 0.3944368832936373, "grad_norm": 0.03618452325463295, "grad_norm_var": 0.0005020375984004971, "learning_rate": 0.0038932561321681166, "loss": 2.5663, "step": 4637 }, { "crossentropy": 2.7088699340820312, "epoch": 0.39452194624021775, "grad_norm": 0.03518326207995415, "grad_norm_var": 0.0005030327141302257, "learning_rate": 0.00389130235338876, "loss": 2.7089, "step": 4638 }, { "crossentropy": 2.680959701538086, "epoch": 0.39460700918679825, "grad_norm": 0.03708163648843765, "grad_norm_var": 0.0005012824245769459, "learning_rate": 0.003889348752634301, "loss": 2.681, "step": 4639 }, { "crossentropy": 2.7052526473999023, "epoch": 0.3946920721333787, "grad_norm": 0.03705994039773941, "grad_norm_var": 0.000496749462806581, "learning_rate": 0.003887395330218428, "loss": 2.7053, "step": 4640 }, { "crossentropy": 2.7462050914764404, "epoch": 0.39477713507995915, "grad_norm": 0.03553786873817444, "grad_norm_var": 0.0004985780487884246, "learning_rate": 0.0038854420864548075, "loss": 2.7462, "step": 4641 }, { "crossentropy": 2.6640446186065674, "epoch": 0.39486219802653966, "grad_norm": 0.03960898146033287, "grad_norm_var": 0.0004953500534634594, "learning_rate": 0.003883489021657071, "loss": 2.664, "step": 4642 }, { "crossentropy": 2.6308701038360596, "epoch": 0.3949472609731201, "grad_norm": 0.035365331918001175, "grad_norm_var": 0.0004975253238014822, "learning_rate": 0.0038815361361388273, "loss": 2.6309, "step": 4643 }, { "crossentropy": 2.7452008724212646, "epoch": 0.39503232391970056, "grad_norm": 0.0352759063243866, "grad_norm_var": 0.0004980463762749877, "learning_rate": 0.0038795834302136514, "loss": 2.7452, "step": 4644 }, { "crossentropy": 2.766315460205078, "epoch": 0.39511738686628106, "grad_norm": 0.0381777361035347, "grad_norm_var": 0.0004935303606248533, "learning_rate": 0.003877630904195093, "loss": 2.7663, "step": 4645 }, { "crossentropy": 2.7164039611816406, "epoch": 0.3952024498128615, "grad_norm": 0.03563465550541878, "grad_norm_var": 8.640925921088854e-06, "learning_rate": 0.0038756785583966727, "loss": 2.7164, "step": 4646 }, { "crossentropy": 2.7317426204681396, "epoch": 0.39528751275944196, "grad_norm": 0.037651464343070984, "grad_norm_var": 3.2091670059320244e-06, "learning_rate": 0.0038737263931318774, "loss": 2.7317, "step": 4647 }, { "crossentropy": 2.68416428565979, "epoch": 0.39537257570602247, "grad_norm": 0.036719128489494324, "grad_norm_var": 3.127347712555441e-06, "learning_rate": 0.00387177440871417, "loss": 2.6842, "step": 4648 }, { "crossentropy": 2.6946041584014893, "epoch": 0.3954576386526029, "grad_norm": 0.03586110845208168, "grad_norm_var": 2.9510104994856513e-06, "learning_rate": 0.0038698226054569847, "loss": 2.6946, "step": 4649 }, { "crossentropy": 2.6766698360443115, "epoch": 0.3955427015991834, "grad_norm": 0.0352887324988842, "grad_norm_var": 1.5239851220131065e-06, "learning_rate": 0.0038678709836737226, "loss": 2.6767, "step": 4650 }, { "crossentropy": 2.679589033126831, "epoch": 0.39562776454576387, "grad_norm": 0.03372375667095184, "grad_norm_var": 1.9815532270093157e-06, "learning_rate": 0.00386591954367776, "loss": 2.6796, "step": 4651 }, { "crossentropy": 2.719231128692627, "epoch": 0.3957128274923443, "grad_norm": 0.03685874864459038, "grad_norm_var": 2.0017364703674156e-06, "learning_rate": 0.00386396828578244, "loss": 2.7192, "step": 4652 }, { "crossentropy": 2.823972463607788, "epoch": 0.3957978904389248, "grad_norm": 0.03774385526776314, "grad_norm_var": 2.1243335026711884e-06, "learning_rate": 0.003862017210301081, "loss": 2.824, "step": 4653 }, { "crossentropy": 2.7446014881134033, "epoch": 0.3958829533855053, "grad_norm": 0.03422912582755089, "grad_norm_var": 2.3389818937270823e-06, "learning_rate": 0.003860066317546967, "loss": 2.7446, "step": 4654 }, { "crossentropy": 2.702277421951294, "epoch": 0.3959680163320857, "grad_norm": 0.03587893024086952, "grad_norm_var": 2.3142471389907155e-06, "learning_rate": 0.0038581156078333553, "loss": 2.7023, "step": 4655 }, { "crossentropy": 2.6416962146759033, "epoch": 0.39605307927866623, "grad_norm": 0.033205267041921616, "grad_norm_var": 2.846393619354339e-06, "learning_rate": 0.0038561650814734737, "loss": 2.6417, "step": 4656 }, { "crossentropy": 2.7015953063964844, "epoch": 0.3961381422252467, "grad_norm": 0.03708987310528755, "grad_norm_var": 2.8914707460080936e-06, "learning_rate": 0.0038542147387805228, "loss": 2.7016, "step": 4657 }, { "crossentropy": 2.741126537322998, "epoch": 0.39622320517182713, "grad_norm": 0.03431060537695885, "grad_norm_var": 2.198563213479052e-06, "learning_rate": 0.003852264580067668, "loss": 2.7411, "step": 4658 }, { "crossentropy": 2.7576801776885986, "epoch": 0.39630826811840764, "grad_norm": 0.051977887749671936, "grad_norm_var": 1.8454676394509772e-05, "learning_rate": 0.003850314605648052, "loss": 2.7577, "step": 4659 }, { "crossentropy": 2.704702138900757, "epoch": 0.3963933310649881, "grad_norm": 0.033511947840452194, "grad_norm_var": 1.9019760040350716e-05, "learning_rate": 0.0038483648158347816, "loss": 2.7047, "step": 4660 }, { "crossentropy": 2.6292243003845215, "epoch": 0.39647839401156854, "grad_norm": 0.03572126477956772, "grad_norm_var": 1.892646695301059e-05, "learning_rate": 0.0038464152109409402, "loss": 2.6292, "step": 4661 }, { "crossentropy": 2.830693244934082, "epoch": 0.39656345695814904, "grad_norm": 0.0362691767513752, "grad_norm_var": 1.88709835988301e-05, "learning_rate": 0.0038444657912795753, "loss": 2.8307, "step": 4662 }, { "crossentropy": 2.6436922550201416, "epoch": 0.3966485199047295, "grad_norm": 0.036158494651317596, "grad_norm_var": 1.880647133684811e-05, "learning_rate": 0.0038425165571637073, "loss": 2.6437, "step": 4663 }, { "crossentropy": 2.8094584941864014, "epoch": 0.39673358285131, "grad_norm": 0.03498580679297447, "grad_norm_var": 1.8951518011563425e-05, "learning_rate": 0.003840567508906328, "loss": 2.8095, "step": 4664 }, { "crossentropy": 2.8911983966827393, "epoch": 0.39681864579789045, "grad_norm": 0.03824616223573685, "grad_norm_var": 1.9127436734836926e-05, "learning_rate": 0.0038386186468204, "loss": 2.8912, "step": 4665 }, { "crossentropy": 2.705604314804077, "epoch": 0.3969037087444709, "grad_norm": 0.03877480328083038, "grad_norm_var": 1.928912114423642e-05, "learning_rate": 0.0038366699712188515, "loss": 2.7056, "step": 4666 }, { "crossentropy": 2.6896421909332275, "epoch": 0.3969887716910514, "grad_norm": 0.0344897098839283, "grad_norm_var": 1.9012350650103534e-05, "learning_rate": 0.003834721482414586, "loss": 2.6896, "step": 4667 }, { "crossentropy": 2.7342867851257324, "epoch": 0.39707383463763185, "grad_norm": 0.03255772590637207, "grad_norm_var": 2.0158191554786365e-05, "learning_rate": 0.003832773180720475, "loss": 2.7343, "step": 4668 }, { "crossentropy": 2.7287025451660156, "epoch": 0.3971588975842123, "grad_norm": 0.034548159688711166, "grad_norm_var": 2.029711561304844e-05, "learning_rate": 0.0038308250664493555, "loss": 2.7287, "step": 4669 }, { "crossentropy": 2.698305130004883, "epoch": 0.3972439605307928, "grad_norm": 0.034683894366025925, "grad_norm_var": 2.0180095467583112e-05, "learning_rate": 0.0038288771399140414, "loss": 2.6983, "step": 4670 }, { "crossentropy": 2.6656687259674072, "epoch": 0.39732902347737326, "grad_norm": 0.03505173698067665, "grad_norm_var": 2.028039799466715e-05, "learning_rate": 0.0038269294014273147, "loss": 2.6657, "step": 4671 }, { "crossentropy": 2.6908037662506104, "epoch": 0.3974140864239537, "grad_norm": 0.034690793603658676, "grad_norm_var": 1.979566084155393e-05, "learning_rate": 0.0038249818513019242, "loss": 2.6908, "step": 4672 }, { "crossentropy": 2.7749106884002686, "epoch": 0.3974991493705342, "grad_norm": 0.03514883667230606, "grad_norm_var": 1.9863400531066646e-05, "learning_rate": 0.003823034489850592, "loss": 2.7749, "step": 4673 }, { "crossentropy": 2.7082386016845703, "epoch": 0.39758421231711466, "grad_norm": 0.03487580642104149, "grad_norm_var": 1.9731905035347678e-05, "learning_rate": 0.0038210873173860063, "loss": 2.7082, "step": 4674 }, { "crossentropy": 2.608950138092041, "epoch": 0.3976692752636951, "grad_norm": 0.034406762570142746, "grad_norm_var": 2.428657875897892e-06, "learning_rate": 0.0038191403342208298, "loss": 2.609, "step": 4675 }, { "crossentropy": 2.6074001789093018, "epoch": 0.3977543382102756, "grad_norm": 0.033510636538267136, "grad_norm_var": 2.4289631880438274e-06, "learning_rate": 0.003817193540667688, "loss": 2.6074, "step": 4676 }, { "crossentropy": 2.7517378330230713, "epoch": 0.39783940115685607, "grad_norm": 0.034669600427150726, "grad_norm_var": 2.43305606670204e-06, "learning_rate": 0.003815246937039182, "loss": 2.7517, "step": 4677 }, { "crossentropy": 2.70638370513916, "epoch": 0.39792446410343657, "grad_norm": 0.032269351184368134, "grad_norm_var": 2.8583698687388465e-06, "learning_rate": 0.0038133005236478798, "loss": 2.7064, "step": 4678 }, { "crossentropy": 2.7826457023620605, "epoch": 0.398009527050017, "grad_norm": 0.03498126566410065, "grad_norm_var": 2.7540044397917144e-06, "learning_rate": 0.0038113543008063203, "loss": 2.7826, "step": 4679 }, { "crossentropy": 2.687995672225952, "epoch": 0.39809458999659747, "grad_norm": 0.03350928798317909, "grad_norm_var": 2.867106206378272e-06, "learning_rate": 0.0038094082688270086, "loss": 2.688, "step": 4680 }, { "crossentropy": 2.607617139816284, "epoch": 0.398179652943178, "grad_norm": 0.034404437988996506, "grad_norm_var": 2.0119662250755547e-06, "learning_rate": 0.0038074624280224245, "loss": 2.6076, "step": 4681 }, { "crossentropy": 2.6139919757843018, "epoch": 0.3982647158897584, "grad_norm": 0.03617256507277489, "grad_norm_var": 9.64407980897676e-07, "learning_rate": 0.0038055167787050133, "loss": 2.614, "step": 4682 }, { "crossentropy": 2.74717378616333, "epoch": 0.3983497788363389, "grad_norm": 0.03746368736028671, "grad_norm_var": 1.563407150304702e-06, "learning_rate": 0.003803571321187187, "loss": 2.7472, "step": 4683 }, { "crossentropy": 2.70796275138855, "epoch": 0.3984348417829194, "grad_norm": 0.034356195479631424, "grad_norm_var": 1.2856573225638578e-06, "learning_rate": 0.0038016260557813333, "loss": 2.708, "step": 4684 }, { "crossentropy": 2.7569801807403564, "epoch": 0.39851990472949983, "grad_norm": 0.03802334517240524, "grad_norm_var": 1.9833421673440727e-06, "learning_rate": 0.003799680982799803, "loss": 2.757, "step": 4685 }, { "crossentropy": 2.663196325302124, "epoch": 0.3986049676760803, "grad_norm": 0.034630320966243744, "grad_norm_var": 1.9849840554919096e-06, "learning_rate": 0.0037977361025549206, "loss": 2.6632, "step": 4686 }, { "crossentropy": 2.696042537689209, "epoch": 0.3986900306226608, "grad_norm": 0.034473467618227005, "grad_norm_var": 1.9930502317655823e-06, "learning_rate": 0.0037957914153589796, "loss": 2.696, "step": 4687 }, { "crossentropy": 2.7044057846069336, "epoch": 0.39877509356924123, "grad_norm": 0.03361271694302559, "grad_norm_var": 2.088453170615417e-06, "learning_rate": 0.003793846921524237, "loss": 2.7044, "step": 4688 }, { "crossentropy": 2.7758684158325195, "epoch": 0.3988601565158217, "grad_norm": 0.033190637826919556, "grad_norm_var": 2.232272899530853e-06, "learning_rate": 0.0037919026213629266, "loss": 2.7759, "step": 4689 }, { "crossentropy": 2.7213504314422607, "epoch": 0.3989452194624022, "grad_norm": 0.035591039806604385, "grad_norm_var": 2.2848846800893953e-06, "learning_rate": 0.003789958515187244, "loss": 2.7214, "step": 4690 }, { "crossentropy": 2.7209081649780273, "epoch": 0.39903028240898264, "grad_norm": 0.03499817103147507, "grad_norm_var": 2.283299932020801e-06, "learning_rate": 0.0037880146033093553, "loss": 2.7209, "step": 4691 }, { "crossentropy": 2.64823842048645, "epoch": 0.3991153453555631, "grad_norm": 0.0329066663980484, "grad_norm_var": 2.4051827093695665e-06, "learning_rate": 0.0037860708860414007, "loss": 2.6482, "step": 4692 }, { "crossentropy": 2.6697044372558594, "epoch": 0.3992004083021436, "grad_norm": 0.03557722270488739, "grad_norm_var": 2.4525909748777825e-06, "learning_rate": 0.003784127363695482, "loss": 2.6697, "step": 4693 }, { "crossentropy": 2.7018845081329346, "epoch": 0.39928547124872404, "grad_norm": 0.03722809627652168, "grad_norm_var": 2.3426650153155853e-06, "learning_rate": 0.0037821840365836737, "loss": 2.7019, "step": 4694 }, { "crossentropy": 2.8346500396728516, "epoch": 0.39937053419530455, "grad_norm": 0.037739839404821396, "grad_norm_var": 2.7856558526596792e-06, "learning_rate": 0.00378024090501802, "loss": 2.8347, "step": 4695 }, { "crossentropy": 2.6843361854553223, "epoch": 0.399455597141885, "grad_norm": 0.03193223476409912, "grad_norm_var": 3.3055181655211134e-06, "learning_rate": 0.003778297969310529, "loss": 2.6843, "step": 4696 }, { "crossentropy": 2.689286470413208, "epoch": 0.39954066008846545, "grad_norm": 0.03368934988975525, "grad_norm_var": 3.407971207631105e-06, "learning_rate": 0.003776355229773184, "loss": 2.6893, "step": 4697 }, { "crossentropy": 2.8194820880889893, "epoch": 0.39962572303504595, "grad_norm": 0.03639744594693184, "grad_norm_var": 3.4433189035777576e-06, "learning_rate": 0.0037744126867179295, "loss": 2.8195, "step": 4698 }, { "crossentropy": 2.7293434143066406, "epoch": 0.3997107859816264, "grad_norm": 0.03739164024591446, "grad_norm_var": 3.421063425776821e-06, "learning_rate": 0.003772470340456682, "loss": 2.7293, "step": 4699 }, { "crossentropy": 2.729308605194092, "epoch": 0.39979584892820685, "grad_norm": 0.037079039961099625, "grad_norm_var": 3.6112557195035086e-06, "learning_rate": 0.0037705281913013285, "loss": 2.7293, "step": 4700 }, { "crossentropy": 2.611123561859131, "epoch": 0.39988091187478736, "grad_norm": 0.03473776578903198, "grad_norm_var": 3.0836343166366795e-06, "learning_rate": 0.0037685862395637195, "loss": 2.6111, "step": 4701 }, { "crossentropy": 2.752781391143799, "epoch": 0.3999659748213678, "grad_norm": 0.034624695777893066, "grad_norm_var": 3.0839686735677105e-06, "learning_rate": 0.0037666444855556785, "loss": 2.7528, "step": 4702 }, { "crossentropy": 2.7397122383117676, "epoch": 0.40005103776794826, "grad_norm": 0.03879879042506218, "grad_norm_var": 3.907415431519436e-06, "learning_rate": 0.0037647029295889956, "loss": 2.7397, "step": 4703 }, { "crossentropy": 2.6207387447357178, "epoch": 0.40013610071452876, "grad_norm": 0.03820415586233139, "grad_norm_var": 4.16545084721276e-06, "learning_rate": 0.003762761571975429, "loss": 2.6207, "step": 4704 }, { "crossentropy": 2.715243339538574, "epoch": 0.4002211636611092, "grad_norm": 0.034715380519628525, "grad_norm_var": 3.814747109693167e-06, "learning_rate": 0.003760820413026702, "loss": 2.7152, "step": 4705 }, { "crossentropy": 2.6725802421569824, "epoch": 0.40030622660768966, "grad_norm": 0.03784816712141037, "grad_norm_var": 4.092628767669397e-06, "learning_rate": 0.003758879453054511, "loss": 2.6726, "step": 4706 }, { "crossentropy": 2.721752166748047, "epoch": 0.40039128955427017, "grad_norm": 0.03620711714029312, "grad_norm_var": 4.043960326935436e-06, "learning_rate": 0.0037569386923705173, "loss": 2.7218, "step": 4707 }, { "crossentropy": 2.657444953918457, "epoch": 0.4004763525008506, "grad_norm": 0.03294201195240021, "grad_norm_var": 4.029732016975577e-06, "learning_rate": 0.0037549981312863514, "loss": 2.6574, "step": 4708 }, { "crossentropy": 2.7878551483154297, "epoch": 0.4005614154474311, "grad_norm": 0.03790251910686493, "grad_norm_var": 4.253780768527041e-06, "learning_rate": 0.003753057770113613, "loss": 2.7879, "step": 4709 }, { "crossentropy": 2.7745702266693115, "epoch": 0.4006464783940116, "grad_norm": 0.03558974713087082, "grad_norm_var": 4.172905415331512e-06, "learning_rate": 0.0037511176091638647, "loss": 2.7746, "step": 4710 }, { "crossentropy": 2.705465078353882, "epoch": 0.400731541340592, "grad_norm": 0.03605213016271591, "grad_norm_var": 3.9566014243698955e-06, "learning_rate": 0.003749177648748646, "loss": 2.7055, "step": 4711 }, { "crossentropy": 2.7379517555236816, "epoch": 0.40081660428717253, "grad_norm": 0.03729593753814697, "grad_norm_var": 2.929959000052181e-06, "learning_rate": 0.003747237889179453, "loss": 2.738, "step": 4712 }, { "crossentropy": 2.763129472732544, "epoch": 0.400901667233753, "grad_norm": 0.035323403775691986, "grad_norm_var": 2.5460803589632634e-06, "learning_rate": 0.003745298330767758, "loss": 2.7631, "step": 4713 }, { "crossentropy": 2.7254536151885986, "epoch": 0.40098673018033343, "grad_norm": 0.03264145180583, "grad_norm_var": 3.388699125661532e-06, "learning_rate": 0.003743358973824998, "loss": 2.7255, "step": 4714 }, { "crossentropy": 2.7108864784240723, "epoch": 0.40107179312691393, "grad_norm": 0.0329708531498909, "grad_norm_var": 3.839752534118279e-06, "learning_rate": 0.0037414198186625763, "loss": 2.7109, "step": 4715 }, { "crossentropy": 2.5631284713745117, "epoch": 0.4011568560734944, "grad_norm": 0.03339553251862526, "grad_norm_var": 4.063673984773993e-06, "learning_rate": 0.0037394808655918667, "loss": 2.5631, "step": 4716 }, { "crossentropy": 2.7619147300720215, "epoch": 0.40124191902007483, "grad_norm": 0.033125389367341995, "grad_norm_var": 4.406817654868609e-06, "learning_rate": 0.00373754211492421, "loss": 2.7619, "step": 4717 }, { "crossentropy": 2.6739354133605957, "epoch": 0.40132698196665534, "grad_norm": 0.03490840643644333, "grad_norm_var": 4.379594858326616e-06, "learning_rate": 0.0037356035669709143, "loss": 2.6739, "step": 4718 }, { "crossentropy": 2.618117094039917, "epoch": 0.4014120449132358, "grad_norm": 0.03523910045623779, "grad_norm_var": 3.6035237689358013e-06, "learning_rate": 0.00373366522204325, "loss": 2.6181, "step": 4719 }, { "crossentropy": 2.6456220149993896, "epoch": 0.40149710785981624, "grad_norm": 0.03491584211587906, "grad_norm_var": 2.994011889518681e-06, "learning_rate": 0.003731727080452464, "loss": 2.6456, "step": 4720 }, { "crossentropy": 2.686171054840088, "epoch": 0.40158217080639674, "grad_norm": 0.03455912694334984, "grad_norm_var": 3.002864701261617e-06, "learning_rate": 0.0037297891425097634, "loss": 2.6862, "step": 4721 }, { "crossentropy": 2.7071800231933594, "epoch": 0.4016672337529772, "grad_norm": 0.033437926322221756, "grad_norm_var": 2.5773818973168564e-06, "learning_rate": 0.0037278514085263273, "loss": 2.7072, "step": 4722 }, { "crossentropy": 2.771832227706909, "epoch": 0.4017522966995577, "grad_norm": 0.035173483192920685, "grad_norm_var": 2.447702839355505e-06, "learning_rate": 0.0037259138788132973, "loss": 2.7718, "step": 4723 }, { "crossentropy": 2.7293295860290527, "epoch": 0.40183735964613815, "grad_norm": 0.037227075546979904, "grad_norm_var": 2.5811577963143205e-06, "learning_rate": 0.003723976553681787, "loss": 2.7293, "step": 4724 }, { "crossentropy": 2.5621302127838135, "epoch": 0.4019224225927186, "grad_norm": 0.03609056398272514, "grad_norm_var": 2.081470149546162e-06, "learning_rate": 0.0037220394334428765, "loss": 2.5621, "step": 4725 }, { "crossentropy": 2.6893575191497803, "epoch": 0.4020074855392991, "grad_norm": 0.03526938334107399, "grad_norm_var": 2.057209920191168e-06, "learning_rate": 0.003720102518407608, "loss": 2.6894, "step": 4726 }, { "crossentropy": 2.637662172317505, "epoch": 0.40209254848587955, "grad_norm": 0.03632108494639397, "grad_norm_var": 2.1047827248635647e-06, "learning_rate": 0.003718165808886995, "loss": 2.6377, "step": 4727 }, { "crossentropy": 2.7011208534240723, "epoch": 0.40217761143246, "grad_norm": 0.03638489544391632, "grad_norm_var": 1.861780288931319e-06, "learning_rate": 0.003716229305192018, "loss": 2.7011, "step": 4728 }, { "crossentropy": 2.652233839035034, "epoch": 0.4022626743790405, "grad_norm": 0.03322518989443779, "grad_norm_var": 1.993717080113709e-06, "learning_rate": 0.0037142930076336234, "loss": 2.6522, "step": 4729 }, { "crossentropy": 2.6596643924713135, "epoch": 0.40234773732562096, "grad_norm": 0.03590596094727516, "grad_norm_var": 1.772321932867286e-06, "learning_rate": 0.003712356916522726, "loss": 2.6597, "step": 4730 }, { "crossentropy": 2.692784547805786, "epoch": 0.4024328002722014, "grad_norm": 0.03425733745098114, "grad_norm_var": 1.5475352604892947e-06, "learning_rate": 0.0037104210321702038, "loss": 2.6928, "step": 4731 }, { "crossentropy": 2.653015613555908, "epoch": 0.4025178632187819, "grad_norm": 0.035224247723817825, "grad_norm_var": 1.3739229125913726e-06, "learning_rate": 0.0037084853548869055, "loss": 2.653, "step": 4732 }, { "crossentropy": 2.675846576690674, "epoch": 0.40260292616536236, "grad_norm": 0.034096021205186844, "grad_norm_var": 1.179966037961657e-06, "learning_rate": 0.0037065498849836475, "loss": 2.6758, "step": 4733 }, { "crossentropy": 2.6909146308898926, "epoch": 0.4026879891119428, "grad_norm": 0.0348779670894146, "grad_norm_var": 1.1809627840368592e-06, "learning_rate": 0.0037046146227712076, "loss": 2.6909, "step": 4734 }, { "crossentropy": 2.7168407440185547, "epoch": 0.4027730520585233, "grad_norm": 0.03416219353675842, "grad_norm_var": 1.2389039767555948e-06, "learning_rate": 0.003702679568560331, "loss": 2.7168, "step": 4735 }, { "crossentropy": 2.697909116744995, "epoch": 0.40285811500510377, "grad_norm": 0.033391788601875305, "grad_norm_var": 1.4155065630326474e-06, "learning_rate": 0.003700744722661736, "loss": 2.6979, "step": 4736 }, { "crossentropy": 2.7502970695495605, "epoch": 0.4029431779516843, "grad_norm": 0.03456571698188782, "grad_norm_var": 1.4151436282044497e-06, "learning_rate": 0.003698810085386101, "loss": 2.7503, "step": 4737 }, { "crossentropy": 2.706132650375366, "epoch": 0.4030282408982647, "grad_norm": 0.035528842359781265, "grad_norm_var": 1.2596814982851015e-06, "learning_rate": 0.0036968756570440733, "loss": 2.7061, "step": 4738 }, { "crossentropy": 2.6464736461639404, "epoch": 0.40311330384484517, "grad_norm": 0.03415025398135185, "grad_norm_var": 1.3159611534728383e-06, "learning_rate": 0.003694941437946266, "loss": 2.6465, "step": 4739 }, { "crossentropy": 2.7162792682647705, "epoch": 0.4031983667914257, "grad_norm": 0.03429367020726204, "grad_norm_var": 9.992964861421995e-07, "learning_rate": 0.003693007428403261, "loss": 2.7163, "step": 4740 }, { "crossentropy": 2.8259410858154297, "epoch": 0.4032834297380061, "grad_norm": 0.034298356622457504, "grad_norm_var": 9.057678924293335e-07, "learning_rate": 0.0036910736287256007, "loss": 2.8259, "step": 4741 }, { "crossentropy": 2.634864568710327, "epoch": 0.4033684926845866, "grad_norm": 0.03399466350674629, "grad_norm_var": 9.185488244712117e-07, "learning_rate": 0.0036891400392238018, "loss": 2.6349, "step": 4742 }, { "crossentropy": 2.7193684577941895, "epoch": 0.4034535556311671, "grad_norm": 0.035078130662441254, "grad_norm_var": 7.410444755181119e-07, "learning_rate": 0.003687206660208339, "loss": 2.7194, "step": 4743 }, { "crossentropy": 2.57658314704895, "epoch": 0.40353861857774753, "grad_norm": 0.03488192334771156, "grad_norm_var": 5.224772613077536e-07, "learning_rate": 0.0036852734919896606, "loss": 2.5766, "step": 4744 }, { "crossentropy": 2.7343695163726807, "epoch": 0.403623681524328, "grad_norm": 0.03541998192667961, "grad_norm_var": 4.5172657514158436e-07, "learning_rate": 0.003683340534878176, "loss": 2.7344, "step": 4745 }, { "crossentropy": 2.7401843070983887, "epoch": 0.4037087444709085, "grad_norm": 0.03437371179461479, "grad_norm_var": 3.383854622876839e-07, "learning_rate": 0.0036814077891842616, "loss": 2.7402, "step": 4746 }, { "crossentropy": 2.8015244007110596, "epoch": 0.40379380741748894, "grad_norm": 0.033651355654001236, "grad_norm_var": 3.8394656354893513e-07, "learning_rate": 0.003679475255218265, "loss": 2.8015, "step": 4747 }, { "crossentropy": 2.6148934364318848, "epoch": 0.4038788703640694, "grad_norm": 0.03374779596924782, "grad_norm_var": 3.77477856594588e-07, "learning_rate": 0.0036775429332904907, "loss": 2.6149, "step": 4748 }, { "crossentropy": 2.7452704906463623, "epoch": 0.4039639333106499, "grad_norm": 0.03688323497772217, "grad_norm_var": 7.474356369935667e-07, "learning_rate": 0.0036756108237112153, "loss": 2.7453, "step": 4749 }, { "crossentropy": 2.659623384475708, "epoch": 0.40404899625723034, "grad_norm": 0.037125635892152786, "grad_norm_var": 1.1521172083745272e-06, "learning_rate": 0.00367367892679068, "loss": 2.6596, "step": 4750 }, { "crossentropy": 2.6700327396392822, "epoch": 0.40413405920381085, "grad_norm": 0.03401578962802887, "grad_norm_var": 1.1643787660118103e-06, "learning_rate": 0.003671747242839092, "loss": 2.67, "step": 4751 }, { "crossentropy": 2.736515522003174, "epoch": 0.4042191221503913, "grad_norm": 0.03781251236796379, "grad_norm_var": 1.6073055860673277e-06, "learning_rate": 0.003669815772166625, "loss": 2.7365, "step": 4752 }, { "crossentropy": 2.6763362884521484, "epoch": 0.40430418509697175, "grad_norm": 0.03503665700554848, "grad_norm_var": 1.5945978425357364e-06, "learning_rate": 0.003667884515083415, "loss": 2.6763, "step": 4753 }, { "crossentropy": 2.413273811340332, "epoch": 0.40438924804355225, "grad_norm": 0.036359433084726334, "grad_norm_var": 1.6942576016198395e-06, "learning_rate": 0.00366595347189957, "loss": 2.4133, "step": 4754 }, { "crossentropy": 2.744844913482666, "epoch": 0.4044743109901327, "grad_norm": 0.03565219044685364, "grad_norm_var": 1.6510203623585259e-06, "learning_rate": 0.0036640226429251565, "loss": 2.7448, "step": 4755 }, { "crossentropy": 2.6521379947662354, "epoch": 0.40455937393671315, "grad_norm": 0.034719523042440414, "grad_norm_var": 1.6129334208433062e-06, "learning_rate": 0.003662092028470212, "loss": 2.6521, "step": 4756 }, { "crossentropy": 2.6500847339630127, "epoch": 0.40464443688329366, "grad_norm": 0.03342881426215172, "grad_norm_var": 1.7636450944917097e-06, "learning_rate": 0.003660161628844735, "loss": 2.6501, "step": 4757 }, { "crossentropy": 2.7023637294769287, "epoch": 0.4047294998298741, "grad_norm": 0.031871501356363297, "grad_norm_var": 2.3685774025293076e-06, "learning_rate": 0.003658231444358695, "loss": 2.7024, "step": 4758 }, { "crossentropy": 2.7507684230804443, "epoch": 0.40481456277645456, "grad_norm": 0.03414342179894447, "grad_norm_var": 2.4138984543453065e-06, "learning_rate": 0.0036563014753220224, "loss": 2.7508, "step": 4759 }, { "crossentropy": 2.511788845062256, "epoch": 0.40489962572303506, "grad_norm": 0.03255946561694145, "grad_norm_var": 2.7706113556874e-06, "learning_rate": 0.003654371722044616, "loss": 2.5118, "step": 4760 }, { "crossentropy": 2.764857530593872, "epoch": 0.4049846886696155, "grad_norm": 0.03532204404473305, "grad_norm_var": 2.763115719159268e-06, "learning_rate": 0.0036524421848363386, "loss": 2.7649, "step": 4761 }, { "crossentropy": 2.7685158252716064, "epoch": 0.40506975161619596, "grad_norm": 0.03709320351481438, "grad_norm_var": 3.0729675729923e-06, "learning_rate": 0.003650512864007015, "loss": 2.7685, "step": 4762 }, { "crossentropy": 2.7459616661071777, "epoch": 0.40515481456277647, "grad_norm": 0.03957337141036987, "grad_norm_var": 4.2284628965769585e-06, "learning_rate": 0.003648583759866441, "loss": 2.746, "step": 4763 }, { "crossentropy": 2.667402505874634, "epoch": 0.4052398775093569, "grad_norm": 0.04040304198861122, "grad_norm_var": 5.589154911970345e-06, "learning_rate": 0.0036466548727243763, "loss": 2.6674, "step": 4764 }, { "crossentropy": 2.7222747802734375, "epoch": 0.4053249404559374, "grad_norm": 0.03452513739466667, "grad_norm_var": 5.580386968055163e-06, "learning_rate": 0.003644726202890542, "loss": 2.7223, "step": 4765 }, { "crossentropy": 2.698028087615967, "epoch": 0.40541000340251787, "grad_norm": 0.03315749764442444, "grad_norm_var": 5.7587086671945235e-06, "learning_rate": 0.003642797750674629, "loss": 2.698, "step": 4766 }, { "crossentropy": 2.7760939598083496, "epoch": 0.4054950663490983, "grad_norm": 0.03396499529480934, "grad_norm_var": 5.767937120516355e-06, "learning_rate": 0.0036408695163862893, "loss": 2.7761, "step": 4767 }, { "crossentropy": 2.6770012378692627, "epoch": 0.4055801292956788, "grad_norm": 0.0350477509200573, "grad_norm_var": 5.338438877073616e-06, "learning_rate": 0.0036389415003351445, "loss": 2.677, "step": 4768 }, { "crossentropy": 2.755073308944702, "epoch": 0.4056651922422593, "grad_norm": 0.03365974873304367, "grad_norm_var": 5.482995299216133e-06, "learning_rate": 0.0036370137028307772, "loss": 2.7551, "step": 4769 }, { "crossentropy": 2.7083606719970703, "epoch": 0.4057502551888397, "grad_norm": 0.0362323634326458, "grad_norm_var": 5.462540509323007e-06, "learning_rate": 0.0036350861241827336, "loss": 2.7084, "step": 4770 }, { "crossentropy": 2.6740994453430176, "epoch": 0.40583531813542023, "grad_norm": 0.03813975304365158, "grad_norm_var": 6.037534296021487e-06, "learning_rate": 0.00363315876470053, "loss": 2.6741, "step": 4771 }, { "crossentropy": 2.7222249507904053, "epoch": 0.4059203810820007, "grad_norm": 0.033695802092552185, "grad_norm_var": 6.174091606582441e-06, "learning_rate": 0.003631231624693645, "loss": 2.7222, "step": 4772 }, { "crossentropy": 2.664931297302246, "epoch": 0.40600544402858113, "grad_norm": 0.03741100803017616, "grad_norm_var": 6.237460551552533e-06, "learning_rate": 0.0036293047044715187, "loss": 2.6649, "step": 4773 }, { "crossentropy": 2.7127974033355713, "epoch": 0.40609050697516164, "grad_norm": 0.036736633628606796, "grad_norm_var": 5.411701952089063e-06, "learning_rate": 0.0036273780043435634, "loss": 2.7128, "step": 4774 }, { "crossentropy": 2.6663756370544434, "epoch": 0.4061755699217421, "grad_norm": 0.03649241477251053, "grad_norm_var": 5.2599365990381916e-06, "learning_rate": 0.0036254515246191485, "loss": 2.6664, "step": 4775 }, { "crossentropy": 2.6953444480895996, "epoch": 0.40626063286832254, "grad_norm": 0.034513067454099655, "grad_norm_var": 4.634608709361276e-06, "learning_rate": 0.0036235252656076134, "loss": 2.6953, "step": 4776 }, { "crossentropy": 2.764742374420166, "epoch": 0.40634569581490304, "grad_norm": 0.032719820737838745, "grad_norm_var": 5.292359759117605e-06, "learning_rate": 0.0036215992276182572, "loss": 2.7647, "step": 4777 }, { "crossentropy": 2.8122122287750244, "epoch": 0.4064307587614835, "grad_norm": 0.03415670990943909, "grad_norm_var": 5.338806667465335e-06, "learning_rate": 0.0036196734109603473, "loss": 2.8122, "step": 4778 }, { "crossentropy": 2.649975538253784, "epoch": 0.40651582170806394, "grad_norm": 0.034287575632333755, "grad_norm_var": 4.321231173918562e-06, "learning_rate": 0.0036177478159431136, "loss": 2.65, "step": 4779 }, { "crossentropy": 2.671006917953491, "epoch": 0.40660088465464445, "grad_norm": 0.03524044156074524, "grad_norm_var": 2.4891168690347835e-06, "learning_rate": 0.0036158224428757537, "loss": 2.671, "step": 4780 }, { "crossentropy": 2.73392391204834, "epoch": 0.4066859476012249, "grad_norm": 0.03685693070292473, "grad_norm_var": 2.6816827309872338e-06, "learning_rate": 0.0036138972920674237, "loss": 2.7339, "step": 4781 }, { "crossentropy": 2.7341885566711426, "epoch": 0.4067710105478054, "grad_norm": 0.03564606234431267, "grad_norm_var": 2.4094271989788875e-06, "learning_rate": 0.00361197236382725, "loss": 2.7342, "step": 4782 }, { "crossentropy": 2.6159632205963135, "epoch": 0.40685607349438585, "grad_norm": 0.03360053524374962, "grad_norm_var": 2.4826065372120116e-06, "learning_rate": 0.0036100476584643203, "loss": 2.616, "step": 4783 }, { "crossentropy": 2.702320098876953, "epoch": 0.4069411364409663, "grad_norm": 0.03351838141679764, "grad_norm_var": 2.6755986280361615e-06, "learning_rate": 0.003608123176287685, "loss": 2.7023, "step": 4784 }, { "crossentropy": 2.7014527320861816, "epoch": 0.4070261993875468, "grad_norm": 0.0402056947350502, "grad_norm_var": 4.025335927707041e-06, "learning_rate": 0.003606198917606361, "loss": 2.7015, "step": 4785 }, { "crossentropy": 2.734872817993164, "epoch": 0.40711126233412726, "grad_norm": 0.03709238022565842, "grad_norm_var": 4.1451272807477634e-06, "learning_rate": 0.0036042748827293304, "loss": 2.7349, "step": 4786 }, { "crossentropy": 2.6301841735839844, "epoch": 0.4071963252807077, "grad_norm": 0.036382969468832016, "grad_norm_var": 3.7535554421541662e-06, "learning_rate": 0.0036023510719655357, "loss": 2.6302, "step": 4787 }, { "crossentropy": 2.6760144233703613, "epoch": 0.4072813882272882, "grad_norm": 0.033028021454811096, "grad_norm_var": 3.9451636841048025e-06, "learning_rate": 0.0036004274856238873, "loss": 2.676, "step": 4788 }, { "crossentropy": 2.7835800647735596, "epoch": 0.40736645117386866, "grad_norm": 0.03680916875600815, "grad_norm_var": 3.813894079514266e-06, "learning_rate": 0.0035985041240132565, "loss": 2.7836, "step": 4789 }, { "crossentropy": 2.7151119709014893, "epoch": 0.4074515141204491, "grad_norm": 0.03753058239817619, "grad_norm_var": 3.988919727071436e-06, "learning_rate": 0.0035965809874424813, "loss": 2.7151, "step": 4790 }, { "crossentropy": 2.708517074584961, "epoch": 0.4075365770670296, "grad_norm": 0.0340300053358078, "grad_norm_var": 4.043712261059667e-06, "learning_rate": 0.0035946580762203605, "loss": 2.7085, "step": 4791 }, { "crossentropy": 2.626035451889038, "epoch": 0.40762164001361006, "grad_norm": 0.03539835661649704, "grad_norm_var": 3.9937701529229685e-06, "learning_rate": 0.003592735390655658, "loss": 2.626, "step": 4792 }, { "crossentropy": 2.647449493408203, "epoch": 0.4077067029601905, "grad_norm": 0.03512345254421234, "grad_norm_var": 3.4938294356909778e-06, "learning_rate": 0.0035908129310571017, "loss": 2.6474, "step": 4793 }, { "crossentropy": 2.683112382888794, "epoch": 0.407791765906771, "grad_norm": 0.03587848320603371, "grad_norm_var": 3.357714503245066e-06, "learning_rate": 0.003588890697733386, "loss": 2.6831, "step": 4794 }, { "crossentropy": 2.7898831367492676, "epoch": 0.40787682885335147, "grad_norm": 0.033096421509981155, "grad_norm_var": 3.6650470315868186e-06, "learning_rate": 0.0035869686909931614, "loss": 2.7899, "step": 4795 }, { "crossentropy": 2.7279088497161865, "epoch": 0.407961891799932, "grad_norm": 0.035244446247816086, "grad_norm_var": 3.6648614548086268e-06, "learning_rate": 0.003585046911145051, "loss": 2.7279, "step": 4796 }, { "crossentropy": 2.701958179473877, "epoch": 0.4080469547465124, "grad_norm": 0.033816929906606674, "grad_norm_var": 3.7289803138882325e-06, "learning_rate": 0.0035831253584976373, "loss": 2.702, "step": 4797 }, { "crossentropy": 2.7046356201171875, "epoch": 0.4081320176930929, "grad_norm": 0.03487616032361984, "grad_norm_var": 3.740780082838307e-06, "learning_rate": 0.0035812040333594625, "loss": 2.7046, "step": 4798 }, { "crossentropy": 2.597212553024292, "epoch": 0.4082170806396734, "grad_norm": 0.03713144361972809, "grad_norm_var": 3.695419307101627e-06, "learning_rate": 0.0035792829360390393, "loss": 2.5972, "step": 4799 }, { "crossentropy": 2.727877616882324, "epoch": 0.40830214358625383, "grad_norm": 0.0362967886030674, "grad_norm_var": 3.4168668253864788e-06, "learning_rate": 0.003577362066844838, "loss": 2.7279, "step": 4800 }, { "crossentropy": 2.704909324645996, "epoch": 0.4083872065328343, "grad_norm": 0.03244968131184578, "grad_norm_var": 2.5650167527202505e-06, "learning_rate": 0.0035754414260852976, "loss": 2.7049, "step": 4801 }, { "crossentropy": 2.68448543548584, "epoch": 0.4084722694794148, "grad_norm": 0.03297021985054016, "grad_norm_var": 2.620783080971532e-06, "learning_rate": 0.003573521014068816, "loss": 2.6845, "step": 4802 }, { "crossentropy": 2.6837027072906494, "epoch": 0.40855733242599523, "grad_norm": 0.03239108622074127, "grad_norm_var": 2.8827419035407433e-06, "learning_rate": 0.0035716008311037564, "loss": 2.6837, "step": 4803 }, { "crossentropy": 2.6305952072143555, "epoch": 0.4086423953725757, "grad_norm": 0.03396289050579071, "grad_norm_var": 2.7221673404843767e-06, "learning_rate": 0.003569680877498446, "loss": 2.6306, "step": 4804 }, { "crossentropy": 2.711082696914673, "epoch": 0.4087274583191562, "grad_norm": 0.034032732248306274, "grad_norm_var": 2.4649464780136683e-06, "learning_rate": 0.0035677611535611733, "loss": 2.7111, "step": 4805 }, { "crossentropy": 2.8047935962677, "epoch": 0.40881252126573664, "grad_norm": 0.03607873618602753, "grad_norm_var": 2.037005232892695e-06, "learning_rate": 0.0035658416596001884, "loss": 2.8048, "step": 4806 }, { "crossentropy": 2.5610432624816895, "epoch": 0.4088975842123171, "grad_norm": 0.032676175236701965, "grad_norm_var": 2.2451732470246213e-06, "learning_rate": 0.00356392239592371, "loss": 2.561, "step": 4807 }, { "crossentropy": 2.7062277793884277, "epoch": 0.4089826471588976, "grad_norm": 0.03721117228269577, "grad_norm_var": 2.6764091494562805e-06, "learning_rate": 0.003562003362839914, "loss": 2.7062, "step": 4808 }, { "crossentropy": 2.6050045490264893, "epoch": 0.40906771010547804, "grad_norm": 0.03435488045215607, "grad_norm_var": 2.6573605371225014e-06, "learning_rate": 0.003560084560656943, "loss": 2.605, "step": 4809 }, { "crossentropy": 2.7323782444000244, "epoch": 0.40915277305205855, "grad_norm": 0.03603193536400795, "grad_norm_var": 2.6864376427661318e-06, "learning_rate": 0.0035581659896829023, "loss": 2.7324, "step": 4810 }, { "crossentropy": 2.6928019523620605, "epoch": 0.409237835998639, "grad_norm": 0.03586959093809128, "grad_norm_var": 2.6337431483497777e-06, "learning_rate": 0.0035562476502258578, "loss": 2.6928, "step": 4811 }, { "crossentropy": 2.7428646087646484, "epoch": 0.40932289894521945, "grad_norm": 0.03714402765035629, "grad_norm_var": 2.9940799872264778e-06, "learning_rate": 0.003554329542593841, "loss": 2.7429, "step": 4812 }, { "crossentropy": 2.782604694366455, "epoch": 0.40940796189179995, "grad_norm": 0.03252236545085907, "grad_norm_var": 3.2738440570930757e-06, "learning_rate": 0.003552411667094843, "loss": 2.7826, "step": 4813 }, { "crossentropy": 2.732093334197998, "epoch": 0.4094930248383804, "grad_norm": 0.03481264039874077, "grad_norm_var": 3.2730276781115878e-06, "learning_rate": 0.003550494024036819, "loss": 2.7321, "step": 4814 }, { "crossentropy": 2.633500099182129, "epoch": 0.40957808778496085, "grad_norm": 0.04162704572081566, "grad_norm_var": 5.966033725106288e-06, "learning_rate": 0.0035485766137276892, "loss": 2.6335, "step": 4815 }, { "crossentropy": 2.6833789348602295, "epoch": 0.40966315073154136, "grad_norm": 0.03501080349087715, "grad_norm_var": 5.85166934492781e-06, "learning_rate": 0.003546659436475332, "loss": 2.6834, "step": 4816 }, { "crossentropy": 2.672161340713501, "epoch": 0.4097482136781218, "grad_norm": 0.0338423028588295, "grad_norm_var": 5.5092420343914405e-06, "learning_rate": 0.0035447424925875927, "loss": 2.6722, "step": 4817 }, { "crossentropy": 2.75761342048645, "epoch": 0.40983327662470226, "grad_norm": 0.03509349375963211, "grad_norm_var": 5.206843012224802e-06, "learning_rate": 0.0035428257823722775, "loss": 2.7576, "step": 4818 }, { "crossentropy": 2.620152235031128, "epoch": 0.40991833957128276, "grad_norm": 0.03433608263731003, "grad_norm_var": 4.723559604443336e-06, "learning_rate": 0.0035409093061371556, "loss": 2.6202, "step": 4819 }, { "crossentropy": 2.6209940910339355, "epoch": 0.4100034025178632, "grad_norm": 0.036202166229486465, "grad_norm_var": 4.641339760143246e-06, "learning_rate": 0.0035389930641899535, "loss": 2.621, "step": 4820 }, { "crossentropy": 2.675509452819824, "epoch": 0.41008846546444366, "grad_norm": 0.03448232263326645, "grad_norm_var": 4.570340045497758e-06, "learning_rate": 0.003537077056838369, "loss": 2.6755, "step": 4821 }, { "crossentropy": 2.758866310119629, "epoch": 0.41017352841102417, "grad_norm": 0.03362958878278732, "grad_norm_var": 4.7418735912902256e-06, "learning_rate": 0.0035351612843900555, "loss": 2.7589, "step": 4822 }, { "crossentropy": 2.6111342906951904, "epoch": 0.4102585913576046, "grad_norm": 0.03492416813969612, "grad_norm_var": 4.270397481435234e-06, "learning_rate": 0.00353324574715263, "loss": 2.6111, "step": 4823 }, { "crossentropy": 2.8330376148223877, "epoch": 0.4103436543041851, "grad_norm": 0.03495493531227112, "grad_norm_var": 4.056762065350143e-06, "learning_rate": 0.0035313304454336765, "loss": 2.833, "step": 4824 }, { "crossentropy": 2.699077606201172, "epoch": 0.4104287172507656, "grad_norm": 0.03593452274799347, "grad_norm_var": 4.013151493485282e-06, "learning_rate": 0.0035294153795407323, "loss": 2.6991, "step": 4825 }, { "crossentropy": 2.6907289028167725, "epoch": 0.410513780197346, "grad_norm": 0.03624663129448891, "grad_norm_var": 4.034090059743904e-06, "learning_rate": 0.003527500549781307, "loss": 2.6907, "step": 4826 }, { "crossentropy": 2.6607296466827393, "epoch": 0.41059884314392653, "grad_norm": 0.03640148043632507, "grad_norm_var": 4.0840430745330924e-06, "learning_rate": 0.0035255859564628633, "loss": 2.6607, "step": 4827 }, { "crossentropy": 2.685192108154297, "epoch": 0.410683906090507, "grad_norm": 0.03454463928937912, "grad_norm_var": 3.918452241559687e-06, "learning_rate": 0.0035236715998928298, "loss": 2.6852, "step": 4828 }, { "crossentropy": 2.6265833377838135, "epoch": 0.41076896903708743, "grad_norm": 0.03586659952998161, "grad_norm_var": 3.3854485937224892e-06, "learning_rate": 0.003521757480378599, "loss": 2.6266, "step": 4829 }, { "crossentropy": 2.692803382873535, "epoch": 0.41085403198366793, "grad_norm": 0.03417355567216873, "grad_norm_var": 3.4690638421671267e-06, "learning_rate": 0.003519843598227521, "loss": 2.6928, "step": 4830 }, { "crossentropy": 2.6753458976745605, "epoch": 0.4109390949302484, "grad_norm": 0.035014182329177856, "grad_norm_var": 7.596680392351594e-07, "learning_rate": 0.0035179299537469112, "loss": 2.6753, "step": 4831 }, { "crossentropy": 2.6748437881469727, "epoch": 0.41102415787682883, "grad_norm": 0.03516187146306038, "grad_norm_var": 7.604842978874959e-07, "learning_rate": 0.003516016547244047, "loss": 2.6748, "step": 4832 }, { "crossentropy": 2.7796313762664795, "epoch": 0.41110922082340934, "grad_norm": 0.03381246328353882, "grad_norm_var": 7.653470282308871e-07, "learning_rate": 0.0035141033790261644, "loss": 2.7796, "step": 4833 }, { "crossentropy": 2.6618592739105225, "epoch": 0.4111942837699898, "grad_norm": 0.036113251000642776, "grad_norm_var": 8.364358004754601e-07, "learning_rate": 0.0035121904494004657, "loss": 2.6619, "step": 4834 }, { "crossentropy": 2.7062344551086426, "epoch": 0.41127934671657024, "grad_norm": 0.0354854092001915, "grad_norm_var": 8.000290679301136e-07, "learning_rate": 0.0035102777586741097, "loss": 2.7062, "step": 4835 }, { "crossentropy": 2.6867921352386475, "epoch": 0.41136440966315074, "grad_norm": 0.033075377345085144, "grad_norm_var": 9.866995187831605e-07, "learning_rate": 0.00350836530715422, "loss": 2.6868, "step": 4836 }, { "crossentropy": 2.63877010345459, "epoch": 0.4114494726097312, "grad_norm": 0.03417880833148956, "grad_norm_var": 1.012953995534136e-06, "learning_rate": 0.003506453095147882, "loss": 2.6388, "step": 4837 }, { "crossentropy": 2.772402286529541, "epoch": 0.4115345355563117, "grad_norm": 0.03472102805972099, "grad_norm_var": 8.923656989290319e-07, "learning_rate": 0.00350454112296214, "loss": 2.7724, "step": 4838 }, { "crossentropy": 2.705497980117798, "epoch": 0.41161959850289215, "grad_norm": 0.03473562002182007, "grad_norm_var": 8.974507538112367e-07, "learning_rate": 0.003502629390904003, "loss": 2.7055, "step": 4839 }, { "crossentropy": 2.702972173690796, "epoch": 0.4117046614494726, "grad_norm": 0.0347050242125988, "grad_norm_var": 9.037313187407655e-07, "learning_rate": 0.0035007178992804418, "loss": 2.703, "step": 4840 }, { "crossentropy": 2.7489352226257324, "epoch": 0.4117897243960531, "grad_norm": 0.033883851021528244, "grad_norm_var": 9.139528611714608e-07, "learning_rate": 0.0034988066483983855, "loss": 2.7489, "step": 4841 }, { "crossentropy": 2.629674196243286, "epoch": 0.41187478734263355, "grad_norm": 0.03422129526734352, "grad_norm_var": 8.019469177763504e-07, "learning_rate": 0.003496895638564724, "loss": 2.6297, "step": 4842 }, { "crossentropy": 2.7916619777679443, "epoch": 0.411959850289214, "grad_norm": 0.034988753497600555, "grad_norm_var": 6.167174785931887e-07, "learning_rate": 0.003494984870086313, "loss": 2.7917, "step": 4843 }, { "crossentropy": 2.625054121017456, "epoch": 0.4120449132357945, "grad_norm": 0.03328855335712433, "grad_norm_var": 7.359215553873547e-07, "learning_rate": 0.003493074343269964, "loss": 2.6251, "step": 4844 }, { "crossentropy": 2.7333719730377197, "epoch": 0.41212997618237496, "grad_norm": 0.04134337604045868, "grad_norm_var": 3.543489380641631e-06, "learning_rate": 0.003491164058422456, "loss": 2.7334, "step": 4845 }, { "crossentropy": 2.6744463443756104, "epoch": 0.4122150391289554, "grad_norm": 0.0371120423078537, "grad_norm_var": 3.7862358128794757e-06, "learning_rate": 0.003489254015850523, "loss": 2.6744, "step": 4846 }, { "crossentropy": 2.722062349319458, "epoch": 0.4123001020755359, "grad_norm": 0.03273826837539673, "grad_norm_var": 4.140583176205174e-06, "learning_rate": 0.0034873442158608638, "loss": 2.7221, "step": 4847 }, { "crossentropy": 2.6983630657196045, "epoch": 0.41238516502211636, "grad_norm": 0.03482262045145035, "grad_norm_var": 4.139224566267601e-06, "learning_rate": 0.0034854346587601397, "loss": 2.6984, "step": 4848 }, { "crossentropy": 2.6995530128479004, "epoch": 0.4124702279686968, "grad_norm": 0.033749837428331375, "grad_norm_var": 4.14898168659116e-06, "learning_rate": 0.003483525344854967, "loss": 2.6996, "step": 4849 }, { "crossentropy": 2.6454150676727295, "epoch": 0.4125552909152773, "grad_norm": 0.03350690007209778, "grad_norm_var": 4.168501775343204e-06, "learning_rate": 0.0034816162744519263, "loss": 2.6454, "step": 4850 }, { "crossentropy": 2.695767402648926, "epoch": 0.41264035386185777, "grad_norm": 0.03390652686357498, "grad_norm_var": 4.1768150554533435e-06, "learning_rate": 0.003479707447857563, "loss": 2.6958, "step": 4851 }, { "crossentropy": 2.6912076473236084, "epoch": 0.4127254168084382, "grad_norm": 0.03597263619303703, "grad_norm_var": 4.079215971313407e-06, "learning_rate": 0.0034777988653783747, "loss": 2.6912, "step": 4852 }, { "crossentropy": 2.6408326625823975, "epoch": 0.4128104797550187, "grad_norm": 0.036185212433338165, "grad_norm_var": 4.146661622505867e-06, "learning_rate": 0.0034758905273208286, "loss": 2.6408, "step": 4853 }, { "crossentropy": 2.6728944778442383, "epoch": 0.41289554270159917, "grad_norm": 0.03644966334104538, "grad_norm_var": 4.270830473778776e-06, "learning_rate": 0.0034739824339913477, "loss": 2.6729, "step": 4854 }, { "crossentropy": 2.654214382171631, "epoch": 0.4129806056481797, "grad_norm": 0.034783173352479935, "grad_norm_var": 4.268657440895801e-06, "learning_rate": 0.0034720745856963186, "loss": 2.6542, "step": 4855 }, { "crossentropy": 2.6929702758789062, "epoch": 0.4130656685947601, "grad_norm": 0.036660972982645035, "grad_norm_var": 4.403817884866361e-06, "learning_rate": 0.0034701669827420824, "loss": 2.693, "step": 4856 }, { "crossentropy": 2.7697668075561523, "epoch": 0.4131507315413406, "grad_norm": 0.03372930735349655, "grad_norm_var": 4.432963718440225e-06, "learning_rate": 0.0034682596254349486, "loss": 2.7698, "step": 4857 }, { "crossentropy": 2.6008505821228027, "epoch": 0.4132357944879211, "grad_norm": 0.03677123412489891, "grad_norm_var": 4.501092270681126e-06, "learning_rate": 0.0034663525140811815, "loss": 2.6009, "step": 4858 }, { "crossentropy": 2.667245388031006, "epoch": 0.41332085743450153, "grad_norm": 0.03348087519407272, "grad_norm_var": 4.720967434449809e-06, "learning_rate": 0.0034644456489870113, "loss": 2.6672, "step": 4859 }, { "crossentropy": 2.7163641452789307, "epoch": 0.413405920381082, "grad_norm": 0.035237472504377365, "grad_norm_var": 4.44052686024432e-06, "learning_rate": 0.0034625390304586223, "loss": 2.7164, "step": 4860 }, { "crossentropy": 2.747616767883301, "epoch": 0.4134909833276625, "grad_norm": 0.034754447638988495, "grad_norm_var": 1.9352551487862578e-06, "learning_rate": 0.0034606326588021642, "loss": 2.7476, "step": 4861 }, { "crossentropy": 2.7135419845581055, "epoch": 0.41357604627424294, "grad_norm": 0.03661347180604935, "grad_norm_var": 1.8098139482336517e-06, "learning_rate": 0.003458726534323747, "loss": 2.7135, "step": 4862 }, { "crossentropy": 2.6297073364257812, "epoch": 0.4136611092208234, "grad_norm": 0.03337038308382034, "grad_norm_var": 1.6475213108322248e-06, "learning_rate": 0.0034568206573294376, "loss": 2.6297, "step": 4863 }, { "crossentropy": 2.754331111907959, "epoch": 0.4137461721674039, "grad_norm": 0.03281104937195778, "grad_norm_var": 1.947909064180647e-06, "learning_rate": 0.003454915028125263, "loss": 2.7543, "step": 4864 }, { "crossentropy": 2.73761248588562, "epoch": 0.41383123511398434, "grad_norm": 0.03890969231724739, "grad_norm_var": 2.8385493279871215e-06, "learning_rate": 0.003453009647017216, "loss": 2.7376, "step": 4865 }, { "crossentropy": 2.6911048889160156, "epoch": 0.4139162980605648, "grad_norm": 0.04073673486709595, "grad_norm_var": 4.476778288559022e-06, "learning_rate": 0.003451104514311243, "loss": 2.6911, "step": 4866 }, { "crossentropy": 2.7065374851226807, "epoch": 0.4140013610071453, "grad_norm": 0.03830505162477493, "grad_norm_var": 4.664467449004041e-06, "learning_rate": 0.003449199630313255, "loss": 2.7065, "step": 4867 }, { "crossentropy": 2.696499824523926, "epoch": 0.41408642395372575, "grad_norm": 0.03316329047083855, "grad_norm_var": 5.139230301318999e-06, "learning_rate": 0.0034472949953291204, "loss": 2.6965, "step": 4868 }, { "crossentropy": 2.7576849460601807, "epoch": 0.41417148690030625, "grad_norm": 0.03494202718138695, "grad_norm_var": 5.1632913533658975e-06, "learning_rate": 0.003445390609664669, "loss": 2.7577, "step": 4869 }, { "crossentropy": 2.766545057296753, "epoch": 0.4142565498468867, "grad_norm": 0.03735773637890816, "grad_norm_var": 5.309236201418652e-06, "learning_rate": 0.0034434864736256933, "loss": 2.7665, "step": 4870 }, { "crossentropy": 2.7307841777801514, "epoch": 0.41434161279346715, "grad_norm": 0.03311750292778015, "grad_norm_var": 5.69218319865783e-06, "learning_rate": 0.003441582587517939, "loss": 2.7308, "step": 4871 }, { "crossentropy": 2.6873350143432617, "epoch": 0.41442667574004766, "grad_norm": 0.03356644883751869, "grad_norm_var": 5.862243113308186e-06, "learning_rate": 0.003439678951647115, "loss": 2.6873, "step": 4872 }, { "crossentropy": 2.7087080478668213, "epoch": 0.4145117386866281, "grad_norm": 0.032558463513851166, "grad_norm_var": 6.2132927025319395e-06, "learning_rate": 0.0034377755663188925, "loss": 2.7087, "step": 4873 }, { "crossentropy": 2.8336286544799805, "epoch": 0.41459680163320856, "grad_norm": 0.03844507783651352, "grad_norm_var": 6.704254664232206e-06, "learning_rate": 0.003435872431838899, "loss": 2.8336, "step": 4874 }, { "crossentropy": 2.6908020973205566, "epoch": 0.41468186457978906, "grad_norm": 0.03436826914548874, "grad_norm_var": 6.519231044887994e-06, "learning_rate": 0.0034339695485127246, "loss": 2.6908, "step": 4875 }, { "crossentropy": 2.6823532581329346, "epoch": 0.4147669275263695, "grad_norm": 0.03505758196115494, "grad_norm_var": 6.527935856096706e-06, "learning_rate": 0.0034320669166459164, "loss": 2.6824, "step": 4876 }, { "crossentropy": 2.7527856826782227, "epoch": 0.41485199047294996, "grad_norm": 0.03448159992694855, "grad_norm_var": 6.559887288556943e-06, "learning_rate": 0.0034301645365439827, "loss": 2.7528, "step": 4877 }, { "crossentropy": 2.7444021701812744, "epoch": 0.41493705341953047, "grad_norm": 0.04128661006689072, "grad_norm_var": 8.62618175258567e-06, "learning_rate": 0.0034282624085123898, "loss": 2.7444, "step": 4878 }, { "crossentropy": 2.6163525581359863, "epoch": 0.4150221163661109, "grad_norm": 0.03292683884501457, "grad_norm_var": 8.780971195891141e-06, "learning_rate": 0.003426360532856566, "loss": 2.6164, "step": 4879 }, { "crossentropy": 2.7544965744018555, "epoch": 0.41510717931269137, "grad_norm": 0.037651412189006805, "grad_norm_var": 8.347175290867916e-06, "learning_rate": 0.003424458909881897, "loss": 2.7545, "step": 4880 }, { "crossentropy": 2.6730356216430664, "epoch": 0.41519224225927187, "grad_norm": 0.0353691391646862, "grad_norm_var": 7.782852709778409e-06, "learning_rate": 0.0034225575398937292, "loss": 2.673, "step": 4881 }, { "crossentropy": 2.798682689666748, "epoch": 0.4152773052058523, "grad_norm": 0.033415786921978, "grad_norm_var": 6.346308036142325e-06, "learning_rate": 0.0034206564231973663, "loss": 2.7987, "step": 4882 }, { "crossentropy": 2.7087721824645996, "epoch": 0.4153623681524328, "grad_norm": 0.03378095105290413, "grad_norm_var": 5.858563444718392e-06, "learning_rate": 0.003418755560098076, "loss": 2.7088, "step": 4883 }, { "crossentropy": 2.6411962509155273, "epoch": 0.4154474310990133, "grad_norm": 0.03457372263073921, "grad_norm_var": 5.619990635083392e-06, "learning_rate": 0.0034168549509010805, "loss": 2.6412, "step": 4884 }, { "crossentropy": 2.7091126441955566, "epoch": 0.4155324940455937, "grad_norm": 0.035354144871234894, "grad_norm_var": 5.61746349161558e-06, "learning_rate": 0.00341495459591156, "loss": 2.7091, "step": 4885 }, { "crossentropy": 2.658395290374756, "epoch": 0.41561755699217423, "grad_norm": 0.035088829696178436, "grad_norm_var": 5.28855340695174e-06, "learning_rate": 0.0034130544954346607, "loss": 2.6584, "step": 4886 }, { "crossentropy": 2.809386730194092, "epoch": 0.4157026199387547, "grad_norm": 0.03726162761449814, "grad_norm_var": 5.285742392162745e-06, "learning_rate": 0.003411154649775483, "loss": 2.8094, "step": 4887 }, { "crossentropy": 2.7203774452209473, "epoch": 0.41578768288533513, "grad_norm": 0.036889512091875076, "grad_norm_var": 5.197117722240997e-06, "learning_rate": 0.003409255059239086, "loss": 2.7204, "step": 4888 }, { "crossentropy": 2.74196457862854, "epoch": 0.41587274583191564, "grad_norm": 0.03694528713822365, "grad_norm_var": 4.660719809496117e-06, "learning_rate": 0.0034073557241304916, "loss": 2.742, "step": 4889 }, { "crossentropy": 2.702108860015869, "epoch": 0.4159578087784961, "grad_norm": 0.03368893265724182, "grad_norm_var": 4.400964312426675e-06, "learning_rate": 0.0034054566447546753, "loss": 2.7021, "step": 4890 }, { "crossentropy": 2.7416343688964844, "epoch": 0.41604287172507654, "grad_norm": 0.03800758719444275, "grad_norm_var": 4.675336919510164e-06, "learning_rate": 0.0034035578214165786, "loss": 2.7416, "step": 4891 }, { "crossentropy": 2.671708345413208, "epoch": 0.41612793467165704, "grad_norm": 0.03609414026141167, "grad_norm_var": 4.648696806333098e-06, "learning_rate": 0.003401659254421093, "loss": 2.6717, "step": 4892 }, { "crossentropy": 2.7265613079071045, "epoch": 0.4162129976182375, "grad_norm": 0.0346066877245903, "grad_norm_var": 4.627669167063935e-06, "learning_rate": 0.003399760944073077, "loss": 2.7266, "step": 4893 }, { "crossentropy": 2.603764533996582, "epoch": 0.41629806056481794, "grad_norm": 0.03389819338917732, "grad_norm_var": 2.643175796910452e-06, "learning_rate": 0.003397862890677343, "loss": 2.6038, "step": 4894 }, { "crossentropy": 2.6889491081237793, "epoch": 0.41638312351139845, "grad_norm": 0.03658827394247055, "grad_norm_var": 2.299531540627247e-06, "learning_rate": 0.003395965094538665, "loss": 2.6889, "step": 4895 }, { "crossentropy": 2.728461265563965, "epoch": 0.4164681864579789, "grad_norm": 0.034511350095272064, "grad_norm_var": 2.04681147274634e-06, "learning_rate": 0.0033940675559617725, "loss": 2.7285, "step": 4896 }, { "crossentropy": 2.6600780487060547, "epoch": 0.4165532494045594, "grad_norm": 0.03657149150967598, "grad_norm_var": 2.1354819801176647e-06, "learning_rate": 0.003392170275251357, "loss": 2.6601, "step": 4897 }, { "crossentropy": 2.5066447257995605, "epoch": 0.41663831235113985, "grad_norm": 0.03479532524943352, "grad_norm_var": 1.8793776955609391e-06, "learning_rate": 0.0033902732527120692, "loss": 2.5066, "step": 4898 }, { "crossentropy": 2.698448419570923, "epoch": 0.4167233752977203, "grad_norm": 0.04227978736162186, "grad_norm_var": 4.39931325991501e-06, "learning_rate": 0.0033883764886485108, "loss": 2.6984, "step": 4899 }, { "crossentropy": 2.671337842941284, "epoch": 0.4168084382443008, "grad_norm": 0.034274015575647354, "grad_norm_var": 4.46480707942485e-06, "learning_rate": 0.0033864799833652503, "loss": 2.6713, "step": 4900 }, { "crossentropy": 2.644556999206543, "epoch": 0.41689350119088125, "grad_norm": 0.03687479346990585, "grad_norm_var": 4.4675441969338824e-06, "learning_rate": 0.0033845837371668137, "loss": 2.6446, "step": 4901 }, { "crossentropy": 2.6480798721313477, "epoch": 0.4169785641374617, "grad_norm": 0.03366211801767349, "grad_norm_var": 4.796340572927639e-06, "learning_rate": 0.00338268775035768, "loss": 2.6481, "step": 4902 }, { "crossentropy": 2.695490837097168, "epoch": 0.4170636270840422, "grad_norm": 0.03301114961504936, "grad_norm_var": 5.244116615861725e-06, "learning_rate": 0.0033807920232422936, "loss": 2.6955, "step": 4903 }, { "crossentropy": 2.7814369201660156, "epoch": 0.41714869003062266, "grad_norm": 0.03576121851801872, "grad_norm_var": 5.158823767377011e-06, "learning_rate": 0.0033788965561250502, "loss": 2.7814, "step": 4904 }, { "crossentropy": 2.596130847930908, "epoch": 0.4172337529772031, "grad_norm": 0.032686877995729446, "grad_norm_var": 5.598285418921082e-06, "learning_rate": 0.0033770013493103104, "loss": 2.5961, "step": 4905 }, { "crossentropy": 2.652937889099121, "epoch": 0.4173188159237836, "grad_norm": 0.03390937298536301, "grad_norm_var": 5.549355532857616e-06, "learning_rate": 0.003375106403102389, "loss": 2.6529, "step": 4906 }, { "crossentropy": 2.6220767498016357, "epoch": 0.41740387887036406, "grad_norm": 0.034114327281713486, "grad_norm_var": 5.179834461967123e-06, "learning_rate": 0.0033732117178055564, "loss": 2.6221, "step": 4907 }, { "crossentropy": 2.6381924152374268, "epoch": 0.4174889418169445, "grad_norm": 0.0344681441783905, "grad_norm_var": 5.1571768763634025e-06, "learning_rate": 0.0033713172937240477, "loss": 2.6382, "step": 4908 }, { "crossentropy": 2.7226569652557373, "epoch": 0.417574004763525, "grad_norm": 0.03544631600379944, "grad_norm_var": 5.1431207207584356e-06, "learning_rate": 0.003369423131162053, "loss": 2.7227, "step": 4909 }, { "crossentropy": 2.686415433883667, "epoch": 0.41765906771010547, "grad_norm": 0.037799376994371414, "grad_norm_var": 5.428466816243908e-06, "learning_rate": 0.003367529230423718, "loss": 2.6864, "step": 4910 }, { "crossentropy": 2.720865249633789, "epoch": 0.417744130656686, "grad_norm": 0.03436525538563728, "grad_norm_var": 5.391679469511577e-06, "learning_rate": 0.003365635591813152, "loss": 2.7209, "step": 4911 }, { "crossentropy": 2.6409506797790527, "epoch": 0.4178291936032664, "grad_norm": 0.03664221614599228, "grad_norm_var": 5.456176819102337e-06, "learning_rate": 0.0033637422156344156, "loss": 2.641, "step": 4912 }, { "crossentropy": 2.734724521636963, "epoch": 0.4179142565498469, "grad_norm": 0.03522700443863869, "grad_norm_var": 5.362080372382262e-06, "learning_rate": 0.003361849102191533, "loss": 2.7347, "step": 4913 }, { "crossentropy": 2.674431562423706, "epoch": 0.4179993194964274, "grad_norm": 0.034766510128974915, "grad_norm_var": 5.364195452321289e-06, "learning_rate": 0.003359956251788482, "loss": 2.6744, "step": 4914 }, { "crossentropy": 2.6048574447631836, "epoch": 0.41808438244300783, "grad_norm": 0.03564918041229248, "grad_norm_var": 1.968299073669642e-06, "learning_rate": 0.003358063664729199, "loss": 2.6049, "step": 4915 }, { "crossentropy": 2.635906457901001, "epoch": 0.4181694453895883, "grad_norm": 0.03290606662631035, "grad_norm_var": 2.2023693320198193e-06, "learning_rate": 0.00335617134131758, "loss": 2.6359, "step": 4916 }, { "crossentropy": 2.7216362953186035, "epoch": 0.4182545083361688, "grad_norm": 0.0356065072119236, "grad_norm_var": 1.957224179566635e-06, "learning_rate": 0.003354279281857479, "loss": 2.7216, "step": 4917 }, { "crossentropy": 2.6651992797851562, "epoch": 0.41833957128274923, "grad_norm": 0.03650522977113724, "grad_norm_var": 2.049520696742873e-06, "learning_rate": 0.003352387486652704, "loss": 2.6652, "step": 4918 }, { "crossentropy": 2.621594190597534, "epoch": 0.4184246342293297, "grad_norm": 0.035066764801740646, "grad_norm_var": 1.7879565811677763e-06, "learning_rate": 0.0033504959560070246, "loss": 2.6216, "step": 4919 }, { "crossentropy": 2.656951665878296, "epoch": 0.4185096971759102, "grad_norm": 0.03698248043656349, "grad_norm_var": 1.995760319437913e-06, "learning_rate": 0.0033486046902241663, "loss": 2.657, "step": 4920 }, { "crossentropy": 2.7637076377868652, "epoch": 0.41859476012249064, "grad_norm": 0.034932397305965424, "grad_norm_var": 1.5782773356118793e-06, "learning_rate": 0.0033467136896078086, "loss": 2.7637, "step": 4921 }, { "crossentropy": 2.6745975017547607, "epoch": 0.4186798230690711, "grad_norm": 0.033332161605358124, "grad_norm_var": 1.7041395646508657e-06, "learning_rate": 0.0033448229544615947, "loss": 2.6746, "step": 4922 }, { "crossentropy": 2.6554341316223145, "epoch": 0.4187648860156516, "grad_norm": 0.03326293081045151, "grad_norm_var": 1.8770168651232576e-06, "learning_rate": 0.003342932485089122, "loss": 2.6554, "step": 4923 }, { "crossentropy": 2.626131534576416, "epoch": 0.41884994896223204, "grad_norm": 0.03669522702693939, "grad_norm_var": 1.9741712379353046e-06, "learning_rate": 0.0033410422817939434, "loss": 2.6261, "step": 4924 }, { "crossentropy": 2.70367693901062, "epoch": 0.4189350119088125, "grad_norm": 0.03869180381298065, "grad_norm_var": 2.685381743949775e-06, "learning_rate": 0.0033391523448795734, "loss": 2.7037, "step": 4925 }, { "crossentropy": 2.621281385421753, "epoch": 0.419020074855393, "grad_norm": 0.032734308391809464, "grad_norm_var": 2.75414408221619e-06, "learning_rate": 0.0033372626746494805, "loss": 2.6213, "step": 4926 }, { "crossentropy": 2.6730797290802, "epoch": 0.41910513780197345, "grad_norm": 0.03485806658864021, "grad_norm_var": 2.713791575169264e-06, "learning_rate": 0.0033353732714070934, "loss": 2.6731, "step": 4927 }, { "crossentropy": 2.7800512313842773, "epoch": 0.41919020074855395, "grad_norm": 0.03669919818639755, "grad_norm_var": 2.724639041124371e-06, "learning_rate": 0.0033334841354557923, "loss": 2.7801, "step": 4928 }, { "crossentropy": 2.689194679260254, "epoch": 0.4192752636951344, "grad_norm": 0.03520428389310837, "grad_norm_var": 2.7247250328282327e-06, "learning_rate": 0.003331595267098919, "loss": 2.6892, "step": 4929 }, { "crossentropy": 2.7825441360473633, "epoch": 0.41936032664171485, "grad_norm": 0.04092821478843689, "grad_norm_var": 4.705909741866086e-06, "learning_rate": 0.003329706666639771, "loss": 2.7825, "step": 4930 }, { "crossentropy": 2.6309006214141846, "epoch": 0.41944538958829536, "grad_norm": 0.036439500749111176, "grad_norm_var": 4.7471346120792065e-06, "learning_rate": 0.003327818334381606, "loss": 2.6309, "step": 4931 }, { "crossentropy": 2.6790108680725098, "epoch": 0.4195304525348758, "grad_norm": 0.03378153219819069, "grad_norm_var": 4.471493662510975e-06, "learning_rate": 0.003325930270627632, "loss": 2.679, "step": 4932 }, { "crossentropy": 2.701077699661255, "epoch": 0.41961551548145626, "grad_norm": 0.038090549409389496, "grad_norm_var": 4.815405667553801e-06, "learning_rate": 0.00332404247568102, "loss": 2.7011, "step": 4933 }, { "crossentropy": 2.6974120140075684, "epoch": 0.41970057842803676, "grad_norm": 0.037575505673885345, "grad_norm_var": 4.975109524903097e-06, "learning_rate": 0.0033221549498448967, "loss": 2.6974, "step": 4934 }, { "crossentropy": 2.594881296157837, "epoch": 0.4197856413746172, "grad_norm": 0.03340035304427147, "grad_norm_var": 5.345952478460978e-06, "learning_rate": 0.0033202676934223395, "loss": 2.5949, "step": 4935 }, { "crossentropy": 2.6099889278411865, "epoch": 0.41987070432119766, "grad_norm": 0.033698972314596176, "grad_norm_var": 5.5242233788799566e-06, "learning_rate": 0.0033183807067163917, "loss": 2.61, "step": 4936 }, { "crossentropy": 2.6792216300964355, "epoch": 0.41995576726777817, "grad_norm": 0.03287459537386894, "grad_norm_var": 5.98448787964348e-06, "learning_rate": 0.003316493990030047, "loss": 2.6792, "step": 4937 }, { "crossentropy": 2.7233834266662598, "epoch": 0.4200408302143586, "grad_norm": 0.03649970144033432, "grad_norm_var": 5.688954580634666e-06, "learning_rate": 0.003314607543666258, "loss": 2.7234, "step": 4938 }, { "crossentropy": 2.778653621673584, "epoch": 0.42012589316093907, "grad_norm": 0.038773927837610245, "grad_norm_var": 5.7856095220238175e-06, "learning_rate": 0.003312721367927935, "loss": 2.7787, "step": 4939 }, { "crossentropy": 2.7460179328918457, "epoch": 0.4202109561075196, "grad_norm": 0.043458953499794006, "grad_norm_var": 9.218529888990225e-06, "learning_rate": 0.0033108354631179414, "loss": 2.746, "step": 4940 }, { "crossentropy": 2.7927956581115723, "epoch": 0.4202960190541, "grad_norm": 0.033687107264995575, "grad_norm_var": 9.309274642444778e-06, "learning_rate": 0.0033089498295391017, "loss": 2.7928, "step": 4941 }, { "crossentropy": 2.739823341369629, "epoch": 0.42038108200068053, "grad_norm": 0.035390522330999374, "grad_norm_var": 8.53378785105833e-06, "learning_rate": 0.0033070644674941937, "loss": 2.7398, "step": 4942 }, { "crossentropy": 2.6556384563446045, "epoch": 0.420466144947261, "grad_norm": 0.03353341668844223, "grad_norm_var": 8.904323264082257e-06, "learning_rate": 0.003305179377285949, "loss": 2.6556, "step": 4943 }, { "crossentropy": 2.6223018169403076, "epoch": 0.42055120789384143, "grad_norm": 0.036712463945150375, "grad_norm_var": 8.905124773430212e-06, "learning_rate": 0.003303294559217063, "loss": 2.6223, "step": 4944 }, { "crossentropy": 2.667161703109741, "epoch": 0.42063627084042193, "grad_norm": 0.03396731987595558, "grad_norm_var": 9.173734481469214e-06, "learning_rate": 0.003301410013590179, "loss": 2.6672, "step": 4945 }, { "crossentropy": 2.6491611003875732, "epoch": 0.4207213337870024, "grad_norm": 0.03717277944087982, "grad_norm_var": 7.675533747227246e-06, "learning_rate": 0.0032995257407079036, "loss": 2.6492, "step": 4946 }, { "crossentropy": 2.6284470558166504, "epoch": 0.42080639673358283, "grad_norm": 0.03486320748925209, "grad_norm_var": 7.726072185897508e-06, "learning_rate": 0.0032976417408727976, "loss": 2.6284, "step": 4947 }, { "crossentropy": 2.749894142150879, "epoch": 0.42089145968016334, "grad_norm": 0.033254534006118774, "grad_norm_var": 7.888250956892643e-06, "learning_rate": 0.003295758014387375, "loss": 2.7499, "step": 4948 }, { "crossentropy": 2.5943784713745117, "epoch": 0.4209765226267438, "grad_norm": 0.03374539315700531, "grad_norm_var": 7.746808552274118e-06, "learning_rate": 0.00329387456155411, "loss": 2.5944, "step": 4949 }, { "crossentropy": 2.7442514896392822, "epoch": 0.42106158557332424, "grad_norm": 0.03373732790350914, "grad_norm_var": 7.6248503218050965e-06, "learning_rate": 0.003291991382675429, "loss": 2.7443, "step": 4950 }, { "crossentropy": 2.6675827503204346, "epoch": 0.42114664851990474, "grad_norm": 0.03588000312447548, "grad_norm_var": 7.381688575704553e-06, "learning_rate": 0.0032901084780537155, "loss": 2.6676, "step": 4951 }, { "crossentropy": 2.553556442260742, "epoch": 0.4212317114664852, "grad_norm": 0.034816451370716095, "grad_norm_var": 7.198370087408793e-06, "learning_rate": 0.003288225847991312, "loss": 2.5536, "step": 4952 }, { "crossentropy": 2.6956799030303955, "epoch": 0.42131677441306564, "grad_norm": 0.033376798033714294, "grad_norm_var": 7.0367961795005905e-06, "learning_rate": 0.003286343492790513, "loss": 2.6957, "step": 4953 }, { "crossentropy": 2.6300199031829834, "epoch": 0.42140183735964615, "grad_norm": 0.037701357156038284, "grad_norm_var": 7.2785065580791935e-06, "learning_rate": 0.0032844614127535703, "loss": 2.63, "step": 4954 }, { "crossentropy": 2.658905267715454, "epoch": 0.4214869003062266, "grad_norm": 0.04007943347096443, "grad_norm_var": 7.932375304949464e-06, "learning_rate": 0.003282579608182694, "loss": 2.6589, "step": 4955 }, { "crossentropy": 2.663597583770752, "epoch": 0.4215719632528071, "grad_norm": 0.03961090371012688, "grad_norm_var": 4.882609414520213e-06, "learning_rate": 0.003280698079380048, "loss": 2.6636, "step": 4956 }, { "crossentropy": 2.7055516242980957, "epoch": 0.42165702619938755, "grad_norm": 0.038274575024843216, "grad_norm_var": 5.107040015130358e-06, "learning_rate": 0.003278816826647747, "loss": 2.7056, "step": 4957 }, { "crossentropy": 2.7950923442840576, "epoch": 0.421742089145968, "grad_norm": 0.03465353697538376, "grad_norm_var": 5.177026107333095e-06, "learning_rate": 0.0032769358502878704, "loss": 2.7951, "step": 4958 }, { "crossentropy": 2.7498364448547363, "epoch": 0.4218271520925485, "grad_norm": 0.03629893809556961, "grad_norm_var": 4.851998436967465e-06, "learning_rate": 0.003275055150602446, "loss": 2.7498, "step": 4959 }, { "crossentropy": 2.7126123905181885, "epoch": 0.42191221503912896, "grad_norm": 0.03403285890817642, "grad_norm_var": 5.004794818100344e-06, "learning_rate": 0.0032731747278934627, "loss": 2.7126, "step": 4960 }, { "crossentropy": 2.697681427001953, "epoch": 0.4219972779857094, "grad_norm": 0.03644043579697609, "grad_norm_var": 4.810244468373785e-06, "learning_rate": 0.0032712945824628593, "loss": 2.6977, "step": 4961 }, { "crossentropy": 2.6066319942474365, "epoch": 0.4220823409322899, "grad_norm": 0.035014793276786804, "grad_norm_var": 4.72678361329489e-06, "learning_rate": 0.003269414714612534, "loss": 2.6066, "step": 4962 }, { "crossentropy": 2.679190158843994, "epoch": 0.42216740387887036, "grad_norm": 0.03429240733385086, "grad_norm_var": 4.81359389209775e-06, "learning_rate": 0.003267535124644343, "loss": 2.6792, "step": 4963 }, { "crossentropy": 2.6963894367218018, "epoch": 0.4222524668254508, "grad_norm": 0.03388750180602074, "grad_norm_var": 4.632196161553452e-06, "learning_rate": 0.00326565581286009, "loss": 2.6964, "step": 4964 }, { "crossentropy": 2.8024818897247314, "epoch": 0.4223375297720313, "grad_norm": 0.036036986857652664, "grad_norm_var": 4.350913120156006e-06, "learning_rate": 0.0032637767795615376, "loss": 2.8025, "step": 4965 }, { "crossentropy": 2.578542947769165, "epoch": 0.42242259271861177, "grad_norm": 0.034824296832084656, "grad_norm_var": 4.113729294927953e-06, "learning_rate": 0.0032618980250504075, "loss": 2.5785, "step": 4966 }, { "crossentropy": 2.786428689956665, "epoch": 0.4225076556651922, "grad_norm": 0.03535538911819458, "grad_norm_var": 4.135919736557708e-06, "learning_rate": 0.0032600195496283713, "loss": 2.7864, "step": 4967 }, { "crossentropy": 2.767082452774048, "epoch": 0.4225927186117727, "grad_norm": 0.03494551405310631, "grad_norm_var": 4.1179956491978195e-06, "learning_rate": 0.0032581413535970596, "loss": 2.7671, "step": 4968 }, { "crossentropy": 2.6317129135131836, "epoch": 0.42267778155835317, "grad_norm": 0.034185364842414856, "grad_norm_var": 3.883964699772507e-06, "learning_rate": 0.0032562634372580567, "loss": 2.6317, "step": 4969 }, { "crossentropy": 2.6077442169189453, "epoch": 0.4227628445049337, "grad_norm": 0.03407033160328865, "grad_norm_var": 3.873231097974393e-06, "learning_rate": 0.0032543858009129022, "loss": 2.6077, "step": 4970 }, { "crossentropy": 2.6321840286254883, "epoch": 0.4228479074515141, "grad_norm": 0.033398471772670746, "grad_norm_var": 2.806478876334467e-06, "learning_rate": 0.003252508444863087, "loss": 2.6322, "step": 4971 }, { "crossentropy": 2.715488910675049, "epoch": 0.4229329703980946, "grad_norm": 0.03606090322136879, "grad_norm_var": 1.5690921943343923e-06, "learning_rate": 0.003250631369410064, "loss": 2.7155, "step": 4972 }, { "crossentropy": 2.5997653007507324, "epoch": 0.4230180333446751, "grad_norm": 0.03780835494399071, "grad_norm_var": 1.3860066195432614e-06, "learning_rate": 0.003248754574855235, "loss": 2.5998, "step": 4973 }, { "crossentropy": 2.713257074356079, "epoch": 0.42310309629125553, "grad_norm": 0.03793519735336304, "grad_norm_var": 1.8717732758539879e-06, "learning_rate": 0.0032468780614999617, "loss": 2.7133, "step": 4974 }, { "crossentropy": 2.7616703510284424, "epoch": 0.423188159237836, "grad_norm": 0.03651517257094383, "grad_norm_var": 1.9038787297307365e-06, "learning_rate": 0.0032450018296455553, "loss": 2.7617, "step": 4975 }, { "crossentropy": 2.6324241161346436, "epoch": 0.4232732221844165, "grad_norm": 0.0338045172393322, "grad_norm_var": 1.945723863350177e-06, "learning_rate": 0.003243125879593286, "loss": 2.6324, "step": 4976 }, { "crossentropy": 2.629157543182373, "epoch": 0.42335828513099694, "grad_norm": 0.03536604717373848, "grad_norm_var": 1.8524900350854722e-06, "learning_rate": 0.003241250211644378, "loss": 2.6292, "step": 4977 }, { "crossentropy": 2.69688081741333, "epoch": 0.4234433480775774, "grad_norm": 0.03320356830954552, "grad_norm_var": 2.106797262526004e-06, "learning_rate": 0.00323937482610001, "loss": 2.6969, "step": 4978 }, { "crossentropy": 2.6783623695373535, "epoch": 0.4235284110241579, "grad_norm": 0.0322997160255909, "grad_norm_var": 2.5710394138279855e-06, "learning_rate": 0.0032374997232613124, "loss": 2.6784, "step": 4979 }, { "crossentropy": 2.633687734603882, "epoch": 0.42361347397073834, "grad_norm": 0.03433997929096222, "grad_norm_var": 2.5178592686319706e-06, "learning_rate": 0.0032356249034293737, "loss": 2.6337, "step": 4980 }, { "crossentropy": 2.7206344604492188, "epoch": 0.4236985369173188, "grad_norm": 0.0343390628695488, "grad_norm_var": 2.4653998043753024e-06, "learning_rate": 0.003233750366905236, "loss": 2.7206, "step": 4981 }, { "crossentropy": 2.6272335052490234, "epoch": 0.4237835998638993, "grad_norm": 0.03705957531929016, "grad_norm_var": 2.7541502991736943e-06, "learning_rate": 0.003231876113989897, "loss": 2.6272, "step": 4982 }, { "crossentropy": 2.770416498184204, "epoch": 0.42386866281047975, "grad_norm": 0.037352554500103, "grad_norm_var": 3.0866417296844415e-06, "learning_rate": 0.0032300021449843054, "loss": 2.7704, "step": 4983 }, { "crossentropy": 2.6244828701019287, "epoch": 0.42395372575706025, "grad_norm": 0.03504335135221481, "grad_norm_var": 3.084340656337818e-06, "learning_rate": 0.0032281284601893678, "loss": 2.6245, "step": 4984 }, { "crossentropy": 2.7021353244781494, "epoch": 0.4240387887036407, "grad_norm": 0.03559878468513489, "grad_norm_var": 3.022907751684678e-06, "learning_rate": 0.003226255059905947, "loss": 2.7021, "step": 4985 }, { "crossentropy": 2.5906131267547607, "epoch": 0.42412385165022115, "grad_norm": 0.03643806651234627, "grad_norm_var": 2.997015174269202e-06, "learning_rate": 0.0032243819444348525, "loss": 2.5906, "step": 4986 }, { "crossentropy": 2.6681206226348877, "epoch": 0.42420891459680166, "grad_norm": 0.0385732501745224, "grad_norm_var": 3.2826225458542726e-06, "learning_rate": 0.0032225091140768536, "loss": 2.6681, "step": 4987 }, { "crossentropy": 2.6744062900543213, "epoch": 0.4242939775433821, "grad_norm": 0.04211602732539177, "grad_norm_var": 5.838378449728639e-06, "learning_rate": 0.0032206365691326757, "loss": 2.6744, "step": 4988 }, { "crossentropy": 2.645935535430908, "epoch": 0.42437904048996256, "grad_norm": 0.033713456243276596, "grad_norm_var": 5.96024559695411e-06, "learning_rate": 0.0032187643099029916, "loss": 2.6459, "step": 4989 }, { "crossentropy": 2.713881492614746, "epoch": 0.42446410343654306, "grad_norm": 0.03468805178999901, "grad_norm_var": 5.719111512549e-06, "learning_rate": 0.0032168923366884352, "loss": 2.7139, "step": 4990 }, { "crossentropy": 2.6564459800720215, "epoch": 0.4245491663831235, "grad_norm": 0.03688529133796692, "grad_norm_var": 5.770210943851587e-06, "learning_rate": 0.0032150206497895896, "loss": 2.6564, "step": 4991 }, { "crossentropy": 2.720412254333496, "epoch": 0.42463422932970396, "grad_norm": 0.03876575827598572, "grad_norm_var": 6.0703777456510545e-06, "learning_rate": 0.003213149249506997, "loss": 2.7204, "step": 4992 }, { "crossentropy": 2.8059911727905273, "epoch": 0.42471929227628447, "grad_norm": 0.03850403428077698, "grad_norm_var": 6.426254690325428e-06, "learning_rate": 0.003211278136141146, "loss": 2.806, "step": 4993 }, { "crossentropy": 2.6732025146484375, "epoch": 0.4248043552228649, "grad_norm": 0.032783426344394684, "grad_norm_var": 6.604165556335119e-06, "learning_rate": 0.0032094073099924857, "loss": 2.6732, "step": 4994 }, { "crossentropy": 2.645258903503418, "epoch": 0.42488941816944537, "grad_norm": 0.03522491082549095, "grad_norm_var": 5.634805510549602e-06, "learning_rate": 0.003207536771361416, "loss": 2.6453, "step": 4995 }, { "crossentropy": 2.688507080078125, "epoch": 0.42497448111602587, "grad_norm": 0.03664468601346016, "grad_norm_var": 5.352467182016124e-06, "learning_rate": 0.003205666520548294, "loss": 2.6885, "step": 4996 }, { "crossentropy": 2.6654233932495117, "epoch": 0.4250595440626063, "grad_norm": 0.03172944113612175, "grad_norm_var": 6.524131775776688e-06, "learning_rate": 0.0032037965578534246, "loss": 2.6654, "step": 4997 }, { "crossentropy": 2.709397077560425, "epoch": 0.4251446070091868, "grad_norm": 0.03633667156100273, "grad_norm_var": 6.485512083448155e-06, "learning_rate": 0.003201926883577072, "loss": 2.7094, "step": 4998 }, { "crossentropy": 2.72806978225708, "epoch": 0.4252296699557673, "grad_norm": 0.033235814422369, "grad_norm_var": 6.953188883224564e-06, "learning_rate": 0.003200057498019453, "loss": 2.7281, "step": 4999 }, { "crossentropy": 2.6674885749816895, "epoch": 0.4253147329023477, "grad_norm": 0.034464031457901, "grad_norm_var": 7.0494153652571215e-06, "learning_rate": 0.003198188401480734, "loss": 2.6675, "step": 5000 }, { "crossentropy": 2.5991806983947754, "epoch": 0.42539979584892823, "grad_norm": 0.0334901288151741, "grad_norm_var": 7.434878825931095e-06, "learning_rate": 0.0031963195942610397, "loss": 2.5992, "step": 5001 }, { "crossentropy": 2.6544833183288574, "epoch": 0.4254848587955087, "grad_norm": 0.035505473613739014, "grad_norm_var": 7.416059327066081e-06, "learning_rate": 0.003194451076660447, "loss": 2.6545, "step": 5002 }, { "crossentropy": 2.727184295654297, "epoch": 0.42556992174208913, "grad_norm": 0.034700680524110794, "grad_norm_var": 6.916908437920993e-06, "learning_rate": 0.0031925828489789854, "loss": 2.7272, "step": 5003 }, { "crossentropy": 2.7108981609344482, "epoch": 0.42565498468866964, "grad_norm": 0.035752855241298676, "grad_norm_var": 3.87612016626455e-06, "learning_rate": 0.0031907149115166403, "loss": 2.7109, "step": 5004 }, { "crossentropy": 2.6704373359680176, "epoch": 0.4257400476352501, "grad_norm": 0.03391673043370247, "grad_norm_var": 3.839725861090604e-06, "learning_rate": 0.0031888472645733444, "loss": 2.6704, "step": 5005 }, { "crossentropy": 2.6886279582977295, "epoch": 0.42582511058183053, "grad_norm": 0.036633990705013275, "grad_norm_var": 3.952839794488644e-06, "learning_rate": 0.0031869799084489935, "loss": 2.6886, "step": 5006 }, { "crossentropy": 2.649130344390869, "epoch": 0.42591017352841104, "grad_norm": 0.03345496580004692, "grad_norm_var": 3.95674765322739e-06, "learning_rate": 0.003185112843443426, "loss": 2.6491, "step": 5007 }, { "crossentropy": 2.772925615310669, "epoch": 0.4259952364749915, "grad_norm": 0.038983214646577835, "grad_norm_var": 4.066815833210444e-06, "learning_rate": 0.003183246069856443, "loss": 2.7729, "step": 5008 }, { "crossentropy": 2.642491102218628, "epoch": 0.42608029942157194, "grad_norm": 0.03347526490688324, "grad_norm_var": 3.3549213501135947e-06, "learning_rate": 0.003181379587987791, "loss": 2.6425, "step": 5009 }, { "crossentropy": 2.6401889324188232, "epoch": 0.42616536236815244, "grad_norm": 0.0327175036072731, "grad_norm_var": 3.3726610957527174e-06, "learning_rate": 0.003179513398137176, "loss": 2.6402, "step": 5010 }, { "crossentropy": 2.653043031692505, "epoch": 0.4262504253147329, "grad_norm": 0.03223313018679619, "grad_norm_var": 3.749280037514992e-06, "learning_rate": 0.0031776475006042517, "loss": 2.653, "step": 5011 }, { "crossentropy": 2.6681771278381348, "epoch": 0.42633548826131334, "grad_norm": 0.03357398137450218, "grad_norm_var": 3.4931293195811576e-06, "learning_rate": 0.00317578189568863, "loss": 2.6682, "step": 5012 }, { "crossentropy": 2.767577648162842, "epoch": 0.42642055120789385, "grad_norm": 0.03380807489156723, "grad_norm_var": 3.0264229135323894e-06, "learning_rate": 0.0031739165836898713, "loss": 2.7676, "step": 5013 }, { "crossentropy": 2.6698317527770996, "epoch": 0.4265056141544743, "grad_norm": 0.034995418041944504, "grad_norm_var": 2.813556687460602e-06, "learning_rate": 0.0031720515649074944, "loss": 2.6698, "step": 5014 }, { "crossentropy": 2.624816656112671, "epoch": 0.4265906771010548, "grad_norm": 0.03903429955244064, "grad_norm_var": 3.988736179258376e-06, "learning_rate": 0.0031701868396409624, "loss": 2.6248, "step": 5015 }, { "crossentropy": 2.628143548965454, "epoch": 0.42667574004763525, "grad_norm": 0.034094829112291336, "grad_norm_var": 4.013608904591496e-06, "learning_rate": 0.0031683224081897, "loss": 2.6281, "step": 5016 }, { "crossentropy": 2.7313144207000732, "epoch": 0.4267608029942157, "grad_norm": 0.03407282009720802, "grad_norm_var": 3.9351480871543365e-06, "learning_rate": 0.003166458270853079, "loss": 2.7313, "step": 5017 }, { "crossentropy": 2.685404062271118, "epoch": 0.4268458659407962, "grad_norm": 0.0326489694416523, "grad_norm_var": 4.1800798992501035e-06, "learning_rate": 0.0031645944279304295, "loss": 2.6854, "step": 5018 }, { "crossentropy": 2.861783266067505, "epoch": 0.42693092888737666, "grad_norm": 0.03902754932641983, "grad_norm_var": 5.390365529413186e-06, "learning_rate": 0.003162730879721027, "loss": 2.8618, "step": 5019 }, { "crossentropy": 2.723942279815674, "epoch": 0.4270159918339571, "grad_norm": 0.03454877436161041, "grad_norm_var": 5.344294586462755e-06, "learning_rate": 0.0031608676265241064, "loss": 2.7239, "step": 5020 }, { "crossentropy": 2.7822084426879883, "epoch": 0.4271010547805376, "grad_norm": 0.036468248814344406, "grad_norm_var": 5.441774459715292e-06, "learning_rate": 0.003159004668638853, "loss": 2.7822, "step": 5021 }, { "crossentropy": 2.692650079727173, "epoch": 0.42718611772711806, "grad_norm": 0.034423552453517914, "grad_norm_var": 5.26135610369177e-06, "learning_rate": 0.0031571420063644007, "loss": 2.6927, "step": 5022 }, { "crossentropy": 2.7767410278320312, "epoch": 0.4272711806736985, "grad_norm": 0.037057727575302124, "grad_norm_var": 5.403652268909145e-06, "learning_rate": 0.0031552796399998417, "loss": 2.7767, "step": 5023 }, { "crossentropy": 2.632838726043701, "epoch": 0.427356243620279, "grad_norm": 0.03278481215238571, "grad_norm_var": 4.573063377611742e-06, "learning_rate": 0.003153417569844219, "loss": 2.6328, "step": 5024 }, { "crossentropy": 2.612034559249878, "epoch": 0.42744130656685947, "grad_norm": 0.03663715347647667, "grad_norm_var": 4.687772660503381e-06, "learning_rate": 0.0031515557961965253, "loss": 2.612, "step": 5025 }, { "crossentropy": 2.7012782096862793, "epoch": 0.4275263695134399, "grad_norm": 0.036395639181137085, "grad_norm_var": 4.471352084545188e-06, "learning_rate": 0.00314969431935571, "loss": 2.7013, "step": 5026 }, { "crossentropy": 2.6486716270446777, "epoch": 0.4276114324600204, "grad_norm": 0.034072842448949814, "grad_norm_var": 3.976514634332351e-06, "learning_rate": 0.0031478331396206705, "loss": 2.6487, "step": 5027 }, { "crossentropy": 2.6738054752349854, "epoch": 0.4276964954066009, "grad_norm": 0.0334777794778347, "grad_norm_var": 3.998306371769123e-06, "learning_rate": 0.0031459722572902624, "loss": 2.6738, "step": 5028 }, { "crossentropy": 2.715590000152588, "epoch": 0.4277815583531814, "grad_norm": 0.036104440689086914, "grad_norm_var": 3.895035957066778e-06, "learning_rate": 0.0031441116726632855, "loss": 2.7156, "step": 5029 }, { "crossentropy": 2.686591386795044, "epoch": 0.42786662129976183, "grad_norm": 0.03707768768072128, "grad_norm_var": 4.063332880369028e-06, "learning_rate": 0.003142251386038497, "loss": 2.6866, "step": 5030 }, { "crossentropy": 2.604104518890381, "epoch": 0.4279516842463423, "grad_norm": 0.0341869480907917, "grad_norm_var": 3.244674556731238e-06, "learning_rate": 0.003140391397714606, "loss": 2.6041, "step": 5031 }, { "crossentropy": 2.7156741619110107, "epoch": 0.4280367471929228, "grad_norm": 0.034476716071367264, "grad_norm_var": 3.1978986348546455e-06, "learning_rate": 0.003138531707990274, "loss": 2.7157, "step": 5032 }, { "crossentropy": 2.629866600036621, "epoch": 0.42812181013950323, "grad_norm": 0.03315192833542824, "grad_norm_var": 3.391310676792436e-06, "learning_rate": 0.0031366723171641128, "loss": 2.6299, "step": 5033 }, { "crossentropy": 2.6516947746276855, "epoch": 0.4282068730860837, "grad_norm": 0.03300253674387932, "grad_norm_var": 3.280804678647507e-06, "learning_rate": 0.0031348132255346875, "loss": 2.6517, "step": 5034 }, { "crossentropy": 2.677643060684204, "epoch": 0.4282919360326642, "grad_norm": 0.03380874916911125, "grad_norm_var": 2.3063915781689735e-06, "learning_rate": 0.0031329544334005146, "loss": 2.6776, "step": 5035 }, { "crossentropy": 2.6384706497192383, "epoch": 0.42837699897924464, "grad_norm": 0.03184041008353233, "grad_norm_var": 2.875325903741131e-06, "learning_rate": 0.0031310959410600593, "loss": 2.6385, "step": 5036 }, { "crossentropy": 2.6529905796051025, "epoch": 0.4284620619258251, "grad_norm": 0.03308205306529999, "grad_norm_var": 2.7870494604605975e-06, "learning_rate": 0.003129237748811746, "loss": 2.653, "step": 5037 }, { "crossentropy": 2.628326892852783, "epoch": 0.4285471248724056, "grad_norm": 0.03437286242842674, "grad_norm_var": 2.7875497341002995e-06, "learning_rate": 0.0031273798569539457, "loss": 2.6283, "step": 5038 }, { "crossentropy": 2.602034330368042, "epoch": 0.42863218781898604, "grad_norm": 0.03476504608988762, "grad_norm_var": 2.3252258623493797e-06, "learning_rate": 0.0031255222657849808, "loss": 2.602, "step": 5039 }, { "crossentropy": 2.6396398544311523, "epoch": 0.4287172507655665, "grad_norm": 0.03515492379665375, "grad_norm_var": 2.1888501634337656e-06, "learning_rate": 0.00312366497560313, "loss": 2.6396, "step": 5040 }, { "crossentropy": 2.6709537506103516, "epoch": 0.428802313712147, "grad_norm": 0.03520871698856354, "grad_norm_var": 1.9046690909942098e-06, "learning_rate": 0.003121807986706617, "loss": 2.671, "step": 5041 }, { "crossentropy": 2.680119276046753, "epoch": 0.42888737665872745, "grad_norm": 0.03760175779461861, "grad_norm_var": 2.318738015223178e-06, "learning_rate": 0.0031199512993936246, "loss": 2.6801, "step": 5042 }, { "crossentropy": 2.8344151973724365, "epoch": 0.42897243960530795, "grad_norm": 0.03570952266454697, "grad_norm_var": 2.4013246513000796e-06, "learning_rate": 0.0031180949139622805, "loss": 2.8344, "step": 5043 }, { "crossentropy": 2.737992525100708, "epoch": 0.4290575025518884, "grad_norm": 0.036741893738508224, "grad_norm_var": 2.594539799331136e-06, "learning_rate": 0.003116238830710666, "loss": 2.738, "step": 5044 }, { "crossentropy": 2.723973274230957, "epoch": 0.42914256549846885, "grad_norm": 0.03416772186756134, "grad_norm_var": 2.4838326514358783e-06, "learning_rate": 0.0031143830499368165, "loss": 2.724, "step": 5045 }, { "crossentropy": 2.7661895751953125, "epoch": 0.42922762844504936, "grad_norm": 0.03493094816803932, "grad_norm_var": 2.0760776936264044e-06, "learning_rate": 0.003112527571938717, "loss": 2.7662, "step": 5046 }, { "crossentropy": 2.7062416076660156, "epoch": 0.4293126913916298, "grad_norm": 0.03415597602725029, "grad_norm_var": 2.0774827557735904e-06, "learning_rate": 0.0031106723970143013, "loss": 2.7062, "step": 5047 }, { "crossentropy": 2.6484546661376953, "epoch": 0.42939775433821026, "grad_norm": 0.03434973955154419, "grad_norm_var": 2.0790663961246792e-06, "learning_rate": 0.0031088175254614615, "loss": 2.6485, "step": 5048 }, { "crossentropy": 2.634789228439331, "epoch": 0.42948281728479076, "grad_norm": 0.03343714773654938, "grad_norm_var": 2.032778168900053e-06, "learning_rate": 0.0031069629575780316, "loss": 2.6348, "step": 5049 }, { "crossentropy": 2.730177402496338, "epoch": 0.4295678802313712, "grad_norm": 0.03369505703449249, "grad_norm_var": 1.92257790078675e-06, "learning_rate": 0.003105108693661807, "loss": 2.7302, "step": 5050 }, { "crossentropy": 2.6489980220794678, "epoch": 0.42965294317795166, "grad_norm": 0.03513858839869499, "grad_norm_var": 1.899208799166265e-06, "learning_rate": 0.0031032547340105244, "loss": 2.649, "step": 5051 }, { "crossentropy": 2.6583588123321533, "epoch": 0.42973800612453217, "grad_norm": 0.03664575517177582, "grad_norm_var": 1.544185261449724e-06, "learning_rate": 0.003101401078921878, "loss": 2.6584, "step": 5052 }, { "crossentropy": 2.6588306427001953, "epoch": 0.4298230690711126, "grad_norm": 0.03429270535707474, "grad_norm_var": 1.3346922809603345e-06, "learning_rate": 0.0030995477286935103, "loss": 2.6588, "step": 5053 }, { "crossentropy": 2.740236282348633, "epoch": 0.42990813201769307, "grad_norm": 0.03520604968070984, "grad_norm_var": 1.3058524792791219e-06, "learning_rate": 0.003097694683623019, "loss": 2.7402, "step": 5054 }, { "crossentropy": 2.649181365966797, "epoch": 0.4299931949642736, "grad_norm": 0.0351644903421402, "grad_norm_var": 1.2993116437713554e-06, "learning_rate": 0.0030958419440079467, "loss": 2.6492, "step": 5055 }, { "crossentropy": 2.7442986965179443, "epoch": 0.430078257910854, "grad_norm": 0.03305218368768692, "grad_norm_var": 1.5602750855649045e-06, "learning_rate": 0.0030939895101457916, "loss": 2.7443, "step": 5056 }, { "crossentropy": 2.7206242084503174, "epoch": 0.43016332085743453, "grad_norm": 0.03745918720960617, "grad_norm_var": 1.9488514810330734e-06, "learning_rate": 0.0030921373823340026, "loss": 2.7206, "step": 5057 }, { "crossentropy": 2.6690292358398438, "epoch": 0.430248383804015, "grad_norm": 0.05152489244937897, "grad_norm_var": 1.8691758561883755e-05, "learning_rate": 0.003090285560869973, "loss": 2.669, "step": 5058 }, { "crossentropy": 2.6791372299194336, "epoch": 0.4303334467505954, "grad_norm": 0.03594798594713211, "grad_norm_var": 1.8686728930427988e-05, "learning_rate": 0.003088434046051057, "loss": 2.6791, "step": 5059 }, { "crossentropy": 2.670558452606201, "epoch": 0.43041850969717593, "grad_norm": 0.04126198962330818, "grad_norm_var": 2.041418514628912e-05, "learning_rate": 0.0030865828381745515, "loss": 2.6706, "step": 5060 }, { "crossentropy": 2.658785343170166, "epoch": 0.4305035726437564, "grad_norm": 0.03865784406661987, "grad_norm_var": 2.041153029123018e-05, "learning_rate": 0.003084731937537708, "loss": 2.6588, "step": 5061 }, { "crossentropy": 2.6404716968536377, "epoch": 0.43058863559033683, "grad_norm": 0.036083389073610306, "grad_norm_var": 2.024459862129957e-05, "learning_rate": 0.00308288134443773, "loss": 2.6405, "step": 5062 }, { "crossentropy": 2.738630771636963, "epoch": 0.43067369853691734, "grad_norm": 0.036152951419353485, "grad_norm_var": 1.983521784248565e-05, "learning_rate": 0.003081031059171766, "loss": 2.7386, "step": 5063 }, { "crossentropy": 2.6569983959198, "epoch": 0.4307587614834978, "grad_norm": 0.03482365608215332, "grad_norm_var": 1.9697309119620124e-05, "learning_rate": 0.0030791810820369222, "loss": 2.657, "step": 5064 }, { "crossentropy": 2.741380453109741, "epoch": 0.43084382443007824, "grad_norm": 0.03507736697793007, "grad_norm_var": 1.9133512935098907e-05, "learning_rate": 0.003077331413330248, "loss": 2.7414, "step": 5065 }, { "crossentropy": 2.693185806274414, "epoch": 0.43092888737665874, "grad_norm": 0.03669837862253189, "grad_norm_var": 1.8419266297196175e-05, "learning_rate": 0.003075482053348748, "loss": 2.6932, "step": 5066 }, { "crossentropy": 2.72522234916687, "epoch": 0.4310139503232392, "grad_norm": 0.03376811370253563, "grad_norm_var": 1.8890350527753925e-05, "learning_rate": 0.003073633002389377, "loss": 2.7252, "step": 5067 }, { "crossentropy": 2.6890580654144287, "epoch": 0.43109901326981964, "grad_norm": 0.035235557705163956, "grad_norm_var": 1.907909767513051e-05, "learning_rate": 0.003071784260749038, "loss": 2.6891, "step": 5068 }, { "crossentropy": 2.6650002002716064, "epoch": 0.43118407621640015, "grad_norm": 0.03499731793999672, "grad_norm_var": 1.886513700367053e-05, "learning_rate": 0.003069935828724587, "loss": 2.665, "step": 5069 }, { "crossentropy": 2.6512932777404785, "epoch": 0.4312691391629806, "grad_norm": 0.03389284014701843, "grad_norm_var": 1.927730566607088e-05, "learning_rate": 0.0030680877066128287, "loss": 2.6513, "step": 5070 }, { "crossentropy": 2.6855056285858154, "epoch": 0.4313542021095611, "grad_norm": 0.03560676425695419, "grad_norm_var": 1.918940650754893e-05, "learning_rate": 0.0030662398947105193, "loss": 2.6855, "step": 5071 }, { "crossentropy": 2.6154966354370117, "epoch": 0.43143926505614155, "grad_norm": 0.034676484763622284, "grad_norm_var": 1.852312873927194e-05, "learning_rate": 0.0030643923933143602, "loss": 2.6155, "step": 5072 }, { "crossentropy": 2.668095588684082, "epoch": 0.431524328002722, "grad_norm": 0.050079528242349625, "grad_norm_var": 2.9264598949274522e-05, "learning_rate": 0.003062545202721011, "loss": 2.6681, "step": 5073 }, { "crossentropy": 2.7036495208740234, "epoch": 0.4316093909493025, "grad_norm": 0.03444653004407883, "grad_norm_var": 1.619602253861424e-05, "learning_rate": 0.0030606983232270746, "loss": 2.7036, "step": 5074 }, { "crossentropy": 2.6305179595947266, "epoch": 0.43169445389588296, "grad_norm": 0.0322396419942379, "grad_norm_var": 1.7433728783205805e-05, "learning_rate": 0.003058851755129109, "loss": 2.6305, "step": 5075 }, { "crossentropy": 2.731400728225708, "epoch": 0.4317795168424634, "grad_norm": 0.035709116607904434, "grad_norm_var": 1.5821223894172536e-05, "learning_rate": 0.0030570054987236173, "loss": 2.7314, "step": 5076 }, { "crossentropy": 2.647610664367676, "epoch": 0.4318645797890439, "grad_norm": 0.032235465943813324, "grad_norm_var": 1.6238026199440667e-05, "learning_rate": 0.0030551595543070564, "loss": 2.6476, "step": 5077 }, { "crossentropy": 2.6772592067718506, "epoch": 0.43194964273562436, "grad_norm": 0.03553858399391174, "grad_norm_var": 1.6231102255534885e-05, "learning_rate": 0.0030533139221758333, "loss": 2.6773, "step": 5078 }, { "crossentropy": 2.7021961212158203, "epoch": 0.4320347056822048, "grad_norm": 0.034351423382759094, "grad_norm_var": 1.6324819785855774e-05, "learning_rate": 0.003051468602626302, "loss": 2.7022, "step": 5079 }, { "crossentropy": 2.629467725753784, "epoch": 0.4321197686287853, "grad_norm": 0.033572521060705185, "grad_norm_var": 1.6549834185701032e-05, "learning_rate": 0.003049623595954766, "loss": 2.6295, "step": 5080 }, { "crossentropy": 2.6890928745269775, "epoch": 0.43220483157536577, "grad_norm": 0.03453543782234192, "grad_norm_var": 1.659929530286221e-05, "learning_rate": 0.0030477789024574826, "loss": 2.6891, "step": 5081 }, { "crossentropy": 2.7548828125, "epoch": 0.4322898945219462, "grad_norm": 0.03536780923604965, "grad_norm_var": 1.6492726881267976e-05, "learning_rate": 0.0030459345224306544, "loss": 2.7549, "step": 5082 }, { "crossentropy": 2.7456583976745605, "epoch": 0.4323749574685267, "grad_norm": 0.03510737046599388, "grad_norm_var": 1.6315064517826752e-05, "learning_rate": 0.003044090456170437, "loss": 2.7457, "step": 5083 }, { "crossentropy": 2.6885428428649902, "epoch": 0.43246002041510717, "grad_norm": 0.03566615283489227, "grad_norm_var": 1.631293304735007e-05, "learning_rate": 0.0030422467039729353, "loss": 2.6885, "step": 5084 }, { "crossentropy": 2.7508270740509033, "epoch": 0.4325450833616876, "grad_norm": 0.03527249023318291, "grad_norm_var": 1.6299169595396233e-05, "learning_rate": 0.0030404032661341994, "loss": 2.7508, "step": 5085 }, { "crossentropy": 2.5816338062286377, "epoch": 0.4326301463082681, "grad_norm": 0.05797816067934036, "grad_norm_var": 4.733454727306255e-05, "learning_rate": 0.0030385601429502376, "loss": 2.5816, "step": 5086 }, { "crossentropy": 2.703303098678589, "epoch": 0.4327152092548486, "grad_norm": 0.040895555168390274, "grad_norm_var": 4.8083381818239156e-05, "learning_rate": 0.0030367173347169976, "loss": 2.7033, "step": 5087 }, { "crossentropy": 2.568814277648926, "epoch": 0.4328002722014291, "grad_norm": 0.03341123089194298, "grad_norm_var": 4.863522145396107e-05, "learning_rate": 0.0030348748417303823, "loss": 2.5688, "step": 5088 }, { "crossentropy": 2.7256627082824707, "epoch": 0.43288533514800953, "grad_norm": 0.03283122181892395, "grad_norm_var": 3.778271136950917e-05, "learning_rate": 0.003033032664286245, "loss": 2.7257, "step": 5089 }, { "crossentropy": 2.5893685817718506, "epoch": 0.43297039809459, "grad_norm": 0.03386343643069267, "grad_norm_var": 3.794008558343197e-05, "learning_rate": 0.003031190802680383, "loss": 2.5894, "step": 5090 }, { "crossentropy": 2.7282586097717285, "epoch": 0.4330554610411705, "grad_norm": 0.03261060640215874, "grad_norm_var": 3.7754729774471205e-05, "learning_rate": 0.003029349257208549, "loss": 2.7283, "step": 5091 }, { "crossentropy": 2.792973279953003, "epoch": 0.43314052398775094, "grad_norm": 0.03574763238430023, "grad_norm_var": 3.775238292831971e-05, "learning_rate": 0.0030275080281664413, "loss": 2.793, "step": 5092 }, { "crossentropy": 2.659808874130249, "epoch": 0.4332255869343314, "grad_norm": 0.034514620900154114, "grad_norm_var": 3.687635208676297e-05, "learning_rate": 0.00302566711584971, "loss": 2.6598, "step": 5093 }, { "crossentropy": 2.6392745971679688, "epoch": 0.4333106498809119, "grad_norm": 0.03439854457974434, "grad_norm_var": 3.7077732500404026e-05, "learning_rate": 0.0030238265205539483, "loss": 2.6393, "step": 5094 }, { "crossentropy": 2.741805076599121, "epoch": 0.43339571282749234, "grad_norm": 0.03495563194155693, "grad_norm_var": 3.694697232260498e-05, "learning_rate": 0.003021986242574707, "loss": 2.7418, "step": 5095 }, { "crossentropy": 2.6130921840667725, "epoch": 0.4334807757740728, "grad_norm": 0.034111060202121735, "grad_norm_var": 3.6769572851678284e-05, "learning_rate": 0.0030201462822074788, "loss": 2.6131, "step": 5096 }, { "crossentropy": 2.6225693225860596, "epoch": 0.4335658387206533, "grad_norm": 0.03299726918339729, "grad_norm_var": 3.728532363713958e-05, "learning_rate": 0.0030183066397477098, "loss": 2.6226, "step": 5097 }, { "crossentropy": 2.5093822479248047, "epoch": 0.43365090166723375, "grad_norm": 0.04351166635751724, "grad_norm_var": 4.049095501821119e-05, "learning_rate": 0.003016467315490793, "loss": 2.5094, "step": 5098 }, { "crossentropy": 2.5628395080566406, "epoch": 0.4337359646138142, "grad_norm": 0.034341659396886826, "grad_norm_var": 4.069449094933429e-05, "learning_rate": 0.0030146283097320704, "loss": 2.5628, "step": 5099 }, { "crossentropy": 2.6604065895080566, "epoch": 0.4338210275603947, "grad_norm": 0.035386890172958374, "grad_norm_var": 4.073764392841742e-05, "learning_rate": 0.0030127896227668365, "loss": 2.6604, "step": 5100 }, { "crossentropy": 2.655540704727173, "epoch": 0.43390609050697515, "grad_norm": 0.04101090878248215, "grad_norm_var": 4.172131914617974e-05, "learning_rate": 0.0030109512548903285, "loss": 2.6555, "step": 5101 }, { "crossentropy": 2.703669548034668, "epoch": 0.43399115345355566, "grad_norm": 0.036113083362579346, "grad_norm_var": 1.0546019807636253e-05, "learning_rate": 0.0030091132063977343, "loss": 2.7037, "step": 5102 }, { "crossentropy": 2.680058002471924, "epoch": 0.4340762164001361, "grad_norm": 0.03228556364774704, "grad_norm_var": 9.17897354743933e-06, "learning_rate": 0.0030072754775841938, "loss": 2.6801, "step": 5103 }, { "crossentropy": 2.7901973724365234, "epoch": 0.43416127934671656, "grad_norm": 0.035310979932546616, "grad_norm_var": 8.969000390832091e-06, "learning_rate": 0.003005438068744792, "loss": 2.7902, "step": 5104 }, { "crossentropy": 2.6001813411712646, "epoch": 0.43424634229329706, "grad_norm": 0.0346478633582592, "grad_norm_var": 8.589527970373489e-06, "learning_rate": 0.003003600980174565, "loss": 2.6002, "step": 5105 }, { "crossentropy": 2.705173969268799, "epoch": 0.4343314052398775, "grad_norm": 0.036529332399368286, "grad_norm_var": 8.500704440417226e-06, "learning_rate": 0.003001764212168495, "loss": 2.7052, "step": 5106 }, { "crossentropy": 2.7496793270111084, "epoch": 0.43441646818645796, "grad_norm": 0.03901702165603638, "grad_norm_var": 8.572483189387553e-06, "learning_rate": 0.002999927765021516, "loss": 2.7497, "step": 5107 }, { "crossentropy": 2.614234209060669, "epoch": 0.43450153113303847, "grad_norm": 0.034936320036649704, "grad_norm_var": 8.633348198119374e-06, "learning_rate": 0.0029980916390285064, "loss": 2.6142, "step": 5108 }, { "crossentropy": 2.729468822479248, "epoch": 0.4345865940796189, "grad_norm": 0.03329259529709816, "grad_norm_var": 8.949034820957685e-06, "learning_rate": 0.002996255834484296, "loss": 2.7295, "step": 5109 }, { "crossentropy": 2.653937578201294, "epoch": 0.43467165702619937, "grad_norm": 0.033452894538640976, "grad_norm_var": 9.18199612103913e-06, "learning_rate": 0.002994420351683661, "loss": 2.6539, "step": 5110 }, { "crossentropy": 2.586393117904663, "epoch": 0.43475671997277987, "grad_norm": 0.03621908277273178, "grad_norm_var": 9.148991121667576e-06, "learning_rate": 0.00299258519092133, "loss": 2.5864, "step": 5111 }, { "crossentropy": 2.745901584625244, "epoch": 0.4348417829193603, "grad_norm": 0.034584831446409225, "grad_norm_var": 9.054892479309905e-06, "learning_rate": 0.0029907503524919735, "loss": 2.7459, "step": 5112 }, { "crossentropy": 2.704796075820923, "epoch": 0.43492684586594077, "grad_norm": 0.03436710685491562, "grad_norm_var": 8.650700494922982e-06, "learning_rate": 0.0029889158366902153, "loss": 2.7048, "step": 5113 }, { "crossentropy": 2.5805952548980713, "epoch": 0.4350119088125213, "grad_norm": 0.033553462475538254, "grad_norm_var": 4.792532770706709e-06, "learning_rate": 0.0029870816438106254, "loss": 2.5806, "step": 5114 }, { "crossentropy": 2.6653757095336914, "epoch": 0.4350969717591017, "grad_norm": 0.035956546664237976, "grad_norm_var": 4.745816890949714e-06, "learning_rate": 0.0029852477741477246, "loss": 2.6654, "step": 5115 }, { "crossentropy": 2.645582675933838, "epoch": 0.43518203470568223, "grad_norm": 0.033874813467264175, "grad_norm_var": 4.89469112378386e-06, "learning_rate": 0.002983414227995975, "loss": 2.6456, "step": 5116 }, { "crossentropy": 2.780917167663574, "epoch": 0.4352670976522627, "grad_norm": 0.03679508715867996, "grad_norm_var": 2.807737454571524e-06, "learning_rate": 0.002981581005649795, "loss": 2.7809, "step": 5117 }, { "crossentropy": 2.7551021575927734, "epoch": 0.43535216059884313, "grad_norm": 0.03357479348778725, "grad_norm_var": 2.8535202787028162e-06, "learning_rate": 0.002979748107403546, "loss": 2.7551, "step": 5118 }, { "crossentropy": 2.706402063369751, "epoch": 0.43543722354542364, "grad_norm": 0.03289922699332237, "grad_norm_var": 2.6631475812053293e-06, "learning_rate": 0.0029779155335515383, "loss": 2.7064, "step": 5119 }, { "crossentropy": 2.6895949840545654, "epoch": 0.4355222864920041, "grad_norm": 0.033307574689388275, "grad_norm_var": 2.8144350443870883e-06, "learning_rate": 0.002976083284388031, "loss": 2.6896, "step": 5120 }, { "crossentropy": 2.758417844772339, "epoch": 0.43560734943858453, "grad_norm": 0.03495121747255325, "grad_norm_var": 2.8135058108049358e-06, "learning_rate": 0.0029742513602072297, "loss": 2.7584, "step": 5121 }, { "crossentropy": 2.6576309204101562, "epoch": 0.43569241238516504, "grad_norm": 0.033713746815919876, "grad_norm_var": 2.67177582327077e-06, "learning_rate": 0.0029724197613032917, "loss": 2.6576, "step": 5122 }, { "crossentropy": 2.694754123687744, "epoch": 0.4357774753317455, "grad_norm": 0.033802345395088196, "grad_norm_var": 1.3391673597026288e-06, "learning_rate": 0.0029705884879703153, "loss": 2.6948, "step": 5123 }, { "crossentropy": 2.7082555294036865, "epoch": 0.43586253827832594, "grad_norm": 0.03613756597042084, "grad_norm_var": 1.5264498045210184e-06, "learning_rate": 0.002968757540502351, "loss": 2.7083, "step": 5124 }, { "crossentropy": 2.7634079456329346, "epoch": 0.43594760122490644, "grad_norm": 0.036162909120321274, "grad_norm_var": 1.6155727360532094e-06, "learning_rate": 0.002966926919193397, "loss": 2.7634, "step": 5125 }, { "crossentropy": 2.6794207096099854, "epoch": 0.4360326641714869, "grad_norm": 0.034582626074552536, "grad_norm_var": 1.5248750057872408e-06, "learning_rate": 0.0029650966243373977, "loss": 2.6794, "step": 5126 }, { "crossentropy": 2.589773178100586, "epoch": 0.43611772711806734, "grad_norm": 0.03533749282360077, "grad_norm_var": 1.3896210137936496e-06, "learning_rate": 0.0029632666562282475, "loss": 2.5898, "step": 5127 }, { "crossentropy": 2.743074655532837, "epoch": 0.43620279006464785, "grad_norm": 0.0343831405043602, "grad_norm_var": 1.3925736445281773e-06, "learning_rate": 0.0029614370151597835, "loss": 2.7431, "step": 5128 }, { "crossentropy": 2.766416072845459, "epoch": 0.4362878530112283, "grad_norm": 0.035870689898729324, "grad_norm_var": 1.4896916588476768e-06, "learning_rate": 0.002959607701425797, "loss": 2.7664, "step": 5129 }, { "crossentropy": 2.7158751487731934, "epoch": 0.4363729159578088, "grad_norm": 0.03395025059580803, "grad_norm_var": 1.4398553073576478e-06, "learning_rate": 0.0029577787153200197, "loss": 2.7159, "step": 5130 }, { "crossentropy": 2.6465203762054443, "epoch": 0.43645797890438925, "grad_norm": 0.03809572756290436, "grad_norm_var": 2.082475521588319e-06, "learning_rate": 0.0029559500571361363, "loss": 2.6465, "step": 5131 }, { "crossentropy": 2.6084816455841064, "epoch": 0.4365430418509697, "grad_norm": 0.033209700137376785, "grad_norm_var": 2.1957140730268405e-06, "learning_rate": 0.0029541217271677744, "loss": 2.6085, "step": 5132 }, { "crossentropy": 2.7535181045532227, "epoch": 0.4366281047975502, "grad_norm": 0.03534061089158058, "grad_norm_var": 1.9407113203125915e-06, "learning_rate": 0.0029522937257085136, "loss": 2.7535, "step": 5133 }, { "crossentropy": 2.600318670272827, "epoch": 0.43671316774413066, "grad_norm": 0.033823590725660324, "grad_norm_var": 1.9070056319749871e-06, "learning_rate": 0.0029504660530518764, "loss": 2.6003, "step": 5134 }, { "crossentropy": 2.673912525177002, "epoch": 0.4367982306907111, "grad_norm": 0.035083234310150146, "grad_norm_var": 1.674031585448497e-06, "learning_rate": 0.002948638709491336, "loss": 2.6739, "step": 5135 }, { "crossentropy": 2.619551181793213, "epoch": 0.4368832936372916, "grad_norm": 0.03432353958487511, "grad_norm_var": 1.5283126612743054e-06, "learning_rate": 0.0029468116953203106, "loss": 2.6196, "step": 5136 }, { "crossentropy": 2.7880947589874268, "epoch": 0.43696835658387206, "grad_norm": 0.03301655128598213, "grad_norm_var": 1.7549733911619939e-06, "learning_rate": 0.002944985010832164, "loss": 2.7881, "step": 5137 }, { "crossentropy": 2.5211222171783447, "epoch": 0.4370534195304525, "grad_norm": 0.03879179432988167, "grad_norm_var": 2.629734093486056e-06, "learning_rate": 0.0029431586563202113, "loss": 2.5211, "step": 5138 }, { "crossentropy": 2.705728769302368, "epoch": 0.437138482477033, "grad_norm": 0.0342603474855423, "grad_norm_var": 2.562410735678968e-06, "learning_rate": 0.0029413326320777123, "loss": 2.7057, "step": 5139 }, { "crossentropy": 2.674307107925415, "epoch": 0.43722354542361347, "grad_norm": 0.03499423339962959, "grad_norm_var": 2.493274463711131e-06, "learning_rate": 0.002939506938397872, "loss": 2.6743, "step": 5140 }, { "crossentropy": 2.724306106567383, "epoch": 0.4373086083701939, "grad_norm": 0.0330771766602993, "grad_norm_var": 2.641463840240219e-06, "learning_rate": 0.002937681575573848, "loss": 2.7243, "step": 5141 }, { "crossentropy": 2.8109371662139893, "epoch": 0.4373936713167744, "grad_norm": 0.03389591723680496, "grad_norm_var": 2.698512207615957e-06, "learning_rate": 0.0029358565438987365, "loss": 2.8109, "step": 5142 }, { "crossentropy": 2.611231565475464, "epoch": 0.4374787342633549, "grad_norm": 0.035726819187402725, "grad_norm_var": 2.7337651760688785e-06, "learning_rate": 0.00293403184366559, "loss": 2.6112, "step": 5143 }, { "crossentropy": 2.6177074909210205, "epoch": 0.4375637972099354, "grad_norm": 0.033106252551078796, "grad_norm_var": 2.9177406328864094e-06, "learning_rate": 0.0029322074751673977, "loss": 2.6177, "step": 5144 }, { "crossentropy": 2.6908023357391357, "epoch": 0.43764886015651583, "grad_norm": 0.034415438771247864, "grad_norm_var": 2.8395182279825995e-06, "learning_rate": 0.002930383438697103, "loss": 2.6908, "step": 5145 }, { "crossentropy": 2.679630994796753, "epoch": 0.4377339231030963, "grad_norm": 0.035588689148426056, "grad_norm_var": 2.8447218263759228e-06, "learning_rate": 0.0029285597345475927, "loss": 2.6796, "step": 5146 }, { "crossentropy": 2.7349534034729004, "epoch": 0.4378189860496768, "grad_norm": 0.03200764209032059, "grad_norm_var": 2.4834255764512198e-06, "learning_rate": 0.002926736363011703, "loss": 2.735, "step": 5147 }, { "crossentropy": 2.6264150142669678, "epoch": 0.43790404899625723, "grad_norm": 0.03310989961028099, "grad_norm_var": 2.500104606101688e-06, "learning_rate": 0.0029249133243822124, "loss": 2.6264, "step": 5148 }, { "crossentropy": 2.63303804397583, "epoch": 0.4379891119428377, "grad_norm": 0.03465702384710312, "grad_norm_var": 2.444499732327336e-06, "learning_rate": 0.0029230906189518513, "loss": 2.633, "step": 5149 }, { "crossentropy": 2.6915335655212402, "epoch": 0.4380741748894182, "grad_norm": 0.034078579396009445, "grad_norm_var": 2.430075270613378e-06, "learning_rate": 0.00292126824701329, "loss": 2.6915, "step": 5150 }, { "crossentropy": 2.613745927810669, "epoch": 0.43815923783599864, "grad_norm": 0.0352058932185173, "grad_norm_var": 2.442462340151765e-06, "learning_rate": 0.002919446208859154, "loss": 2.6137, "step": 5151 }, { "crossentropy": 2.5716304779052734, "epoch": 0.4382443007825791, "grad_norm": 0.034760482609272, "grad_norm_var": 2.4504653457573775e-06, "learning_rate": 0.002917624504782006, "loss": 2.5716, "step": 5152 }, { "crossentropy": 2.691408634185791, "epoch": 0.4383293637291596, "grad_norm": 0.035098135471343994, "grad_norm_var": 2.3322306899100273e-06, "learning_rate": 0.0029158031350743573, "loss": 2.6914, "step": 5153 }, { "crossentropy": 2.687028646469116, "epoch": 0.43841442667574004, "grad_norm": 0.03626402094960213, "grad_norm_var": 1.3014029378264718e-06, "learning_rate": 0.002913982100028673, "loss": 2.687, "step": 5154 }, { "crossentropy": 2.7835609912872314, "epoch": 0.4384994896223205, "grad_norm": 0.034787245094776154, "grad_norm_var": 1.309617009005957e-06, "learning_rate": 0.0029121613999373566, "loss": 2.7836, "step": 5155 }, { "crossentropy": 2.6909871101379395, "epoch": 0.438584552568901, "grad_norm": 0.033941518515348434, "grad_norm_var": 1.2987483987573582e-06, "learning_rate": 0.0029103410350927584, "loss": 2.691, "step": 5156 }, { "crossentropy": 2.6462318897247314, "epoch": 0.43866961551548145, "grad_norm": 0.033629752695560455, "grad_norm_var": 1.2234986517038633e-06, "learning_rate": 0.002908521005787177, "loss": 2.6462, "step": 5157 }, { "crossentropy": 2.713135242462158, "epoch": 0.43875467846206195, "grad_norm": 0.03420109301805496, "grad_norm_var": 1.2091304259835452e-06, "learning_rate": 0.002906701312312861, "loss": 2.7131, "step": 5158 }, { "crossentropy": 2.7388532161712646, "epoch": 0.4388397414086424, "grad_norm": 0.03761351853609085, "grad_norm_var": 1.76257584965737e-06, "learning_rate": 0.0029048819549619955, "loss": 2.7389, "step": 5159 }, { "crossentropy": 2.624127149581909, "epoch": 0.43892480435522285, "grad_norm": 0.038762301206588745, "grad_norm_var": 2.6889998650377517e-06, "learning_rate": 0.0029030629340267163, "loss": 2.6241, "step": 5160 }, { "crossentropy": 2.736078977584839, "epoch": 0.43900986730180336, "grad_norm": 0.03627309575676918, "grad_norm_var": 2.788976077707818e-06, "learning_rate": 0.0029012442497991097, "loss": 2.7361, "step": 5161 }, { "crossentropy": 2.752577066421509, "epoch": 0.4390949302483838, "grad_norm": 0.03580394759774208, "grad_norm_var": 2.808805997431274e-06, "learning_rate": 0.002899425902571203, "loss": 2.7526, "step": 5162 }, { "crossentropy": 2.6908650398254395, "epoch": 0.43917999319496426, "grad_norm": 0.03561099246144295, "grad_norm_var": 2.176815925736206e-06, "learning_rate": 0.0028976078926349668, "loss": 2.6909, "step": 5163 }, { "crossentropy": 2.7437922954559326, "epoch": 0.43926505614154476, "grad_norm": 0.034556351602077484, "grad_norm_var": 1.8972804444311135e-06, "learning_rate": 0.002895790220282327, "loss": 2.7438, "step": 5164 }, { "crossentropy": 2.6374611854553223, "epoch": 0.4393501190881252, "grad_norm": 0.03511384502053261, "grad_norm_var": 1.8694698885184912e-06, "learning_rate": 0.002893972885805148, "loss": 2.6375, "step": 5165 }, { "crossentropy": 2.733708620071411, "epoch": 0.43943518203470566, "grad_norm": 0.0369647778570652, "grad_norm_var": 1.8984036828132225e-06, "learning_rate": 0.002892155889495236, "loss": 2.7337, "step": 5166 }, { "crossentropy": 2.529493808746338, "epoch": 0.43952024498128617, "grad_norm": 0.03665236383676529, "grad_norm_var": 1.9653734585923025e-06, "learning_rate": 0.0028903392316443535, "loss": 2.5295, "step": 5167 }, { "crossentropy": 2.7525999546051025, "epoch": 0.4396053079278666, "grad_norm": 0.039746493101119995, "grad_norm_var": 2.9430203493625733e-06, "learning_rate": 0.002888522912544202, "loss": 2.7526, "step": 5168 }, { "crossentropy": 2.6898019313812256, "epoch": 0.43969037087444707, "grad_norm": 0.03868742659687996, "grad_norm_var": 3.345930190527119e-06, "learning_rate": 0.002886706932486428, "loss": 2.6898, "step": 5169 }, { "crossentropy": 2.6934990882873535, "epoch": 0.4397754338210276, "grad_norm": 0.03688132390379906, "grad_norm_var": 3.3780575350343215e-06, "learning_rate": 0.002884891291762629, "loss": 2.6935, "step": 5170 }, { "crossentropy": 2.6527976989746094, "epoch": 0.439860496767608, "grad_norm": 0.036936625838279724, "grad_norm_var": 3.2614577545773032e-06, "learning_rate": 0.0028830759906643445, "loss": 2.6528, "step": 5171 }, { "crossentropy": 2.6688737869262695, "epoch": 0.43994555971418847, "grad_norm": 0.033196546137332916, "grad_norm_var": 3.533983699727181e-06, "learning_rate": 0.0028812610294830566, "loss": 2.6689, "step": 5172 }, { "crossentropy": 2.762042999267578, "epoch": 0.440030622660769, "grad_norm": 0.03500409051775932, "grad_norm_var": 3.164666161455341e-06, "learning_rate": 0.002879446408510198, "loss": 2.762, "step": 5173 }, { "crossentropy": 2.6557679176330566, "epoch": 0.4401156856073494, "grad_norm": 0.03478095680475235, "grad_norm_var": 3.0175821352171803e-06, "learning_rate": 0.0028776321280371436, "loss": 2.6558, "step": 5174 }, { "crossentropy": 2.687476873397827, "epoch": 0.44020074855392993, "grad_norm": 0.03315252065658569, "grad_norm_var": 3.5464276454585017e-06, "learning_rate": 0.002875818188355213, "loss": 2.6875, "step": 5175 }, { "crossentropy": 2.7320027351379395, "epoch": 0.4402858115005104, "grad_norm": 0.03618040308356285, "grad_norm_var": 3.057826566212468e-06, "learning_rate": 0.0028740045897556765, "loss": 2.732, "step": 5176 }, { "crossentropy": 2.6482152938842773, "epoch": 0.44037087444709083, "grad_norm": 0.03216075897216797, "grad_norm_var": 3.94933850015299e-06, "learning_rate": 0.0028721913325297434, "loss": 2.6482, "step": 5177 }, { "crossentropy": 2.8200836181640625, "epoch": 0.44045593739367134, "grad_norm": 0.03604843094944954, "grad_norm_var": 3.9559952983822994e-06, "learning_rate": 0.002870378416968571, "loss": 2.8201, "step": 5178 }, { "crossentropy": 2.6532461643218994, "epoch": 0.4405410003402518, "grad_norm": 0.03665255382657051, "grad_norm_var": 4.0073241566368024e-06, "learning_rate": 0.0028685658433632616, "loss": 2.6532, "step": 5179 }, { "crossentropy": 2.545266628265381, "epoch": 0.44062606328683224, "grad_norm": 0.03922353312373161, "grad_norm_var": 4.598112264257194e-06, "learning_rate": 0.0028667536120048614, "loss": 2.5453, "step": 5180 }, { "crossentropy": 2.6214704513549805, "epoch": 0.44071112623341274, "grad_norm": 0.0337701253592968, "grad_norm_var": 4.885209473879694e-06, "learning_rate": 0.0028649417231843633, "loss": 2.6215, "step": 5181 }, { "crossentropy": 2.758688449859619, "epoch": 0.4407961891799932, "grad_norm": 0.03557766228914261, "grad_norm_var": 4.827480632394604e-06, "learning_rate": 0.002863130177192702, "loss": 2.7587, "step": 5182 }, { "crossentropy": 2.6638917922973633, "epoch": 0.44088125212657364, "grad_norm": 0.032641809433698654, "grad_norm_var": 5.438861297590337e-06, "learning_rate": 0.0028613189743207637, "loss": 2.6639, "step": 5183 }, { "crossentropy": 2.6904211044311523, "epoch": 0.44096631507315415, "grad_norm": 0.03315616399049759, "grad_norm_var": 4.56700685043798e-06, "learning_rate": 0.002859508114859374, "loss": 2.6904, "step": 5184 }, { "crossentropy": 2.67948055267334, "epoch": 0.4410513780197346, "grad_norm": 0.03839525952935219, "grad_norm_var": 4.438558907845074e-06, "learning_rate": 0.0028576975990993016, "loss": 2.6795, "step": 5185 }, { "crossentropy": 2.7021737098693848, "epoch": 0.44113644096631505, "grad_norm": 0.03529901057481766, "grad_norm_var": 4.247691464871287e-06, "learning_rate": 0.0028558874273312673, "loss": 2.7022, "step": 5186 }, { "crossentropy": 2.582329750061035, "epoch": 0.44122150391289555, "grad_norm": 0.03323560580611229, "grad_norm_var": 4.21524856740642e-06, "learning_rate": 0.0028540775998459333, "loss": 2.5823, "step": 5187 }, { "crossentropy": 2.5630035400390625, "epoch": 0.441306566859476, "grad_norm": 0.0347784124314785, "grad_norm_var": 4.011363196299948e-06, "learning_rate": 0.0028522681169339, "loss": 2.563, "step": 5188 }, { "crossentropy": 2.632052421569824, "epoch": 0.4413916298060565, "grad_norm": 0.03223070502281189, "grad_norm_var": 4.491904009562506e-06, "learning_rate": 0.0028504589788857226, "loss": 2.6321, "step": 5189 }, { "crossentropy": 2.6404788494110107, "epoch": 0.44147669275263696, "grad_norm": 0.03648567199707031, "grad_norm_var": 4.662329515184247e-06, "learning_rate": 0.0028486501859918967, "loss": 2.6405, "step": 5190 }, { "crossentropy": 2.719149589538574, "epoch": 0.4415617556992174, "grad_norm": 0.03708827123045921, "grad_norm_var": 4.6941380139662226e-06, "learning_rate": 0.0028468417385428585, "loss": 2.7191, "step": 5191 }, { "crossentropy": 2.7025182247161865, "epoch": 0.4416468186457979, "grad_norm": 0.03654279187321663, "grad_norm_var": 4.750549831560498e-06, "learning_rate": 0.0028450336368289974, "loss": 2.7025, "step": 5192 }, { "crossentropy": 2.5834028720855713, "epoch": 0.44173188159237836, "grad_norm": 0.03342994302511215, "grad_norm_var": 4.33599473763438e-06, "learning_rate": 0.002843225881140641, "loss": 2.5834, "step": 5193 }, { "crossentropy": 2.6633493900299072, "epoch": 0.4418169445389588, "grad_norm": 0.03315875306725502, "grad_norm_var": 4.5636445096843e-06, "learning_rate": 0.002841418471768061, "loss": 2.6633, "step": 5194 }, { "crossentropy": 2.6865622997283936, "epoch": 0.4419020074855393, "grad_norm": 0.033859528601169586, "grad_norm_var": 4.474572620433564e-06, "learning_rate": 0.002839611409001477, "loss": 2.6866, "step": 5195 }, { "crossentropy": 2.655966281890869, "epoch": 0.44198707043211977, "grad_norm": 0.03362758830189705, "grad_norm_var": 3.227903534784524e-06, "learning_rate": 0.0028378046931310497, "loss": 2.656, "step": 5196 }, { "crossentropy": 2.641085147857666, "epoch": 0.4420721333787002, "grad_norm": 0.03665091097354889, "grad_norm_var": 3.435574456110008e-06, "learning_rate": 0.0028359983244468866, "loss": 2.6411, "step": 5197 }, { "crossentropy": 2.62202787399292, "epoch": 0.4421571963252807, "grad_norm": 0.037645917385816574, "grad_norm_var": 3.928446913904239e-06, "learning_rate": 0.002834192303239037, "loss": 2.622, "step": 5198 }, { "crossentropy": 2.6830880641937256, "epoch": 0.44224225927186117, "grad_norm": 0.03712054714560509, "grad_norm_var": 3.840109057667331e-06, "learning_rate": 0.0028323866297974985, "loss": 2.6831, "step": 5199 }, { "crossentropy": 2.634650230407715, "epoch": 0.4423273222184416, "grad_norm": 0.035486508160829544, "grad_norm_var": 3.554081141602522e-06, "learning_rate": 0.00283058130441221, "loss": 2.6347, "step": 5200 }, { "crossentropy": 2.6630656719207764, "epoch": 0.4424123851650221, "grad_norm": 0.03485377877950668, "grad_norm_var": 2.883335851531996e-06, "learning_rate": 0.0028287763273730526, "loss": 2.6631, "step": 5201 }, { "crossentropy": 2.7093112468719482, "epoch": 0.4424974481116026, "grad_norm": 0.03430822119116783, "grad_norm_var": 2.917523828188662e-06, "learning_rate": 0.002826971698969855, "loss": 2.7093, "step": 5202 }, { "crossentropy": 2.581841468811035, "epoch": 0.4425825110581831, "grad_norm": 0.036057036370038986, "grad_norm_var": 2.739474328521581e-06, "learning_rate": 0.0028251674194923883, "loss": 2.5818, "step": 5203 }, { "crossentropy": 2.6545279026031494, "epoch": 0.44266757400476353, "grad_norm": 0.033449821174144745, "grad_norm_var": 2.9258582008014878e-06, "learning_rate": 0.002823363489230365, "loss": 2.6545, "step": 5204 }, { "crossentropy": 2.5824689865112305, "epoch": 0.442752636951344, "grad_norm": 0.03549481928348541, "grad_norm_var": 2.3322285108844906e-06, "learning_rate": 0.00282155990847345, "loss": 2.5825, "step": 5205 }, { "crossentropy": 2.7215914726257324, "epoch": 0.4428376998979245, "grad_norm": 0.03412528708577156, "grad_norm_var": 2.316340040954083e-06, "learning_rate": 0.0028197566775112425, "loss": 2.7216, "step": 5206 }, { "crossentropy": 2.743865966796875, "epoch": 0.44292276284450494, "grad_norm": 0.03453492000699043, "grad_norm_var": 2.0745699829143964e-06, "learning_rate": 0.0028179537966332886, "loss": 2.7439, "step": 5207 }, { "crossentropy": 2.6291568279266357, "epoch": 0.4430078257910854, "grad_norm": 0.03311585262417793, "grad_norm_var": 2.1135156288281756e-06, "learning_rate": 0.0028161512661290843, "loss": 2.6292, "step": 5208 }, { "crossentropy": 2.7657430171966553, "epoch": 0.4430928887376659, "grad_norm": 0.03445659577846527, "grad_norm_var": 1.9908267823443015e-06, "learning_rate": 0.0028143490862880594, "loss": 2.7657, "step": 5209 }, { "crossentropy": 2.666067600250244, "epoch": 0.44317795168424634, "grad_norm": 0.03385358303785324, "grad_norm_var": 1.862313277169123e-06, "learning_rate": 0.0028125472573995902, "loss": 2.6661, "step": 5210 }, { "crossentropy": 2.741497039794922, "epoch": 0.4432630146308268, "grad_norm": 0.03650888428092003, "grad_norm_var": 1.928143348108267e-06, "learning_rate": 0.0028107457797530047, "loss": 2.7415, "step": 5211 }, { "crossentropy": 2.7198967933654785, "epoch": 0.4433480775774073, "grad_norm": 0.037153348326683044, "grad_norm_var": 2.0219973592804896e-06, "learning_rate": 0.002808944653637564, "loss": 2.7199, "step": 5212 }, { "crossentropy": 2.641707420349121, "epoch": 0.44343314052398775, "grad_norm": 0.040471889078617096, "grad_norm_var": 3.6222191610562593e-06, "learning_rate": 0.0028071438793424762, "loss": 2.6417, "step": 5213 }, { "crossentropy": 2.7427473068237305, "epoch": 0.4435182034705682, "grad_norm": 0.03960391879081726, "grad_norm_var": 4.41166377322851e-06, "learning_rate": 0.002805343457156898, "loss": 2.7427, "step": 5214 }, { "crossentropy": 2.805898427963257, "epoch": 0.4436032664171487, "grad_norm": 0.035049423575401306, "grad_norm_var": 4.277035279645567e-06, "learning_rate": 0.0028035433873699247, "loss": 2.8059, "step": 5215 }, { "crossentropy": 2.588853359222412, "epoch": 0.44368832936372915, "grad_norm": 0.034884050488471985, "grad_norm_var": 4.303433918108798e-06, "learning_rate": 0.00280174367027059, "loss": 2.5889, "step": 5216 }, { "crossentropy": 2.7308876514434814, "epoch": 0.44377339231030966, "grad_norm": 0.034242983907461166, "grad_norm_var": 4.3789787123576484e-06, "learning_rate": 0.0027999443061478826, "loss": 2.7309, "step": 5217 }, { "crossentropy": 2.6667683124542236, "epoch": 0.4438584552568901, "grad_norm": 0.034680016338825226, "grad_norm_var": 4.3306743669341285e-06, "learning_rate": 0.002798145295290726, "loss": 2.6668, "step": 5218 }, { "crossentropy": 2.71345853805542, "epoch": 0.44394351820347056, "grad_norm": 0.035021163523197174, "grad_norm_var": 4.3180617049993e-06, "learning_rate": 0.002796346637987991, "loss": 2.7135, "step": 5219 }, { "crossentropy": 2.6061275005340576, "epoch": 0.44402858115005106, "grad_norm": 0.033518269658088684, "grad_norm_var": 4.300415654531706e-06, "learning_rate": 0.002794548334528486, "loss": 2.6061, "step": 5220 }, { "crossentropy": 2.642895460128784, "epoch": 0.4441136440966315, "grad_norm": 0.03410052880644798, "grad_norm_var": 4.4079511886165506e-06, "learning_rate": 0.0027927503852009728, "loss": 2.6429, "step": 5221 }, { "crossentropy": 2.7400636672973633, "epoch": 0.44419870704321196, "grad_norm": 0.04043227434158325, "grad_norm_var": 5.878860579918282e-06, "learning_rate": 0.0027909527902941467, "loss": 2.7401, "step": 5222 }, { "crossentropy": 2.7685630321502686, "epoch": 0.44428376998979247, "grad_norm": 0.038460131734609604, "grad_norm_var": 6.2180677444585605e-06, "learning_rate": 0.0027891555500966503, "loss": 2.7686, "step": 5223 }, { "crossentropy": 2.7167296409606934, "epoch": 0.4443688329363729, "grad_norm": 0.03425775468349457, "grad_norm_var": 5.8646965710747645e-06, "learning_rate": 0.002787358664897068, "loss": 2.7167, "step": 5224 }, { "crossentropy": 2.6852922439575195, "epoch": 0.44445389588295336, "grad_norm": 0.03370843455195427, "grad_norm_var": 6.0579746288738984e-06, "learning_rate": 0.00278556213498393, "loss": 2.6853, "step": 5225 }, { "crossentropy": 2.6571130752563477, "epoch": 0.44453895882953387, "grad_norm": 0.03322694078087807, "grad_norm_var": 6.261576669262109e-06, "learning_rate": 0.002783765960645702, "loss": 2.6571, "step": 5226 }, { "crossentropy": 2.7380428314208984, "epoch": 0.4446240217761143, "grad_norm": 0.03586944565176964, "grad_norm_var": 6.240121660026216e-06, "learning_rate": 0.002781970142170804, "loss": 2.738, "step": 5227 }, { "crossentropy": 2.7112338542938232, "epoch": 0.44470908472269477, "grad_norm": 0.033102452754974365, "grad_norm_var": 6.59824505964482e-06, "learning_rate": 0.0027801746798475904, "loss": 2.7112, "step": 5228 }, { "crossentropy": 2.8134148120880127, "epoch": 0.4447941476692753, "grad_norm": 0.03790632262825966, "grad_norm_var": 5.36508847969689e-06, "learning_rate": 0.0027783795739643575, "loss": 2.8134, "step": 5229 }, { "crossentropy": 2.662668228149414, "epoch": 0.4448792106158557, "grad_norm": 0.033269502222537994, "grad_norm_var": 4.410150679727033e-06, "learning_rate": 0.002776584824809355, "loss": 2.6627, "step": 5230 }, { "crossentropy": 2.6522936820983887, "epoch": 0.44496427356243623, "grad_norm": 0.03619540482759476, "grad_norm_var": 4.483263719697907e-06, "learning_rate": 0.0027747904326707613, "loss": 2.6523, "step": 5231 }, { "crossentropy": 2.7077832221984863, "epoch": 0.4450493365090167, "grad_norm": 0.04991805925965309, "grad_norm_var": 1.8016903058668682e-05, "learning_rate": 0.002772996397836704, "loss": 2.7078, "step": 5232 }, { "crossentropy": 2.716482162475586, "epoch": 0.44513439945559713, "grad_norm": 0.03524819016456604, "grad_norm_var": 1.7828570158986457e-05, "learning_rate": 0.002771202720595258, "loss": 2.7165, "step": 5233 }, { "crossentropy": 2.749486207962036, "epoch": 0.44521946240217763, "grad_norm": 0.036898113787174225, "grad_norm_var": 1.76918078221339e-05, "learning_rate": 0.0027694094012344333, "loss": 2.7495, "step": 5234 }, { "crossentropy": 2.690122604370117, "epoch": 0.4453045253487581, "grad_norm": 0.03314359858632088, "grad_norm_var": 1.8237492499893066e-05, "learning_rate": 0.002767616440042186, "loss": 2.6901, "step": 5235 }, { "crossentropy": 2.618269920349121, "epoch": 0.44538958829533853, "grad_norm": 0.034598834812641144, "grad_norm_var": 1.7923598486549454e-05, "learning_rate": 0.0027658238373064114, "loss": 2.6183, "step": 5236 }, { "crossentropy": 2.750129222869873, "epoch": 0.44547465124191904, "grad_norm": 0.035015206784009933, "grad_norm_var": 1.771118400997196e-05, "learning_rate": 0.0027640315933149568, "loss": 2.7501, "step": 5237 }, { "crossentropy": 2.58543062210083, "epoch": 0.4455597141884995, "grad_norm": 0.036220982670784, "grad_norm_var": 1.651514087545325e-05, "learning_rate": 0.002762239708355596, "loss": 2.5854, "step": 5238 }, { "crossentropy": 2.7163186073303223, "epoch": 0.44564477713507994, "grad_norm": 0.03541925549507141, "grad_norm_var": 1.612195152176036e-05, "learning_rate": 0.0027604481827160606, "loss": 2.7163, "step": 5239 }, { "crossentropy": 2.665463924407959, "epoch": 0.44572984008166044, "grad_norm": 0.036305516958236694, "grad_norm_var": 1.594249574400902e-05, "learning_rate": 0.002758657016684015, "loss": 2.6655, "step": 5240 }, { "crossentropy": 2.7008168697357178, "epoch": 0.4458149030282409, "grad_norm": 0.04127132520079613, "grad_norm_var": 1.72036311265874e-05, "learning_rate": 0.002756866210547071, "loss": 2.7008, "step": 5241 }, { "crossentropy": 2.6062347888946533, "epoch": 0.44589996597482134, "grad_norm": 0.03643998131155968, "grad_norm_var": 1.645712678148414e-05, "learning_rate": 0.0027550757645927767, "loss": 2.6062, "step": 5242 }, { "crossentropy": 2.6750237941741943, "epoch": 0.44598502892140185, "grad_norm": 0.035285886377096176, "grad_norm_var": 1.654119704048573e-05, "learning_rate": 0.0027532856791086303, "loss": 2.675, "step": 5243 }, { "crossentropy": 2.72623348236084, "epoch": 0.4460700918679823, "grad_norm": 0.03718830645084381, "grad_norm_var": 1.5657444351641012e-05, "learning_rate": 0.0027514959543820682, "loss": 2.7262, "step": 5244 }, { "crossentropy": 2.821716070175171, "epoch": 0.44615515481456275, "grad_norm": 0.03699768707156181, "grad_norm_var": 1.5586556332126447e-05, "learning_rate": 0.002749706590700463, "loss": 2.8217, "step": 5245 }, { "crossentropy": 2.6913652420043945, "epoch": 0.44624021776114325, "grad_norm": 0.03669445961713791, "grad_norm_var": 1.468988433507793e-05, "learning_rate": 0.0027479175883511403, "loss": 2.6914, "step": 5246 }, { "crossentropy": 2.6521801948547363, "epoch": 0.4463252807077237, "grad_norm": 0.03615326061844826, "grad_norm_var": 1.4694811840823552e-05, "learning_rate": 0.002746128947621361, "loss": 2.6522, "step": 5247 }, { "crossentropy": 2.848719358444214, "epoch": 0.4464103436543042, "grad_norm": 0.03493814170360565, "grad_norm_var": 3.0178535919494463e-06, "learning_rate": 0.0027443406687983262, "loss": 2.8487, "step": 5248 }, { "crossentropy": 2.6902987957000732, "epoch": 0.44649540660088466, "grad_norm": 0.035544537007808685, "grad_norm_var": 2.9891447355346117e-06, "learning_rate": 0.0027425527521691873, "loss": 2.6903, "step": 5249 }, { "crossentropy": 2.6910922527313232, "epoch": 0.4465804695474651, "grad_norm": 0.03672386705875397, "grad_norm_var": 2.973247806200135e-06, "learning_rate": 0.0027407651980210297, "loss": 2.6911, "step": 5250 }, { "crossentropy": 2.704535484313965, "epoch": 0.4466655324940456, "grad_norm": 0.03781400993466377, "grad_norm_var": 2.4822636039987037e-06, "learning_rate": 0.0027389780066408827, "loss": 2.7045, "step": 5251 }, { "crossentropy": 2.6441009044647217, "epoch": 0.44675059544062606, "grad_norm": 0.0342315174639225, "grad_norm_var": 2.5795561200171735e-06, "learning_rate": 0.0027371911783157176, "loss": 2.6441, "step": 5252 }, { "crossentropy": 2.7445740699768066, "epoch": 0.4468356583872065, "grad_norm": 0.0341838076710701, "grad_norm_var": 2.775185207913531e-06, "learning_rate": 0.0027354047133324485, "loss": 2.7446, "step": 5253 }, { "crossentropy": 2.6755800247192383, "epoch": 0.446920721333787, "grad_norm": 0.03294523432850838, "grad_norm_var": 3.497076404756818e-06, "learning_rate": 0.0027336186119779273, "loss": 2.6756, "step": 5254 }, { "crossentropy": 2.663978099822998, "epoch": 0.44700578428036747, "grad_norm": 0.033089764416217804, "grad_norm_var": 4.058093359996888e-06, "learning_rate": 0.0027318328745389543, "loss": 2.664, "step": 5255 }, { "crossentropy": 2.711671829223633, "epoch": 0.4470908472269479, "grad_norm": 0.036809056997299194, "grad_norm_var": 4.095260986484532e-06, "learning_rate": 0.0027300475013022664, "loss": 2.7117, "step": 5256 }, { "crossentropy": 2.5823814868927, "epoch": 0.4471759101735284, "grad_norm": 0.037530265748500824, "grad_norm_var": 2.3502932464530254e-06, "learning_rate": 0.002728262492554543, "loss": 2.5824, "step": 5257 }, { "crossentropy": 2.690551519393921, "epoch": 0.4472609731201089, "grad_norm": 0.03264191001653671, "grad_norm_var": 2.9204982440094823e-06, "learning_rate": 0.0027264778485824015, "loss": 2.6906, "step": 5258 }, { "crossentropy": 2.7275898456573486, "epoch": 0.4473460360666893, "grad_norm": 0.033470168709754944, "grad_norm_var": 3.1900629110981694e-06, "learning_rate": 0.0027246935696724124, "loss": 2.7276, "step": 5259 }, { "crossentropy": 2.674661159515381, "epoch": 0.44743109901326983, "grad_norm": 0.03939468786120415, "grad_norm_var": 4.010189020125833e-06, "learning_rate": 0.00272290965611107, "loss": 2.6747, "step": 5260 }, { "crossentropy": 2.788796901702881, "epoch": 0.4475161619598503, "grad_norm": 0.0354757234454155, "grad_norm_var": 3.865781436700131e-06, "learning_rate": 0.0027211261081848265, "loss": 2.7888, "step": 5261 }, { "crossentropy": 2.7321507930755615, "epoch": 0.4476012249064308, "grad_norm": 0.03635594621300697, "grad_norm_var": 3.818016939392359e-06, "learning_rate": 0.002719342926180066, "loss": 2.7322, "step": 5262 }, { "crossentropy": 2.642920970916748, "epoch": 0.44768628785301123, "grad_norm": 0.03540275990962982, "grad_norm_var": 3.78348442931168e-06, "learning_rate": 0.002717560110383115, "loss": 2.6429, "step": 5263 }, { "crossentropy": 2.677238941192627, "epoch": 0.4477713507995917, "grad_norm": 0.03503280133008957, "grad_norm_var": 3.7780957849059897e-06, "learning_rate": 0.0027157776610802413, "loss": 2.6772, "step": 5264 }, { "crossentropy": 2.594348430633545, "epoch": 0.4478564137461722, "grad_norm": 0.03594190999865532, "grad_norm_var": 3.7948080724490203e-06, "learning_rate": 0.00271399557855766, "loss": 2.5943, "step": 5265 }, { "crossentropy": 2.731670618057251, "epoch": 0.44794147669275264, "grad_norm": 0.03392606973648071, "grad_norm_var": 3.8051841253584905e-06, "learning_rate": 0.0027122138631015197, "loss": 2.7317, "step": 5266 }, { "crossentropy": 2.665346622467041, "epoch": 0.4480265396393331, "grad_norm": 0.036224476993083954, "grad_norm_var": 3.4229408188599446e-06, "learning_rate": 0.0027104325149979085, "loss": 2.6653, "step": 5267 }, { "crossentropy": 2.6044633388519287, "epoch": 0.4481116025859136, "grad_norm": 0.03301314264535904, "grad_norm_var": 3.667525832980652e-06, "learning_rate": 0.0027086515345328653, "loss": 2.6045, "step": 5268 }, { "crossentropy": 2.6992499828338623, "epoch": 0.44819666553249404, "grad_norm": 0.03242304176092148, "grad_norm_var": 4.0740066784441165e-06, "learning_rate": 0.0027068709219923615, "loss": 2.6992, "step": 5269 }, { "crossentropy": 2.8068995475769043, "epoch": 0.4482817284790745, "grad_norm": 0.03563709557056427, "grad_norm_var": 3.796649599234423e-06, "learning_rate": 0.0027050906776623107, "loss": 2.8069, "step": 5270 }, { "crossentropy": 2.574307680130005, "epoch": 0.448366791425655, "grad_norm": 0.032680850476026535, "grad_norm_var": 3.919321890898571e-06, "learning_rate": 0.0027033108018285736, "loss": 2.5743, "step": 5271 }, { "crossentropy": 2.746063232421875, "epoch": 0.44845185437223545, "grad_norm": 0.0332457534968853, "grad_norm_var": 3.911594621058776e-06, "learning_rate": 0.0027015312947769434, "loss": 2.7461, "step": 5272 }, { "crossentropy": 2.6580307483673096, "epoch": 0.4485369173188159, "grad_norm": 0.03629596531391144, "grad_norm_var": 3.573906532934543e-06, "learning_rate": 0.002699752156793158, "loss": 2.658, "step": 5273 }, { "crossentropy": 2.692957878112793, "epoch": 0.4486219802653964, "grad_norm": 0.03322012722492218, "grad_norm_var": 3.426677426220046e-06, "learning_rate": 0.0026979733881628966, "loss": 2.693, "step": 5274 }, { "crossentropy": 2.4916844367980957, "epoch": 0.44870704321197685, "grad_norm": 0.03311727195978165, "grad_norm_var": 3.4997992413021125e-06, "learning_rate": 0.0026961949891717773, "loss": 2.4917, "step": 5275 }, { "crossentropy": 2.6755785942077637, "epoch": 0.44879210615855736, "grad_norm": 0.034431323409080505, "grad_norm_var": 2.023109453859704e-06, "learning_rate": 0.0026944169601053588, "loss": 2.6756, "step": 5276 }, { "crossentropy": 2.610159397125244, "epoch": 0.4488771691051378, "grad_norm": 0.0332861952483654, "grad_norm_var": 2.045627763068841e-06, "learning_rate": 0.002692639301249144, "loss": 2.6102, "step": 5277 }, { "crossentropy": 2.749126672744751, "epoch": 0.44896223205171826, "grad_norm": 0.03896235674619675, "grad_norm_var": 3.153536580671745e-06, "learning_rate": 0.0026908620128885726, "loss": 2.7491, "step": 5278 }, { "crossentropy": 2.6874213218688965, "epoch": 0.44904729499829876, "grad_norm": 0.03679262846708298, "grad_norm_var": 3.431823370652944e-06, "learning_rate": 0.002689085095309025, "loss": 2.6874, "step": 5279 }, { "crossentropy": 2.6500587463378906, "epoch": 0.4491323579448792, "grad_norm": 0.03421156480908394, "grad_norm_var": 3.43090264819512e-06, "learning_rate": 0.002687308548795825, "loss": 2.6501, "step": 5280 }, { "crossentropy": 2.6438798904418945, "epoch": 0.44921742089145966, "grad_norm": 0.034611042588949203, "grad_norm_var": 3.3013728660962724e-06, "learning_rate": 0.0026855323736342317, "loss": 2.6439, "step": 5281 }, { "crossentropy": 2.7564940452575684, "epoch": 0.44930248383804017, "grad_norm": 0.03523482382297516, "grad_norm_var": 3.3074134749579962e-06, "learning_rate": 0.0026837565701094503, "loss": 2.7565, "step": 5282 }, { "crossentropy": 2.829166889190674, "epoch": 0.4493875467846206, "grad_norm": 0.03651505336165428, "grad_norm_var": 3.376142762737816e-06, "learning_rate": 0.0026819811385066206, "loss": 2.8292, "step": 5283 }, { "crossentropy": 2.6668081283569336, "epoch": 0.44947260973120107, "grad_norm": 0.03219262510538101, "grad_norm_var": 3.59236168930091e-06, "learning_rate": 0.00268020607911083, "loss": 2.6668, "step": 5284 }, { "crossentropy": 2.6309657096862793, "epoch": 0.4495576726777816, "grad_norm": 0.03314204141497612, "grad_norm_var": 3.420421587287982e-06, "learning_rate": 0.0026784313922070995, "loss": 2.631, "step": 5285 }, { "crossentropy": 2.7156848907470703, "epoch": 0.449642735624362, "grad_norm": 0.035512566566467285, "grad_norm_var": 3.404146847552948e-06, "learning_rate": 0.0026766570780803914, "loss": 2.7157, "step": 5286 }, { "crossentropy": 2.72444486618042, "epoch": 0.44972779857094247, "grad_norm": 0.03574514389038086, "grad_norm_var": 3.2106780407483345e-06, "learning_rate": 0.0026748831370156144, "loss": 2.7244, "step": 5287 }, { "crossentropy": 2.643587350845337, "epoch": 0.449812861517523, "grad_norm": 0.034445811063051224, "grad_norm_var": 3.0548306075274535e-06, "learning_rate": 0.0026731095692976072, "loss": 2.6436, "step": 5288 }, { "crossentropy": 2.598607063293457, "epoch": 0.4498979244641034, "grad_norm": 0.03184758499264717, "grad_norm_var": 3.4382790776443214e-06, "learning_rate": 0.0026713363752111532, "loss": 2.5986, "step": 5289 }, { "crossentropy": 2.678804397583008, "epoch": 0.44998298741068393, "grad_norm": 0.03594786301255226, "grad_norm_var": 3.4089989555739313e-06, "learning_rate": 0.0026695635550409804, "loss": 2.6788, "step": 5290 }, { "crossentropy": 2.5830020904541016, "epoch": 0.4500680503572644, "grad_norm": 0.03399628773331642, "grad_norm_var": 3.2659616500206487e-06, "learning_rate": 0.0026677911090717504, "loss": 2.583, "step": 5291 }, { "crossentropy": 2.677914619445801, "epoch": 0.45015311330384483, "grad_norm": 0.03400769084692001, "grad_norm_var": 3.2982670992696856e-06, "learning_rate": 0.0026660190375880654, "loss": 2.6779, "step": 5292 }, { "crossentropy": 2.6441314220428467, "epoch": 0.45023817625042534, "grad_norm": 0.03361063823103905, "grad_norm_var": 3.2403031073583234e-06, "learning_rate": 0.0026642473408744734, "loss": 2.6441, "step": 5293 }, { "crossentropy": 2.724642515182495, "epoch": 0.4503232391970058, "grad_norm": 0.03328299894928932, "grad_norm_var": 2.1031630724209033e-06, "learning_rate": 0.0026624760192154547, "loss": 2.7246, "step": 5294 }, { "crossentropy": 2.7279467582702637, "epoch": 0.45040830214358624, "grad_norm": 0.033923521637916565, "grad_norm_var": 1.7190042669958105e-06, "learning_rate": 0.002660705072895433, "loss": 2.7279, "step": 5295 }, { "crossentropy": 2.6783039569854736, "epoch": 0.45049336509016674, "grad_norm": 0.03434055298566818, "grad_norm_var": 1.7191388341723674e-06, "learning_rate": 0.002658934502198772, "loss": 2.6783, "step": 5296 }, { "crossentropy": 2.706115245819092, "epoch": 0.4505784280367472, "grad_norm": 0.03641313314437866, "grad_norm_var": 2.0035104415977444e-06, "learning_rate": 0.002657164307409773, "loss": 2.7061, "step": 5297 }, { "crossentropy": 2.710664987564087, "epoch": 0.45066349098332764, "grad_norm": 0.03447252884507179, "grad_norm_var": 1.953442703695775e-06, "learning_rate": 0.0026553944888126767, "loss": 2.7107, "step": 5298 }, { "crossentropy": 2.730722427368164, "epoch": 0.45074855392990815, "grad_norm": 0.03379989042878151, "grad_norm_var": 1.6257883801907084e-06, "learning_rate": 0.0026536250466916702, "loss": 2.7307, "step": 5299 }, { "crossentropy": 2.7048466205596924, "epoch": 0.4508336168764886, "grad_norm": 0.03530805930495262, "grad_norm_var": 1.412040499450725e-06, "learning_rate": 0.002651855981330872, "loss": 2.7048, "step": 5300 }, { "crossentropy": 2.7086706161499023, "epoch": 0.45091867982306905, "grad_norm": 0.033730484545230865, "grad_norm_var": 1.3379440987975588e-06, "learning_rate": 0.0026500872930143417, "loss": 2.7087, "step": 5301 }, { "crossentropy": 2.707287311553955, "epoch": 0.45100374276964955, "grad_norm": 0.03516580909490585, "grad_norm_var": 1.2939763267877727e-06, "learning_rate": 0.0026483189820260817, "loss": 2.7073, "step": 5302 }, { "crossentropy": 2.7330117225646973, "epoch": 0.45108880571623, "grad_norm": 0.03612406179308891, "grad_norm_var": 1.3720529536798143e-06, "learning_rate": 0.002646551048650031, "loss": 2.733, "step": 5303 }, { "crossentropy": 2.7186334133148193, "epoch": 0.4511738686628105, "grad_norm": 0.03360234573483467, "grad_norm_var": 1.411484462671142e-06, "learning_rate": 0.0026447834931700685, "loss": 2.7186, "step": 5304 }, { "crossentropy": 2.626460313796997, "epoch": 0.45125893160939096, "grad_norm": 0.03572912514209747, "grad_norm_var": 1.05889375652714e-06, "learning_rate": 0.0026430163158700118, "loss": 2.6265, "step": 5305 }, { "crossentropy": 2.7163658142089844, "epoch": 0.4513439945559714, "grad_norm": 0.03579540178179741, "grad_norm_var": 1.0327627178276118e-06, "learning_rate": 0.0026412495170336217, "loss": 2.7164, "step": 5306 }, { "crossentropy": 2.5614304542541504, "epoch": 0.4514290575025519, "grad_norm": 0.034813277423381805, "grad_norm_var": 1.010741415748371e-06, "learning_rate": 0.0026394830969445927, "loss": 2.5614, "step": 5307 }, { "crossentropy": 2.7226603031158447, "epoch": 0.45151412044913236, "grad_norm": 0.03684556484222412, "grad_norm_var": 1.2776810343469632e-06, "learning_rate": 0.002637717055886559, "loss": 2.7227, "step": 5308 }, { "crossentropy": 2.650047779083252, "epoch": 0.4515991833957128, "grad_norm": 0.03909257799386978, "grad_norm_var": 2.2793851989429933e-06, "learning_rate": 0.0026359513941431037, "loss": 2.65, "step": 5309 }, { "crossentropy": 2.727692127227783, "epoch": 0.4516842463422933, "grad_norm": 0.034196071326732635, "grad_norm_var": 2.103897956212686e-06, "learning_rate": 0.0026341861119977337, "loss": 2.7277, "step": 5310 }, { "crossentropy": 2.697625160217285, "epoch": 0.45176930928887377, "grad_norm": 0.03494790568947792, "grad_norm_var": 1.993834894682631e-06, "learning_rate": 0.002632421209733903, "loss": 2.6976, "step": 5311 }, { "crossentropy": 2.5714311599731445, "epoch": 0.4518543722354542, "grad_norm": 0.032730136066675186, "grad_norm_var": 2.356260150952287e-06, "learning_rate": 0.002630656687635007, "loss": 2.5714, "step": 5312 }, { "crossentropy": 2.6543080806732178, "epoch": 0.4519394351820347, "grad_norm": 0.03326697647571564, "grad_norm_var": 2.4546409382976883e-06, "learning_rate": 0.002628892545984376, "loss": 2.6543, "step": 5313 }, { "crossentropy": 2.6226353645324707, "epoch": 0.45202449812861517, "grad_norm": 0.0324920229613781, "grad_norm_var": 2.8328110949620935e-06, "learning_rate": 0.002627128785065279, "loss": 2.6226, "step": 5314 }, { "crossentropy": 2.5115621089935303, "epoch": 0.4521095610751956, "grad_norm": 0.03447897359728813, "grad_norm_var": 2.766326928804741e-06, "learning_rate": 0.0026253654051609275, "loss": 2.5116, "step": 5315 }, { "crossentropy": 2.6425626277923584, "epoch": 0.4521946240217761, "grad_norm": 0.03573053702712059, "grad_norm_var": 2.8007544201809806e-06, "learning_rate": 0.002623602406554471, "loss": 2.6426, "step": 5316 }, { "crossentropy": 2.683519124984741, "epoch": 0.4522796869683566, "grad_norm": 0.033553801476955414, "grad_norm_var": 2.830759094828624e-06, "learning_rate": 0.002621839789528989, "loss": 2.6835, "step": 5317 }, { "crossentropy": 2.7281835079193115, "epoch": 0.4523647499149371, "grad_norm": 0.03237832337617874, "grad_norm_var": 3.2214202426346836e-06, "learning_rate": 0.0026200775543675147, "loss": 2.7282, "step": 5318 }, { "crossentropy": 2.695158004760742, "epoch": 0.45244981286151753, "grad_norm": 0.036261267960071564, "grad_norm_var": 3.2479889960023687e-06, "learning_rate": 0.002618315701353009, "loss": 2.6952, "step": 5319 }, { "crossentropy": 2.6295247077941895, "epoch": 0.452534875808098, "grad_norm": 0.03435010462999344, "grad_norm_var": 3.169046934123343e-06, "learning_rate": 0.002616554230768374, "loss": 2.6295, "step": 5320 }, { "crossentropy": 2.6776230335235596, "epoch": 0.4526199387546785, "grad_norm": 0.03411344066262245, "grad_norm_var": 3.1301856709750955e-06, "learning_rate": 0.002614793142896451, "loss": 2.6776, "step": 5321 }, { "crossentropy": 2.6746039390563965, "epoch": 0.45270500170125894, "grad_norm": 0.03481215238571167, "grad_norm_var": 3.045743600384408e-06, "learning_rate": 0.0026130324380200234, "loss": 2.6746, "step": 5322 }, { "crossentropy": 2.5867533683776855, "epoch": 0.4527900646478394, "grad_norm": 0.03647932410240173, "grad_norm_var": 3.260172901414782e-06, "learning_rate": 0.0026112721164218066, "loss": 2.5868, "step": 5323 }, { "crossentropy": 2.5923306941986084, "epoch": 0.4528751275944199, "grad_norm": 0.0332878902554512, "grad_norm_var": 3.0491643310516515e-06, "learning_rate": 0.0026095121783844583, "loss": 2.5923, "step": 5324 }, { "crossentropy": 2.627682685852051, "epoch": 0.45296019054100034, "grad_norm": 0.033656612038612366, "grad_norm_var": 1.5751116874324066e-06, "learning_rate": 0.0026077526241905743, "loss": 2.6277, "step": 5325 }, { "crossentropy": 2.756551742553711, "epoch": 0.4530452534875808, "grad_norm": 0.03425237163901329, "grad_norm_var": 1.575498214194335e-06, "learning_rate": 0.002605993454122687, "loss": 2.7566, "step": 5326 }, { "crossentropy": 2.6380910873413086, "epoch": 0.4531303164341613, "grad_norm": 0.032778918743133545, "grad_norm_var": 1.6458592724856909e-06, "learning_rate": 0.002604234668463268, "loss": 2.6381, "step": 5327 }, { "crossentropy": 2.7150254249572754, "epoch": 0.45321537938074175, "grad_norm": 0.03516073524951935, "grad_norm_var": 1.5909442902167202e-06, "learning_rate": 0.002602476267494731, "loss": 2.715, "step": 5328 }, { "crossentropy": 2.6929900646209717, "epoch": 0.4533004423273222, "grad_norm": 0.033564161509275436, "grad_norm_var": 1.5598564047404617e-06, "learning_rate": 0.0026007182514994217, "loss": 2.693, "step": 5329 }, { "crossentropy": 2.7155778408050537, "epoch": 0.4533855052739027, "grad_norm": 0.035947419703006744, "grad_norm_var": 1.5148557746392024e-06, "learning_rate": 0.002598960620759626, "loss": 2.7156, "step": 5330 }, { "crossentropy": 2.6638731956481934, "epoch": 0.45347056822048315, "grad_norm": 0.034823477268218994, "grad_norm_var": 1.5247353381817505e-06, "learning_rate": 0.0025972033755575735, "loss": 2.6639, "step": 5331 }, { "crossentropy": 2.813371181488037, "epoch": 0.4535556311670636, "grad_norm": 0.035394180566072464, "grad_norm_var": 1.4742387566786993e-06, "learning_rate": 0.0025954465161754227, "loss": 2.8134, "step": 5332 }, { "crossentropy": 2.581752061843872, "epoch": 0.4536406941136441, "grad_norm": 0.03590228408575058, "grad_norm_var": 1.5458725965459644e-06, "learning_rate": 0.0025936900428952738, "loss": 2.5818, "step": 5333 }, { "crossentropy": 2.6581170558929443, "epoch": 0.45372575706022455, "grad_norm": 0.036791589111089706, "grad_norm_var": 1.471950500509372e-06, "learning_rate": 0.002591933955999169, "loss": 2.6581, "step": 5334 }, { "crossentropy": 2.6143510341644287, "epoch": 0.45381082000680506, "grad_norm": 0.035821545869112015, "grad_norm_var": 1.4012049259789124e-06, "learning_rate": 0.0025901782557690846, "loss": 2.6144, "step": 5335 }, { "crossentropy": 2.646541118621826, "epoch": 0.4538958829533855, "grad_norm": 0.03544415161013603, "grad_norm_var": 1.407320818284409e-06, "learning_rate": 0.002588422942486932, "loss": 2.6465, "step": 5336 }, { "crossentropy": 2.532463550567627, "epoch": 0.45398094589996596, "grad_norm": 0.03591185435652733, "grad_norm_var": 1.423400113012505e-06, "learning_rate": 0.00258666801643457, "loss": 2.5325, "step": 5337 }, { "crossentropy": 2.690589427947998, "epoch": 0.45406600884654646, "grad_norm": 0.037316206842660904, "grad_norm_var": 1.7519775085535668e-06, "learning_rate": 0.002584913477893788, "loss": 2.6906, "step": 5338 }, { "crossentropy": 2.67104172706604, "epoch": 0.4541510717931269, "grad_norm": 0.032995618879795074, "grad_norm_var": 1.8968800929698716e-06, "learning_rate": 0.0025831593271463072, "loss": 2.671, "step": 5339 }, { "crossentropy": 2.707507371902466, "epoch": 0.45423613473970736, "grad_norm": 0.03374926373362541, "grad_norm_var": 1.8085175654764321e-06, "learning_rate": 0.002581405564473801, "loss": 2.7075, "step": 5340 }, { "crossentropy": 2.7040255069732666, "epoch": 0.45432119768628787, "grad_norm": 0.03514739125967026, "grad_norm_var": 1.686475490670208e-06, "learning_rate": 0.0025796521901578716, "loss": 2.704, "step": 5341 }, { "crossentropy": 2.690786123275757, "epoch": 0.4544062606328683, "grad_norm": 0.03492771089076996, "grad_norm_var": 1.6420259002870862e-06, "learning_rate": 0.00257789920448006, "loss": 2.6908, "step": 5342 }, { "crossentropy": 2.692662477493286, "epoch": 0.45449132357944877, "grad_norm": 0.035824988037347794, "grad_norm_var": 1.2773025067205322e-06, "learning_rate": 0.002576146607721842, "loss": 2.6927, "step": 5343 }, { "crossentropy": 2.735135078430176, "epoch": 0.4545763865260293, "grad_norm": 0.03322406858205795, "grad_norm_var": 1.5464316320253889e-06, "learning_rate": 0.002574394400164639, "loss": 2.7351, "step": 5344 }, { "crossentropy": 2.689549207687378, "epoch": 0.4546614494726097, "grad_norm": 0.032558489590883255, "grad_norm_var": 1.8255212405226516e-06, "learning_rate": 0.0025726425820898034, "loss": 2.6895, "step": 5345 }, { "crossentropy": 2.751176357269287, "epoch": 0.4547465124191902, "grad_norm": 0.03306732699275017, "grad_norm_var": 2.022860884063708e-06, "learning_rate": 0.002570891153778627, "loss": 2.7512, "step": 5346 }, { "crossentropy": 2.731821060180664, "epoch": 0.4548315753657707, "grad_norm": 0.03629259392619133, "grad_norm_var": 2.1366422987134994e-06, "learning_rate": 0.0025691401155123377, "loss": 2.7318, "step": 5347 }, { "crossentropy": 2.700019359588623, "epoch": 0.45491663831235113, "grad_norm": 0.038415201008319855, "grad_norm_var": 2.8565332946243564e-06, "learning_rate": 0.002567389467572102, "loss": 2.7, "step": 5348 }, { "crossentropy": 2.685258388519287, "epoch": 0.45500170125893163, "grad_norm": 0.037603288888931274, "grad_norm_var": 3.1939530644839915e-06, "learning_rate": 0.0025656392102390226, "loss": 2.6853, "step": 5349 }, { "crossentropy": 2.7041516304016113, "epoch": 0.4550867642055121, "grad_norm": 0.03246665000915527, "grad_norm_var": 3.513382508905869e-06, "learning_rate": 0.002563889343794142, "loss": 2.7042, "step": 5350 }, { "crossentropy": 2.669173002243042, "epoch": 0.45517182715209253, "grad_norm": 0.032886847853660583, "grad_norm_var": 3.748937309189006e-06, "learning_rate": 0.002562139868518439, "loss": 2.6692, "step": 5351 }, { "crossentropy": 2.675630569458008, "epoch": 0.45525689009867304, "grad_norm": 0.03318540006875992, "grad_norm_var": 3.893231312988334e-06, "learning_rate": 0.0025603907846928277, "loss": 2.6756, "step": 5352 }, { "crossentropy": 2.6672251224517822, "epoch": 0.4553419530452535, "grad_norm": 0.0344715379178524, "grad_norm_var": 3.794636905798896e-06, "learning_rate": 0.0025586420925981603, "loss": 2.6672, "step": 5353 }, { "crossentropy": 2.6908955574035645, "epoch": 0.45542701599183394, "grad_norm": 0.03334856778383255, "grad_norm_var": 3.3592072973022413e-06, "learning_rate": 0.002556893792515227, "loss": 2.6909, "step": 5354 }, { "crossentropy": 2.6935653686523438, "epoch": 0.45551207893841444, "grad_norm": 0.033186174929142, "grad_norm_var": 3.326168255988332e-06, "learning_rate": 0.0025551458847247518, "loss": 2.6936, "step": 5355 }, { "crossentropy": 2.714397668838501, "epoch": 0.4555971418849949, "grad_norm": 0.0331234484910965, "grad_norm_var": 3.4047127419606706e-06, "learning_rate": 0.0025533983695074027, "loss": 2.7144, "step": 5356 }, { "crossentropy": 2.67497181892395, "epoch": 0.45568220483157534, "grad_norm": 0.03408236801624298, "grad_norm_var": 3.3635238591030067e-06, "learning_rate": 0.002551651247143778, "loss": 2.675, "step": 5357 }, { "crossentropy": 2.6589503288269043, "epoch": 0.45576726777815585, "grad_norm": 0.032242853194475174, "grad_norm_var": 3.586316068195326e-06, "learning_rate": 0.0025499045179144158, "loss": 2.659, "step": 5358 }, { "crossentropy": 2.6786980628967285, "epoch": 0.4558523307247363, "grad_norm": 0.03748631849884987, "grad_norm_var": 4.135662421722355e-06, "learning_rate": 0.002548158182099788, "loss": 2.6787, "step": 5359 }, { "crossentropy": 2.6555562019348145, "epoch": 0.45593739367131675, "grad_norm": 0.034430935978889465, "grad_norm_var": 4.065216234529675e-06, "learning_rate": 0.0025464122399803126, "loss": 2.6556, "step": 5360 }, { "crossentropy": 2.6926753520965576, "epoch": 0.45602245661789725, "grad_norm": 0.03484442085027695, "grad_norm_var": 3.8600979986711316e-06, "learning_rate": 0.0025446666918363284, "loss": 2.6927, "step": 5361 }, { "crossentropy": 2.764008045196533, "epoch": 0.4561075195644777, "grad_norm": 0.03584941849112511, "grad_norm_var": 3.832485362959397e-06, "learning_rate": 0.0025429215379481262, "loss": 2.764, "step": 5362 }, { "crossentropy": 2.693957805633545, "epoch": 0.4561925825110582, "grad_norm": 0.032950084656476974, "grad_norm_var": 3.785226254196007e-06, "learning_rate": 0.002541176778595926, "loss": 2.694, "step": 5363 }, { "crossentropy": 2.7201309204101562, "epoch": 0.45627764545763866, "grad_norm": 0.03627072647213936, "grad_norm_var": 2.9276841180499632e-06, "learning_rate": 0.002539432414059886, "loss": 2.7201, "step": 5364 }, { "crossentropy": 2.715761423110962, "epoch": 0.4563627084042191, "grad_norm": 0.0360538586974144, "grad_norm_var": 2.390511471768673e-06, "learning_rate": 0.0025376884446200987, "loss": 2.7158, "step": 5365 }, { "crossentropy": 2.6927595138549805, "epoch": 0.4564477713507996, "grad_norm": 0.0359216071665287, "grad_norm_var": 2.347294763500353e-06, "learning_rate": 0.0025359448705565995, "loss": 2.6928, "step": 5366 }, { "crossentropy": 2.680243968963623, "epoch": 0.45653283429738006, "grad_norm": 0.036456141620874405, "grad_norm_var": 2.425364136840469e-06, "learning_rate": 0.0025342016921493537, "loss": 2.6802, "step": 5367 }, { "crossentropy": 2.724137783050537, "epoch": 0.4566178972439605, "grad_norm": 0.03718215599656105, "grad_norm_var": 2.6597807905467372e-06, "learning_rate": 0.002532458909678266, "loss": 2.7241, "step": 5368 }, { "crossentropy": 2.594961643218994, "epoch": 0.456702960190541, "grad_norm": 0.033539965748786926, "grad_norm_var": 2.763362325985952e-06, "learning_rate": 0.0025307165234231765, "loss": 2.595, "step": 5369 }, { "crossentropy": 2.574014186859131, "epoch": 0.45678802313712147, "grad_norm": 0.036455947905778885, "grad_norm_var": 2.7611195449458426e-06, "learning_rate": 0.002528974533663863, "loss": 2.574, "step": 5370 }, { "crossentropy": 2.7492401599884033, "epoch": 0.4568730860837019, "grad_norm": 0.03540291637182236, "grad_norm_var": 2.5307249794668044e-06, "learning_rate": 0.0025272329406800364, "loss": 2.7492, "step": 5371 }, { "crossentropy": 2.6026499271392822, "epoch": 0.4569581490302824, "grad_norm": 0.03333034738898277, "grad_norm_var": 2.477679118349091e-06, "learning_rate": 0.0025254917447513504, "loss": 2.6026, "step": 5372 }, { "crossentropy": 2.723568916320801, "epoch": 0.4570432119768629, "grad_norm": 0.037525538355112076, "grad_norm_var": 2.7256332557095674e-06, "learning_rate": 0.002523750946157388, "loss": 2.7236, "step": 5373 }, { "crossentropy": 2.7336273193359375, "epoch": 0.4571282749234433, "grad_norm": 0.034492578357458115, "grad_norm_var": 2.1034971170368877e-06, "learning_rate": 0.0025220105451776733, "loss": 2.7336, "step": 5374 }, { "crossentropy": 2.7106776237487793, "epoch": 0.45721333787002383, "grad_norm": 0.0338401198387146, "grad_norm_var": 1.974614827454646e-06, "learning_rate": 0.0025202705420916626, "loss": 2.7107, "step": 5375 }, { "crossentropy": 2.6577014923095703, "epoch": 0.4572984008166043, "grad_norm": 0.03382125496864319, "grad_norm_var": 2.067207061200634e-06, "learning_rate": 0.0025185309371787514, "loss": 2.6577, "step": 5376 }, { "crossentropy": 2.5953431129455566, "epoch": 0.4573834637631848, "grad_norm": 0.03203536197543144, "grad_norm_var": 2.710816075382425e-06, "learning_rate": 0.0025167917307182676, "loss": 2.5953, "step": 5377 }, { "crossentropy": 2.6695051193237305, "epoch": 0.45746852670976523, "grad_norm": 0.03345968946814537, "grad_norm_var": 2.819554637131736e-06, "learning_rate": 0.002515052922989482, "loss": 2.6695, "step": 5378 }, { "crossentropy": 2.6139938831329346, "epoch": 0.4575535896563457, "grad_norm": 0.03276979178190231, "grad_norm_var": 2.8689686126320836e-06, "learning_rate": 0.0025133145142715945, "loss": 2.614, "step": 5379 }, { "crossentropy": 2.618393659591675, "epoch": 0.4576386526029262, "grad_norm": 0.0324660986661911, "grad_norm_var": 3.083330445939556e-06, "learning_rate": 0.0025115765048437442, "loss": 2.6184, "step": 5380 }, { "crossentropy": 2.626271963119507, "epoch": 0.45772371554950664, "grad_norm": 0.035663995891809464, "grad_norm_var": 3.0210030993236177e-06, "learning_rate": 0.002509838894985005, "loss": 2.6263, "step": 5381 }, { "crossentropy": 2.729520797729492, "epoch": 0.4578087784960871, "grad_norm": 0.03367888554930687, "grad_norm_var": 2.954435551450844e-06, "learning_rate": 0.0025081016849743867, "loss": 2.7295, "step": 5382 }, { "crossentropy": 2.70963454246521, "epoch": 0.4578938414426676, "grad_norm": 0.035488203167915344, "grad_norm_var": 2.7615097883829445e-06, "learning_rate": 0.002506364875090833, "loss": 2.7096, "step": 5383 }, { "crossentropy": 2.6978344917297363, "epoch": 0.45797890438924804, "grad_norm": 0.0323532335460186, "grad_norm_var": 2.457902309978682e-06, "learning_rate": 0.00250462846561323, "loss": 2.6978, "step": 5384 }, { "crossentropy": 2.620760917663574, "epoch": 0.4580639673358285, "grad_norm": 0.034488726407289505, "grad_norm_var": 2.4375927276977516e-06, "learning_rate": 0.0025028924568203936, "loss": 2.6208, "step": 5385 }, { "crossentropy": 2.5996484756469727, "epoch": 0.458149030282409, "grad_norm": 0.032721854746341705, "grad_norm_var": 2.188131090417357e-06, "learning_rate": 0.002501156848991076, "loss": 2.5996, "step": 5386 }, { "crossentropy": 2.609254837036133, "epoch": 0.45823409322898945, "grad_norm": 0.034955210983753204, "grad_norm_var": 2.115191394738182e-06, "learning_rate": 0.0024994216424039633, "loss": 2.6093, "step": 5387 }, { "crossentropy": 2.6916940212249756, "epoch": 0.4583191561755699, "grad_norm": 0.03373943269252777, "grad_norm_var": 2.0922240057267655e-06, "learning_rate": 0.0024976868373376864, "loss": 2.6917, "step": 5388 }, { "crossentropy": 2.752450466156006, "epoch": 0.4584042191221504, "grad_norm": 0.03488243743777275, "grad_norm_var": 1.2753873143198553e-06, "learning_rate": 0.0024959524340707993, "loss": 2.7525, "step": 5389 }, { "crossentropy": 2.5953822135925293, "epoch": 0.45848928206873085, "grad_norm": 0.034725964069366455, "grad_norm_var": 1.3002327230101617e-06, "learning_rate": 0.0024942184328817967, "loss": 2.5954, "step": 5390 }, { "crossentropy": 2.6341867446899414, "epoch": 0.45857434501531136, "grad_norm": 0.03220151737332344, "grad_norm_var": 1.4632444774620445e-06, "learning_rate": 0.0024924848340491137, "loss": 2.6342, "step": 5391 }, { "crossentropy": 2.7277700901031494, "epoch": 0.4586594079618918, "grad_norm": 0.036872245371341705, "grad_norm_var": 2.087956342903402e-06, "learning_rate": 0.0024907516378511138, "loss": 2.7278, "step": 5392 }, { "crossentropy": 2.537667751312256, "epoch": 0.45874447090847226, "grad_norm": 0.03410561382770538, "grad_norm_var": 1.8393541408501593e-06, "learning_rate": 0.0024890188445660963, "loss": 2.5377, "step": 5393 }, { "crossentropy": 2.64969801902771, "epoch": 0.45882953385505276, "grad_norm": 0.03555845841765404, "grad_norm_var": 1.9534379325886546e-06, "learning_rate": 0.002487286454472302, "loss": 2.6497, "step": 5394 }, { "crossentropy": 2.6909677982330322, "epoch": 0.4589145968016332, "grad_norm": 0.033419135957956314, "grad_norm_var": 1.8588235072944845e-06, "learning_rate": 0.0024855544678479002, "loss": 2.691, "step": 5395 }, { "crossentropy": 2.652632474899292, "epoch": 0.45899965974821366, "grad_norm": 0.03386829420924187, "grad_norm_var": 1.6561248202858736e-06, "learning_rate": 0.002483822884971, "loss": 2.6526, "step": 5396 }, { "crossentropy": 2.7333526611328125, "epoch": 0.45908472269479417, "grad_norm": 0.04053833708167076, "grad_norm_var": 4.030671648328337e-06, "learning_rate": 0.002482091706119642, "loss": 2.7334, "step": 5397 }, { "crossentropy": 2.6189029216766357, "epoch": 0.4591697856413746, "grad_norm": 0.03661080822348595, "grad_norm_var": 4.207907302572883e-06, "learning_rate": 0.002480360931571804, "loss": 2.6189, "step": 5398 }, { "crossentropy": 2.7348735332489014, "epoch": 0.45925484858795507, "grad_norm": 0.03360415995121002, "grad_norm_var": 4.252630515592879e-06, "learning_rate": 0.0024786305616053966, "loss": 2.7349, "step": 5399 }, { "crossentropy": 2.6490285396575928, "epoch": 0.45933991153453557, "grad_norm": 0.03515268862247467, "grad_norm_var": 3.879421617291231e-06, "learning_rate": 0.0024769005964982715, "loss": 2.649, "step": 5400 }, { "crossentropy": 2.6301841735839844, "epoch": 0.459424974481116, "grad_norm": 0.034307293593883514, "grad_norm_var": 3.889984044454173e-06, "learning_rate": 0.0024751710365282092, "loss": 2.6302, "step": 5401 }, { "crossentropy": 2.641733169555664, "epoch": 0.45951003742769647, "grad_norm": 0.03809010609984398, "grad_norm_var": 4.1829164819149245e-06, "learning_rate": 0.0024734418819729273, "loss": 2.6417, "step": 5402 }, { "crossentropy": 2.7010059356689453, "epoch": 0.459595100374277, "grad_norm": 0.03884467855095863, "grad_norm_var": 5.019887072237455e-06, "learning_rate": 0.002471713133110078, "loss": 2.701, "step": 5403 }, { "crossentropy": 2.641247272491455, "epoch": 0.4596801633208574, "grad_norm": 0.0341714583337307, "grad_norm_var": 4.935461859579641e-06, "learning_rate": 0.0024699847902172486, "loss": 2.6412, "step": 5404 }, { "crossentropy": 2.696115732192993, "epoch": 0.4597652262674379, "grad_norm": 0.03565293177962303, "grad_norm_var": 4.915843208055012e-06, "learning_rate": 0.002468256853571959, "loss": 2.6961, "step": 5405 }, { "crossentropy": 2.5860707759857178, "epoch": 0.4598502892140184, "grad_norm": 0.03284072503447533, "grad_norm_var": 5.328200899844665e-06, "learning_rate": 0.00246652932345167, "loss": 2.5861, "step": 5406 }, { "crossentropy": 2.662212371826172, "epoch": 0.45993535216059883, "grad_norm": 0.034278303384780884, "grad_norm_var": 4.7218091626774945e-06, "learning_rate": 0.002464802200133772, "loss": 2.6622, "step": 5407 }, { "crossentropy": 2.722910165786743, "epoch": 0.46002041510717934, "grad_norm": 0.03417893126606941, "grad_norm_var": 4.680493012551983e-06, "learning_rate": 0.0024630754838955897, "loss": 2.7229, "step": 5408 }, { "crossentropy": 2.7001423835754395, "epoch": 0.4601054780537598, "grad_norm": 0.03370051085948944, "grad_norm_var": 4.756687396296049e-06, "learning_rate": 0.0024613491750143835, "loss": 2.7001, "step": 5409 }, { "crossentropy": 2.6688008308410645, "epoch": 0.46019054100034024, "grad_norm": 0.03336353227496147, "grad_norm_var": 4.982461756653824e-06, "learning_rate": 0.002459623273767354, "loss": 2.6688, "step": 5410 }, { "crossentropy": 2.6489858627319336, "epoch": 0.46027560394692074, "grad_norm": 0.033501725643873215, "grad_norm_var": 4.963675151514226e-06, "learning_rate": 0.0024578977804316257, "loss": 2.649, "step": 5411 }, { "crossentropy": 2.5433433055877686, "epoch": 0.4603606668935012, "grad_norm": 0.04776589199900627, "grad_norm_var": 1.4624845723181825e-05, "learning_rate": 0.002456172695284263, "loss": 2.5433, "step": 5412 }, { "crossentropy": 2.7124688625335693, "epoch": 0.46044572984008164, "grad_norm": 0.03175681084394455, "grad_norm_var": 1.4174802468756158e-05, "learning_rate": 0.0024544480186022683, "loss": 2.7125, "step": 5413 }, { "crossentropy": 2.7077507972717285, "epoch": 0.46053079278666215, "grad_norm": 0.03613394871354103, "grad_norm_var": 1.4117674982443614e-05, "learning_rate": 0.0024527237506625733, "loss": 2.7078, "step": 5414 }, { "crossentropy": 2.697953939437866, "epoch": 0.4606158557332426, "grad_norm": 0.036220233887434006, "grad_norm_var": 1.3898435272232703e-05, "learning_rate": 0.002450999891742044, "loss": 2.698, "step": 5415 }, { "crossentropy": 2.7327332496643066, "epoch": 0.46070091867982305, "grad_norm": 0.03432075306773186, "grad_norm_var": 1.3993804686997467e-05, "learning_rate": 0.002449276442117486, "loss": 2.7327, "step": 5416 }, { "crossentropy": 2.751312255859375, "epoch": 0.46078598162640355, "grad_norm": 0.03649932146072388, "grad_norm_var": 1.3924921511338876e-05, "learning_rate": 0.0024475534020656363, "loss": 2.7513, "step": 5417 }, { "crossentropy": 2.724799156188965, "epoch": 0.460871044572984, "grad_norm": 0.03434034436941147, "grad_norm_var": 1.361248444505288e-05, "learning_rate": 0.0024458307718631593, "loss": 2.7248, "step": 5418 }, { "crossentropy": 2.5440163612365723, "epoch": 0.46095610751956445, "grad_norm": 0.03409885615110397, "grad_norm_var": 1.2886726100123461e-05, "learning_rate": 0.002444108551786666, "loss": 2.544, "step": 5419 }, { "crossentropy": 2.700737476348877, "epoch": 0.46104117046614496, "grad_norm": 0.03304177522659302, "grad_norm_var": 1.3117874034768105e-05, "learning_rate": 0.002442386742112692, "loss": 2.7007, "step": 5420 }, { "crossentropy": 2.694749116897583, "epoch": 0.4611262334127254, "grad_norm": 0.03446049615740776, "grad_norm_var": 1.3119771544212582e-05, "learning_rate": 0.00244066534311771, "loss": 2.6947, "step": 5421 }, { "crossentropy": 2.612457275390625, "epoch": 0.4612112963593059, "grad_norm": 0.03289534151554108, "grad_norm_var": 1.3104005160587361e-05, "learning_rate": 0.0024389443550781303, "loss": 2.6125, "step": 5422 }, { "crossentropy": 2.7491281032562256, "epoch": 0.46129635930588636, "grad_norm": 0.03436676785349846, "grad_norm_var": 1.3095571224160049e-05, "learning_rate": 0.0024372237782702916, "loss": 2.7491, "step": 5423 }, { "crossentropy": 2.60199236869812, "epoch": 0.4613814222524668, "grad_norm": 0.03355756029486656, "grad_norm_var": 1.319106881384474e-05, "learning_rate": 0.00243550361297047, "loss": 2.602, "step": 5424 }, { "crossentropy": 2.666376829147339, "epoch": 0.4614664851990473, "grad_norm": 0.034405242651700974, "grad_norm_var": 1.3099863552129323e-05, "learning_rate": 0.0024337838594548733, "loss": 2.6664, "step": 5425 }, { "crossentropy": 2.722196102142334, "epoch": 0.46155154814562777, "grad_norm": 0.033154211938381195, "grad_norm_var": 1.3149545713496819e-05, "learning_rate": 0.0024320645179996448, "loss": 2.7222, "step": 5426 }, { "crossentropy": 2.7030797004699707, "epoch": 0.4616366110922082, "grad_norm": 0.033967092633247375, "grad_norm_var": 1.3068100987188163e-05, "learning_rate": 0.0024303455888808616, "loss": 2.7031, "step": 5427 }, { "crossentropy": 2.7258152961730957, "epoch": 0.4617216740387887, "grad_norm": 0.034068334847688675, "grad_norm_var": 1.5920656386031462e-06, "learning_rate": 0.002428627072374532, "loss": 2.7258, "step": 5428 }, { "crossentropy": 2.6325247287750244, "epoch": 0.46180673698536917, "grad_norm": 0.03496849909424782, "grad_norm_var": 1.1881836304914253e-06, "learning_rate": 0.002426908968756604, "loss": 2.6325, "step": 5429 }, { "crossentropy": 2.7170236110687256, "epoch": 0.4618917999319496, "grad_norm": 0.03787592425942421, "grad_norm_var": 1.7791374556689124e-06, "learning_rate": 0.0024251912783029538, "loss": 2.717, "step": 5430 }, { "crossentropy": 2.680778741836548, "epoch": 0.4619768628785301, "grad_norm": 0.03534070774912834, "grad_norm_var": 1.6275178448099092e-06, "learning_rate": 0.0024234740012893915, "loss": 2.6808, "step": 5431 }, { "crossentropy": 2.6265766620635986, "epoch": 0.4620619258251106, "grad_norm": 0.03332441672682762, "grad_norm_var": 1.708069169912135e-06, "learning_rate": 0.0024217571379916668, "loss": 2.6266, "step": 5432 }, { "crossentropy": 2.5822043418884277, "epoch": 0.462146988771691, "grad_norm": 0.03463100269436836, "grad_norm_var": 1.402725790664953e-06, "learning_rate": 0.0024200406886854538, "loss": 2.5822, "step": 5433 }, { "crossentropy": 2.729511260986328, "epoch": 0.46223205171827153, "grad_norm": 0.03731502220034599, "grad_norm_var": 1.9792932041142717e-06, "learning_rate": 0.002418324653646364, "loss": 2.7295, "step": 5434 }, { "crossentropy": 2.768660068511963, "epoch": 0.462317114664852, "grad_norm": 0.03580851852893829, "grad_norm_var": 2.078067726397845e-06, "learning_rate": 0.0024166090331499477, "loss": 2.7687, "step": 5435 }, { "crossentropy": 2.650067090988159, "epoch": 0.4624021776114325, "grad_norm": 0.03700026869773865, "grad_norm_var": 2.248817021864522e-06, "learning_rate": 0.002414893827471682, "loss": 2.6501, "step": 5436 }, { "crossentropy": 2.6827023029327393, "epoch": 0.46248724055801294, "grad_norm": 0.03355274721980095, "grad_norm_var": 2.3439762397894337e-06, "learning_rate": 0.0024131790368869775, "loss": 2.6827, "step": 5437 }, { "crossentropy": 2.6797242164611816, "epoch": 0.4625723035045934, "grad_norm": 0.03529660031199455, "grad_norm_var": 2.1059163695211675e-06, "learning_rate": 0.0024114646616711843, "loss": 2.6797, "step": 5438 }, { "crossentropy": 2.6486308574676514, "epoch": 0.4626573664511739, "grad_norm": 0.03355040028691292, "grad_norm_var": 2.207196214220238e-06, "learning_rate": 0.002409750702099581, "loss": 2.6486, "step": 5439 }, { "crossentropy": 2.7001848220825195, "epoch": 0.46274242939775434, "grad_norm": 0.03295910358428955, "grad_norm_var": 2.333789810661455e-06, "learning_rate": 0.0024080371584473746, "loss": 2.7002, "step": 5440 }, { "crossentropy": 2.6936256885528564, "epoch": 0.4628274923443348, "grad_norm": 0.032090503722429276, "grad_norm_var": 2.7985653330341842e-06, "learning_rate": 0.002406324030989718, "loss": 2.6936, "step": 5441 }, { "crossentropy": 2.655439615249634, "epoch": 0.4629125552909153, "grad_norm": 0.034926749765872955, "grad_norm_var": 2.633986233048444e-06, "learning_rate": 0.0024046113200016854, "loss": 2.6554, "step": 5442 }, { "crossentropy": 2.6199445724487305, "epoch": 0.46299761823749574, "grad_norm": 0.032301370054483414, "grad_norm_var": 2.9906636544674293e-06, "learning_rate": 0.0024028990257582896, "loss": 2.6199, "step": 5443 }, { "crossentropy": 2.5740511417388916, "epoch": 0.4630826811840762, "grad_norm": 0.03281832113862038, "grad_norm_var": 3.191623306330234e-06, "learning_rate": 0.0024011871485344776, "loss": 2.5741, "step": 5444 }, { "crossentropy": 2.5488808155059814, "epoch": 0.4631677441306567, "grad_norm": 0.03299577161669731, "grad_norm_var": 3.340558092564348e-06, "learning_rate": 0.002399475688605127, "loss": 2.5489, "step": 5445 }, { "crossentropy": 2.59773325920105, "epoch": 0.46325280707723715, "grad_norm": 0.037254538387060165, "grad_norm_var": 3.0838896581573497e-06, "learning_rate": 0.002397764646245048, "loss": 2.5977, "step": 5446 }, { "crossentropy": 2.6412241458892822, "epoch": 0.4633378700238176, "grad_norm": 0.03427669778466225, "grad_norm_var": 3.0279829696960396e-06, "learning_rate": 0.0023960540217289857, "loss": 2.6412, "step": 5447 }, { "crossentropy": 2.6965527534484863, "epoch": 0.4634229329703981, "grad_norm": 0.03439762443304062, "grad_norm_var": 2.9487238323290417e-06, "learning_rate": 0.0023943438153316153, "loss": 2.6966, "step": 5448 }, { "crossentropy": 2.6715011596679688, "epoch": 0.46350799591697855, "grad_norm": 0.036155931651592255, "grad_norm_var": 3.13117865752439e-06, "learning_rate": 0.0023926340273275493, "loss": 2.6715, "step": 5449 }, { "crossentropy": 2.6216001510620117, "epoch": 0.46359305886355906, "grad_norm": 0.033570580184459686, "grad_norm_var": 2.6239044099988947e-06, "learning_rate": 0.0023909246579913264, "loss": 2.6216, "step": 5450 }, { "crossentropy": 2.653871774673462, "epoch": 0.4636781218101395, "grad_norm": 0.033993471413850784, "grad_norm_var": 2.46708863893159e-06, "learning_rate": 0.0023892157075974265, "loss": 2.6539, "step": 5451 }, { "crossentropy": 2.71638822555542, "epoch": 0.46376318475671996, "grad_norm": 0.03618515655398369, "grad_norm_var": 2.2038734553077196e-06, "learning_rate": 0.002387507176420256, "loss": 2.7164, "step": 5452 }, { "crossentropy": 2.5887022018432617, "epoch": 0.46384824770330046, "grad_norm": 0.034391626715660095, "grad_norm_var": 2.181573127577746e-06, "learning_rate": 0.002385799064734156, "loss": 2.5887, "step": 5453 }, { "crossentropy": 2.689314365386963, "epoch": 0.4639333106498809, "grad_norm": 0.0329839251935482, "grad_norm_var": 2.1770230549553643e-06, "learning_rate": 0.0023840913728133996, "loss": 2.6893, "step": 5454 }, { "crossentropy": 2.7283477783203125, "epoch": 0.46401837359646136, "grad_norm": 0.03315850347280502, "grad_norm_var": 2.2128966165703306e-06, "learning_rate": 0.0023823841009321923, "loss": 2.7283, "step": 5455 }, { "crossentropy": 2.724111795425415, "epoch": 0.46410343654304187, "grad_norm": 0.03561058267951012, "grad_norm_var": 2.2741430192760775e-06, "learning_rate": 0.002380677249364672, "loss": 2.7241, "step": 5456 }, { "crossentropy": 2.723611354827881, "epoch": 0.4641884994896223, "grad_norm": 0.03595539927482605, "grad_norm_var": 2.123522288427435e-06, "learning_rate": 0.002378970818384914, "loss": 2.7236, "step": 5457 }, { "crossentropy": 2.699476718902588, "epoch": 0.46427356243620277, "grad_norm": 0.03503267094492912, "grad_norm_var": 2.131154045995759e-06, "learning_rate": 0.0023772648082669187, "loss": 2.6995, "step": 5458 }, { "crossentropy": 2.679241180419922, "epoch": 0.4643586253827833, "grad_norm": 0.03329053521156311, "grad_norm_var": 1.909898309460957e-06, "learning_rate": 0.002375559219284621, "loss": 2.6792, "step": 5459 }, { "crossentropy": 2.539923906326294, "epoch": 0.4644436883293637, "grad_norm": 0.03482067957520485, "grad_norm_var": 1.7103214058464614e-06, "learning_rate": 0.002373854051711895, "loss": 2.5399, "step": 5460 }, { "crossentropy": 2.725311517715454, "epoch": 0.4645287512759442, "grad_norm": 0.03272896632552147, "grad_norm_var": 1.7728925580758868e-06, "learning_rate": 0.0023721493058225357, "loss": 2.7253, "step": 5461 }, { "crossentropy": 2.653674840927124, "epoch": 0.4646138142225247, "grad_norm": 0.03484184294939041, "grad_norm_var": 1.286925259365764e-06, "learning_rate": 0.0023704449818902763, "loss": 2.6537, "step": 5462 }, { "crossentropy": 2.7005343437194824, "epoch": 0.46469887716910513, "grad_norm": 0.03682282194495201, "grad_norm_var": 1.6291434719796493e-06, "learning_rate": 0.0023687410801887855, "loss": 2.7005, "step": 5463 }, { "crossentropy": 2.6361770629882812, "epoch": 0.46478394011568563, "grad_norm": 0.03278438374400139, "grad_norm_var": 1.8399084310018998e-06, "learning_rate": 0.0023670376009916594, "loss": 2.6362, "step": 5464 }, { "crossentropy": 2.6700947284698486, "epoch": 0.4648690030622661, "grad_norm": 0.036531511694192886, "grad_norm_var": 1.9306256554975363e-06, "learning_rate": 0.0023653345445724273, "loss": 2.6701, "step": 5465 }, { "crossentropy": 2.6900107860565186, "epoch": 0.46495406600884653, "grad_norm": 0.034009940922260284, "grad_norm_var": 1.8856711091343182e-06, "learning_rate": 0.0023636319112045495, "loss": 2.69, "step": 5466 }, { "crossentropy": 2.7085747718811035, "epoch": 0.46503912895542704, "grad_norm": 0.03584010526537895, "grad_norm_var": 1.9565091823270376e-06, "learning_rate": 0.0023619297011614232, "loss": 2.7086, "step": 5467 }, { "crossentropy": 2.5582075119018555, "epoch": 0.4651241919020075, "grad_norm": 0.03350837901234627, "grad_norm_var": 1.8695580604556616e-06, "learning_rate": 0.002360227914716373, "loss": 2.5582, "step": 5468 }, { "crossentropy": 2.70432448387146, "epoch": 0.46520925484858794, "grad_norm": 0.03826862573623657, "grad_norm_var": 2.7429052428119825e-06, "learning_rate": 0.0023585265521426564, "loss": 2.7043, "step": 5469 }, { "crossentropy": 2.744662046432495, "epoch": 0.46529431779516844, "grad_norm": 0.031884413212537766, "grad_norm_var": 3.0791031193538415e-06, "learning_rate": 0.002356825613713463, "loss": 2.7447, "step": 5470 }, { "crossentropy": 2.537182331085205, "epoch": 0.4653793807417489, "grad_norm": 0.03818539157509804, "grad_norm_var": 3.6298973370979883e-06, "learning_rate": 0.0023551250997019147, "loss": 2.5372, "step": 5471 }, { "crossentropy": 2.613664388656616, "epoch": 0.46546444368832934, "grad_norm": 0.03373032063245773, "grad_norm_var": 3.6996063688538028e-06, "learning_rate": 0.002353425010381063, "loss": 2.6137, "step": 5472 }, { "crossentropy": 2.796649217605591, "epoch": 0.46554950663490985, "grad_norm": 0.035984378308057785, "grad_norm_var": 3.703776389702741e-06, "learning_rate": 0.0023517253460238973, "loss": 2.7966, "step": 5473 }, { "crossentropy": 2.556440830230713, "epoch": 0.4656345695814903, "grad_norm": 0.033665504306554794, "grad_norm_var": 3.7948750518096776e-06, "learning_rate": 0.002350026106903333, "loss": 2.5564, "step": 5474 }, { "crossentropy": 2.767420530319214, "epoch": 0.46571963252807075, "grad_norm": 0.03376416116952896, "grad_norm_var": 3.713186247574808e-06, "learning_rate": 0.0023483272932922185, "loss": 2.7674, "step": 5475 }, { "crossentropy": 2.684784173965454, "epoch": 0.46580469547465125, "grad_norm": 0.0347391702234745, "grad_norm_var": 3.7137648779175225e-06, "learning_rate": 0.0023466289054633344, "loss": 2.6848, "step": 5476 }, { "crossentropy": 2.723344326019287, "epoch": 0.4658897584212317, "grad_norm": 0.0346003919839859, "grad_norm_var": 3.4082427341743105e-06, "learning_rate": 0.002344930943689393, "loss": 2.7233, "step": 5477 }, { "crossentropy": 2.589855194091797, "epoch": 0.4659748213678122, "grad_norm": 0.032995134592056274, "grad_norm_var": 3.6474248329328627e-06, "learning_rate": 0.0023432334082430363, "loss": 2.5899, "step": 5478 }, { "crossentropy": 2.595926523208618, "epoch": 0.46605988431439266, "grad_norm": 0.03392161801457405, "grad_norm_var": 3.403445977339982e-06, "learning_rate": 0.0023415362993968424, "loss": 2.5959, "step": 5479 }, { "crossentropy": 2.7189579010009766, "epoch": 0.4661449472609731, "grad_norm": 0.03502815589308739, "grad_norm_var": 3.159716239414569e-06, "learning_rate": 0.0023398396174233176, "loss": 2.719, "step": 5480 }, { "crossentropy": 2.6906661987304688, "epoch": 0.4662300102075536, "grad_norm": 0.03427089750766754, "grad_norm_var": 2.954520701016425e-06, "learning_rate": 0.0023381433625948972, "loss": 2.6907, "step": 5481 }, { "crossentropy": 2.581169843673706, "epoch": 0.46631507315413406, "grad_norm": 0.033599939197301865, "grad_norm_var": 3.0000054263787843e-06, "learning_rate": 0.0023364475351839572, "loss": 2.5812, "step": 5482 }, { "crossentropy": 2.717519998550415, "epoch": 0.4664001361007145, "grad_norm": 0.03269753232598305, "grad_norm_var": 3.1077485246722157e-06, "learning_rate": 0.0023347521354627927, "loss": 2.7175, "step": 5483 }, { "crossentropy": 2.605254650115967, "epoch": 0.466485199047295, "grad_norm": 0.04556997865438461, "grad_norm_var": 1.0721839132105982e-05, "learning_rate": 0.0023330571637036356, "loss": 2.6053, "step": 5484 }, { "crossentropy": 2.6658897399902344, "epoch": 0.46657026199387547, "grad_norm": 0.04238659888505936, "grad_norm_var": 1.3476666981839836e-05, "learning_rate": 0.002331362620178654, "loss": 2.6659, "step": 5485 }, { "crossentropy": 2.65944242477417, "epoch": 0.4666553249404559, "grad_norm": 0.03333776816725731, "grad_norm_var": 1.291987685316607e-05, "learning_rate": 0.0023296685051599404, "loss": 2.6594, "step": 5486 }, { "crossentropy": 2.599073648452759, "epoch": 0.4667403878870364, "grad_norm": 0.035500217229127884, "grad_norm_var": 1.2419751581169553e-05, "learning_rate": 0.002327974818919521, "loss": 2.5991, "step": 5487 }, { "crossentropy": 2.6597423553466797, "epoch": 0.4668254508336169, "grad_norm": 0.034875787794589996, "grad_norm_var": 1.2252555058520943e-05, "learning_rate": 0.0023262815617293513, "loss": 2.6597, "step": 5488 }, { "crossentropy": 2.6942896842956543, "epoch": 0.4669105137801973, "grad_norm": 0.03585681691765785, "grad_norm_var": 1.2244203924426968e-05, "learning_rate": 0.0023245887338613253, "loss": 2.6943, "step": 5489 }, { "crossentropy": 2.6504874229431152, "epoch": 0.46699557672677783, "grad_norm": 0.032963939011096954, "grad_norm_var": 1.2439609407588481e-05, "learning_rate": 0.0023228963355872547, "loss": 2.6505, "step": 5490 }, { "crossentropy": 2.647839307785034, "epoch": 0.4670806396733583, "grad_norm": 0.03264807537198067, "grad_norm_var": 1.2758179099916535e-05, "learning_rate": 0.0023212043671788953, "loss": 2.6478, "step": 5491 }, { "crossentropy": 2.6498911380767822, "epoch": 0.4671657026199387, "grad_norm": 0.034433405846357346, "grad_norm_var": 1.278737585492188e-05, "learning_rate": 0.0023195128289079264, "loss": 2.6499, "step": 5492 }, { "crossentropy": 2.6186976432800293, "epoch": 0.46725076556651923, "grad_norm": 0.03740677610039711, "grad_norm_var": 1.3020490378527067e-05, "learning_rate": 0.0023178217210459606, "loss": 2.6187, "step": 5493 }, { "crossentropy": 2.6736531257629395, "epoch": 0.4673358285130997, "grad_norm": 0.03666006028652191, "grad_norm_var": 1.2651446247278348e-05, "learning_rate": 0.00231613104386454, "loss": 2.6737, "step": 5494 }, { "crossentropy": 2.747009754180908, "epoch": 0.4674208914596802, "grad_norm": 0.034478966146707535, "grad_norm_var": 1.2538901087680992e-05, "learning_rate": 0.002314440797635141, "loss": 2.747, "step": 5495 }, { "crossentropy": 2.578782320022583, "epoch": 0.46750595440626064, "grad_norm": 0.03512928634881973, "grad_norm_var": 1.2530047164632275e-05, "learning_rate": 0.00231275098262917, "loss": 2.5788, "step": 5496 }, { "crossentropy": 2.6452622413635254, "epoch": 0.4675910173528411, "grad_norm": 0.03711400926113129, "grad_norm_var": 1.2478910304679288e-05, "learning_rate": 0.002311061599117956, "loss": 2.6453, "step": 5497 }, { "crossentropy": 2.5564498901367188, "epoch": 0.4676760802994216, "grad_norm": 0.03331843391060829, "grad_norm_var": 1.2570801653245291e-05, "learning_rate": 0.0023093726473727705, "loss": 2.5564, "step": 5498 }, { "crossentropy": 2.6327505111694336, "epoch": 0.46776114324600204, "grad_norm": 0.03579020872712135, "grad_norm_var": 1.1848608570228827e-05, "learning_rate": 0.0023076841276648092, "loss": 2.6328, "step": 5499 }, { "crossentropy": 2.6769258975982666, "epoch": 0.4678462061925825, "grad_norm": 0.03259780630469322, "grad_norm_var": 5.972426678544732e-06, "learning_rate": 0.002305996040265198, "loss": 2.6769, "step": 5500 }, { "crossentropy": 2.644836187362671, "epoch": 0.467931269139163, "grad_norm": 0.034586019814014435, "grad_norm_var": 2.3852600042654585e-06, "learning_rate": 0.0023043083854449985, "loss": 2.6448, "step": 5501 }, { "crossentropy": 2.633335590362549, "epoch": 0.46801633208574345, "grad_norm": 0.03273644670844078, "grad_norm_var": 2.524582164951476e-06, "learning_rate": 0.002302621163475198, "loss": 2.6333, "step": 5502 }, { "crossentropy": 2.6525893211364746, "epoch": 0.4681013950323239, "grad_norm": 0.034604962915182114, "grad_norm_var": 2.4858414331102124e-06, "learning_rate": 0.002300934374626715, "loss": 2.6526, "step": 5503 }, { "crossentropy": 2.720801830291748, "epoch": 0.4681864579789044, "grad_norm": 0.034267861396074295, "grad_norm_var": 2.4946961077636103e-06, "learning_rate": 0.0022992480191704003, "loss": 2.7208, "step": 5504 }, { "crossentropy": 2.6666812896728516, "epoch": 0.46827152092548485, "grad_norm": 0.03531841188669205, "grad_norm_var": 2.4270457045873216e-06, "learning_rate": 0.0022975620973770326, "loss": 2.6667, "step": 5505 }, { "crossentropy": 2.6652188301086426, "epoch": 0.4683565838720653, "grad_norm": 0.03857951611280441, "grad_norm_var": 3.151697668975449e-06, "learning_rate": 0.002295876609517322, "loss": 2.6652, "step": 5506 }, { "crossentropy": 2.674591302871704, "epoch": 0.4684416468186458, "grad_norm": 0.03256865590810776, "grad_norm_var": 3.1767787904520815e-06, "learning_rate": 0.002294191555861912, "loss": 2.6746, "step": 5507 }, { "crossentropy": 2.6719396114349365, "epoch": 0.46852670976522626, "grad_norm": 0.03352578356862068, "grad_norm_var": 3.293737275406455e-06, "learning_rate": 0.0022925069366813715, "loss": 2.6719, "step": 5508 }, { "crossentropy": 2.6962530612945557, "epoch": 0.46861177271180676, "grad_norm": 0.033447328954935074, "grad_norm_var": 2.959515108146883e-06, "learning_rate": 0.002290822752246203, "loss": 2.6963, "step": 5509 }, { "crossentropy": 2.566923141479492, "epoch": 0.4686968356583872, "grad_norm": 0.03380366414785385, "grad_norm_var": 2.711621929142073e-06, "learning_rate": 0.002289139002826835, "loss": 2.5669, "step": 5510 }, { "crossentropy": 2.598792552947998, "epoch": 0.46878189860496766, "grad_norm": 0.0325484499335289, "grad_norm_var": 2.9478330615867464e-06, "learning_rate": 0.0022874556886936354, "loss": 2.5988, "step": 5511 }, { "crossentropy": 2.7214431762695312, "epoch": 0.46886696155154817, "grad_norm": 0.03319678455591202, "grad_norm_var": 2.985871585665471e-06, "learning_rate": 0.00228577281011689, "loss": 2.7214, "step": 5512 }, { "crossentropy": 2.725839614868164, "epoch": 0.4689520244981286, "grad_norm": 0.03432571887969971, "grad_norm_var": 2.4071241307469885e-06, "learning_rate": 0.0022840903673668205, "loss": 2.7258, "step": 5513 }, { "crossentropy": 2.5987706184387207, "epoch": 0.46903708744470907, "grad_norm": 0.035716019570827484, "grad_norm_var": 2.5242218407101405e-06, "learning_rate": 0.0022824083607135837, "loss": 2.5988, "step": 5514 }, { "crossentropy": 2.661034345626831, "epoch": 0.46912215039128957, "grad_norm": 0.03317400813102722, "grad_norm_var": 2.406314132209301e-06, "learning_rate": 0.0022807267904272578, "loss": 2.661, "step": 5515 }, { "crossentropy": 2.788386821746826, "epoch": 0.46920721333787, "grad_norm": 0.03654943406581879, "grad_norm_var": 2.610635360432839e-06, "learning_rate": 0.0022790456567778534, "loss": 2.7884, "step": 5516 }, { "crossentropy": 2.6855006217956543, "epoch": 0.46929227628445047, "grad_norm": 0.03346623107790947, "grad_norm_var": 2.647692576164771e-06, "learning_rate": 0.0022773649600353156, "loss": 2.6855, "step": 5517 }, { "crossentropy": 2.571274757385254, "epoch": 0.469377339231031, "grad_norm": 0.03732055798172951, "grad_norm_var": 3.0424878548253784e-06, "learning_rate": 0.002275684700469517, "loss": 2.5713, "step": 5518 }, { "crossentropy": 2.59114408493042, "epoch": 0.4694624021776114, "grad_norm": 0.03321969509124756, "grad_norm_var": 3.1478085106177605e-06, "learning_rate": 0.0022740048783502515, "loss": 2.5911, "step": 5519 }, { "crossentropy": 2.6626789569854736, "epoch": 0.4695474651241919, "grad_norm": 0.03599447011947632, "grad_norm_var": 3.294674229921875e-06, "learning_rate": 0.002272325493947257, "loss": 2.6627, "step": 5520 }, { "crossentropy": 2.6928186416625977, "epoch": 0.4696325280707724, "grad_norm": 0.03507087752223015, "grad_norm_var": 3.2730493133347727e-06, "learning_rate": 0.0022706465475301925, "loss": 2.6928, "step": 5521 }, { "crossentropy": 2.6761474609375, "epoch": 0.46971759101735283, "grad_norm": 0.0323701873421669, "grad_norm_var": 2.3315548501522006e-06, "learning_rate": 0.0022689680393686456, "loss": 2.6761, "step": 5522 }, { "crossentropy": 2.7055628299713135, "epoch": 0.46980265396393334, "grad_norm": 0.0332007110118866, "grad_norm_var": 2.2237949378093107e-06, "learning_rate": 0.002267289969732141, "loss": 2.7056, "step": 5523 }, { "crossentropy": 2.7095491886138916, "epoch": 0.4698877169105138, "grad_norm": 0.03511323034763336, "grad_norm_var": 2.24216256994846e-06, "learning_rate": 0.002265612338890125, "loss": 2.7095, "step": 5524 }, { "crossentropy": 2.749526262283325, "epoch": 0.46997277985709424, "grad_norm": 0.03463592007756233, "grad_norm_var": 2.1981285173220307e-06, "learning_rate": 0.0022639351471119783, "loss": 2.7495, "step": 5525 }, { "crossentropy": 2.7435128688812256, "epoch": 0.47005784280367474, "grad_norm": 0.035568054765462875, "grad_norm_var": 2.2626110563569423e-06, "learning_rate": 0.0022622583946670096, "loss": 2.7435, "step": 5526 }, { "crossentropy": 2.6420488357543945, "epoch": 0.4701429057502552, "grad_norm": 0.03482896089553833, "grad_norm_var": 2.0043181171625414e-06, "learning_rate": 0.002260582081824456, "loss": 2.642, "step": 5527 }, { "crossentropy": 2.642834424972534, "epoch": 0.47022796869683564, "grad_norm": 0.034661516547203064, "grad_norm_var": 1.8625220684085134e-06, "learning_rate": 0.0022589062088534833, "loss": 2.6428, "step": 5528 }, { "crossentropy": 2.676668405532837, "epoch": 0.47031303164341615, "grad_norm": 0.032315582036972046, "grad_norm_var": 2.2156380782659483e-06, "learning_rate": 0.002257230776023193, "loss": 2.6767, "step": 5529 }, { "crossentropy": 2.5592942237854004, "epoch": 0.4703980945899966, "grad_norm": 0.03337420895695686, "grad_norm_var": 2.202225807904773e-06, "learning_rate": 0.0022555557836026092, "loss": 2.5593, "step": 5530 }, { "crossentropy": 2.6493747234344482, "epoch": 0.47048315753657705, "grad_norm": 0.032518040388822556, "grad_norm_var": 2.3388817852696212e-06, "learning_rate": 0.0022538812318606866, "loss": 2.6494, "step": 5531 }, { "crossentropy": 2.6927809715270996, "epoch": 0.47056822048315755, "grad_norm": 0.034755025058984756, "grad_norm_var": 2.0229880532109113e-06, "learning_rate": 0.002252207121066311, "loss": 2.6928, "step": 5532 }, { "crossentropy": 2.6320621967315674, "epoch": 0.470653283429738, "grad_norm": 0.03591206669807434, "grad_norm_var": 2.1328508145535344e-06, "learning_rate": 0.002250533451488296, "loss": 2.6321, "step": 5533 }, { "crossentropy": 2.6600372791290283, "epoch": 0.47073834637631845, "grad_norm": 0.036530427634716034, "grad_norm_var": 1.8672100107401204e-06, "learning_rate": 0.0022488602233953843, "loss": 2.66, "step": 5534 }, { "crossentropy": 2.633601427078247, "epoch": 0.47082340932289896, "grad_norm": 0.03428904339671135, "grad_norm_var": 1.7733413622835736e-06, "learning_rate": 0.0022471874370562465, "loss": 2.6336, "step": 5535 }, { "crossentropy": 2.7335140705108643, "epoch": 0.4709084722694794, "grad_norm": 0.035402920097112656, "grad_norm_var": 1.6730905225246137e-06, "learning_rate": 0.002245515092739488, "loss": 2.7335, "step": 5536 }, { "crossentropy": 2.6754941940307617, "epoch": 0.4709935352160599, "grad_norm": 0.03420873358845711, "grad_norm_var": 1.6434817117059556e-06, "learning_rate": 0.0022438431907136364, "loss": 2.6755, "step": 5537 }, { "crossentropy": 2.6609652042388916, "epoch": 0.47107859816264036, "grad_norm": 0.034879185259342194, "grad_norm_var": 1.372841396377056e-06, "learning_rate": 0.0022421717312471503, "loss": 2.661, "step": 5538 }, { "crossentropy": 2.814230442047119, "epoch": 0.4711636611092208, "grad_norm": 0.03326704353094101, "grad_norm_var": 1.3615180171046294e-06, "learning_rate": 0.0022405007146084205, "loss": 2.8142, "step": 5539 }, { "crossentropy": 2.678745985031128, "epoch": 0.4712487240558013, "grad_norm": 0.03449020907282829, "grad_norm_var": 1.3361866640368772e-06, "learning_rate": 0.002238830141065765, "loss": 2.6787, "step": 5540 }, { "crossentropy": 2.64699125289917, "epoch": 0.47133378700238177, "grad_norm": 0.05912176892161369, "grad_norm_var": 3.932631773738513e-05, "learning_rate": 0.0022371600108874235, "loss": 2.647, "step": 5541 }, { "crossentropy": 2.7037220001220703, "epoch": 0.4714188499489622, "grad_norm": 0.038076043128967285, "grad_norm_var": 3.957243503984742e-05, "learning_rate": 0.002235490324341577, "loss": 2.7037, "step": 5542 }, { "crossentropy": 2.5920119285583496, "epoch": 0.4715039128955427, "grad_norm": 0.03496236354112625, "grad_norm_var": 3.9549793408068476e-05, "learning_rate": 0.0022338210816963268, "loss": 2.592, "step": 5543 }, { "crossentropy": 2.7160162925720215, "epoch": 0.47158897584212317, "grad_norm": 0.034715473651885986, "grad_norm_var": 3.953910305111881e-05, "learning_rate": 0.002232152283219703, "loss": 2.716, "step": 5544 }, { "crossentropy": 2.678893804550171, "epoch": 0.4716740387887036, "grad_norm": 0.03313035890460014, "grad_norm_var": 3.9161195988375994e-05, "learning_rate": 0.002230483929179671, "loss": 2.6789, "step": 5545 }, { "crossentropy": 2.762101173400879, "epoch": 0.4717591017352841, "grad_norm": 0.03449872136116028, "grad_norm_var": 3.881248726426423e-05, "learning_rate": 0.0022288160198441183, "loss": 2.7621, "step": 5546 }, { "crossentropy": 2.708714246749878, "epoch": 0.4718441646818646, "grad_norm": 0.036052316427230835, "grad_norm_var": 3.7812236848932106e-05, "learning_rate": 0.002227148555480863, "loss": 2.7087, "step": 5547 }, { "crossentropy": 2.668454647064209, "epoch": 0.471929227628445, "grad_norm": 0.03613511100411415, "grad_norm_var": 3.7606826527435536e-05, "learning_rate": 0.0022254815363576515, "loss": 2.6685, "step": 5548 }, { "crossentropy": 2.6861836910247803, "epoch": 0.47201429057502553, "grad_norm": 0.03406037017703056, "grad_norm_var": 3.799207882382953e-05, "learning_rate": 0.0022238149627421596, "loss": 2.6862, "step": 5549 }, { "crossentropy": 2.6465065479278564, "epoch": 0.472099353521606, "grad_norm": 0.03443225100636482, "grad_norm_var": 3.825556735970222e-05, "learning_rate": 0.00222214883490199, "loss": 2.6465, "step": 5550 }, { "crossentropy": 2.746853828430176, "epoch": 0.4721844164681865, "grad_norm": 0.0348297543823719, "grad_norm_var": 3.812470680151266e-05, "learning_rate": 0.0022204831531046744, "loss": 2.7469, "step": 5551 }, { "crossentropy": 2.6596121788024902, "epoch": 0.47226947941476694, "grad_norm": 0.033858396112918854, "grad_norm_var": 3.847737095383034e-05, "learning_rate": 0.0022188179176176764, "loss": 2.6596, "step": 5552 }, { "crossentropy": 2.678321123123169, "epoch": 0.4723545423613474, "grad_norm": 0.03203689306974411, "grad_norm_var": 3.937628066011569e-05, "learning_rate": 0.0022171531287083824, "loss": 2.6783, "step": 5553 }, { "crossentropy": 2.742800712585449, "epoch": 0.4724396053079279, "grad_norm": 0.03450361639261246, "grad_norm_var": 3.944919128985123e-05, "learning_rate": 0.0022154887866441104, "loss": 2.7428, "step": 5554 }, { "crossentropy": 2.704702138900757, "epoch": 0.47252466825450834, "grad_norm": 0.0322578065097332, "grad_norm_var": 3.989886754129715e-05, "learning_rate": 0.0022138248916921057, "loss": 2.7047, "step": 5555 }, { "crossentropy": 2.76897931098938, "epoch": 0.4726097312010888, "grad_norm": 0.035220567137002945, "grad_norm_var": 3.9778112433400126e-05, "learning_rate": 0.002212161444119541, "loss": 2.769, "step": 5556 }, { "crossentropy": 2.674036979675293, "epoch": 0.4726947941476693, "grad_norm": 0.0353982150554657, "grad_norm_var": 2.1901505156073895e-06, "learning_rate": 0.0022104984441935166, "loss": 2.674, "step": 5557 }, { "crossentropy": 2.8207905292510986, "epoch": 0.47277985709424974, "grad_norm": 0.03415866196155548, "grad_norm_var": 1.3522202862792723e-06, "learning_rate": 0.002208835892181067, "loss": 2.8208, "step": 5558 }, { "crossentropy": 2.6925315856933594, "epoch": 0.4728649200408302, "grad_norm": 0.03519264608621597, "grad_norm_var": 1.373087836227469e-06, "learning_rate": 0.0022071737883491466, "loss": 2.6925, "step": 5559 }, { "crossentropy": 2.6791906356811523, "epoch": 0.4729499829874107, "grad_norm": 0.03771933168172836, "grad_norm_var": 2.0613556790592564e-06, "learning_rate": 0.0022055121329646417, "loss": 2.6792, "step": 5560 }, { "crossentropy": 2.6001009941101074, "epoch": 0.47303504593399115, "grad_norm": 0.03182293474674225, "grad_norm_var": 2.423130359135069e-06, "learning_rate": 0.0022038509262943696, "loss": 2.6001, "step": 5561 }, { "crossentropy": 2.593454360961914, "epoch": 0.4731201088805716, "grad_norm": 0.03350607678294182, "grad_norm_var": 2.4863526009878564e-06, "learning_rate": 0.0022021901686050684, "loss": 2.5935, "step": 5562 }, { "crossentropy": 2.6446566581726074, "epoch": 0.4732051718271521, "grad_norm": 0.03310198336839676, "grad_norm_var": 2.3996959995830603e-06, "learning_rate": 0.0022005298601634068, "loss": 2.6447, "step": 5563 }, { "crossentropy": 2.6989052295684814, "epoch": 0.47329023477373255, "grad_norm": 0.0346786230802536, "grad_norm_var": 2.1690429166378494e-06, "learning_rate": 0.0021988700012359865, "loss": 2.6989, "step": 5564 }, { "crossentropy": 2.5557239055633545, "epoch": 0.473375297720313, "grad_norm": 0.03220498561859131, "grad_norm_var": 2.4122156299654615e-06, "learning_rate": 0.0021972105920893313, "loss": 2.5557, "step": 5565 }, { "crossentropy": 2.7001781463623047, "epoch": 0.4734603606668935, "grad_norm": 0.03264088183641434, "grad_norm_var": 2.5233103099484474e-06, "learning_rate": 0.0021955516329898927, "loss": 2.7002, "step": 5566 }, { "crossentropy": 2.7065038681030273, "epoch": 0.47354542361347396, "grad_norm": 0.03182070702314377, "grad_norm_var": 2.734524343625475e-06, "learning_rate": 0.0021938931242040556, "loss": 2.7065, "step": 5567 }, { "crossentropy": 2.578599452972412, "epoch": 0.47363048656005446, "grad_norm": 0.033755138516426086, "grad_norm_var": 2.733803623354049e-06, "learning_rate": 0.002192235065998126, "loss": 2.5786, "step": 5568 }, { "crossentropy": 2.7347865104675293, "epoch": 0.4737155495066349, "grad_norm": 0.035261861979961395, "grad_norm_var": 2.6466888033260227e-06, "learning_rate": 0.0021905774586383413, "loss": 2.7348, "step": 5569 }, { "crossentropy": 2.6369264125823975, "epoch": 0.47380061245321536, "grad_norm": 0.03314691036939621, "grad_norm_var": 2.662081432140637e-06, "learning_rate": 0.0021889203023908654, "loss": 2.6369, "step": 5570 }, { "crossentropy": 2.701328754425049, "epoch": 0.47388567539979587, "grad_norm": 0.03487461060285568, "grad_norm_var": 2.528266861103375e-06, "learning_rate": 0.002187263597521789, "loss": 2.7013, "step": 5571 }, { "crossentropy": 2.596003770828247, "epoch": 0.4739707383463763, "grad_norm": 0.03522032871842384, "grad_norm_var": 2.528229065499691e-06, "learning_rate": 0.002185607344297132, "loss": 2.596, "step": 5572 }, { "crossentropy": 2.6687135696411133, "epoch": 0.47405580129295677, "grad_norm": 0.033395688980817795, "grad_norm_var": 2.413941599935648e-06, "learning_rate": 0.0021839515429828388, "loss": 2.6687, "step": 5573 }, { "crossentropy": 2.645583152770996, "epoch": 0.4741408642395373, "grad_norm": 0.03371230885386467, "grad_norm_var": 2.4113766625882366e-06, "learning_rate": 0.0021822961938447872, "loss": 2.6456, "step": 5574 }, { "crossentropy": 2.676872968673706, "epoch": 0.4742259271861177, "grad_norm": 0.03698529303073883, "grad_norm_var": 2.9263469326181365e-06, "learning_rate": 0.002180641297148776, "loss": 2.6769, "step": 5575 }, { "crossentropy": 2.6995534896850586, "epoch": 0.4743109901326982, "grad_norm": 0.03577326238155365, "grad_norm_var": 2.1954986289908495e-06, "learning_rate": 0.0021789868531605352, "loss": 2.6996, "step": 5576 }, { "crossentropy": 2.664966583251953, "epoch": 0.4743960530792787, "grad_norm": 0.034119799733161926, "grad_norm_var": 1.898664190794902e-06, "learning_rate": 0.0021773328621457195, "loss": 2.665, "step": 5577 }, { "crossentropy": 2.595700979232788, "epoch": 0.47448111602585913, "grad_norm": 0.034677423536777496, "grad_norm_var": 1.905339581776723e-06, "learning_rate": 0.002175679324369913, "loss": 2.5957, "step": 5578 }, { "crossentropy": 2.7106189727783203, "epoch": 0.4745661789724396, "grad_norm": 0.035036373883485794, "grad_norm_var": 1.8855097436515368e-06, "learning_rate": 0.0021740262400986245, "loss": 2.7106, "step": 5579 }, { "crossentropy": 2.6675522327423096, "epoch": 0.4746512419190201, "grad_norm": 0.03197895735502243, "grad_norm_var": 2.1710831242506557e-06, "learning_rate": 0.0021723736095972945, "loss": 2.6676, "step": 5580 }, { "crossentropy": 2.6495747566223145, "epoch": 0.47473630486560053, "grad_norm": 0.033554673194885254, "grad_norm_var": 1.9551094283899727e-06, "learning_rate": 0.0021707214331312874, "loss": 2.6496, "step": 5581 }, { "crossentropy": 2.619251012802124, "epoch": 0.47482136781218104, "grad_norm": 0.03530590608716011, "grad_norm_var": 1.8726618233525296e-06, "learning_rate": 0.002169069710965892, "loss": 2.6193, "step": 5582 }, { "crossentropy": 2.642845869064331, "epoch": 0.4749064307587615, "grad_norm": 0.035670481622219086, "grad_norm_var": 1.5321292930700962e-06, "learning_rate": 0.0021674184433663336, "loss": 2.6428, "step": 5583 }, { "crossentropy": 2.6134185791015625, "epoch": 0.47499149370534194, "grad_norm": 0.03374249115586281, "grad_norm_var": 1.533444793290715e-06, "learning_rate": 0.002165767630597752, "loss": 2.6134, "step": 5584 }, { "crossentropy": 2.6212973594665527, "epoch": 0.47507655665192244, "grad_norm": 0.03433844819664955, "grad_norm_var": 1.4964480906431408e-06, "learning_rate": 0.0021641172729252207, "loss": 2.6213, "step": 5585 }, { "crossentropy": 2.6199729442596436, "epoch": 0.4751616195985029, "grad_norm": 0.03475327789783478, "grad_norm_var": 1.3741682441678962e-06, "learning_rate": 0.002162467370613743, "loss": 2.62, "step": 5586 }, { "crossentropy": 2.69551682472229, "epoch": 0.47524668254508334, "grad_norm": 0.033025819808244705, "grad_norm_var": 1.5130045222556954e-06, "learning_rate": 0.002160817923928244, "loss": 2.6955, "step": 5587 }, { "crossentropy": 2.61513090133667, "epoch": 0.47533174549166385, "grad_norm": 0.0356777124106884, "grad_norm_var": 1.572712543385091e-06, "learning_rate": 0.0021591689331335756, "loss": 2.6151, "step": 5588 }, { "crossentropy": 2.6079061031341553, "epoch": 0.4754168084382443, "grad_norm": 0.034262094646692276, "grad_norm_var": 1.4938779168247504e-06, "learning_rate": 0.0021575203984945212, "loss": 2.6079, "step": 5589 }, { "crossentropy": 2.5957536697387695, "epoch": 0.47550187138482475, "grad_norm": 0.03881896659731865, "grad_norm_var": 2.5612779418091497e-06, "learning_rate": 0.002155872320275789, "loss": 2.5958, "step": 5590 }, { "crossentropy": 2.61834979057312, "epoch": 0.47558693433140525, "grad_norm": 0.033150751143693924, "grad_norm_var": 2.3924097403315557e-06, "learning_rate": 0.0021542246987420067, "loss": 2.6183, "step": 5591 }, { "crossentropy": 2.571821451187134, "epoch": 0.4756719972779857, "grad_norm": 0.0360654816031456, "grad_norm_var": 2.442762527805377e-06, "learning_rate": 0.0021525775341577403, "loss": 2.5718, "step": 5592 }, { "crossentropy": 2.658342123031616, "epoch": 0.47575706022456615, "grad_norm": 0.03460698574781418, "grad_norm_var": 2.4240547146276227e-06, "learning_rate": 0.0021509308267874754, "loss": 2.6583, "step": 5593 }, { "crossentropy": 2.6727662086486816, "epoch": 0.47584212317114666, "grad_norm": 0.03443542867898941, "grad_norm_var": 2.4273660708908883e-06, "learning_rate": 0.002149284576895626, "loss": 2.6728, "step": 5594 }, { "crossentropy": 2.661036729812622, "epoch": 0.4759271861177271, "grad_norm": 0.03446703031659126, "grad_norm_var": 2.4184081361413565e-06, "learning_rate": 0.0021476387847465294, "loss": 2.661, "step": 5595 }, { "crossentropy": 2.5759429931640625, "epoch": 0.4760122490643076, "grad_norm": 0.03808107599616051, "grad_norm_var": 2.6001852175392785e-06, "learning_rate": 0.0021459934506044575, "loss": 2.5759, "step": 5596 }, { "crossentropy": 2.747062921524048, "epoch": 0.47609731201088806, "grad_norm": 0.034314144402742386, "grad_norm_var": 2.4901516370439666e-06, "learning_rate": 0.0021443485747336013, "loss": 2.7471, "step": 5597 }, { "crossentropy": 2.678269863128662, "epoch": 0.4761823749574685, "grad_norm": 0.03456559777259827, "grad_norm_var": 2.498627619889245e-06, "learning_rate": 0.00214270415739808, "loss": 2.6783, "step": 5598 }, { "crossentropy": 2.6707637310028076, "epoch": 0.476267437904049, "grad_norm": 0.03406408429145813, "grad_norm_var": 2.5159775413339155e-06, "learning_rate": 0.0021410601988619393, "loss": 2.6708, "step": 5599 }, { "crossentropy": 2.6609013080596924, "epoch": 0.47635250085062947, "grad_norm": 0.03586582466959953, "grad_norm_var": 2.4705995893640567e-06, "learning_rate": 0.002139416699389153, "loss": 2.6609, "step": 5600 }, { "crossentropy": 2.733996629714966, "epoch": 0.4764375637972099, "grad_norm": 0.037449389696121216, "grad_norm_var": 2.788291740638861e-06, "learning_rate": 0.002137773659243617, "loss": 2.734, "step": 5601 }, { "crossentropy": 2.6653590202331543, "epoch": 0.4765226267437904, "grad_norm": 0.03438631445169449, "grad_norm_var": 2.819799971831976e-06, "learning_rate": 0.0021361310786891597, "loss": 2.6654, "step": 5602 }, { "crossentropy": 2.6937577724456787, "epoch": 0.4766076896903709, "grad_norm": 0.03474687412381172, "grad_norm_var": 2.5054827044285537e-06, "learning_rate": 0.0021344889579895305, "loss": 2.6938, "step": 5603 }, { "crossentropy": 2.6360769271850586, "epoch": 0.4766927526369513, "grad_norm": 0.03507276251912117, "grad_norm_var": 2.4986844901106824e-06, "learning_rate": 0.002132847297408406, "loss": 2.6361, "step": 5604 }, { "crossentropy": 2.635579824447632, "epoch": 0.4767778155835318, "grad_norm": 0.032877881079912186, "grad_norm_var": 2.8048367037286983e-06, "learning_rate": 0.002131206097209393, "loss": 2.6356, "step": 5605 }, { "crossentropy": 2.6521053314208984, "epoch": 0.4768628785301123, "grad_norm": 0.03412933275103569, "grad_norm_var": 1.9074510914294958e-06, "learning_rate": 0.0021295653576560163, "loss": 2.6521, "step": 5606 }, { "crossentropy": 2.720229148864746, "epoch": 0.4769479414766927, "grad_norm": 0.033872488886117935, "grad_norm_var": 1.772402484930396e-06, "learning_rate": 0.0021279250790117326, "loss": 2.7202, "step": 5607 }, { "crossentropy": 2.662998914718628, "epoch": 0.47703300442327323, "grad_norm": 0.034515950828790665, "grad_norm_var": 1.6894312086473498e-06, "learning_rate": 0.002126285261539926, "loss": 2.663, "step": 5608 }, { "crossentropy": 2.643972873687744, "epoch": 0.4771180673698537, "grad_norm": 0.0358586348593235, "grad_norm_var": 1.7483418898978434e-06, "learning_rate": 0.002124645905503902, "loss": 2.644, "step": 5609 }, { "crossentropy": 2.7381227016448975, "epoch": 0.4772031303164342, "grad_norm": 0.03443599492311478, "grad_norm_var": 1.7483054062715382e-06, "learning_rate": 0.0021230070111668935, "loss": 2.7381, "step": 5610 }, { "crossentropy": 2.6952879428863525, "epoch": 0.47728819326301464, "grad_norm": 0.035063374787569046, "grad_norm_var": 1.7345978671955027e-06, "learning_rate": 0.0021213685787920596, "loss": 2.6953, "step": 5611 }, { "crossentropy": 2.693657875061035, "epoch": 0.4773732562095951, "grad_norm": 0.035969287157058716, "grad_norm_var": 1.1334582805664259e-06, "learning_rate": 0.002119730608642489, "loss": 2.6937, "step": 5612 }, { "crossentropy": 2.5863494873046875, "epoch": 0.4774583191561756, "grad_norm": 0.031552623957395554, "grad_norm_var": 1.7979038049833104e-06, "learning_rate": 0.0021180931009811862, "loss": 2.5863, "step": 5613 }, { "crossentropy": 2.5203776359558105, "epoch": 0.47754338210275604, "grad_norm": 0.03636844828724861, "grad_norm_var": 1.9803601865726768e-06, "learning_rate": 0.0021164560560710927, "loss": 2.5204, "step": 5614 }, { "crossentropy": 2.663654327392578, "epoch": 0.4776284450493365, "grad_norm": 0.03629741817712784, "grad_norm_var": 2.083579026251371e-06, "learning_rate": 0.0021148194741750694, "loss": 2.6637, "step": 5615 }, { "crossentropy": 2.7349693775177, "epoch": 0.477713507995917, "grad_norm": 0.03408994898200035, "grad_norm_var": 2.0529225888779938e-06, "learning_rate": 0.002113183355555904, "loss": 2.735, "step": 5616 }, { "crossentropy": 2.7256553173065186, "epoch": 0.47779857094249745, "grad_norm": 0.03465531766414642, "grad_norm_var": 1.5512010994436197e-06, "learning_rate": 0.0021115477004763077, "loss": 2.7257, "step": 5617 }, { "crossentropy": 2.721715211868286, "epoch": 0.4778836338890779, "grad_norm": 0.037237320095300674, "grad_norm_var": 1.971033540994436e-06, "learning_rate": 0.002109912509198924, "loss": 2.7217, "step": 5618 }, { "crossentropy": 2.58598256111145, "epoch": 0.4779686968356584, "grad_norm": 0.03474818542599678, "grad_norm_var": 1.971024975586794e-06, "learning_rate": 0.002108277781986317, "loss": 2.586, "step": 5619 }, { "crossentropy": 2.665569305419922, "epoch": 0.47805375978223885, "grad_norm": 0.036073219031095505, "grad_norm_var": 2.070425780575924e-06, "learning_rate": 0.0021066435191009715, "loss": 2.6656, "step": 5620 }, { "crossentropy": 2.768110990524292, "epoch": 0.4781388227288193, "grad_norm": 0.03494758531451225, "grad_norm_var": 1.791420200600996e-06, "learning_rate": 0.002105009720805309, "loss": 2.7681, "step": 5621 }, { "crossentropy": 2.664430618286133, "epoch": 0.4782238856753998, "grad_norm": 0.031760670244693756, "grad_norm_var": 2.413406812609188e-06, "learning_rate": 0.002103376387361667, "loss": 2.6644, "step": 5622 }, { "crossentropy": 2.675522565841675, "epoch": 0.47830894862198026, "grad_norm": 0.033879686146974564, "grad_norm_var": 2.4124812048590195e-06, "learning_rate": 0.0021017435190323125, "loss": 2.6755, "step": 5623 }, { "crossentropy": 2.6490445137023926, "epoch": 0.47839401156856076, "grad_norm": 0.033310867846012115, "grad_norm_var": 2.555449990330333e-06, "learning_rate": 0.0021001111160794386, "loss": 2.649, "step": 5624 }, { "crossentropy": 2.6416101455688477, "epoch": 0.4784790745151412, "grad_norm": 0.03429171070456505, "grad_norm_var": 2.4805295529339854e-06, "learning_rate": 0.002098479178765161, "loss": 2.6416, "step": 5625 }, { "crossentropy": 2.629542112350464, "epoch": 0.47856413746172166, "grad_norm": 0.03818892315030098, "grad_norm_var": 3.24491414404552e-06, "learning_rate": 0.0020968477073515217, "loss": 2.6295, "step": 5626 }, { "crossentropy": 2.5923469066619873, "epoch": 0.47864920040830217, "grad_norm": 0.03355918452143669, "grad_norm_var": 3.3539932258703936e-06, "learning_rate": 0.002095216702100489, "loss": 2.5923, "step": 5627 }, { "crossentropy": 2.6757748126983643, "epoch": 0.4787342633548826, "grad_norm": 0.03519347682595253, "grad_norm_var": 3.2715011819689035e-06, "learning_rate": 0.0020935861632739535, "loss": 2.6758, "step": 5628 }, { "crossentropy": 2.625915765762329, "epoch": 0.47881932630146307, "grad_norm": 0.036211155354976654, "grad_norm_var": 2.6358594189428036e-06, "learning_rate": 0.002091956091133732, "loss": 2.6259, "step": 5629 }, { "crossentropy": 2.7433624267578125, "epoch": 0.47890438924804357, "grad_norm": 0.0344754122197628, "grad_norm_var": 2.5272577646774536e-06, "learning_rate": 0.0020903264859415713, "loss": 2.7434, "step": 5630 }, { "crossentropy": 2.5915632247924805, "epoch": 0.478989452194624, "grad_norm": 0.032656773924827576, "grad_norm_var": 2.6930958687118934e-06, "learning_rate": 0.0020886973479591364, "loss": 2.5916, "step": 5631 }, { "crossentropy": 2.7056005001068115, "epoch": 0.47907451514120447, "grad_norm": 0.03749348223209381, "grad_norm_var": 3.1380013569483335e-06, "learning_rate": 0.0020870686774480196, "loss": 2.7056, "step": 5632 }, { "crossentropy": 2.714966058731079, "epoch": 0.479159578087785, "grad_norm": 0.034822169691324234, "grad_norm_var": 3.133904447422525e-06, "learning_rate": 0.0020854404746697397, "loss": 2.715, "step": 5633 }, { "crossentropy": 2.780625343322754, "epoch": 0.4792446410343654, "grad_norm": 0.03285294026136398, "grad_norm_var": 2.9854036741439048e-06, "learning_rate": 0.002083812739885738, "loss": 2.7806, "step": 5634 }, { "crossentropy": 2.6944823265075684, "epoch": 0.4793297039809459, "grad_norm": 0.03414400666952133, "grad_norm_var": 3.0006381258393735e-06, "learning_rate": 0.002082185473357381, "loss": 2.6945, "step": 5635 }, { "crossentropy": 2.55926513671875, "epoch": 0.4794147669275264, "grad_norm": 0.03352511301636696, "grad_norm_var": 2.9114661950555356e-06, "learning_rate": 0.0020805586753459638, "loss": 2.5593, "step": 5636 }, { "crossentropy": 2.7189857959747314, "epoch": 0.47949982987410683, "grad_norm": 0.03546390309929848, "grad_norm_var": 2.9618957718611945e-06, "learning_rate": 0.0020789323461127013, "loss": 2.719, "step": 5637 }, { "crossentropy": 2.701988935470581, "epoch": 0.47958489282068734, "grad_norm": 0.033110108226537704, "grad_norm_var": 2.5847507161963026e-06, "learning_rate": 0.002077306485918735, "loss": 2.702, "step": 5638 }, { "crossentropy": 2.5656802654266357, "epoch": 0.4796699557672678, "grad_norm": 0.03216642141342163, "grad_norm_var": 2.9267386455139946e-06, "learning_rate": 0.0020756810950251304, "loss": 2.5657, "step": 5639 }, { "crossentropy": 2.593684196472168, "epoch": 0.47975501871384824, "grad_norm": 0.034641530364751816, "grad_norm_var": 2.8323525937158645e-06, "learning_rate": 0.0020740561736928807, "loss": 2.5937, "step": 5640 }, { "crossentropy": 2.6640231609344482, "epoch": 0.47984008166042874, "grad_norm": 0.03491412103176117, "grad_norm_var": 2.835148964282624e-06, "learning_rate": 0.002072431722182903, "loss": 2.664, "step": 5641 }, { "crossentropy": 2.748044967651367, "epoch": 0.4799251446070092, "grad_norm": 0.038511957973241806, "grad_norm_var": 2.9967385459673976e-06, "learning_rate": 0.0020708077407560303, "loss": 2.748, "step": 5642 }, { "crossentropy": 2.666003942489624, "epoch": 0.48001020755358964, "grad_norm": 0.03389519080519676, "grad_norm_var": 2.956768477879214e-06, "learning_rate": 0.0020691842296730345, "loss": 2.666, "step": 5643 }, { "crossentropy": 2.654064178466797, "epoch": 0.48009527050017015, "grad_norm": 0.03494594991207123, "grad_norm_var": 2.9419964548083045e-06, "learning_rate": 0.0020675611891946013, "loss": 2.6541, "step": 5644 }, { "crossentropy": 2.6423447132110596, "epoch": 0.4801803334467506, "grad_norm": 0.03201501816511154, "grad_norm_var": 3.1491030804686737e-06, "learning_rate": 0.002065938619581343, "loss": 2.6423, "step": 5645 }, { "crossentropy": 2.6786980628967285, "epoch": 0.48026539639333105, "grad_norm": 0.03632241487503052, "grad_norm_var": 3.392676808426122e-06, "learning_rate": 0.002064316521093801, "loss": 2.6787, "step": 5646 }, { "crossentropy": 2.5715768337249756, "epoch": 0.48035045933991155, "grad_norm": 0.03643329069018364, "grad_norm_var": 3.3722570839445075e-06, "learning_rate": 0.002062694893992436, "loss": 2.5716, "step": 5647 }, { "crossentropy": 2.777766704559326, "epoch": 0.480435522286492, "grad_norm": 0.03596516698598862, "grad_norm_var": 2.9497322634979933e-06, "learning_rate": 0.0020610737385376348, "loss": 2.7778, "step": 5648 }, { "crossentropy": 2.7460827827453613, "epoch": 0.48052058523307245, "grad_norm": 0.0384359136223793, "grad_norm_var": 3.869083516723435e-06, "learning_rate": 0.002059453054989708, "loss": 2.7461, "step": 5649 }, { "crossentropy": 2.7025949954986572, "epoch": 0.48060564817965296, "grad_norm": 0.03446153551340103, "grad_norm_var": 3.6059235193415414e-06, "learning_rate": 0.00205783284360889, "loss": 2.7026, "step": 5650 }, { "crossentropy": 2.6429262161254883, "epoch": 0.4806907111262334, "grad_norm": 0.0348178930580616, "grad_norm_var": 3.5632811806793803e-06, "learning_rate": 0.0020562131046553394, "loss": 2.6429, "step": 5651 }, { "crossentropy": 2.7327840328216553, "epoch": 0.48077577407281386, "grad_norm": 0.03460649400949478, "grad_norm_var": 3.42708698454295e-06, "learning_rate": 0.002054593838389143, "loss": 2.7328, "step": 5652 }, { "crossentropy": 2.680426597595215, "epoch": 0.48086083701939436, "grad_norm": 0.03369509428739548, "grad_norm_var": 3.523642222436988e-06, "learning_rate": 0.0020529750450703054, "loss": 2.6804, "step": 5653 }, { "crossentropy": 2.689897060394287, "epoch": 0.4809458999659748, "grad_norm": 0.035361893475055695, "grad_norm_var": 3.2930597893082145e-06, "learning_rate": 0.0020513567249587595, "loss": 2.6899, "step": 5654 }, { "crossentropy": 2.663792133331299, "epoch": 0.4810309629125553, "grad_norm": 0.03201594576239586, "grad_norm_var": 3.3528183237279778e-06, "learning_rate": 0.00204973887831436, "loss": 2.6638, "step": 5655 }, { "crossentropy": 2.6159791946411133, "epoch": 0.48111602585913577, "grad_norm": 0.03416690230369568, "grad_norm_var": 3.393694222432869e-06, "learning_rate": 0.002048121505396887, "loss": 2.616, "step": 5656 }, { "crossentropy": 2.6755526065826416, "epoch": 0.4812010888057162, "grad_norm": 0.0344989150762558, "grad_norm_var": 3.41117747477872e-06, "learning_rate": 0.002046504606466044, "loss": 2.6756, "step": 5657 }, { "crossentropy": 2.6809866428375244, "epoch": 0.4812861517522967, "grad_norm": 0.03575585037469864, "grad_norm_var": 2.5987930771703233e-06, "learning_rate": 0.0020448881817814557, "loss": 2.681, "step": 5658 }, { "crossentropy": 2.6598994731903076, "epoch": 0.48137121469887717, "grad_norm": 0.03514954820275307, "grad_norm_var": 2.5396006252574136e-06, "learning_rate": 0.002043272231602678, "loss": 2.6599, "step": 5659 }, { "crossentropy": 2.698667287826538, "epoch": 0.4814562776454576, "grad_norm": 0.03554218262434006, "grad_norm_var": 2.564240526505233e-06, "learning_rate": 0.002041656756189184, "loss": 2.6987, "step": 5660 }, { "crossentropy": 2.661349058151245, "epoch": 0.4815413405920381, "grad_norm": 0.035392437130212784, "grad_norm_var": 1.954246965845246e-06, "learning_rate": 0.0020400417558003705, "loss": 2.6613, "step": 5661 }, { "crossentropy": 2.7169125080108643, "epoch": 0.4816264035386186, "grad_norm": 0.03337708115577698, "grad_norm_var": 2.041449532088196e-06, "learning_rate": 0.002038427230695565, "loss": 2.7169, "step": 5662 }, { "crossentropy": 2.6818134784698486, "epoch": 0.481711466485199, "grad_norm": 0.03288649767637253, "grad_norm_var": 2.1402998602602458e-06, "learning_rate": 0.0020368131811340086, "loss": 2.6818, "step": 5663 }, { "crossentropy": 2.712531566619873, "epoch": 0.48179652943177953, "grad_norm": 0.0338078998029232, "grad_norm_var": 2.083962528551658e-06, "learning_rate": 0.0020351996073748714, "loss": 2.7125, "step": 5664 }, { "crossentropy": 2.6120145320892334, "epoch": 0.48188159237836, "grad_norm": 0.03219117969274521, "grad_norm_var": 1.3467181181974741e-06, "learning_rate": 0.0020335865096772505, "loss": 2.612, "step": 5665 }, { "crossentropy": 2.7028894424438477, "epoch": 0.48196665532494043, "grad_norm": 0.03279700130224228, "grad_norm_var": 1.4691555470111006e-06, "learning_rate": 0.0020319738883001603, "loss": 2.7029, "step": 5666 }, { "crossentropy": 2.6052942276000977, "epoch": 0.48205171827152093, "grad_norm": 0.03340775519609451, "grad_norm_var": 1.4638976243705934e-06, "learning_rate": 0.002030361743502541, "loss": 2.6053, "step": 5667 }, { "crossentropy": 2.698721408843994, "epoch": 0.4821367812181014, "grad_norm": 0.03460065275430679, "grad_norm_var": 1.4634591692095264e-06, "learning_rate": 0.0020287500755432585, "loss": 2.6987, "step": 5668 }, { "crossentropy": 2.583077907562256, "epoch": 0.4822218441646819, "grad_norm": 0.03417547792196274, "grad_norm_var": 1.455763219382113e-06, "learning_rate": 0.0020271388846811013, "loss": 2.5831, "step": 5669 }, { "crossentropy": 2.582915782928467, "epoch": 0.48230690711126234, "grad_norm": 0.03570505604147911, "grad_norm_var": 1.522213202722183e-06, "learning_rate": 0.002025528171174775, "loss": 2.5829, "step": 5670 }, { "crossentropy": 2.6831774711608887, "epoch": 0.4823919700578428, "grad_norm": 0.0373135469853878, "grad_norm_var": 1.809906427874206e-06, "learning_rate": 0.002023917935282918, "loss": 2.6832, "step": 5671 }, { "crossentropy": 2.6909608840942383, "epoch": 0.4824770330044233, "grad_norm": 0.03541715815663338, "grad_norm_var": 1.8649111373438462e-06, "learning_rate": 0.0020223081772640867, "loss": 2.691, "step": 5672 }, { "crossentropy": 2.7306597232818604, "epoch": 0.48256209595100374, "grad_norm": 0.03352281078696251, "grad_norm_var": 1.9247494297648543e-06, "learning_rate": 0.002020698897376761, "loss": 2.7307, "step": 5673 }, { "crossentropy": 2.5397000312805176, "epoch": 0.4826471588975842, "grad_norm": 0.03643666207790375, "grad_norm_var": 2.073152526343008e-06, "learning_rate": 0.002019090095879347, "loss": 2.5397, "step": 5674 }, { "crossentropy": 2.6939949989318848, "epoch": 0.4827322218441647, "grad_norm": 0.036908093839883804, "grad_norm_var": 2.4227941318750887e-06, "learning_rate": 0.0020174817730301705, "loss": 2.694, "step": 5675 }, { "crossentropy": 2.689905881881714, "epoch": 0.48281728479074515, "grad_norm": 0.03311121463775635, "grad_norm_var": 2.484354973639596e-06, "learning_rate": 0.0020158739290874824, "loss": 2.6899, "step": 5676 }, { "crossentropy": 2.738715887069702, "epoch": 0.4829023477373256, "grad_norm": 0.03406193479895592, "grad_norm_var": 2.4261488069332014e-06, "learning_rate": 0.002014266564309455, "loss": 2.7387, "step": 5677 }, { "crossentropy": 2.6161751747131348, "epoch": 0.4829874106839061, "grad_norm": 0.0329195000231266, "grad_norm_var": 2.49905133014941e-06, "learning_rate": 0.002012659678954186, "loss": 2.6162, "step": 5678 }, { "crossentropy": 2.659275770187378, "epoch": 0.48307247363048655, "grad_norm": 0.034194715321063995, "grad_norm_var": 2.354418640733667e-06, "learning_rate": 0.0020110532732796936, "loss": 2.6593, "step": 5679 }, { "crossentropy": 2.649834156036377, "epoch": 0.483157536577067, "grad_norm": 0.03625205159187317, "grad_norm_var": 2.531352335160125e-06, "learning_rate": 0.00200944734754392, "loss": 2.6498, "step": 5680 }, { "crossentropy": 2.599376678466797, "epoch": 0.4832425995236475, "grad_norm": 0.03920940309762955, "grad_norm_var": 3.3899582477019015e-06, "learning_rate": 0.0020078419020047333, "loss": 2.5994, "step": 5681 }, { "crossentropy": 2.7255823612213135, "epoch": 0.48332766247022796, "grad_norm": 0.033681441098451614, "grad_norm_var": 3.178815085900372e-06, "learning_rate": 0.0020062369369199195, "loss": 2.7256, "step": 5682 }, { "crossentropy": 2.760878086090088, "epoch": 0.48341272541680846, "grad_norm": 0.03583785891532898, "grad_norm_var": 3.0134138571336067e-06, "learning_rate": 0.0020046324525471886, "loss": 2.7609, "step": 5683 }, { "crossentropy": 2.679182767868042, "epoch": 0.4834977883633889, "grad_norm": 0.03657888248562813, "grad_norm_var": 3.097481845362217e-06, "learning_rate": 0.00200302844914418, "loss": 2.6792, "step": 5684 }, { "crossentropy": 2.6106278896331787, "epoch": 0.48358285130996936, "grad_norm": 0.03363605588674545, "grad_norm_var": 3.198910387871602e-06, "learning_rate": 0.0020014249269684455, "loss": 2.6106, "step": 5685 }, { "crossentropy": 2.651529550552368, "epoch": 0.48366791425654987, "grad_norm": 0.03569701313972473, "grad_norm_var": 3.198479141611199e-06, "learning_rate": 0.0019998218862774635, "loss": 2.6515, "step": 5686 }, { "crossentropy": 2.697744369506836, "epoch": 0.4837529772031303, "grad_norm": 0.039094191044569016, "grad_norm_var": 3.8750235650919505e-06, "learning_rate": 0.00199821932732864, "loss": 2.6977, "step": 5687 }, { "crossentropy": 2.6595330238342285, "epoch": 0.48383804014971077, "grad_norm": 0.03282034024596214, "grad_norm_var": 4.2939896380820205e-06, "learning_rate": 0.0019966172503792985, "loss": 2.6595, "step": 5688 }, { "crossentropy": 2.6747684478759766, "epoch": 0.4839231030962913, "grad_norm": 0.03377945348620415, "grad_norm_var": 4.239084405477147e-06, "learning_rate": 0.001995015655686684, "loss": 2.6748, "step": 5689 }, { "crossentropy": 2.6527671813964844, "epoch": 0.4840081660428717, "grad_norm": 0.034307029098272324, "grad_norm_var": 4.1894722234277084e-06, "learning_rate": 0.00199341454350797, "loss": 2.6528, "step": 5690 }, { "crossentropy": 2.6507511138916016, "epoch": 0.4840932289894522, "grad_norm": 0.03382939100265503, "grad_norm_var": 4.052212068665166e-06, "learning_rate": 0.00199181391410025, "loss": 2.6508, "step": 5691 }, { "crossentropy": 2.645226240158081, "epoch": 0.4841782919360327, "grad_norm": 0.034549832344055176, "grad_norm_var": 3.831127590888291e-06, "learning_rate": 0.0019902137677205328, "loss": 2.6452, "step": 5692 }, { "crossentropy": 2.702859878540039, "epoch": 0.48426335488261313, "grad_norm": 0.03255711868405342, "grad_norm_var": 4.1665041734378445e-06, "learning_rate": 0.0019886141046257613, "loss": 2.7029, "step": 5693 }, { "crossentropy": 2.703730583190918, "epoch": 0.4843484178291936, "grad_norm": 0.034582898020744324, "grad_norm_var": 3.892642436434332e-06, "learning_rate": 0.001987014925072793, "loss": 2.7037, "step": 5694 }, { "crossentropy": 2.659515380859375, "epoch": 0.4844334807757741, "grad_norm": 0.03234285116195679, "grad_norm_var": 4.315194805146136e-06, "learning_rate": 0.0019854162293184113, "loss": 2.6595, "step": 5695 }, { "crossentropy": 2.644117832183838, "epoch": 0.48451854372235453, "grad_norm": 0.03359050303697586, "grad_norm_var": 4.286019796972904e-06, "learning_rate": 0.0019838180176193175, "loss": 2.6441, "step": 5696 }, { "crossentropy": 2.535541534423828, "epoch": 0.48460360666893504, "grad_norm": 0.03384040668606758, "grad_norm_var": 2.899534023673156e-06, "learning_rate": 0.001982220290232143, "loss": 2.5355, "step": 5697 }, { "crossentropy": 2.7177927494049072, "epoch": 0.4846886696155155, "grad_norm": 0.03195330873131752, "grad_norm_var": 3.2564394628435225e-06, "learning_rate": 0.0019806230474134342, "loss": 2.7178, "step": 5698 }, { "crossentropy": 2.608494520187378, "epoch": 0.48477373256209594, "grad_norm": 0.0343545638024807, "grad_norm_var": 3.092239987672179e-06, "learning_rate": 0.0019790262894196628, "loss": 2.6085, "step": 5699 }, { "crossentropy": 2.606105327606201, "epoch": 0.48485879550867644, "grad_norm": 0.033461399376392365, "grad_norm_var": 2.718995228296698e-06, "learning_rate": 0.001977430016507222, "loss": 2.6061, "step": 5700 }, { "crossentropy": 2.5036070346832275, "epoch": 0.4849438584552569, "grad_norm": 0.032213158905506134, "grad_norm_var": 2.919282075989719e-06, "learning_rate": 0.0019758342289324265, "loss": 2.5036, "step": 5701 }, { "crossentropy": 2.643486976623535, "epoch": 0.48502892140183734, "grad_norm": 0.032683081924915314, "grad_norm_var": 2.77927825608366e-06, "learning_rate": 0.001974238926951514, "loss": 2.6435, "step": 5702 }, { "crossentropy": 2.7034263610839844, "epoch": 0.48511398434841785, "grad_norm": 0.03352200612425804, "grad_norm_var": 7.474672888786727e-07, "learning_rate": 0.001972644110820645, "loss": 2.7034, "step": 5703 }, { "crossentropy": 2.728455066680908, "epoch": 0.4851990472949983, "grad_norm": 0.03447691723704338, "grad_norm_var": 7.911240112214168e-07, "learning_rate": 0.001971049780795901, "loss": 2.7285, "step": 5704 }, { "crossentropy": 2.700937509536743, "epoch": 0.48528411024157875, "grad_norm": 0.034007053822278976, "grad_norm_var": 8.027588240919666e-07, "learning_rate": 0.0019694559371332848, "loss": 2.7009, "step": 5705 }, { "crossentropy": 2.6531379222869873, "epoch": 0.48536917318815925, "grad_norm": 0.03438825532793999, "grad_norm_var": 8.117276499628364e-07, "learning_rate": 0.001967862580088722, "loss": 2.6531, "step": 5706 }, { "crossentropy": 2.691739320755005, "epoch": 0.4854542361347397, "grad_norm": 0.033534273505210876, "grad_norm_var": 8.050773526962577e-07, "learning_rate": 0.0019662697099180606, "loss": 2.6917, "step": 5707 }, { "crossentropy": 2.5614736080169678, "epoch": 0.48553929908132015, "grad_norm": 0.03320819512009621, "grad_norm_var": 7.304218416580183e-07, "learning_rate": 0.0019646773268770664, "loss": 2.5615, "step": 5708 }, { "crossentropy": 2.6519057750701904, "epoch": 0.48562436202790066, "grad_norm": 0.032415393739938736, "grad_norm_var": 7.479780540728745e-07, "learning_rate": 0.0019630854312214347, "loss": 2.6519, "step": 5709 }, { "crossentropy": 2.668131113052368, "epoch": 0.4857094249744811, "grad_norm": 0.035452838987112045, "grad_norm_var": 9.312213974578542e-07, "learning_rate": 0.0019614940232067757, "loss": 2.6681, "step": 5710 }, { "crossentropy": 2.571953058242798, "epoch": 0.4857944879210616, "grad_norm": 0.0344812348484993, "grad_norm_var": 8.9699458411067e-07, "learning_rate": 0.001959903103088622, "loss": 2.572, "step": 5711 }, { "crossentropy": 2.6623032093048096, "epoch": 0.48587955086764206, "grad_norm": 0.033548492938280106, "grad_norm_var": 8.97151988362282e-07, "learning_rate": 0.0019583126711224343, "loss": 2.6623, "step": 5712 }, { "crossentropy": 2.6332013607025146, "epoch": 0.4859646138142225, "grad_norm": 0.03307347372174263, "grad_norm_var": 9.089504296917623e-07, "learning_rate": 0.0019567227275635885, "loss": 2.6332, "step": 5713 }, { "crossentropy": 2.659876823425293, "epoch": 0.486049676760803, "grad_norm": 0.034045469015836716, "grad_norm_var": 7.375762409305685e-07, "learning_rate": 0.0019551332726673787, "loss": 2.6599, "step": 5714 }, { "crossentropy": 2.717668056488037, "epoch": 0.48613473970738347, "grad_norm": 0.03390971198678017, "grad_norm_var": 7.098811614050214e-07, "learning_rate": 0.001953544306689032, "loss": 2.7177, "step": 5715 }, { "crossentropy": 2.577401876449585, "epoch": 0.4862198026539639, "grad_norm": 0.03278132528066635, "grad_norm_var": 7.560078799289637e-07, "learning_rate": 0.0019519558298836876, "loss": 2.5774, "step": 5716 }, { "crossentropy": 2.658740997314453, "epoch": 0.4863048656005444, "grad_norm": 0.03359091281890869, "grad_norm_var": 6.182648009652713e-07, "learning_rate": 0.0019503678425064098, "loss": 2.6587, "step": 5717 }, { "crossentropy": 2.5884757041931152, "epoch": 0.4863899285471249, "grad_norm": 0.034921545535326004, "grad_norm_var": 6.294412907892344e-07, "learning_rate": 0.001948780344812181, "loss": 2.5885, "step": 5718 }, { "crossentropy": 2.6699581146240234, "epoch": 0.4864749914937053, "grad_norm": 0.035498715937137604, "grad_norm_var": 7.912073409771861e-07, "learning_rate": 0.0019471933370559119, "loss": 2.67, "step": 5719 }, { "crossentropy": 2.7332212924957275, "epoch": 0.4865600544402858, "grad_norm": 0.03376762941479683, "grad_norm_var": 7.736098669408955e-07, "learning_rate": 0.0019456068194924287, "loss": 2.7332, "step": 5720 }, { "crossentropy": 2.6576802730560303, "epoch": 0.4866451173868663, "grad_norm": 0.032508037984371185, "grad_norm_var": 8.954583640271204e-07, "learning_rate": 0.0019440207923764796, "loss": 2.6577, "step": 5721 }, { "crossentropy": 2.7132604122161865, "epoch": 0.4867301803334467, "grad_norm": 0.03215115889906883, "grad_norm_var": 1.0388496108617214e-06, "learning_rate": 0.0019424352559627356, "loss": 2.7133, "step": 5722 }, { "crossentropy": 2.595327138900757, "epoch": 0.48681524328002723, "grad_norm": 0.03605533018708229, "grad_norm_var": 1.3869212231860824e-06, "learning_rate": 0.0019408502105057873, "loss": 2.5953, "step": 5723 }, { "crossentropy": 2.6528284549713135, "epoch": 0.4869003062266077, "grad_norm": 0.03414066508412361, "grad_norm_var": 1.3629503750671341e-06, "learning_rate": 0.0019392656562601462, "loss": 2.6528, "step": 5724 }, { "crossentropy": 2.6080381870269775, "epoch": 0.48698536917318813, "grad_norm": 0.034070517867803574, "grad_norm_var": 1.2073382861192578e-06, "learning_rate": 0.0019376815934802494, "loss": 2.608, "step": 5725 }, { "crossentropy": 2.7419445514678955, "epoch": 0.48707043211976864, "grad_norm": 0.03408954292535782, "grad_norm_var": 1.059379278032374e-06, "learning_rate": 0.00193609802242045, "loss": 2.7419, "step": 5726 }, { "crossentropy": 2.700629234313965, "epoch": 0.4871554950663491, "grad_norm": 0.03799506649374962, "grad_norm_var": 2.096537359617898e-06, "learning_rate": 0.001934514943335024, "loss": 2.7006, "step": 5727 }, { "crossentropy": 2.7260541915893555, "epoch": 0.4872405580129296, "grad_norm": 0.032750967890024185, "grad_norm_var": 2.198575020072718e-06, "learning_rate": 0.0019329323564781682, "loss": 2.7261, "step": 5728 }, { "crossentropy": 2.6416730880737305, "epoch": 0.48732562095951004, "grad_norm": 0.03774932026863098, "grad_norm_var": 2.93480099316674e-06, "learning_rate": 0.0019313502621039996, "loss": 2.6417, "step": 5729 }, { "crossentropy": 2.702151298522949, "epoch": 0.4874106839060905, "grad_norm": 0.03412530571222305, "grad_norm_var": 2.931674296656877e-06, "learning_rate": 0.001929768660466557, "loss": 2.7022, "step": 5730 }, { "crossentropy": 2.694568634033203, "epoch": 0.487495746852671, "grad_norm": 0.03425572067499161, "grad_norm_var": 2.917386163899203e-06, "learning_rate": 0.0019281875518198021, "loss": 2.6946, "step": 5731 }, { "crossentropy": 2.6796886920928955, "epoch": 0.48758080979925145, "grad_norm": 0.034386441111564636, "grad_norm_var": 2.7312971971227286e-06, "learning_rate": 0.0019266069364176142, "loss": 2.6797, "step": 5732 }, { "crossentropy": 2.6304662227630615, "epoch": 0.4876658727458319, "grad_norm": 0.03512843698263168, "grad_norm_var": 2.691951409557423e-06, "learning_rate": 0.0019250268145137946, "loss": 2.6305, "step": 5733 }, { "crossentropy": 2.6617205142974854, "epoch": 0.4877509356924124, "grad_norm": 0.03626001626253128, "grad_norm_var": 2.861366736250405e-06, "learning_rate": 0.001923447186362065, "loss": 2.6617, "step": 5734 }, { "crossentropy": 2.5368425846099854, "epoch": 0.48783599863899285, "grad_norm": 0.03207942843437195, "grad_norm_var": 3.2203370721905465e-06, "learning_rate": 0.0019218680522160687, "loss": 2.5368, "step": 5735 }, { "crossentropy": 2.7472445964813232, "epoch": 0.4879210615855733, "grad_norm": 0.03339045122265816, "grad_norm_var": 3.264530890084658e-06, "learning_rate": 0.0019202894123293674, "loss": 2.7472, "step": 5736 }, { "crossentropy": 2.696315288543701, "epoch": 0.4880061245321538, "grad_norm": 0.03358609601855278, "grad_norm_var": 3.0586005588748486e-06, "learning_rate": 0.001918711266955448, "loss": 2.6963, "step": 5737 }, { "crossentropy": 2.441997528076172, "epoch": 0.48809118747873426, "grad_norm": 0.03221145272254944, "grad_norm_var": 3.0398372617062553e-06, "learning_rate": 0.0019171336163477139, "loss": 2.442, "step": 5738 }, { "crossentropy": 2.6741180419921875, "epoch": 0.4881762504253147, "grad_norm": 0.0343795046210289, "grad_norm_var": 2.8716705208403636e-06, "learning_rate": 0.00191555646075949, "loss": 2.6741, "step": 5739 }, { "crossentropy": 2.618011236190796, "epoch": 0.4882613133718952, "grad_norm": 0.03519897907972336, "grad_norm_var": 2.903323475565653e-06, "learning_rate": 0.0019139798004440206, "loss": 2.618, "step": 5740 }, { "crossentropy": 2.5121536254882812, "epoch": 0.48834637631847566, "grad_norm": 0.03338776156306267, "grad_norm_var": 2.969605654998206e-06, "learning_rate": 0.0019124036356544772, "loss": 2.5122, "step": 5741 }, { "crossentropy": 2.728379011154175, "epoch": 0.48843143926505617, "grad_norm": 0.03519332408905029, "grad_norm_var": 2.9947769571787738e-06, "learning_rate": 0.0019108279666439393, "loss": 2.7284, "step": 5742 }, { "crossentropy": 2.57922625541687, "epoch": 0.4885165022116366, "grad_norm": 0.03377426788210869, "grad_norm_var": 2.1440468173561723e-06, "learning_rate": 0.0019092527936654191, "loss": 2.5792, "step": 5743 }, { "crossentropy": 2.6068403720855713, "epoch": 0.48860156515821707, "grad_norm": 0.035319529473781586, "grad_norm_var": 2.0460609786554505e-06, "learning_rate": 0.0019076781169718427, "loss": 2.6068, "step": 5744 }, { "crossentropy": 2.6808183193206787, "epoch": 0.48868662810479757, "grad_norm": 0.03421839699149132, "grad_norm_var": 1.2492150542589911e-06, "learning_rate": 0.0019061039368160576, "loss": 2.6808, "step": 5745 }, { "crossentropy": 2.646428346633911, "epoch": 0.488771691051378, "grad_norm": 0.03293328359723091, "grad_norm_var": 1.3468653798547984e-06, "learning_rate": 0.0019045302534508297, "loss": 2.6464, "step": 5746 }, { "crossentropy": 2.635592460632324, "epoch": 0.48885675399795847, "grad_norm": 0.03542277216911316, "grad_norm_var": 1.4552195588527865e-06, "learning_rate": 0.001902957067128851, "loss": 2.6356, "step": 5747 }, { "crossentropy": 2.7545969486236572, "epoch": 0.488941816944539, "grad_norm": 0.03258570283651352, "grad_norm_var": 1.60817158932694e-06, "learning_rate": 0.0019013843781027278, "loss": 2.7546, "step": 5748 }, { "crossentropy": 2.635190010070801, "epoch": 0.4890268798911194, "grad_norm": 0.03444211557507515, "grad_norm_var": 1.540464968894685e-06, "learning_rate": 0.001899812186624989, "loss": 2.6352, "step": 5749 }, { "crossentropy": 2.664706230163574, "epoch": 0.4891119428376999, "grad_norm": 0.03343944624066353, "grad_norm_var": 1.196757298748315e-06, "learning_rate": 0.0018982404929480828, "loss": 2.6647, "step": 5750 }, { "crossentropy": 2.7280433177948, "epoch": 0.4891970057842804, "grad_norm": 0.03352148458361626, "grad_norm_var": 9.867430134563014e-07, "learning_rate": 0.0018966692973243788, "loss": 2.728, "step": 5751 }, { "crossentropy": 2.692521333694458, "epoch": 0.48928206873086083, "grad_norm": 0.0327427051961422, "grad_norm_var": 1.0602375963415673e-06, "learning_rate": 0.0018950986000061637, "loss": 2.6925, "step": 5752 }, { "crossentropy": 2.615715980529785, "epoch": 0.4893671316774413, "grad_norm": 0.03567582741379738, "grad_norm_var": 1.2464622528869535e-06, "learning_rate": 0.001893528401245649, "loss": 2.6157, "step": 5753 }, { "crossentropy": 2.5953004360198975, "epoch": 0.4894521946240218, "grad_norm": 0.03291575238108635, "grad_norm_var": 1.1068872980928937e-06, "learning_rate": 0.0018919587012949618, "loss": 2.5953, "step": 5754 }, { "crossentropy": 2.6368210315704346, "epoch": 0.48953725757060224, "grad_norm": 0.03301408141851425, "grad_norm_var": 1.1674148448453472e-06, "learning_rate": 0.0018903895004061518, "loss": 2.6368, "step": 5755 }, { "crossentropy": 2.6712687015533447, "epoch": 0.48962232051718274, "grad_norm": 0.03346005454659462, "grad_norm_var": 1.0753054500373463e-06, "learning_rate": 0.0018888207988311861, "loss": 2.6713, "step": 5756 }, { "crossentropy": 2.738325834274292, "epoch": 0.4897073834637632, "grad_norm": 0.033900484442710876, "grad_norm_var": 1.058227935494399e-06, "learning_rate": 0.001887252596821954, "loss": 2.7383, "step": 5757 }, { "crossentropy": 2.5976648330688477, "epoch": 0.48979244641034364, "grad_norm": 0.037496864795684814, "grad_norm_var": 1.7840450460824183e-06, "learning_rate": 0.0018856848946302608, "loss": 2.5977, "step": 5758 }, { "crossentropy": 2.6331892013549805, "epoch": 0.48987750935692415, "grad_norm": 0.03238576650619507, "grad_norm_var": 1.9563146038444867e-06, "learning_rate": 0.0018841176925078384, "loss": 2.6332, "step": 5759 }, { "crossentropy": 2.736379861831665, "epoch": 0.4899625723035046, "grad_norm": 0.03361845389008522, "grad_norm_var": 1.8304330751874042e-06, "learning_rate": 0.0018825509907063327, "loss": 2.7364, "step": 5760 }, { "crossentropy": 2.7045509815216064, "epoch": 0.49004763525008505, "grad_norm": 0.03513534367084503, "grad_norm_var": 1.9266991760131037e-06, "learning_rate": 0.0018809847894773108, "loss": 2.7046, "step": 5761 }, { "crossentropy": 2.5901267528533936, "epoch": 0.49013269819666555, "grad_norm": 0.031936801970005035, "grad_norm_var": 2.119611491361676e-06, "learning_rate": 0.0018794190890722574, "loss": 2.5901, "step": 5762 }, { "crossentropy": 2.7915709018707275, "epoch": 0.490217761143246, "grad_norm": 0.033826012164354324, "grad_norm_var": 1.9453650953159147e-06, "learning_rate": 0.001877853889742584, "loss": 2.7916, "step": 5763 }, { "crossentropy": 2.7687795162200928, "epoch": 0.49030282408982645, "grad_norm": 0.03525953367352486, "grad_norm_var": 1.9749572508507657e-06, "learning_rate": 0.001876289191739612, "loss": 2.7688, "step": 5764 }, { "crossentropy": 2.6636910438537598, "epoch": 0.49038788703640696, "grad_norm": 0.03518882393836975, "grad_norm_var": 2.0614723345246364e-06, "learning_rate": 0.001874724995314586, "loss": 2.6637, "step": 5765 }, { "crossentropy": 2.7951409816741943, "epoch": 0.4904729499829874, "grad_norm": 0.03336618095636368, "grad_norm_var": 2.066989080150757e-06, "learning_rate": 0.001873161300718675, "loss": 2.7951, "step": 5766 }, { "crossentropy": 2.60333251953125, "epoch": 0.49055801292956785, "grad_norm": 0.03565077856183052, "grad_norm_var": 2.224366735192653e-06, "learning_rate": 0.0018715981082029608, "loss": 2.6033, "step": 5767 }, { "crossentropy": 2.617116689682007, "epoch": 0.49064307587614836, "grad_norm": 0.03147765249013901, "grad_norm_var": 2.553049331364907e-06, "learning_rate": 0.0018700354180184464, "loss": 2.6171, "step": 5768 }, { "crossentropy": 2.7033443450927734, "epoch": 0.4907281388227288, "grad_norm": 0.03476904705166817, "grad_norm_var": 2.4041561992889257e-06, "learning_rate": 0.0018684732304160578, "loss": 2.7033, "step": 5769 }, { "crossentropy": 2.7044320106506348, "epoch": 0.4908132017693093, "grad_norm": 0.035591643303632736, "grad_norm_var": 2.4781799844090026e-06, "learning_rate": 0.0018669115456466368, "loss": 2.7044, "step": 5770 }, { "crossentropy": 2.6271607875823975, "epoch": 0.49089826471588976, "grad_norm": 0.03463011607527733, "grad_norm_var": 2.4009879241002525e-06, "learning_rate": 0.0018653503639609404, "loss": 2.6272, "step": 5771 }, { "crossentropy": 2.7121620178222656, "epoch": 0.4909833276624702, "grad_norm": 0.03383856639266014, "grad_norm_var": 2.371041812894899e-06, "learning_rate": 0.0018637896856096548, "loss": 2.7122, "step": 5772 }, { "crossentropy": 2.581305980682373, "epoch": 0.4910683906090507, "grad_norm": 0.03338770195841789, "grad_norm_var": 2.4116806257078994e-06, "learning_rate": 0.0018622295108433773, "loss": 2.5813, "step": 5773 }, { "crossentropy": 2.709916353225708, "epoch": 0.49115345355563117, "grad_norm": 0.035049669444561005, "grad_norm_var": 1.717562537219217e-06, "learning_rate": 0.001860669839912626, "loss": 2.7099, "step": 5774 }, { "crossentropy": 2.5775809288024902, "epoch": 0.4912385165022116, "grad_norm": 0.035009659826755524, "grad_norm_var": 1.5588032586416033e-06, "learning_rate": 0.0018591106730678425, "loss": 2.5776, "step": 5775 }, { "crossentropy": 2.7596654891967773, "epoch": 0.4913235794487921, "grad_norm": 0.034655194729566574, "grad_norm_var": 1.540961235357794e-06, "learning_rate": 0.0018575520105593819, "loss": 2.7597, "step": 5776 }, { "crossentropy": 2.723615884780884, "epoch": 0.4914086423953726, "grad_norm": 0.033155571669340134, "grad_norm_var": 1.5649745084074981e-06, "learning_rate": 0.0018559938526375214, "loss": 2.7236, "step": 5777 }, { "crossentropy": 2.6465251445770264, "epoch": 0.491493705341953, "grad_norm": 0.033237818628549576, "grad_norm_var": 1.2825834461909745e-06, "learning_rate": 0.001854436199552455, "loss": 2.6465, "step": 5778 }, { "crossentropy": 2.6879420280456543, "epoch": 0.49157876828853353, "grad_norm": 0.03279342129826546, "grad_norm_var": 1.4084064307334197e-06, "learning_rate": 0.0018528790515542981, "loss": 2.6879, "step": 5779 }, { "crossentropy": 2.7299258708953857, "epoch": 0.491663831235114, "grad_norm": 0.035214804112911224, "grad_norm_var": 1.4021608097288436e-06, "learning_rate": 0.0018513224088930814, "loss": 2.7299, "step": 5780 }, { "crossentropy": 2.618581533432007, "epoch": 0.49174889418169443, "grad_norm": 0.036494795233011246, "grad_norm_var": 1.6829372080297161e-06, "learning_rate": 0.0018497662718187607, "loss": 2.6186, "step": 5781 }, { "crossentropy": 2.618443727493286, "epoch": 0.49183395712827493, "grad_norm": 0.03522361442446709, "grad_norm_var": 1.6746874881676718e-06, "learning_rate": 0.0018482106405812043, "loss": 2.6184, "step": 5782 }, { "crossentropy": 2.677687644958496, "epoch": 0.4919190200748554, "grad_norm": 0.03294145315885544, "grad_norm_var": 1.6766639124675293e-06, "learning_rate": 0.0018466555154302029, "loss": 2.6777, "step": 5783 }, { "crossentropy": 2.6697585582733154, "epoch": 0.4920040830214359, "grad_norm": 0.036154650151729584, "grad_norm_var": 1.3356013609165547e-06, "learning_rate": 0.001845100896615462, "loss": 2.6698, "step": 5784 }, { "crossentropy": 2.6997766494750977, "epoch": 0.49208914596801634, "grad_norm": 0.037372712045907974, "grad_norm_var": 1.84948916879159e-06, "learning_rate": 0.0018435467843866144, "loss": 2.6998, "step": 5785 }, { "crossentropy": 2.636021614074707, "epoch": 0.4921742089145968, "grad_norm": 0.03242005407810211, "grad_norm_var": 2.0892618107874275e-06, "learning_rate": 0.0018419931789932004, "loss": 2.636, "step": 5786 }, { "crossentropy": 2.6627697944641113, "epoch": 0.4922592718611773, "grad_norm": 0.033089529722929, "grad_norm_var": 2.205477786699587e-06, "learning_rate": 0.0018404400806846838, "loss": 2.6628, "step": 5787 }, { "crossentropy": 2.7787132263183594, "epoch": 0.49234433480775774, "grad_norm": 0.03452425077557564, "grad_norm_var": 2.1855956724952757e-06, "learning_rate": 0.0018388874897104519, "loss": 2.7787, "step": 5788 }, { "crossentropy": 2.647371768951416, "epoch": 0.4924293977543382, "grad_norm": 0.031536925584077835, "grad_norm_var": 2.6544974664042635e-06, "learning_rate": 0.0018373354063198034, "loss": 2.6474, "step": 5789 }, { "crossentropy": 2.6028549671173096, "epoch": 0.4925144607009187, "grad_norm": 0.03240010887384415, "grad_norm_var": 2.830055540435176e-06, "learning_rate": 0.0018357838307619574, "loss": 2.6029, "step": 5790 }, { "crossentropy": 2.771883726119995, "epoch": 0.49259952364749915, "grad_norm": 0.03270180895924568, "grad_norm_var": 2.8950386318397797e-06, "learning_rate": 0.0018342327632860545, "loss": 2.7719, "step": 5791 }, { "crossentropy": 2.675577163696289, "epoch": 0.4926845865940796, "grad_norm": 0.036749593913555145, "grad_norm_var": 3.3536142465519226e-06, "learning_rate": 0.0018326822041411523, "loss": 2.6756, "step": 5792 }, { "crossentropy": 2.590073347091675, "epoch": 0.4927696495406601, "grad_norm": 0.03350791335105896, "grad_norm_var": 3.3157979913761916e-06, "learning_rate": 0.00183113215357622, "loss": 2.5901, "step": 5793 }, { "crossentropy": 2.6442248821258545, "epoch": 0.49285471248724055, "grad_norm": 0.03461698442697525, "grad_norm_var": 3.267359243192917e-06, "learning_rate": 0.001829582611840157, "loss": 2.6442, "step": 5794 }, { "crossentropy": 2.6388752460479736, "epoch": 0.492939775433821, "grad_norm": 0.034572165459394455, "grad_norm_var": 3.12346924428696e-06, "learning_rate": 0.0018280335791817732, "loss": 2.6389, "step": 5795 }, { "crossentropy": 2.621823310852051, "epoch": 0.4930248383804015, "grad_norm": 0.037757180631160736, "grad_norm_var": 3.822269511338562e-06, "learning_rate": 0.0018264850558497969, "loss": 2.6218, "step": 5796 }, { "crossentropy": 2.6992666721343994, "epoch": 0.49310990132698196, "grad_norm": 0.03271262347698212, "grad_norm_var": 3.712375524057785e-06, "learning_rate": 0.001824937042092879, "loss": 2.6993, "step": 5797 }, { "crossentropy": 2.7438554763793945, "epoch": 0.49319496427356246, "grad_norm": 0.03627513349056244, "grad_norm_var": 3.9155172196897e-06, "learning_rate": 0.0018233895381595856, "loss": 2.7439, "step": 5798 }, { "crossentropy": 2.7921926975250244, "epoch": 0.4932800272201429, "grad_norm": 0.03504255414009094, "grad_norm_var": 3.8015048234770385e-06, "learning_rate": 0.0018218425442984, "loss": 2.7922, "step": 5799 }, { "crossentropy": 2.6838765144348145, "epoch": 0.49336509016672336, "grad_norm": 0.033001337200403214, "grad_norm_var": 3.7124141094389715e-06, "learning_rate": 0.0018202960607577245, "loss": 2.6839, "step": 5800 }, { "crossentropy": 2.652512311935425, "epoch": 0.49345015311330387, "grad_norm": 0.03336500748991966, "grad_norm_var": 3.056996362865658e-06, "learning_rate": 0.0018187500877858815, "loss": 2.6525, "step": 5801 }, { "crossentropy": 2.64717698097229, "epoch": 0.4935352160598843, "grad_norm": 0.03298209607601166, "grad_norm_var": 2.957060652515088e-06, "learning_rate": 0.0018172046256311088, "loss": 2.6472, "step": 5802 }, { "crossentropy": 2.680750608444214, "epoch": 0.49362027900646477, "grad_norm": 0.03298125043511391, "grad_norm_var": 2.9716917402888263e-06, "learning_rate": 0.001815659674541561, "loss": 2.6808, "step": 5803 }, { "crossentropy": 2.6676931381225586, "epoch": 0.4937053419530453, "grad_norm": 0.03590347617864609, "grad_norm_var": 3.178636106195472e-06, "learning_rate": 0.0018141152347653166, "loss": 2.6677, "step": 5804 }, { "crossentropy": 2.672769546508789, "epoch": 0.4937904048996257, "grad_norm": 0.034382838755846024, "grad_norm_var": 2.7002617811207792e-06, "learning_rate": 0.0018125713065503663, "loss": 2.6728, "step": 5805 }, { "crossentropy": 2.643216609954834, "epoch": 0.4938754678462062, "grad_norm": 0.05889992415904999, "grad_norm_var": 3.984379053924361e-05, "learning_rate": 0.0018110278901446203, "loss": 2.6432, "step": 5806 }, { "crossentropy": 2.675339698791504, "epoch": 0.4939605307927867, "grad_norm": 0.03509055823087692, "grad_norm_var": 3.916086055404614e-05, "learning_rate": 0.0018094849857959072, "loss": 2.6753, "step": 5807 }, { "crossentropy": 2.6490750312805176, "epoch": 0.49404559373936713, "grad_norm": 0.03389937803149223, "grad_norm_var": 3.942744490123551e-05, "learning_rate": 0.0018079425937519728, "loss": 2.6491, "step": 5808 }, { "crossentropy": 2.695781707763672, "epoch": 0.4941306566859476, "grad_norm": 0.03466102108359337, "grad_norm_var": 3.9137097179542335e-05, "learning_rate": 0.0018064007142604794, "loss": 2.6958, "step": 5809 }, { "crossentropy": 2.6308040618896484, "epoch": 0.4942157196325281, "grad_norm": 0.032804910093545914, "grad_norm_var": 3.9678640666917835e-05, "learning_rate": 0.0018048593475690111, "loss": 2.6308, "step": 5810 }, { "crossentropy": 2.725311756134033, "epoch": 0.49430078257910853, "grad_norm": 0.03128854185342789, "grad_norm_var": 4.093199937665922e-05, "learning_rate": 0.0018033184939250657, "loss": 2.7253, "step": 5811 }, { "crossentropy": 2.6480607986450195, "epoch": 0.494385845525689, "grad_norm": 0.034266095608472824, "grad_norm_var": 4.073172976119694e-05, "learning_rate": 0.001801778153576058, "loss": 2.6481, "step": 5812 }, { "crossentropy": 2.654019355773926, "epoch": 0.4944709084722695, "grad_norm": 0.03501097112894058, "grad_norm_var": 4.021618810263009e-05, "learning_rate": 0.0018002383267693262, "loss": 2.654, "step": 5813 }, { "crossentropy": 2.6147725582122803, "epoch": 0.49455597141884994, "grad_norm": 0.03303935006260872, "grad_norm_var": 4.05861819740745e-05, "learning_rate": 0.001798699013752122, "loss": 2.6148, "step": 5814 }, { "crossentropy": 2.677626609802246, "epoch": 0.49464103436543044, "grad_norm": 0.03406527638435364, "grad_norm_var": 4.069423654152373e-05, "learning_rate": 0.0017971602147716093, "loss": 2.6776, "step": 5815 }, { "crossentropy": 2.725837469100952, "epoch": 0.4947260973120109, "grad_norm": 0.03362775221467018, "grad_norm_var": 4.052237684200871e-05, "learning_rate": 0.0017956219300748794, "loss": 2.7258, "step": 5816 }, { "crossentropy": 2.66263747215271, "epoch": 0.49481116025859134, "grad_norm": 0.03379637375473976, "grad_norm_var": 4.0417435922900034e-05, "learning_rate": 0.0017940841599089364, "loss": 2.6626, "step": 5817 }, { "crossentropy": 2.6384599208831787, "epoch": 0.49489622320517185, "grad_norm": 0.03351389244198799, "grad_norm_var": 4.026233838096323e-05, "learning_rate": 0.0017925469045206987, "loss": 2.6385, "step": 5818 }, { "crossentropy": 2.648789882659912, "epoch": 0.4949812861517523, "grad_norm": 0.033877864480018616, "grad_norm_var": 4.001721158692622e-05, "learning_rate": 0.0017910101641570092, "loss": 2.6488, "step": 5819 }, { "crossentropy": 2.701417922973633, "epoch": 0.49506634909833275, "grad_norm": 0.035208456218242645, "grad_norm_var": 4.001075517182275e-05, "learning_rate": 0.0017894739390646225, "loss": 2.7014, "step": 5820 }, { "crossentropy": 2.629293203353882, "epoch": 0.49515141204491325, "grad_norm": 0.03372930362820625, "grad_norm_var": 4.013170980778445e-05, "learning_rate": 0.001787938229490212, "loss": 2.6293, "step": 5821 }, { "crossentropy": 2.6888701915740967, "epoch": 0.4952364749914937, "grad_norm": 0.03419927507638931, "grad_norm_var": 9.473721937551412e-07, "learning_rate": 0.0017864030356803684, "loss": 2.6889, "step": 5822 }, { "crossentropy": 2.6977999210357666, "epoch": 0.49532153793807415, "grad_norm": 0.03329244628548622, "grad_norm_var": 8.59203695315521e-07, "learning_rate": 0.0017848683578816, "loss": 2.6978, "step": 5823 }, { "crossentropy": 2.586406946182251, "epoch": 0.49540660088465466, "grad_norm": 0.03420882672071457, "grad_norm_var": 8.706275231625792e-07, "learning_rate": 0.001783334196340331, "loss": 2.5864, "step": 5824 }, { "crossentropy": 2.619234800338745, "epoch": 0.4954916638312351, "grad_norm": 0.03234798088669777, "grad_norm_var": 9.354277524265333e-07, "learning_rate": 0.0017818005513029018, "loss": 2.6192, "step": 5825 }, { "crossentropy": 2.6292765140533447, "epoch": 0.49557672677781556, "grad_norm": 0.034167539328336716, "grad_norm_var": 8.993290232415032e-07, "learning_rate": 0.0017802674230155762, "loss": 2.6293, "step": 5826 }, { "crossentropy": 2.6047956943511963, "epoch": 0.49566178972439606, "grad_norm": 0.03274673596024513, "grad_norm_var": 5.580287207131163e-07, "learning_rate": 0.0017787348117245278, "loss": 2.6048, "step": 5827 }, { "crossentropy": 2.5741491317749023, "epoch": 0.4957468526709765, "grad_norm": 0.03484668955206871, "grad_norm_var": 6.137359554697168e-07, "learning_rate": 0.0017772027176758498, "loss": 2.5741, "step": 5828 }, { "crossentropy": 2.6485073566436768, "epoch": 0.495831915617557, "grad_norm": 0.034892257302999496, "grad_norm_var": 5.963182132213857e-07, "learning_rate": 0.0017756711411155523, "loss": 2.6485, "step": 5829 }, { "crossentropy": 2.589656352996826, "epoch": 0.49591697856413747, "grad_norm": 0.03286809101700783, "grad_norm_var": 6.166050775806171e-07, "learning_rate": 0.0017741400822895632, "loss": 2.5897, "step": 5830 }, { "crossentropy": 2.6835005283355713, "epoch": 0.4960020415107179, "grad_norm": 0.03404917195439339, "grad_norm_var": 6.161306843314544e-07, "learning_rate": 0.0017726095414437237, "loss": 2.6835, "step": 5831 }, { "crossentropy": 2.6484360694885254, "epoch": 0.4960871044572984, "grad_norm": 0.03197697177529335, "grad_norm_var": 8.322381303253283e-07, "learning_rate": 0.0017710795188237989, "loss": 2.6484, "step": 5832 }, { "crossentropy": 2.577941656112671, "epoch": 0.49617216740387887, "grad_norm": 0.03507380560040474, "grad_norm_var": 9.450869123730129e-07, "learning_rate": 0.0017695500146754635, "loss": 2.5779, "step": 5833 }, { "crossentropy": 2.758760452270508, "epoch": 0.4962572303504593, "grad_norm": 0.03443141281604767, "grad_norm_var": 9.611769598511302e-07, "learning_rate": 0.001768021029244311, "loss": 2.7588, "step": 5834 }, { "crossentropy": 2.728304624557495, "epoch": 0.4963422932970398, "grad_norm": 0.033494796603918076, "grad_norm_var": 9.699364644157049e-07, "learning_rate": 0.0017664925627758577, "loss": 2.7283, "step": 5835 }, { "crossentropy": 2.5570356845855713, "epoch": 0.4964273562436203, "grad_norm": 0.03418572247028351, "grad_norm_var": 8.49500738926238e-07, "learning_rate": 0.0017649646155155252, "loss": 2.557, "step": 5836 }, { "crossentropy": 2.618865728378296, "epoch": 0.4965124191902007, "grad_norm": 0.037132903933525085, "grad_norm_var": 1.549644958676449e-06, "learning_rate": 0.0017634371877086586, "loss": 2.6189, "step": 5837 }, { "crossentropy": 2.6846044063568115, "epoch": 0.49659748213678123, "grad_norm": 0.03322508931159973, "grad_norm_var": 1.582382627963487e-06, "learning_rate": 0.0017619102796005216, "loss": 2.6846, "step": 5838 }, { "crossentropy": 2.6243631839752197, "epoch": 0.4966825450833617, "grad_norm": 0.03373090177774429, "grad_norm_var": 1.5569051311334811e-06, "learning_rate": 0.0017603838914362901, "loss": 2.6244, "step": 5839 }, { "crossentropy": 2.7216856479644775, "epoch": 0.49676760802994213, "grad_norm": 0.031721893697977066, "grad_norm_var": 1.8613402838776305e-06, "learning_rate": 0.0017588580234610591, "loss": 2.7217, "step": 5840 }, { "crossentropy": 2.7410430908203125, "epoch": 0.49685267097652264, "grad_norm": 0.034881606698036194, "grad_norm_var": 1.7700859183752046e-06, "learning_rate": 0.0017573326759198354, "loss": 2.741, "step": 5841 }, { "crossentropy": 2.615631341934204, "epoch": 0.4969377339231031, "grad_norm": 0.0329466313123703, "grad_norm_var": 1.8301319109986203e-06, "learning_rate": 0.001755807849057553, "loss": 2.6156, "step": 5842 }, { "crossentropy": 2.703361988067627, "epoch": 0.4970227968696836, "grad_norm": 0.0322885625064373, "grad_norm_var": 1.9129590086952935e-06, "learning_rate": 0.0017542835431190468, "loss": 2.7034, "step": 5843 }, { "crossentropy": 2.6217892169952393, "epoch": 0.49710785981626404, "grad_norm": 0.03608610853552818, "grad_norm_var": 2.1721645557104887e-06, "learning_rate": 0.0017527597583490823, "loss": 2.6218, "step": 5844 }, { "crossentropy": 2.6417722702026367, "epoch": 0.4971929227628445, "grad_norm": 0.03135741129517555, "grad_norm_var": 2.5027066957778244e-06, "learning_rate": 0.0017512364949923331, "loss": 2.6418, "step": 5845 }, { "crossentropy": 2.6005136966705322, "epoch": 0.497277985709425, "grad_norm": 0.033558592200279236, "grad_norm_var": 2.454470204094061e-06, "learning_rate": 0.001749713753293392, "loss": 2.6005, "step": 5846 }, { "crossentropy": 2.7096328735351562, "epoch": 0.49736304865600545, "grad_norm": 0.03450234979391098, "grad_norm_var": 2.48484823500422e-06, "learning_rate": 0.0017481915334967657, "loss": 2.7096, "step": 5847 }, { "crossentropy": 2.5848758220672607, "epoch": 0.4974481116025859, "grad_norm": 0.033154863864183426, "grad_norm_var": 2.287266444204958e-06, "learning_rate": 0.0017466698358468823, "loss": 2.5849, "step": 5848 }, { "crossentropy": 2.5796711444854736, "epoch": 0.4975331745491664, "grad_norm": 0.0330854132771492, "grad_norm_var": 2.2127796975318174e-06, "learning_rate": 0.0017451486605880807, "loss": 2.5797, "step": 5849 }, { "crossentropy": 2.6360785961151123, "epoch": 0.49761823749574685, "grad_norm": 0.03226854279637337, "grad_norm_var": 2.3047589927572066e-06, "learning_rate": 0.0017436280079646171, "loss": 2.6361, "step": 5850 }, { "crossentropy": 2.669978141784668, "epoch": 0.4977033004423273, "grad_norm": 0.03489600867033005, "grad_norm_var": 2.407566460208704e-06, "learning_rate": 0.0017421078782206656, "loss": 2.67, "step": 5851 }, { "crossentropy": 2.6910741329193115, "epoch": 0.4977883633889078, "grad_norm": 0.03298094496130943, "grad_norm_var": 2.4184784707951986e-06, "learning_rate": 0.001740588271600315, "loss": 2.6911, "step": 5852 }, { "crossentropy": 2.608250141143799, "epoch": 0.49787342633548826, "grad_norm": 0.03630320727825165, "grad_norm_var": 2.0721775139957595e-06, "learning_rate": 0.0017390691883475684, "loss": 2.6083, "step": 5853 }, { "crossentropy": 2.6396982669830322, "epoch": 0.4979584892820687, "grad_norm": 0.03221172094345093, "grad_norm_var": 2.1818489826458745e-06, "learning_rate": 0.0017375506287063503, "loss": 2.6397, "step": 5854 }, { "crossentropy": 2.6488852500915527, "epoch": 0.4980435522286492, "grad_norm": 0.03343171998858452, "grad_norm_var": 2.1781695317707e-06, "learning_rate": 0.0017360325929204957, "loss": 2.6489, "step": 5855 }, { "crossentropy": 2.704681634902954, "epoch": 0.49812861517522966, "grad_norm": 0.03271884471178055, "grad_norm_var": 2.00662628063817e-06, "learning_rate": 0.0017345150812337563, "loss": 2.7047, "step": 5856 }, { "crossentropy": 2.640418291091919, "epoch": 0.49821367812181017, "grad_norm": 0.0347190722823143, "grad_norm_var": 1.9792471277581152e-06, "learning_rate": 0.001732998093889805, "loss": 2.6404, "step": 5857 }, { "crossentropy": 2.6251797676086426, "epoch": 0.4982987410683906, "grad_norm": 0.0331483893096447, "grad_norm_var": 1.966047600991209e-06, "learning_rate": 0.0017314816311322218, "loss": 2.6252, "step": 5858 }, { "crossentropy": 2.6896705627441406, "epoch": 0.49838380401497107, "grad_norm": 0.03305461257696152, "grad_norm_var": 1.874444758521015e-06, "learning_rate": 0.0017299656932045066, "loss": 2.6897, "step": 5859 }, { "crossentropy": 2.63399338722229, "epoch": 0.49846886696155157, "grad_norm": 0.032756414264440536, "grad_norm_var": 1.4602522385891044e-06, "learning_rate": 0.001728450280350079, "loss": 2.634, "step": 5860 }, { "crossentropy": 2.707913398742676, "epoch": 0.498553929908132, "grad_norm": 0.03486485034227371, "grad_norm_var": 1.281263688724758e-06, "learning_rate": 0.0017269353928122684, "loss": 2.7079, "step": 5861 }, { "crossentropy": 2.7557923793792725, "epoch": 0.49863899285471247, "grad_norm": 0.034033678472042084, "grad_norm_var": 1.2925274892732856e-06, "learning_rate": 0.0017254210308343231, "loss": 2.7558, "step": 5862 }, { "crossentropy": 2.6167356967926025, "epoch": 0.498724055801293, "grad_norm": 0.03371783718466759, "grad_norm_var": 1.2400755083786958e-06, "learning_rate": 0.0017239071946594032, "loss": 2.6167, "step": 5863 }, { "crossentropy": 2.6486458778381348, "epoch": 0.4988091187478734, "grad_norm": 0.03447316586971283, "grad_norm_var": 1.2732414155408137e-06, "learning_rate": 0.001722393884530593, "loss": 2.6486, "step": 5864 }, { "crossentropy": 2.6497280597686768, "epoch": 0.4988941816944539, "grad_norm": 0.033723343163728714, "grad_norm_var": 1.249248147682363e-06, "learning_rate": 0.0017208811006908797, "loss": 2.6497, "step": 5865 }, { "crossentropy": 2.749753713607788, "epoch": 0.4989792446410344, "grad_norm": 0.03271535411477089, "grad_norm_var": 1.1760657291336069e-06, "learning_rate": 0.0017193688433831773, "loss": 2.7498, "step": 5866 }, { "crossentropy": 2.7020442485809326, "epoch": 0.49906430758761483, "grad_norm": 0.03342300280928612, "grad_norm_var": 1.083518849870115e-06, "learning_rate": 0.0017178571128503095, "loss": 2.702, "step": 5867 }, { "crossentropy": 2.6172659397125244, "epoch": 0.4991493705341953, "grad_norm": 0.033206429332494736, "grad_norm_var": 1.0668143924639125e-06, "learning_rate": 0.001716345909335017, "loss": 2.6173, "step": 5868 }, { "crossentropy": 2.639425039291382, "epoch": 0.4992344334807758, "grad_norm": 0.032451700419187546, "grad_norm_var": 6.346954673926962e-07, "learning_rate": 0.0017148352330799532, "loss": 2.6394, "step": 5869 }, { "crossentropy": 2.7369086742401123, "epoch": 0.49931949642735624, "grad_norm": 0.03307719901204109, "grad_norm_var": 5.425832483324049e-07, "learning_rate": 0.0017133250843276925, "loss": 2.7369, "step": 5870 }, { "crossentropy": 2.785675525665283, "epoch": 0.49940455937393674, "grad_norm": 0.03605644777417183, "grad_norm_var": 9.59857317492374e-07, "learning_rate": 0.0017118154633207216, "loss": 2.7857, "step": 5871 }, { "crossentropy": 2.7327232360839844, "epoch": 0.4994896223205172, "grad_norm": 0.03424190357327461, "grad_norm_var": 9.190408022699894e-07, "learning_rate": 0.001710306370301437, "loss": 2.7327, "step": 5872 }, { "crossentropy": 2.752927541732788, "epoch": 0.49957468526709764, "grad_norm": 0.034003693610429764, "grad_norm_var": 8.565857643728913e-07, "learning_rate": 0.001708797805512161, "loss": 2.7529, "step": 5873 }, { "crossentropy": 2.5793726444244385, "epoch": 0.49965974821367815, "grad_norm": 0.032114285975694656, "grad_norm_var": 9.973062692569813e-07, "learning_rate": 0.0017072897691951232, "loss": 2.5794, "step": 5874 }, { "crossentropy": 2.7025253772735596, "epoch": 0.4997448111602586, "grad_norm": 0.03505430370569229, "grad_norm_var": 1.0965837049431436e-06, "learning_rate": 0.00170578226159247, "loss": 2.7025, "step": 5875 }, { "crossentropy": 2.613774538040161, "epoch": 0.49982987410683904, "grad_norm": 0.03451536223292351, "grad_norm_var": 1.0581965471767596e-06, "learning_rate": 0.0017042752829462666, "loss": 2.6138, "step": 5876 }, { "crossentropy": 2.684968948364258, "epoch": 0.49991493705341955, "grad_norm": 0.03233487531542778, "grad_norm_var": 1.1174351706702904e-06, "learning_rate": 0.001702768833498488, "loss": 2.685, "step": 5877 }, { "crossentropy": 2.6470530033111572, "epoch": 0.5, "grad_norm": 0.03384105861186981, "grad_norm_var": 1.111092160093324e-06, "learning_rate": 0.0017012629134910274, "loss": 2.6471, "step": 5878 }, { "crossentropy": 2.6177704334259033, "epoch": 0.5000850629465805, "grad_norm": 0.03427048772573471, "grad_norm_var": 1.1326469648962242e-06, "learning_rate": 0.001699757523165692, "loss": 2.6178, "step": 5879 }, { "crossentropy": 2.6765692234039307, "epoch": 0.5001701258931609, "grad_norm": 0.03398333489894867, "grad_norm_var": 1.0983820296737076e-06, "learning_rate": 0.0016982526627642041, "loss": 2.6766, "step": 5880 }, { "crossentropy": 2.6399667263031006, "epoch": 0.5002551888397414, "grad_norm": 0.03552765026688576, "grad_norm_var": 1.310283037728871e-06, "learning_rate": 0.001696748332528199, "loss": 2.64, "step": 5881 }, { "crossentropy": 2.5131516456604004, "epoch": 0.5003402517863219, "grad_norm": 0.031438007950782776, "grad_norm_var": 1.597169874353356e-06, "learning_rate": 0.0016952445326992326, "loss": 2.5132, "step": 5882 }, { "crossentropy": 2.591388702392578, "epoch": 0.5004253147329023, "grad_norm": 0.03449865058064461, "grad_norm_var": 1.626711285640919e-06, "learning_rate": 0.0016937412635187699, "loss": 2.5914, "step": 5883 }, { "crossentropy": 2.664351224899292, "epoch": 0.5005103776794828, "grad_norm": 0.03313368558883667, "grad_norm_var": 1.6326872446468614e-06, "learning_rate": 0.0016922385252281925, "loss": 2.6644, "step": 5884 }, { "crossentropy": 2.6329023838043213, "epoch": 0.5005954406260633, "grad_norm": 0.032585714012384415, "grad_norm_var": 1.610005065871602e-06, "learning_rate": 0.0016907363180687957, "loss": 2.6329, "step": 5885 }, { "crossentropy": 2.584141969680786, "epoch": 0.5006805035726437, "grad_norm": 0.03391663730144501, "grad_norm_var": 1.5740093261328834e-06, "learning_rate": 0.0016892346422817946, "loss": 2.5841, "step": 5886 }, { "crossentropy": 2.606114625930786, "epoch": 0.5007655665192242, "grad_norm": 0.03790152445435524, "grad_norm_var": 2.3308773141535985e-06, "learning_rate": 0.001687733498108311, "loss": 2.6061, "step": 5887 }, { "crossentropy": 2.6437230110168457, "epoch": 0.5008506294658047, "grad_norm": 0.037133898586034775, "grad_norm_var": 2.962278004806952e-06, "learning_rate": 0.0016862328857893856, "loss": 2.6437, "step": 5888 }, { "crossentropy": 2.6166608333587646, "epoch": 0.5009356924123851, "grad_norm": 0.03313738480210304, "grad_norm_var": 3.0250232293549743e-06, "learning_rate": 0.0016847328055659755, "loss": 2.6167, "step": 5889 }, { "crossentropy": 2.651118755340576, "epoch": 0.5010207553589656, "grad_norm": 0.03387526795268059, "grad_norm_var": 2.755726211454458e-06, "learning_rate": 0.00168323325767895, "loss": 2.6511, "step": 5890 }, { "crossentropy": 2.7276010513305664, "epoch": 0.5011058183055461, "grad_norm": 0.03351666033267975, "grad_norm_var": 2.7276810057508606e-06, "learning_rate": 0.0016817342423690912, "loss": 2.7276, "step": 5891 }, { "crossentropy": 2.644843816757202, "epoch": 0.5011908812521265, "grad_norm": 0.03495633602142334, "grad_norm_var": 2.7642189860177523e-06, "learning_rate": 0.001680235759877101, "loss": 2.6448, "step": 5892 }, { "crossentropy": 2.5807037353515625, "epoch": 0.501275944198707, "grad_norm": 0.03497572988271713, "grad_norm_var": 2.568646977681108e-06, "learning_rate": 0.001678737810443593, "loss": 2.5807, "step": 5893 }, { "crossentropy": 2.6566760540008545, "epoch": 0.5013610071452875, "grad_norm": 0.038077108561992645, "grad_norm_var": 3.434752696172769e-06, "learning_rate": 0.001677240394309089, "loss": 2.6567, "step": 5894 }, { "crossentropy": 2.6618893146514893, "epoch": 0.501446070091868, "grad_norm": 0.03343883901834488, "grad_norm_var": 3.5098619445662776e-06, "learning_rate": 0.0016757435117140363, "loss": 2.6619, "step": 5895 }, { "crossentropy": 2.6242306232452393, "epoch": 0.5015311330384484, "grad_norm": 0.03300222381949425, "grad_norm_var": 3.6383989825370757e-06, "learning_rate": 0.0016742471628987893, "loss": 2.6242, "step": 5896 }, { "crossentropy": 2.692753791809082, "epoch": 0.5016161959850289, "grad_norm": 0.03296880051493645, "grad_norm_var": 3.67815258997146e-06, "learning_rate": 0.0016727513481036166, "loss": 2.6928, "step": 5897 }, { "crossentropy": 2.60913348197937, "epoch": 0.5017012589316094, "grad_norm": 0.033309079706668854, "grad_norm_var": 3.1867576300461294e-06, "learning_rate": 0.0016712560675687072, "loss": 2.6091, "step": 5898 }, { "crossentropy": 2.6449496746063232, "epoch": 0.5017863218781898, "grad_norm": 0.03523602336645126, "grad_norm_var": 3.2302697772142982e-06, "learning_rate": 0.0016697613215341572, "loss": 2.6449, "step": 5899 }, { "crossentropy": 2.6877760887145996, "epoch": 0.5018713848247703, "grad_norm": 0.03298449143767357, "grad_norm_var": 3.257802190987739e-06, "learning_rate": 0.0016682671102399805, "loss": 2.6878, "step": 5900 }, { "crossentropy": 2.682969331741333, "epoch": 0.5019564477713508, "grad_norm": 0.033772606402635574, "grad_norm_var": 3.052641880696427e-06, "learning_rate": 0.0016667734339261043, "loss": 2.683, "step": 5901 }, { "crossentropy": 2.638148069381714, "epoch": 0.5020415107179312, "grad_norm": 0.034084614366292953, "grad_norm_var": 3.041056240276653e-06, "learning_rate": 0.0016652802928323696, "loss": 2.6381, "step": 5902 }, { "crossentropy": 2.614928722381592, "epoch": 0.5021265736645117, "grad_norm": 0.03544953092932701, "grad_norm_var": 2.3123268172580677e-06, "learning_rate": 0.0016637876871985308, "loss": 2.6149, "step": 5903 }, { "crossentropy": 2.6990761756896973, "epoch": 0.5022116366110922, "grad_norm": 0.03315862640738487, "grad_norm_var": 1.8349880487572e-06, "learning_rate": 0.00166229561726426, "loss": 2.6991, "step": 5904 }, { "crossentropy": 2.692216396331787, "epoch": 0.5022966995576726, "grad_norm": 0.033058784902095795, "grad_norm_var": 1.8456872413361488e-06, "learning_rate": 0.0016608040832691395, "loss": 2.6922, "step": 5905 }, { "crossentropy": 2.580974817276001, "epoch": 0.5023817625042531, "grad_norm": 0.033861711621284485, "grad_norm_var": 1.8461348385095366e-06, "learning_rate": 0.0016593130854526666, "loss": 2.581, "step": 5906 }, { "crossentropy": 2.616157293319702, "epoch": 0.5024668254508337, "grad_norm": 0.03520866483449936, "grad_norm_var": 1.8899215433215132e-06, "learning_rate": 0.0016578226240542527, "loss": 2.6162, "step": 5907 }, { "crossentropy": 2.5918850898742676, "epoch": 0.502551888397414, "grad_norm": 0.03562029078602791, "grad_norm_var": 1.9825314216938284e-06, "learning_rate": 0.001656332699313222, "loss": 2.5919, "step": 5908 }, { "crossentropy": 2.681281805038452, "epoch": 0.5026369513439946, "grad_norm": 0.03237545117735863, "grad_norm_var": 2.1579969009541747e-06, "learning_rate": 0.001654843311468815, "loss": 2.6813, "step": 5909 }, { "crossentropy": 2.644534111022949, "epoch": 0.5027220142905751, "grad_norm": 0.03469432517886162, "grad_norm_var": 1.079565170352311e-06, "learning_rate": 0.0016533544607601814, "loss": 2.6445, "step": 5910 }, { "crossentropy": 2.6413891315460205, "epoch": 0.5028070772371555, "grad_norm": 0.03351445123553276, "grad_norm_var": 1.07538409920323e-06, "learning_rate": 0.001651866147426392, "loss": 2.6414, "step": 5911 }, { "crossentropy": 2.6134164333343506, "epoch": 0.502892140183736, "grad_norm": 0.033970776945352554, "grad_norm_var": 1.018885593584371e-06, "learning_rate": 0.0016503783717064246, "loss": 2.6134, "step": 5912 }, { "crossentropy": 2.647747755050659, "epoch": 0.5029772031303165, "grad_norm": 0.0344226211309433, "grad_norm_var": 9.599602213491063e-07, "learning_rate": 0.0016488911338391722, "loss": 2.6477, "step": 5913 }, { "crossentropy": 2.6078667640686035, "epoch": 0.5030622660768969, "grad_norm": 0.03479979187250137, "grad_norm_var": 9.525509610327128e-07, "learning_rate": 0.001647404434063447, "loss": 2.6079, "step": 5914 }, { "crossentropy": 2.627962350845337, "epoch": 0.5031473290234774, "grad_norm": 0.03292199969291687, "grad_norm_var": 9.485316230607721e-07, "learning_rate": 0.001645918272617965, "loss": 2.628, "step": 5915 }, { "crossentropy": 2.6504323482513428, "epoch": 0.5032323919700579, "grad_norm": 0.04239385947585106, "grad_norm_var": 5.215945401441241e-06, "learning_rate": 0.0016444326497413613, "loss": 2.6504, "step": 5916 }, { "crossentropy": 2.655667781829834, "epoch": 0.5033174549166383, "grad_norm": 0.03565661236643791, "grad_norm_var": 5.234528611591793e-06, "learning_rate": 0.0016429475656721875, "loss": 2.6557, "step": 5917 }, { "crossentropy": 2.63443660736084, "epoch": 0.5034025178632188, "grad_norm": 0.03117596171796322, "grad_norm_var": 6.00176277657526e-06, "learning_rate": 0.0016414630206489028, "loss": 2.6344, "step": 5918 }, { "crossentropy": 2.6373672485351562, "epoch": 0.5034875808097993, "grad_norm": 0.03338276594877243, "grad_norm_var": 6.0119536977564956e-06, "learning_rate": 0.001639979014909882, "loss": 2.6374, "step": 5919 }, { "crossentropy": 2.6690175533294678, "epoch": 0.5035726437563797, "grad_norm": 0.03356047719717026, "grad_norm_var": 5.956147366361246e-06, "learning_rate": 0.0016384955486934156, "loss": 2.669, "step": 5920 }, { "crossentropy": 2.5325305461883545, "epoch": 0.5036577067029602, "grad_norm": 0.033342182636260986, "grad_norm_var": 5.90997123708369e-06, "learning_rate": 0.0016370126222377047, "loss": 2.5325, "step": 5921 }, { "crossentropy": 2.706686496734619, "epoch": 0.5037427696495407, "grad_norm": 0.03965980187058449, "grad_norm_var": 7.570694986153134e-06, "learning_rate": 0.001635530235780865, "loss": 2.7067, "step": 5922 }, { "crossentropy": 2.6035423278808594, "epoch": 0.5038278325961212, "grad_norm": 0.032654259353876114, "grad_norm_var": 7.837192657661574e-06, "learning_rate": 0.0016340483895609238, "loss": 2.6035, "step": 5923 }, { "crossentropy": 2.6917757987976074, "epoch": 0.5039128955427016, "grad_norm": 0.0369078665971756, "grad_norm_var": 8.1101141212781e-06, "learning_rate": 0.0016325670838158236, "loss": 2.6918, "step": 5924 }, { "crossentropy": 2.669483184814453, "epoch": 0.5039979584892821, "grad_norm": 0.03761310875415802, "grad_norm_var": 8.191142703377674e-06, "learning_rate": 0.001631086318783419, "loss": 2.6695, "step": 5925 }, { "crossentropy": 2.6040091514587402, "epoch": 0.5040830214358626, "grad_norm": 0.03277522325515747, "grad_norm_var": 8.510272137991524e-06, "learning_rate": 0.001629606094701478, "loss": 2.604, "step": 5926 }, { "crossentropy": 2.629269599914551, "epoch": 0.504168084382443, "grad_norm": 0.034576356410980225, "grad_norm_var": 8.38146081493193e-06, "learning_rate": 0.0016281264118076834, "loss": 2.6293, "step": 5927 }, { "crossentropy": 2.6658167839050293, "epoch": 0.5042531473290235, "grad_norm": 0.037050455808639526, "grad_norm_var": 8.55639574601449e-06, "learning_rate": 0.0016266472703396284, "loss": 2.6658, "step": 5928 }, { "crossentropy": 2.5877368450164795, "epoch": 0.504338210275604, "grad_norm": 0.03192899376153946, "grad_norm_var": 9.197125220577204e-06, "learning_rate": 0.0016251686705348212, "loss": 2.5877, "step": 5929 }, { "crossentropy": 2.730349063873291, "epoch": 0.5044232732221844, "grad_norm": 0.03457886353135109, "grad_norm_var": 9.206809263353798e-06, "learning_rate": 0.0016236906126306822, "loss": 2.7303, "step": 5930 }, { "crossentropy": 2.6379213333129883, "epoch": 0.5045083361687649, "grad_norm": 0.03401314839720726, "grad_norm_var": 8.97727542187239e-06, "learning_rate": 0.0016222130968645443, "loss": 2.6379, "step": 5931 }, { "crossentropy": 2.728255033493042, "epoch": 0.5045933991153454, "grad_norm": 0.03432144224643707, "grad_norm_var": 5.1772734505184236e-06, "learning_rate": 0.0016207361234736534, "loss": 2.7283, "step": 5932 }, { "crossentropy": 2.575709104537964, "epoch": 0.5046784620619258, "grad_norm": 0.03348008170723915, "grad_norm_var": 5.159420492877375e-06, "learning_rate": 0.0016192596926951708, "loss": 2.5757, "step": 5933 }, { "crossentropy": 2.603327512741089, "epoch": 0.5047635250085063, "grad_norm": 0.03356055170297623, "grad_norm_var": 4.4774043718969736e-06, "learning_rate": 0.001617783804766168, "loss": 2.6033, "step": 5934 }, { "crossentropy": 2.588299036026001, "epoch": 0.5048485879550868, "grad_norm": 0.03373364731669426, "grad_norm_var": 4.428720419126045e-06, "learning_rate": 0.0016163084599236277, "loss": 2.5883, "step": 5935 }, { "crossentropy": 2.583521842956543, "epoch": 0.5049336509016672, "grad_norm": 0.034727681428194046, "grad_norm_var": 4.350568376164318e-06, "learning_rate": 0.0016148336584044538, "loss": 2.5835, "step": 5936 }, { "crossentropy": 2.7955093383789062, "epoch": 0.5050187138482477, "grad_norm": 0.0356283038854599, "grad_norm_var": 4.268595055520406e-06, "learning_rate": 0.001613359400445451, "loss": 2.7955, "step": 5937 }, { "crossentropy": 2.689483165740967, "epoch": 0.5051037767948282, "grad_norm": 0.03292831778526306, "grad_norm_var": 2.761813304363638e-06, "learning_rate": 0.001611885686283343, "loss": 2.6895, "step": 5938 }, { "crossentropy": 2.6147196292877197, "epoch": 0.5051888397414086, "grad_norm": 0.03302840143442154, "grad_norm_var": 2.6832307258028407e-06, "learning_rate": 0.001610412516154769, "loss": 2.6147, "step": 5939 }, { "crossentropy": 2.582901954650879, "epoch": 0.5052739026879891, "grad_norm": 0.0359988696873188, "grad_norm_var": 2.434347790719604e-06, "learning_rate": 0.0016089398902962766, "loss": 2.5829, "step": 5940 }, { "crossentropy": 2.609760284423828, "epoch": 0.5053589656345696, "grad_norm": 0.032059527933597565, "grad_norm_var": 1.9616252882083407e-06, "learning_rate": 0.0016074678089443245, "loss": 2.6098, "step": 5941 }, { "crossentropy": 2.606443405151367, "epoch": 0.50544402858115, "grad_norm": 0.033197712153196335, "grad_norm_var": 1.9024147807133439e-06, "learning_rate": 0.0016059962723352912, "loss": 2.6064, "step": 5942 }, { "crossentropy": 2.632350444793701, "epoch": 0.5055290915277305, "grad_norm": 0.03378414362668991, "grad_norm_var": 1.88612324497067e-06, "learning_rate": 0.0016045252807054611, "loss": 2.6324, "step": 5943 }, { "crossentropy": 2.620476722717285, "epoch": 0.505614154474311, "grad_norm": 0.03320266678929329, "grad_norm_var": 1.247110248539518e-06, "learning_rate": 0.00160305483429103, "loss": 2.6205, "step": 5944 }, { "crossentropy": 2.5658676624298096, "epoch": 0.5056992174208914, "grad_norm": 0.03295557573437691, "grad_norm_var": 1.0622477434810497e-06, "learning_rate": 0.001601584933328113, "loss": 2.5659, "step": 5945 }, { "crossentropy": 2.6856026649475098, "epoch": 0.5057842803674719, "grad_norm": 0.03290063515305519, "grad_norm_var": 1.0695736536188322e-06, "learning_rate": 0.0016001155780527327, "loss": 2.6856, "step": 5946 }, { "crossentropy": 2.717601776123047, "epoch": 0.5058693433140524, "grad_norm": 0.03313286975026131, "grad_norm_var": 1.083602525881316e-06, "learning_rate": 0.0015986467687008254, "loss": 2.7176, "step": 5947 }, { "crossentropy": 2.617827892303467, "epoch": 0.5059544062606328, "grad_norm": 0.03804510831832886, "grad_norm_var": 2.276111027356825e-06, "learning_rate": 0.0015971785055082378, "loss": 2.6178, "step": 5948 }, { "crossentropy": 2.7057137489318848, "epoch": 0.5060394692072133, "grad_norm": 0.035298388451337814, "grad_norm_var": 2.381489683866457e-06, "learning_rate": 0.001595710788710733, "loss": 2.7057, "step": 5949 }, { "crossentropy": 2.6837260723114014, "epoch": 0.5061245321537938, "grad_norm": 0.03458529710769653, "grad_norm_var": 2.3855204911594927e-06, "learning_rate": 0.001594243618543984, "loss": 2.6837, "step": 5950 }, { "crossentropy": 2.6729085445404053, "epoch": 0.5062095951003743, "grad_norm": 0.032253868877887726, "grad_norm_var": 2.5898178118303567e-06, "learning_rate": 0.0015927769952435744, "loss": 2.6729, "step": 5951 }, { "crossentropy": 2.7101073265075684, "epoch": 0.5062946580469547, "grad_norm": 0.037428371608257294, "grad_norm_var": 3.313843841109126e-06, "learning_rate": 0.0015913109190450032, "loss": 2.7101, "step": 5952 }, { "crossentropy": 2.6211767196655273, "epoch": 0.5063797209935352, "grad_norm": 0.03162108734250069, "grad_norm_var": 3.5285411600303094e-06, "learning_rate": 0.001589845390183679, "loss": 2.6212, "step": 5953 }, { "crossentropy": 2.561690092086792, "epoch": 0.5064647839401157, "grad_norm": 0.033267516642808914, "grad_norm_var": 3.4917274394807273e-06, "learning_rate": 0.0015883804088949222, "loss": 2.5617, "step": 5954 }, { "crossentropy": 2.7035024166107178, "epoch": 0.5065498468866961, "grad_norm": 0.0325453095138073, "grad_norm_var": 3.5639046214748967e-06, "learning_rate": 0.0015869159754139696, "loss": 2.7035, "step": 5955 }, { "crossentropy": 2.603541851043701, "epoch": 0.5066349098332766, "grad_norm": 0.03324569761753082, "grad_norm_var": 3.26435546726857e-06, "learning_rate": 0.0015854520899759655, "loss": 2.6035, "step": 5956 }, { "crossentropy": 2.7111291885375977, "epoch": 0.5067199727798571, "grad_norm": 0.03457111120223999, "grad_norm_var": 3.1024743989966143e-06, "learning_rate": 0.0015839887528159664, "loss": 2.7111, "step": 5957 }, { "crossentropy": 2.5455482006073, "epoch": 0.5068050357264375, "grad_norm": 0.032837651669979095, "grad_norm_var": 3.143198497046473e-06, "learning_rate": 0.0015825259641689465, "loss": 2.5455, "step": 5958 }, { "crossentropy": 2.6406874656677246, "epoch": 0.506890098673018, "grad_norm": 0.03478284180164337, "grad_norm_var": 3.196139778031265e-06, "learning_rate": 0.0015810637242697828, "loss": 2.6407, "step": 5959 }, { "crossentropy": 2.665534496307373, "epoch": 0.5069751616195985, "grad_norm": 0.03347475081682205, "grad_norm_var": 3.174847618563938e-06, "learning_rate": 0.0015796020333532696, "loss": 2.6655, "step": 5960 }, { "crossentropy": 2.692365884780884, "epoch": 0.5070602245661789, "grad_norm": 0.03605034574866295, "grad_norm_var": 3.369660947349397e-06, "learning_rate": 0.0015781408916541145, "loss": 2.6924, "step": 5961 }, { "crossentropy": 2.6771442890167236, "epoch": 0.5071452875127594, "grad_norm": 0.03331746906042099, "grad_norm_var": 3.3123308800562153e-06, "learning_rate": 0.0015766802994069335, "loss": 2.6771, "step": 5962 }, { "crossentropy": 2.6603245735168457, "epoch": 0.50723035045934, "grad_norm": 0.03406640887260437, "grad_norm_var": 3.2397464542851505e-06, "learning_rate": 0.0015752202568462554, "loss": 2.6603, "step": 5963 }, { "crossentropy": 2.6363158226013184, "epoch": 0.5073154134059203, "grad_norm": 0.034650396555662155, "grad_norm_var": 2.225005714475333e-06, "learning_rate": 0.0015737607642065206, "loss": 2.6363, "step": 5964 }, { "crossentropy": 2.575810432434082, "epoch": 0.5074004763525009, "grad_norm": 0.03219239413738251, "grad_norm_var": 2.2901605460469315e-06, "learning_rate": 0.0015723018217220846, "loss": 2.5758, "step": 5965 }, { "crossentropy": 2.7461886405944824, "epoch": 0.5074855392990814, "grad_norm": 0.036475930362939835, "grad_norm_var": 2.7101014593538624e-06, "learning_rate": 0.0015708434296272061, "loss": 2.7462, "step": 5966 }, { "crossentropy": 2.5922064781188965, "epoch": 0.5075706022456618, "grad_norm": 0.03542559966444969, "grad_norm_var": 2.6326248279491423e-06, "learning_rate": 0.0015693855881560653, "loss": 2.5922, "step": 5967 }, { "crossentropy": 2.706321954727173, "epoch": 0.5076556651922423, "grad_norm": 0.03325668349862099, "grad_norm_var": 1.881255050025186e-06, "learning_rate": 0.0015679282975427488, "loss": 2.7063, "step": 5968 }, { "crossentropy": 2.6961209774017334, "epoch": 0.5077407281388228, "grad_norm": 0.03393213078379631, "grad_norm_var": 1.524757894807068e-06, "learning_rate": 0.0015664715580212547, "loss": 2.6961, "step": 5969 }, { "crossentropy": 2.7025697231292725, "epoch": 0.5078257910854032, "grad_norm": 0.03474990651011467, "grad_norm_var": 1.516184151323143e-06, "learning_rate": 0.0015650153698254915, "loss": 2.7026, "step": 5970 }, { "crossentropy": 2.6264216899871826, "epoch": 0.5079108540319837, "grad_norm": 0.03368023410439491, "grad_norm_var": 1.361666616286894e-06, "learning_rate": 0.0015635597331892848, "loss": 2.6264, "step": 5971 }, { "crossentropy": 2.7371339797973633, "epoch": 0.5079959169785642, "grad_norm": 0.036175794899463654, "grad_norm_var": 1.5374074804376525e-06, "learning_rate": 0.0015621046483463663, "loss": 2.7371, "step": 5972 }, { "crossentropy": 2.5472300052642822, "epoch": 0.5080809799251446, "grad_norm": 0.0335761196911335, "grad_norm_var": 1.5702779104388604e-06, "learning_rate": 0.0015606501155303804, "loss": 2.5472, "step": 5973 }, { "crossentropy": 2.6265974044799805, "epoch": 0.5081660428717251, "grad_norm": 0.034393247216939926, "grad_norm_var": 1.420224323228636e-06, "learning_rate": 0.0015591961349748834, "loss": 2.6266, "step": 5974 }, { "crossentropy": 2.6028778553009033, "epoch": 0.5082511058183056, "grad_norm": 0.034632548689842224, "grad_norm_var": 1.4137141056080883e-06, "learning_rate": 0.0015577427069133427, "loss": 2.6029, "step": 5975 }, { "crossentropy": 2.7222812175750732, "epoch": 0.508336168764886, "grad_norm": 0.037183165550231934, "grad_norm_var": 1.826558411723254e-06, "learning_rate": 0.0015562898315791352, "loss": 2.7223, "step": 5976 }, { "crossentropy": 2.6550135612487793, "epoch": 0.5084212317114665, "grad_norm": 0.033966705203056335, "grad_norm_var": 1.697722531768658e-06, "learning_rate": 0.001554837509205554, "loss": 2.655, "step": 5977 }, { "crossentropy": 2.6361029148101807, "epoch": 0.508506294658047, "grad_norm": 0.03270835056900978, "grad_norm_var": 1.8153007681725544e-06, "learning_rate": 0.001553385740025799, "loss": 2.6361, "step": 5978 }, { "crossentropy": 2.6172873973846436, "epoch": 0.5085913576046274, "grad_norm": 0.03378340229392052, "grad_norm_var": 1.8344641428362391e-06, "learning_rate": 0.001551934524272982, "loss": 2.6173, "step": 5979 }, { "crossentropy": 2.6049766540527344, "epoch": 0.5086764205512079, "grad_norm": 0.03393993526697159, "grad_norm_var": 1.8445569825784971e-06, "learning_rate": 0.001550483862180127, "loss": 2.605, "step": 5980 }, { "crossentropy": 2.5655291080474854, "epoch": 0.5087614834977884, "grad_norm": 0.032354846596717834, "grad_norm_var": 1.7988327765570503e-06, "learning_rate": 0.0015490337539801674, "loss": 2.5655, "step": 5981 }, { "crossentropy": 2.6593284606933594, "epoch": 0.5088465464443689, "grad_norm": 0.032190002501010895, "grad_norm_var": 1.7546939362830226e-06, "learning_rate": 0.0015475841999059486, "loss": 2.6593, "step": 5982 }, { "crossentropy": 2.7643485069274902, "epoch": 0.5089316093909493, "grad_norm": 0.03339212015271187, "grad_norm_var": 1.659631730304735e-06, "learning_rate": 0.0015461352001902302, "loss": 2.7643, "step": 5983 }, { "crossentropy": 2.5960493087768555, "epoch": 0.5090166723375298, "grad_norm": 0.03639506921172142, "grad_norm_var": 1.966399395637343e-06, "learning_rate": 0.0015446867550656768, "loss": 2.596, "step": 5984 }, { "crossentropy": 2.665850877761841, "epoch": 0.5091017352841103, "grad_norm": 0.03396235406398773, "grad_norm_var": 1.965413912167341e-06, "learning_rate": 0.0015432388647648688, "loss": 2.6659, "step": 5985 }, { "crossentropy": 2.6471621990203857, "epoch": 0.5091867982306907, "grad_norm": 0.034600913524627686, "grad_norm_var": 1.9557327772571147e-06, "learning_rate": 0.0015417915295202944, "loss": 2.6472, "step": 5986 }, { "crossentropy": 2.6702966690063477, "epoch": 0.5092718611772712, "grad_norm": 0.03220078721642494, "grad_norm_var": 2.191789815490265e-06, "learning_rate": 0.001540344749564354, "loss": 2.6703, "step": 5987 }, { "crossentropy": 2.607395648956299, "epoch": 0.5093569241238517, "grad_norm": 0.037940312176942825, "grad_norm_var": 2.876881813070285e-06, "learning_rate": 0.001538898525129358, "loss": 2.6074, "step": 5988 }, { "crossentropy": 2.627530336380005, "epoch": 0.5094419870704321, "grad_norm": 0.03215041756629944, "grad_norm_var": 3.1227528270639028e-06, "learning_rate": 0.0015374528564475309, "loss": 2.6275, "step": 5989 }, { "crossentropy": 2.636983633041382, "epoch": 0.5095270500170126, "grad_norm": 0.03390134498476982, "grad_norm_var": 3.1194385909732423e-06, "learning_rate": 0.0015360077437510044, "loss": 2.637, "step": 5990 }, { "crossentropy": 2.662182331085205, "epoch": 0.5096121129635931, "grad_norm": 0.03654490038752556, "grad_norm_var": 3.488540655120313e-06, "learning_rate": 0.0015345631872718214, "loss": 2.6622, "step": 5991 }, { "crossentropy": 2.7385506629943848, "epoch": 0.5096971759101735, "grad_norm": 0.0338984839618206, "grad_norm_var": 2.856761997720112e-06, "learning_rate": 0.0015331191872419347, "loss": 2.7386, "step": 5992 }, { "crossentropy": 2.6661951541900635, "epoch": 0.509782238856754, "grad_norm": 0.033413954079151154, "grad_norm_var": 2.877989003676323e-06, "learning_rate": 0.0015316757438932121, "loss": 2.6662, "step": 5993 }, { "crossentropy": 2.7183213233947754, "epoch": 0.5098673018033345, "grad_norm": 0.035354696214199066, "grad_norm_var": 2.8736667910550055e-06, "learning_rate": 0.0015302328574574291, "loss": 2.7183, "step": 5994 }, { "crossentropy": 2.7158901691436768, "epoch": 0.5099523647499149, "grad_norm": 0.032097384333610535, "grad_norm_var": 3.1284555518803468e-06, "learning_rate": 0.0015287905281662667, "loss": 2.7159, "step": 5995 }, { "crossentropy": 2.667097806930542, "epoch": 0.5100374276964954, "grad_norm": 0.03272628411650658, "grad_norm_var": 3.233648174657174e-06, "learning_rate": 0.0015273487562513267, "loss": 2.6671, "step": 5996 }, { "crossentropy": 2.6688551902770996, "epoch": 0.5101224906430759, "grad_norm": 0.03379617631435394, "grad_norm_var": 3.057849757622429e-06, "learning_rate": 0.0015259075419441143, "loss": 2.6689, "step": 5997 }, { "crossentropy": 2.6537363529205322, "epoch": 0.5102075535896563, "grad_norm": 0.03545406088232994, "grad_norm_var": 2.9206308396242364e-06, "learning_rate": 0.0015244668854760457, "loss": 2.6537, "step": 5998 }, { "crossentropy": 2.6714653968811035, "epoch": 0.5102926165362368, "grad_norm": 0.034441087394952774, "grad_norm_var": 2.870909071817027e-06, "learning_rate": 0.0015230267870784515, "loss": 2.6715, "step": 5999 }, { "crossentropy": 2.657930374145508, "epoch": 0.5103776794828173, "grad_norm": 0.031629640609025955, "grad_norm_var": 2.9621604199845177e-06, "learning_rate": 0.001521587246982568, "loss": 2.6579, "step": 6000 }, { "crossentropy": 2.6540169715881348, "epoch": 0.5104627424293977, "grad_norm": 0.03836583346128464, "grad_norm_var": 4.147832573776246e-06, "learning_rate": 0.0015201482654195447, "loss": 2.654, "step": 6001 }, { "crossentropy": 2.6827595233917236, "epoch": 0.5105478053759782, "grad_norm": 0.03306078910827637, "grad_norm_var": 4.23064755290057e-06, "learning_rate": 0.001518709842620441, "loss": 2.6828, "step": 6002 }, { "crossentropy": 2.6769330501556396, "epoch": 0.5106328683225587, "grad_norm": 0.03372317552566528, "grad_norm_var": 3.972531118479062e-06, "learning_rate": 0.0015172719788162248, "loss": 2.6769, "step": 6003 }, { "crossentropy": 2.585408926010132, "epoch": 0.5107179312691391, "grad_norm": 0.033348482102155685, "grad_norm_var": 3.0500428587300342e-06, "learning_rate": 0.001515834674237775, "loss": 2.5854, "step": 6004 }, { "crossentropy": 2.534888982772827, "epoch": 0.5108029942157196, "grad_norm": 0.03343212231993675, "grad_norm_var": 2.837629728788017e-06, "learning_rate": 0.0015143979291158838, "loss": 2.5349, "step": 6005 }, { "crossentropy": 2.6780381202697754, "epoch": 0.5108880571623001, "grad_norm": 0.03494136780500412, "grad_norm_var": 2.8812524036007463e-06, "learning_rate": 0.0015129617436812504, "loss": 2.678, "step": 6006 }, { "crossentropy": 2.61399507522583, "epoch": 0.5109731201088805, "grad_norm": 0.033547982573509216, "grad_norm_var": 2.4813365983052002e-06, "learning_rate": 0.001511526118164484, "loss": 2.614, "step": 6007 }, { "crossentropy": 2.658296585083008, "epoch": 0.511058183055461, "grad_norm": 0.034448157995939255, "grad_norm_var": 2.4963004562690457e-06, "learning_rate": 0.0015100910527961048, "loss": 2.6583, "step": 6008 }, { "crossentropy": 2.5656795501708984, "epoch": 0.5111432460020415, "grad_norm": 0.034920476377010345, "grad_norm_var": 2.5231791700686472e-06, "learning_rate": 0.001508656547806543, "loss": 2.5657, "step": 6009 }, { "crossentropy": 2.650505542755127, "epoch": 0.511228308948622, "grad_norm": 0.03706337511539459, "grad_norm_var": 2.995949463635052e-06, "learning_rate": 0.001507222603426137, "loss": 2.6505, "step": 6010 }, { "crossentropy": 2.5747361183166504, "epoch": 0.5113133718952024, "grad_norm": 0.03361815586686134, "grad_norm_var": 2.7167299779991736e-06, "learning_rate": 0.0015057892198851413, "loss": 2.5747, "step": 6011 }, { "crossentropy": 2.5817008018493652, "epoch": 0.5113984348417829, "grad_norm": 0.03228943049907684, "grad_norm_var": 2.81929237280065e-06, "learning_rate": 0.0015043563974137131, "loss": 2.5817, "step": 6012 }, { "crossentropy": 2.582552671432495, "epoch": 0.5114834977883634, "grad_norm": 0.032568663358688354, "grad_norm_var": 2.988564773609368e-06, "learning_rate": 0.0015029241362419228, "loss": 2.5826, "step": 6013 }, { "crossentropy": 2.651463508605957, "epoch": 0.5115685607349438, "grad_norm": 0.03281895071268082, "grad_norm_var": 2.9743165825056133e-06, "learning_rate": 0.0015014924365997484, "loss": 2.6515, "step": 6014 }, { "crossentropy": 2.573753595352173, "epoch": 0.5116536236815243, "grad_norm": 0.033634502440690994, "grad_norm_var": 2.9690044117329045e-06, "learning_rate": 0.0015000612987170847, "loss": 2.5738, "step": 6015 }, { "crossentropy": 2.646009683609009, "epoch": 0.5117386866281048, "grad_norm": 0.033702149987220764, "grad_norm_var": 2.5926188196097207e-06, "learning_rate": 0.0014986307228237268, "loss": 2.646, "step": 6016 }, { "crossentropy": 2.633864164352417, "epoch": 0.5118237495746852, "grad_norm": 0.03322219103574753, "grad_norm_var": 1.3156066314819077e-06, "learning_rate": 0.0014972007091493828, "loss": 2.6339, "step": 6017 }, { "crossentropy": 2.6740899085998535, "epoch": 0.5119088125212657, "grad_norm": 0.03552287444472313, "grad_norm_var": 1.4612449893274908e-06, "learning_rate": 0.0014957712579236765, "loss": 2.6741, "step": 6018 }, { "crossentropy": 2.677530288696289, "epoch": 0.5119938754678462, "grad_norm": 0.03245972841978073, "grad_norm_var": 1.5950345940346499e-06, "learning_rate": 0.0014943423693761337, "loss": 2.6775, "step": 6019 }, { "crossentropy": 2.6681323051452637, "epoch": 0.5120789384144266, "grad_norm": 0.03358956798911095, "grad_norm_var": 1.5826693927800672e-06, "learning_rate": 0.0014929140437361915, "loss": 2.6681, "step": 6020 }, { "crossentropy": 2.6150248050689697, "epoch": 0.5121640013610071, "grad_norm": 0.03417094424366951, "grad_norm_var": 1.5745141816049524e-06, "learning_rate": 0.0014914862812332014, "loss": 2.615, "step": 6021 }, { "crossentropy": 2.61008882522583, "epoch": 0.5122490643075877, "grad_norm": 0.0357821024954319, "grad_norm_var": 1.734596205378985e-06, "learning_rate": 0.00149005908209642, "loss": 2.6101, "step": 6022 }, { "crossentropy": 2.704962730407715, "epoch": 0.512334127254168, "grad_norm": 0.03443659469485283, "grad_norm_var": 1.7351372111784613e-06, "learning_rate": 0.0014886324465550106, "loss": 2.705, "step": 6023 }, { "crossentropy": 2.724924325942993, "epoch": 0.5124191902007486, "grad_norm": 0.03528290241956711, "grad_norm_var": 1.8268425496158087e-06, "learning_rate": 0.0014872063748380543, "loss": 2.7249, "step": 6024 }, { "crossentropy": 2.6556224822998047, "epoch": 0.5125042531473291, "grad_norm": 0.034574851393699646, "grad_norm_var": 1.795008113757562e-06, "learning_rate": 0.0014857808671745348, "loss": 2.6556, "step": 6025 }, { "crossentropy": 2.682772636413574, "epoch": 0.5125893160939095, "grad_norm": 0.037816524505615234, "grad_norm_var": 2.1334586199336348e-06, "learning_rate": 0.0014843559237933473, "loss": 2.6828, "step": 6026 }, { "crossentropy": 2.64939022064209, "epoch": 0.51267437904049, "grad_norm": 0.033130913972854614, "grad_norm_var": 2.179153605538775e-06, "learning_rate": 0.0014829315449232984, "loss": 2.6494, "step": 6027 }, { "crossentropy": 2.5983073711395264, "epoch": 0.5127594419870705, "grad_norm": 0.031706802546978, "grad_norm_var": 2.338122254978009e-06, "learning_rate": 0.0014815077307931018, "loss": 2.5983, "step": 6028 }, { "crossentropy": 2.6981656551361084, "epoch": 0.5128445049336509, "grad_norm": 0.033635083585977554, "grad_norm_var": 2.20194484232359e-06, "learning_rate": 0.0014800844816313802, "loss": 2.6982, "step": 6029 }, { "crossentropy": 2.5807578563690186, "epoch": 0.5129295678802314, "grad_norm": 0.03193942457437515, "grad_norm_var": 2.399691045905777e-06, "learning_rate": 0.0014786617976666671, "loss": 2.5808, "step": 6030 }, { "crossentropy": 2.6093649864196777, "epoch": 0.5130146308268119, "grad_norm": 0.03411436825990677, "grad_norm_var": 2.388269735328907e-06, "learning_rate": 0.001477239679127404, "loss": 2.6094, "step": 6031 }, { "crossentropy": 2.696103572845459, "epoch": 0.5130996937733923, "grad_norm": 0.03628917783498764, "grad_norm_var": 2.680390109208869e-06, "learning_rate": 0.0014758181262419423, "loss": 2.6961, "step": 6032 }, { "crossentropy": 2.6537373065948486, "epoch": 0.5131847567199728, "grad_norm": 0.03592820093035698, "grad_norm_var": 2.7745610206616455e-06, "learning_rate": 0.0014743971392385414, "loss": 2.6537, "step": 6033 }, { "crossentropy": 2.672783851623535, "epoch": 0.5132698196665533, "grad_norm": 0.03237379714846611, "grad_norm_var": 2.9223616755428077e-06, "learning_rate": 0.0014729767183453736, "loss": 2.6728, "step": 6034 }, { "crossentropy": 2.654615640640259, "epoch": 0.5133548826131337, "grad_norm": 0.030549172312021255, "grad_norm_var": 3.5943122369224794e-06, "learning_rate": 0.0014715568637905158, "loss": 2.6546, "step": 6035 }, { "crossentropy": 2.641935348510742, "epoch": 0.5134399455597142, "grad_norm": 0.035608239471912384, "grad_norm_var": 3.7163189756600837e-06, "learning_rate": 0.0014701375758019541, "loss": 2.6419, "step": 6036 }, { "crossentropy": 2.6970834732055664, "epoch": 0.5135250085062947, "grad_norm": 0.03392995893955231, "grad_norm_var": 3.7211615397464427e-06, "learning_rate": 0.0014687188546075904, "loss": 2.6971, "step": 6037 }, { "crossentropy": 2.577504873275757, "epoch": 0.5136100714528751, "grad_norm": 0.03622176870703697, "grad_norm_var": 3.826362751035752e-06, "learning_rate": 0.0014673007004352252, "loss": 2.5775, "step": 6038 }, { "crossentropy": 2.678476095199585, "epoch": 0.5136951343994556, "grad_norm": 0.03316280245780945, "grad_norm_var": 3.891174446027321e-06, "learning_rate": 0.0014658831135125738, "loss": 2.6785, "step": 6039 }, { "crossentropy": 2.680135726928711, "epoch": 0.5137801973460361, "grad_norm": 0.03289876878261566, "grad_norm_var": 3.883595933385347e-06, "learning_rate": 0.0014644660940672626, "loss": 2.6801, "step": 6040 }, { "crossentropy": 2.6771538257598877, "epoch": 0.5138652602926166, "grad_norm": 0.034377895295619965, "grad_norm_var": 3.870727156533339e-06, "learning_rate": 0.001463049642326823, "loss": 2.6772, "step": 6041 }, { "crossentropy": 2.6964006423950195, "epoch": 0.513950323239197, "grad_norm": 0.035733845084905624, "grad_norm_var": 3.0765078230278464e-06, "learning_rate": 0.001461633758518694, "loss": 2.6964, "step": 6042 }, { "crossentropy": 2.6276211738586426, "epoch": 0.5140353861857775, "grad_norm": 0.03388948738574982, "grad_norm_var": 3.0397404287597907e-06, "learning_rate": 0.0014602184428702293, "loss": 2.6276, "step": 6043 }, { "crossentropy": 2.647878646850586, "epoch": 0.514120449132358, "grad_norm": 0.03171778470277786, "grad_norm_var": 3.0365402664117116e-06, "learning_rate": 0.0014588036956086875, "loss": 2.6479, "step": 6044 }, { "crossentropy": 2.744504690170288, "epoch": 0.5142055120789384, "grad_norm": 0.03262325003743172, "grad_norm_var": 3.136013535825812e-06, "learning_rate": 0.0014573895169612322, "loss": 2.7445, "step": 6045 }, { "crossentropy": 2.5834434032440186, "epoch": 0.5142905750255189, "grad_norm": 0.033297888934612274, "grad_norm_var": 2.9080330352461436e-06, "learning_rate": 0.0014559759071549438, "loss": 2.5834, "step": 6046 }, { "crossentropy": 2.57480525970459, "epoch": 0.5143756379720994, "grad_norm": 0.03138326480984688, "grad_norm_var": 3.3033554802728084e-06, "learning_rate": 0.0014545628664168059, "loss": 2.5748, "step": 6047 }, { "crossentropy": 2.6352593898773193, "epoch": 0.5144607009186798, "grad_norm": 0.03463798761367798, "grad_norm_var": 2.914532988667652e-06, "learning_rate": 0.0014531503949737107, "loss": 2.6353, "step": 6048 }, { "crossentropy": 2.6752121448516846, "epoch": 0.5145457638652603, "grad_norm": 0.03293781355023384, "grad_norm_var": 2.5634316831108298e-06, "learning_rate": 0.001451738493052464, "loss": 2.6752, "step": 6049 }, { "crossentropy": 2.6602253913879395, "epoch": 0.5146308268118408, "grad_norm": 0.03292699530720711, "grad_norm_var": 2.5025154165432435e-06, "learning_rate": 0.001450327160879774, "loss": 2.6602, "step": 6050 }, { "crossentropy": 2.6474831104278564, "epoch": 0.5147158897584212, "grad_norm": 0.03364307060837746, "grad_norm_var": 1.8861611369047048e-06, "learning_rate": 0.0014489163986822606, "loss": 2.6475, "step": 6051 }, { "crossentropy": 2.7843570709228516, "epoch": 0.5148009527050017, "grad_norm": 0.0340818390250206, "grad_norm_var": 1.6407540222542737e-06, "learning_rate": 0.0014475062066864515, "loss": 2.7844, "step": 6052 }, { "crossentropy": 2.7293951511383057, "epoch": 0.5148860156515822, "grad_norm": 0.0334780178964138, "grad_norm_var": 1.6331261463884324e-06, "learning_rate": 0.0014460965851187824, "loss": 2.7294, "step": 6053 }, { "crossentropy": 2.625761032104492, "epoch": 0.5149710785981626, "grad_norm": 0.03489154949784279, "grad_norm_var": 1.2722024790597549e-06, "learning_rate": 0.0014446875342055986, "loss": 2.6258, "step": 6054 }, { "crossentropy": 2.6136388778686523, "epoch": 0.5150561415447431, "grad_norm": 0.03162604942917824, "grad_norm_var": 1.4848259553445463e-06, "learning_rate": 0.0014432790541731516, "loss": 2.6136, "step": 6055 }, { "crossentropy": 2.683331251144409, "epoch": 0.5151412044913236, "grad_norm": 0.033015668392181396, "grad_norm_var": 1.4781154692818908e-06, "learning_rate": 0.0014418711452476047, "loss": 2.6833, "step": 6056 }, { "crossentropy": 2.661759376525879, "epoch": 0.515226267437904, "grad_norm": 0.03316623345017433, "grad_norm_var": 1.4105001610716557e-06, "learning_rate": 0.0014404638076550274, "loss": 2.6618, "step": 6057 }, { "crossentropy": 2.631967067718506, "epoch": 0.5153113303844845, "grad_norm": 0.034962013363838196, "grad_norm_var": 1.1988765243733355e-06, "learning_rate": 0.001439057041621395, "loss": 2.632, "step": 6058 }, { "crossentropy": 2.685725450515747, "epoch": 0.515396393331065, "grad_norm": 0.032099343836307526, "grad_norm_var": 1.2506891332599834e-06, "learning_rate": 0.0014376508473725985, "loss": 2.6857, "step": 6059 }, { "crossentropy": 2.6441617012023926, "epoch": 0.5154814562776454, "grad_norm": 0.032843902707099915, "grad_norm_var": 1.1140691522049453e-06, "learning_rate": 0.0014362452251344283, "loss": 2.6442, "step": 6060 }, { "crossentropy": 2.629481792449951, "epoch": 0.5155665192242259, "grad_norm": 0.03240654245018959, "grad_norm_var": 1.1344183463721583e-06, "learning_rate": 0.0014348401751325857, "loss": 2.6295, "step": 6061 }, { "crossentropy": 2.6436820030212402, "epoch": 0.5156515821708064, "grad_norm": 0.03403715789318085, "grad_norm_var": 1.1770036838155856e-06, "learning_rate": 0.0014334356975926844, "loss": 2.6437, "step": 6062 }, { "crossentropy": 2.675954580307007, "epoch": 0.5157366451173868, "grad_norm": 0.034551989287137985, "grad_norm_var": 1.012235838979541e-06, "learning_rate": 0.001432031792740242, "loss": 2.676, "step": 6063 }, { "crossentropy": 2.678463935852051, "epoch": 0.5158217080639673, "grad_norm": 0.03454044461250305, "grad_norm_var": 9.974661581761575e-07, "learning_rate": 0.0014306284608006836, "loss": 2.6785, "step": 6064 }, { "crossentropy": 2.7164359092712402, "epoch": 0.5159067710105478, "grad_norm": 0.03415559232234955, "grad_norm_var": 1.006901165985045e-06, "learning_rate": 0.001429225701999347, "loss": 2.7164, "step": 6065 }, { "crossentropy": 2.6641175746917725, "epoch": 0.5159918339571282, "grad_norm": 0.033093590289354324, "grad_norm_var": 9.953158416185308e-07, "learning_rate": 0.0014278235165614745, "loss": 2.6641, "step": 6066 }, { "crossentropy": 2.6962087154388428, "epoch": 0.5160768969037087, "grad_norm": 0.03885012865066528, "grad_norm_var": 2.76350517942562e-06, "learning_rate": 0.0014264219047122122, "loss": 2.6962, "step": 6067 }, { "crossentropy": 2.5838844776153564, "epoch": 0.5161619598502892, "grad_norm": 0.03138380125164986, "grad_norm_var": 3.1395649240889095e-06, "learning_rate": 0.0014250208666766234, "loss": 2.5839, "step": 6068 }, { "crossentropy": 2.6602423191070557, "epoch": 0.5162470227968697, "grad_norm": 0.03372347354888916, "grad_norm_var": 3.13626595595014e-06, "learning_rate": 0.0014236204026796728, "loss": 2.6602, "step": 6069 }, { "crossentropy": 2.6392576694488525, "epoch": 0.5163320857434501, "grad_norm": 0.03282712772488594, "grad_norm_var": 3.0771865454997636e-06, "learning_rate": 0.0014222205129462346, "loss": 2.6393, "step": 6070 }, { "crossentropy": 2.622258424758911, "epoch": 0.5164171486900306, "grad_norm": 0.033463090658187866, "grad_norm_var": 2.8094613751266743e-06, "learning_rate": 0.0014208211977010889, "loss": 2.6223, "step": 6071 }, { "crossentropy": 2.5970699787139893, "epoch": 0.5165022116366111, "grad_norm": 0.03640362620353699, "grad_norm_var": 3.219976762371355e-06, "learning_rate": 0.0014194224571689284, "loss": 2.5971, "step": 6072 }, { "crossentropy": 2.7093350887298584, "epoch": 0.5165872745831915, "grad_norm": 0.03448140248656273, "grad_norm_var": 3.198226548640585e-06, "learning_rate": 0.0014180242915743497, "loss": 2.7093, "step": 6073 }, { "crossentropy": 2.735389471054077, "epoch": 0.516672337529772, "grad_norm": 0.03481026366353035, "grad_norm_var": 3.1799775548554374e-06, "learning_rate": 0.001416626701141857, "loss": 2.7354, "step": 6074 }, { "crossentropy": 2.7212541103363037, "epoch": 0.5167574004763525, "grad_norm": 0.034922223538160324, "grad_norm_var": 2.9703698345426762e-06, "learning_rate": 0.0014152296860958642, "loss": 2.7213, "step": 6075 }, { "crossentropy": 2.6433873176574707, "epoch": 0.5168424634229329, "grad_norm": 0.03628220781683922, "grad_norm_var": 3.1077695022631187e-06, "learning_rate": 0.0014138332466606908, "loss": 2.6434, "step": 6076 }, { "crossentropy": 2.668470859527588, "epoch": 0.5169275263695134, "grad_norm": 0.034354932606220245, "grad_norm_var": 2.8347504057684807e-06, "learning_rate": 0.001412437383060563, "loss": 2.6685, "step": 6077 }, { "crossentropy": 2.606717586517334, "epoch": 0.517012589316094, "grad_norm": 0.03192936256527901, "grad_norm_var": 3.2404130273994483e-06, "learning_rate": 0.0014110420955196201, "loss": 2.6067, "step": 6078 }, { "crossentropy": 2.6754519939422607, "epoch": 0.5170976522626743, "grad_norm": 0.03370743244886398, "grad_norm_var": 3.2634666485118386e-06, "learning_rate": 0.0014096473842619033, "loss": 2.6755, "step": 6079 }, { "crossentropy": 2.5679450035095215, "epoch": 0.5171827152092549, "grad_norm": 0.033651672303676605, "grad_norm_var": 3.285296221454866e-06, "learning_rate": 0.0014082532495113627, "loss": 2.5679, "step": 6080 }, { "crossentropy": 2.67228364944458, "epoch": 0.5172677781558354, "grad_norm": 0.03377600386738777, "grad_norm_var": 3.299206127564622e-06, "learning_rate": 0.0014068596914918563, "loss": 2.6723, "step": 6081 }, { "crossentropy": 2.756300210952759, "epoch": 0.5173528411024158, "grad_norm": 0.0338650718331337, "grad_norm_var": 3.219635627760055e-06, "learning_rate": 0.0014054667104271496, "loss": 2.7563, "step": 6082 }, { "crossentropy": 2.5245282649993896, "epoch": 0.5174379040489963, "grad_norm": 0.03386177122592926, "grad_norm_var": 1.7332066882200698e-06, "learning_rate": 0.001404074306540914, "loss": 2.5245, "step": 6083 }, { "crossentropy": 2.5933265686035156, "epoch": 0.5175229669955768, "grad_norm": 0.03238792344927788, "grad_norm_var": 1.450615513486538e-06, "learning_rate": 0.001402682480056731, "loss": 2.5933, "step": 6084 }, { "crossentropy": 2.5971086025238037, "epoch": 0.5176080299421572, "grad_norm": 0.03351125866174698, "grad_norm_var": 1.4620461514955792e-06, "learning_rate": 0.0014012912311980885, "loss": 2.5971, "step": 6085 }, { "crossentropy": 2.5612621307373047, "epoch": 0.5176930928887377, "grad_norm": 0.03606092557311058, "grad_norm_var": 1.6035829188817038e-06, "learning_rate": 0.0013999005601883774, "loss": 2.5613, "step": 6086 }, { "crossentropy": 2.6727523803710938, "epoch": 0.5177781558353182, "grad_norm": 0.03270956128835678, "grad_norm_var": 1.714798759031716e-06, "learning_rate": 0.0013985104672509059, "loss": 2.6728, "step": 6087 }, { "crossentropy": 2.7489569187164307, "epoch": 0.5178632187818986, "grad_norm": 0.035399872809648514, "grad_norm_var": 1.478797689143975e-06, "learning_rate": 0.0013971209526088763, "loss": 2.749, "step": 6088 }, { "crossentropy": 2.6236770153045654, "epoch": 0.5179482817284791, "grad_norm": 0.03359975293278694, "grad_norm_var": 1.4833662355402561e-06, "learning_rate": 0.001395732016485406, "loss": 2.6237, "step": 6089 }, { "crossentropy": 2.608079433441162, "epoch": 0.5180333446750596, "grad_norm": 0.03344820812344551, "grad_norm_var": 1.4615896293432642e-06, "learning_rate": 0.0013943436591035208, "loss": 2.6081, "step": 6090 }, { "crossentropy": 2.7307016849517822, "epoch": 0.51811840762164, "grad_norm": 0.03270531818270683, "grad_norm_var": 1.486333888176851e-06, "learning_rate": 0.0013929558806861492, "loss": 2.7307, "step": 6091 }, { "crossentropy": 2.654041290283203, "epoch": 0.5182034705682205, "grad_norm": 0.03374214842915535, "grad_norm_var": 1.058469267080769e-06, "learning_rate": 0.0013915686814561284, "loss": 2.654, "step": 6092 }, { "crossentropy": 2.628603458404541, "epoch": 0.518288533514801, "grad_norm": 0.03345973417162895, "grad_norm_var": 1.0267365927111674e-06, "learning_rate": 0.0013901820616362016, "loss": 2.6286, "step": 6093 }, { "crossentropy": 2.7616584300994873, "epoch": 0.5183735964613814, "grad_norm": 0.03645521402359009, "grad_norm_var": 1.2906566841436665e-06, "learning_rate": 0.0013887960214490225, "loss": 2.7617, "step": 6094 }, { "crossentropy": 2.552341938018799, "epoch": 0.5184586594079619, "grad_norm": 0.035583991557359695, "grad_norm_var": 1.4634759427513411e-06, "learning_rate": 0.0013874105611171477, "loss": 2.5523, "step": 6095 }, { "crossentropy": 2.7255420684814453, "epoch": 0.5185437223545424, "grad_norm": 0.03353939205408096, "grad_norm_var": 1.4696829572370674e-06, "learning_rate": 0.0013860256808630427, "loss": 2.7255, "step": 6096 }, { "crossentropy": 2.6370198726654053, "epoch": 0.5186287853011229, "grad_norm": 0.03193072974681854, "grad_norm_var": 1.7392412498359299e-06, "learning_rate": 0.001384641380909079, "loss": 2.637, "step": 6097 }, { "crossentropy": 2.6430392265319824, "epoch": 0.5187138482477033, "grad_norm": 0.034477878361940384, "grad_norm_var": 1.760568572808004e-06, "learning_rate": 0.001383257661477535, "loss": 2.643, "step": 6098 }, { "crossentropy": 2.6258177757263184, "epoch": 0.5187989111942838, "grad_norm": 0.03449271619319916, "grad_norm_var": 1.7797427078987868e-06, "learning_rate": 0.0013818745227905955, "loss": 2.6258, "step": 6099 }, { "crossentropy": 2.7653040885925293, "epoch": 0.5188839741408643, "grad_norm": 0.031073464080691338, "grad_norm_var": 2.1648387259151956e-06, "learning_rate": 0.0013804919650703552, "loss": 2.7653, "step": 6100 }, { "crossentropy": 2.678380250930786, "epoch": 0.5189690370874447, "grad_norm": 0.0327812060713768, "grad_norm_var": 2.2347134101198858e-06, "learning_rate": 0.0013791099885388108, "loss": 2.6784, "step": 6101 }, { "crossentropy": 2.5611989498138428, "epoch": 0.5190541000340252, "grad_norm": 0.036583077162504196, "grad_norm_var": 2.406287342252691e-06, "learning_rate": 0.0013777285934178697, "loss": 2.5612, "step": 6102 }, { "crossentropy": 2.641306161880493, "epoch": 0.5191391629806057, "grad_norm": 0.03396473452448845, "grad_norm_var": 2.3098954213169313e-06, "learning_rate": 0.0013763477799293429, "loss": 2.6413, "step": 6103 }, { "crossentropy": 2.626114845275879, "epoch": 0.5192242259271861, "grad_norm": 0.03162793070077896, "grad_norm_var": 2.471115748540438e-06, "learning_rate": 0.0013749675482949487, "loss": 2.6261, "step": 6104 }, { "crossentropy": 2.662285566329956, "epoch": 0.5193092888737666, "grad_norm": 0.03361370787024498, "grad_norm_var": 2.4709105193966893e-06, "learning_rate": 0.0013735878987363127, "loss": 2.6623, "step": 6105 }, { "crossentropy": 2.7726900577545166, "epoch": 0.5193943518203471, "grad_norm": 0.033469751477241516, "grad_norm_var": 2.4701660984782625e-06, "learning_rate": 0.0013722088314749687, "loss": 2.7727, "step": 6106 }, { "crossentropy": 2.564293622970581, "epoch": 0.5194794147669275, "grad_norm": 0.033511485904455185, "grad_norm_var": 2.401845763790724e-06, "learning_rate": 0.0013708303467323535, "loss": 2.5643, "step": 6107 }, { "crossentropy": 2.6090776920318604, "epoch": 0.519564477713508, "grad_norm": 0.033590465784072876, "grad_norm_var": 2.4038307932348987e-06, "learning_rate": 0.0013694524447298128, "loss": 2.6091, "step": 6108 }, { "crossentropy": 2.57314395904541, "epoch": 0.5196495406600885, "grad_norm": 0.03312969580292702, "grad_norm_var": 2.423839426279175e-06, "learning_rate": 0.0013680751256885976, "loss": 2.5731, "step": 6109 }, { "crossentropy": 2.6450159549713135, "epoch": 0.5197346036066689, "grad_norm": 0.03523964434862137, "grad_norm_var": 2.0759716422392156e-06, "learning_rate": 0.0013666983898298656, "loss": 2.645, "step": 6110 }, { "crossentropy": 2.5946686267852783, "epoch": 0.5198196665532494, "grad_norm": 0.031921789050102234, "grad_norm_var": 1.976253698858611e-06, "learning_rate": 0.0013653222373746798, "loss": 2.5947, "step": 6111 }, { "crossentropy": 2.7000019550323486, "epoch": 0.5199047294998299, "grad_norm": 0.034157294780015945, "grad_norm_var": 2.008780480299459e-06, "learning_rate": 0.0013639466685440133, "loss": 2.7, "step": 6112 }, { "crossentropy": 2.632253408432007, "epoch": 0.5199897924464103, "grad_norm": 0.033752333372831345, "grad_norm_var": 1.8416199804582723e-06, "learning_rate": 0.001362571683558741, "loss": 2.6323, "step": 6113 }, { "crossentropy": 2.6273019313812256, "epoch": 0.5200748553929908, "grad_norm": 0.033580318093299866, "grad_norm_var": 1.785319194011351e-06, "learning_rate": 0.0013611972826396463, "loss": 2.6273, "step": 6114 }, { "crossentropy": 2.783473253250122, "epoch": 0.5201599183395713, "grad_norm": 0.0367283932864666, "grad_norm_var": 2.384507149156287e-06, "learning_rate": 0.0013598234660074171, "loss": 2.7835, "step": 6115 }, { "crossentropy": 2.6754672527313232, "epoch": 0.5202449812861517, "grad_norm": 0.03433382883667946, "grad_norm_var": 1.91998311182915e-06, "learning_rate": 0.0013584502338826526, "loss": 2.6755, "step": 6116 }, { "crossentropy": 2.5827674865722656, "epoch": 0.5203300442327322, "grad_norm": 0.035806238651275635, "grad_norm_var": 2.0511027563849608e-06, "learning_rate": 0.0013570775864858488, "loss": 2.5828, "step": 6117 }, { "crossentropy": 2.596609115600586, "epoch": 0.5204151071793127, "grad_norm": 0.035337768495082855, "grad_norm_var": 1.7296187487978823e-06, "learning_rate": 0.0013557055240374179, "loss": 2.5966, "step": 6118 }, { "crossentropy": 2.6733736991882324, "epoch": 0.5205001701258931, "grad_norm": 0.03560372442007065, "grad_norm_var": 1.8930095918032533e-06, "learning_rate": 0.0013543340467576715, "loss": 2.6734, "step": 6119 }, { "crossentropy": 2.705183744430542, "epoch": 0.5205852330724736, "grad_norm": 0.03490171208977699, "grad_norm_var": 1.489130841284686e-06, "learning_rate": 0.0013529631548668296, "loss": 2.7052, "step": 6120 }, { "crossentropy": 2.625105857849121, "epoch": 0.5206702960190541, "grad_norm": 0.03579191118478775, "grad_norm_var": 1.58856035021761e-06, "learning_rate": 0.0013515928485850171, "loss": 2.6251, "step": 6121 }, { "crossentropy": 2.6446056365966797, "epoch": 0.5207553589656345, "grad_norm": 0.03790559619665146, "grad_norm_var": 2.2512941616522597e-06, "learning_rate": 0.001350223128132268, "loss": 2.6446, "step": 6122 }, { "crossentropy": 2.6340949535369873, "epoch": 0.520840421912215, "grad_norm": 0.032793428748846054, "grad_norm_var": 2.3978607240445736e-06, "learning_rate": 0.00134885399372852, "loss": 2.6341, "step": 6123 }, { "crossentropy": 2.7399961948394775, "epoch": 0.5209254848587955, "grad_norm": 0.03668801859021187, "grad_norm_var": 2.555447573082681e-06, "learning_rate": 0.0013474854455936126, "loss": 2.74, "step": 6124 }, { "crossentropy": 2.709298610687256, "epoch": 0.5210105478053759, "grad_norm": 0.03282201662659645, "grad_norm_var": 2.632121626174507e-06, "learning_rate": 0.0013461174839472984, "loss": 2.7093, "step": 6125 }, { "crossentropy": 2.723947048187256, "epoch": 0.5210956107519564, "grad_norm": 0.033006757497787476, "grad_norm_var": 2.823337842305697e-06, "learning_rate": 0.0013447501090092329, "loss": 2.7239, "step": 6126 }, { "crossentropy": 2.7156944274902344, "epoch": 0.5211806736985369, "grad_norm": 0.035061467438936234, "grad_norm_var": 2.278213348354668e-06, "learning_rate": 0.0013433833209989742, "loss": 2.7157, "step": 6127 }, { "crossentropy": 2.6324589252471924, "epoch": 0.5212657366451174, "grad_norm": 0.03266359493136406, "grad_norm_var": 2.5639685939386644e-06, "learning_rate": 0.0013420171201359932, "loss": 2.6325, "step": 6128 }, { "crossentropy": 2.641467809677124, "epoch": 0.5213507995916978, "grad_norm": 0.03310419246554375, "grad_norm_var": 2.6806384472466764e-06, "learning_rate": 0.0013406515066396597, "loss": 2.6415, "step": 6129 }, { "crossentropy": 2.667461633682251, "epoch": 0.5214358625382783, "grad_norm": 0.03363281860947609, "grad_norm_var": 2.6725664385512474e-06, "learning_rate": 0.0013392864807292531, "loss": 2.6675, "step": 6130 }, { "crossentropy": 2.621424674987793, "epoch": 0.5215209254848588, "grad_norm": 0.035881467163562775, "grad_norm_var": 2.4952703693014697e-06, "learning_rate": 0.0013379220426239563, "loss": 2.6214, "step": 6131 }, { "crossentropy": 2.6553568840026855, "epoch": 0.5216059884314392, "grad_norm": 0.03447450324892998, "grad_norm_var": 2.489481356441186e-06, "learning_rate": 0.0013365581925428593, "loss": 2.6554, "step": 6132 }, { "crossentropy": 2.701343059539795, "epoch": 0.5216910513780197, "grad_norm": 0.033333778381347656, "grad_norm_var": 2.51253395711955e-06, "learning_rate": 0.001335194930704955, "loss": 2.7013, "step": 6133 }, { "crossentropy": 2.539618968963623, "epoch": 0.5217761143246002, "grad_norm": 0.03330746665596962, "grad_norm_var": 2.5603428950765156e-06, "learning_rate": 0.0013338322573291479, "loss": 2.5396, "step": 6134 }, { "crossentropy": 2.568453311920166, "epoch": 0.5218611772711806, "grad_norm": 0.034320756793022156, "grad_norm_var": 2.4634266730392634e-06, "learning_rate": 0.0013324701726342414, "loss": 2.5685, "step": 6135 }, { "crossentropy": 2.7201664447784424, "epoch": 0.5219462402177611, "grad_norm": 0.03543546423316002, "grad_norm_var": 2.520098019870067e-06, "learning_rate": 0.001331108676838948, "loss": 2.7202, "step": 6136 }, { "crossentropy": 2.6988260746002197, "epoch": 0.5220313031643417, "grad_norm": 0.03533632680773735, "grad_norm_var": 2.447848194349287e-06, "learning_rate": 0.001329747770161883, "loss": 2.6988, "step": 6137 }, { "crossentropy": 2.5883123874664307, "epoch": 0.522116366110922, "grad_norm": 0.03508549928665161, "grad_norm_var": 1.6118972562664234e-06, "learning_rate": 0.0013283874528215733, "loss": 2.5883, "step": 6138 }, { "crossentropy": 2.705235481262207, "epoch": 0.5222014290575026, "grad_norm": 0.03340288624167442, "grad_norm_var": 1.5220948816095532e-06, "learning_rate": 0.001327027725036442, "loss": 2.7052, "step": 6139 }, { "crossentropy": 2.5733819007873535, "epoch": 0.5222864920040831, "grad_norm": 0.033720552921295166, "grad_norm_var": 1.0968746098830953e-06, "learning_rate": 0.0013256685870248225, "loss": 2.5734, "step": 6140 }, { "crossentropy": 2.6148228645324707, "epoch": 0.5223715549506635, "grad_norm": 0.03418313339352608, "grad_norm_var": 9.921944367709731e-07, "learning_rate": 0.0013243100390049557, "loss": 2.6148, "step": 6141 }, { "crossentropy": 2.675534248352051, "epoch": 0.522456617897244, "grad_norm": 0.03437790647149086, "grad_norm_var": 9.058242911876823e-07, "learning_rate": 0.0013229520811949835, "loss": 2.6755, "step": 6142 }, { "crossentropy": 2.7624738216400146, "epoch": 0.5225416808438245, "grad_norm": 0.03478764742612839, "grad_norm_var": 8.793367369361778e-07, "learning_rate": 0.0013215947138129546, "loss": 2.7625, "step": 6143 }, { "crossentropy": 2.645291566848755, "epoch": 0.5226267437904049, "grad_norm": 0.03267425298690796, "grad_norm_var": 8.771739918075315e-07, "learning_rate": 0.0013202379370768254, "loss": 2.6453, "step": 6144 }, { "crossentropy": 2.643639087677002, "epoch": 0.5227118067369854, "grad_norm": 0.03419769927859306, "grad_norm_var": 7.934271115066111e-07, "learning_rate": 0.0013188817512044544, "loss": 2.6436, "step": 6145 }, { "crossentropy": 2.639113426208496, "epoch": 0.5227968696835659, "grad_norm": 0.032763417810201645, "grad_norm_var": 9.133143529154707e-07, "learning_rate": 0.0013175261564136026, "loss": 2.6391, "step": 6146 }, { "crossentropy": 2.6714205741882324, "epoch": 0.5228819326301463, "grad_norm": 0.03354165703058243, "grad_norm_var": 7.32522302898606e-07, "learning_rate": 0.0013161711529219434, "loss": 2.6714, "step": 6147 }, { "crossentropy": 2.6370489597320557, "epoch": 0.5229669955767268, "grad_norm": 0.032865073531866074, "grad_norm_var": 8.052366039072607e-07, "learning_rate": 0.0013148167409470501, "loss": 2.637, "step": 6148 }, { "crossentropy": 2.653937578201294, "epoch": 0.5230520585233073, "grad_norm": 0.033543288707733154, "grad_norm_var": 7.905329287872034e-07, "learning_rate": 0.0013134629207064003, "loss": 2.6539, "step": 6149 }, { "crossentropy": 2.658944606781006, "epoch": 0.5231371214698877, "grad_norm": 0.0339207723736763, "grad_norm_var": 7.597461562478443e-07, "learning_rate": 0.001312109692417382, "loss": 2.6589, "step": 6150 }, { "crossentropy": 2.6188066005706787, "epoch": 0.5232221844164682, "grad_norm": 0.03327440097928047, "grad_norm_var": 7.847880249840673e-07, "learning_rate": 0.0013107570562972827, "loss": 2.6188, "step": 6151 }, { "crossentropy": 2.58035945892334, "epoch": 0.5233072473630487, "grad_norm": 0.03301572427153587, "grad_norm_var": 6.696608599929613e-07, "learning_rate": 0.0013094050125632972, "loss": 2.5804, "step": 6152 }, { "crossentropy": 2.7357981204986572, "epoch": 0.5233923103096291, "grad_norm": 0.034636784344911575, "grad_norm_var": 5.563092090913116e-07, "learning_rate": 0.0013080535614325246, "loss": 2.7358, "step": 6153 }, { "crossentropy": 2.714587926864624, "epoch": 0.5234773732562096, "grad_norm": 0.037284594029188156, "grad_norm_var": 1.2503160596246307e-06, "learning_rate": 0.0013067027031219691, "loss": 2.7146, "step": 6154 }, { "crossentropy": 2.653430938720703, "epoch": 0.5235624362027901, "grad_norm": 0.03376210108399391, "grad_norm_var": 1.2352005965663346e-06, "learning_rate": 0.001305352437848538, "loss": 2.6534, "step": 6155 }, { "crossentropy": 2.626595973968506, "epoch": 0.5236474991493706, "grad_norm": 0.03328968957066536, "grad_norm_var": 1.2576472666988557e-06, "learning_rate": 0.0013040027658290476, "loss": 2.6266, "step": 6156 }, { "crossentropy": 2.7440996170043945, "epoch": 0.523732562095951, "grad_norm": 0.03308398649096489, "grad_norm_var": 1.2890793028740831e-06, "learning_rate": 0.0013026536872802158, "loss": 2.7441, "step": 6157 }, { "crossentropy": 2.6882593631744385, "epoch": 0.5238176250425315, "grad_norm": 0.0338745042681694, "grad_norm_var": 1.26704710574424e-06, "learning_rate": 0.001301305202418665, "loss": 2.6883, "step": 6158 }, { "crossentropy": 2.7487528324127197, "epoch": 0.523902687989112, "grad_norm": 0.033974096179008484, "grad_norm_var": 1.1993519791429301e-06, "learning_rate": 0.0012999573114609237, "loss": 2.7488, "step": 6159 }, { "crossentropy": 2.6578891277313232, "epoch": 0.5239877509356924, "grad_norm": 0.03263769671320915, "grad_norm_var": 1.2045881070139759e-06, "learning_rate": 0.001298610014623423, "loss": 2.6579, "step": 6160 }, { "crossentropy": 2.6127190589904785, "epoch": 0.5240728138822729, "grad_norm": 0.033483851701021194, "grad_norm_var": 1.1918349525731505e-06, "learning_rate": 0.0012972633121225013, "loss": 2.6127, "step": 6161 }, { "crossentropy": 2.6406490802764893, "epoch": 0.5241578768288534, "grad_norm": 0.0357639454305172, "grad_norm_var": 1.3860441785510173e-06, "learning_rate": 0.0012959172041743983, "loss": 2.6406, "step": 6162 }, { "crossentropy": 2.682062864303589, "epoch": 0.5242429397754338, "grad_norm": 0.03525234013795853, "grad_norm_var": 1.4935958104135444e-06, "learning_rate": 0.0012945716909952632, "loss": 2.6821, "step": 6163 }, { "crossentropy": 2.67688250541687, "epoch": 0.5243280027220143, "grad_norm": 0.03301868215203285, "grad_norm_var": 1.4722575138815044e-06, "learning_rate": 0.0012932267728011449, "loss": 2.6769, "step": 6164 }, { "crossentropy": 2.6241841316223145, "epoch": 0.5244130656685948, "grad_norm": 0.03391595557332039, "grad_norm_var": 1.4588140609913166e-06, "learning_rate": 0.0012918824498079973, "loss": 2.6242, "step": 6165 }, { "crossentropy": 2.6689352989196777, "epoch": 0.5244981286151752, "grad_norm": 0.03283901512622833, "grad_norm_var": 1.5450837207828713e-06, "learning_rate": 0.0012905387222316822, "loss": 2.6689, "step": 6166 }, { "crossentropy": 2.7585744857788086, "epoch": 0.5245831915617557, "grad_norm": 0.03469737246632576, "grad_norm_var": 1.5445540643437931e-06, "learning_rate": 0.001289195590287965, "loss": 2.7586, "step": 6167 }, { "crossentropy": 2.6678807735443115, "epoch": 0.5246682545083362, "grad_norm": 0.03398313373327255, "grad_norm_var": 1.4718114815866096e-06, "learning_rate": 0.0012878530541925077, "loss": 2.6679, "step": 6168 }, { "crossentropy": 2.6450648307800293, "epoch": 0.5247533174549166, "grad_norm": 0.03229093179106712, "grad_norm_var": 1.645856022780036e-06, "learning_rate": 0.0012865111141608888, "loss": 2.6451, "step": 6169 }, { "crossentropy": 2.612433433532715, "epoch": 0.5248383804014971, "grad_norm": 0.034712035208940506, "grad_norm_var": 9.146615552785634e-07, "learning_rate": 0.0012851697704085836, "loss": 2.6124, "step": 6170 }, { "crossentropy": 2.5593297481536865, "epoch": 0.5249234433480776, "grad_norm": 0.03408603370189667, "grad_norm_var": 9.201785984125355e-07, "learning_rate": 0.0012838290231509707, "loss": 2.5593, "step": 6171 }, { "crossentropy": 2.692495584487915, "epoch": 0.525008506294658, "grad_norm": 0.03347385674715042, "grad_norm_var": 9.0960896285356e-07, "learning_rate": 0.001282488872603339, "loss": 2.6925, "step": 6172 }, { "crossentropy": 2.604419231414795, "epoch": 0.5250935692412385, "grad_norm": 0.035089265555143356, "grad_norm_var": 9.646862820793128e-07, "learning_rate": 0.0012811493189808776, "loss": 2.6044, "step": 6173 }, { "crossentropy": 2.7237331867218018, "epoch": 0.525178632187819, "grad_norm": 0.0338580459356308, "grad_norm_var": 9.648541688236343e-07, "learning_rate": 0.0012798103624986784, "loss": 2.7237, "step": 6174 }, { "crossentropy": 2.59305739402771, "epoch": 0.5252636951343994, "grad_norm": 0.03252142667770386, "grad_norm_var": 1.0905795733493275e-06, "learning_rate": 0.0012784720033717407, "loss": 2.5931, "step": 6175 }, { "crossentropy": 2.5919241905212402, "epoch": 0.5253487580809799, "grad_norm": 0.03213052079081535, "grad_norm_var": 1.188736121858896e-06, "learning_rate": 0.0012771342418149657, "loss": 2.5919, "step": 6176 }, { "crossentropy": 2.5772364139556885, "epoch": 0.5254338210275604, "grad_norm": 0.03330404311418533, "grad_norm_var": 1.1988104222491562e-06, "learning_rate": 0.0012757970780431593, "loss": 2.5772, "step": 6177 }, { "crossentropy": 2.6819052696228027, "epoch": 0.5255188839741408, "grad_norm": 0.032531484961509705, "grad_norm_var": 1.009090065068272e-06, "learning_rate": 0.0012744605122710308, "loss": 2.6819, "step": 6178 }, { "crossentropy": 2.764441728591919, "epoch": 0.5256039469207213, "grad_norm": 0.03647865727543831, "grad_norm_var": 1.372189049996438e-06, "learning_rate": 0.0012731245447131963, "loss": 2.7644, "step": 6179 }, { "crossentropy": 2.5852138996124268, "epoch": 0.5256890098673018, "grad_norm": 0.03347420692443848, "grad_norm_var": 1.344800203270137e-06, "learning_rate": 0.0012717891755841722, "loss": 2.5852, "step": 6180 }, { "crossentropy": 2.6225876808166504, "epoch": 0.5257740728138822, "grad_norm": 0.0323140025138855, "grad_norm_var": 1.4615471294903796e-06, "learning_rate": 0.0012704544050983807, "loss": 2.6226, "step": 6181 }, { "crossentropy": 2.715859889984131, "epoch": 0.5258591357604627, "grad_norm": 0.034882765263319016, "grad_norm_var": 1.5121015972780274e-06, "learning_rate": 0.0012691202334701474, "loss": 2.7159, "step": 6182 }, { "crossentropy": 2.656804323196411, "epoch": 0.5259441987070432, "grad_norm": 0.03284590318799019, "grad_norm_var": 1.489819824670481e-06, "learning_rate": 0.0012677866609137017, "loss": 2.6568, "step": 6183 }, { "crossentropy": 2.684997797012329, "epoch": 0.5260292616536237, "grad_norm": 0.03382832556962967, "grad_norm_var": 1.4838948452148073e-06, "learning_rate": 0.0012664536876431753, "loss": 2.685, "step": 6184 }, { "crossentropy": 2.5604336261749268, "epoch": 0.5261143246002041, "grad_norm": 0.032954782247543335, "grad_norm_var": 1.394342977920765e-06, "learning_rate": 0.0012651213138726086, "loss": 2.5604, "step": 6185 }, { "crossentropy": 2.576594591140747, "epoch": 0.5261993875467846, "grad_norm": 0.03809792175889015, "grad_norm_var": 2.5879062696555764e-06, "learning_rate": 0.001263789539815941, "loss": 2.5766, "step": 6186 }, { "crossentropy": 2.7493646144866943, "epoch": 0.5262844504933651, "grad_norm": 0.033559493720531464, "grad_norm_var": 2.5898533831558714e-06, "learning_rate": 0.0012624583656870153, "loss": 2.7494, "step": 6187 }, { "crossentropy": 2.646770477294922, "epoch": 0.5263695134399455, "grad_norm": 0.03325009346008301, "grad_norm_var": 2.603728979908709e-06, "learning_rate": 0.0012611277916995838, "loss": 2.6468, "step": 6188 }, { "crossentropy": 2.6565394401550293, "epoch": 0.526454576386526, "grad_norm": 0.037319209426641464, "grad_norm_var": 3.29188759524493e-06, "learning_rate": 0.0012597978180672953, "loss": 2.6565, "step": 6189 }, { "crossentropy": 2.625171184539795, "epoch": 0.5265396393331065, "grad_norm": 0.035106610506772995, "grad_norm_var": 3.3724417187241374e-06, "learning_rate": 0.0012584684450037032, "loss": 2.6252, "step": 6190 }, { "crossentropy": 2.6063594818115234, "epoch": 0.5266247022796869, "grad_norm": 0.03406732529401779, "grad_norm_var": 3.2093187423512874e-06, "learning_rate": 0.0012571396727222711, "loss": 2.6064, "step": 6191 }, { "crossentropy": 2.6289329528808594, "epoch": 0.5267097652262674, "grad_norm": 0.03362048789858818, "grad_norm_var": 2.9500364038767894e-06, "learning_rate": 0.0012558115014363591, "loss": 2.6289, "step": 6192 }, { "crossentropy": 2.7001798152923584, "epoch": 0.526794828172848, "grad_norm": 0.033246029168367386, "grad_norm_var": 2.9573876061596284e-06, "learning_rate": 0.0012544839313592326, "loss": 2.7002, "step": 6193 }, { "crossentropy": 2.59281849861145, "epoch": 0.5268798911194283, "grad_norm": 0.03286401927471161, "grad_norm_var": 2.889274789990894e-06, "learning_rate": 0.0012531569627040634, "loss": 2.5928, "step": 6194 }, { "crossentropy": 2.5499513149261475, "epoch": 0.5269649540660089, "grad_norm": 0.032860167324543, "grad_norm_var": 2.629647886204176e-06, "learning_rate": 0.001251830595683925, "loss": 2.55, "step": 6195 }, { "crossentropy": 2.6325879096984863, "epoch": 0.5270500170125894, "grad_norm": 0.03464258089661598, "grad_norm_var": 2.6302201172427133e-06, "learning_rate": 0.0012505048305117884, "loss": 2.6326, "step": 6196 }, { "crossentropy": 2.709472179412842, "epoch": 0.5271350799591698, "grad_norm": 0.03235551714897156, "grad_norm_var": 2.620490360749628e-06, "learning_rate": 0.0012491796674005385, "loss": 2.7095, "step": 6197 }, { "crossentropy": 2.6312406063079834, "epoch": 0.5272201429057503, "grad_norm": 0.032386355102062225, "grad_norm_var": 2.7473925450183583e-06, "learning_rate": 0.0012478551065629561, "loss": 2.6312, "step": 6198 }, { "crossentropy": 2.60087251663208, "epoch": 0.5273052058523308, "grad_norm": 0.033439792692661285, "grad_norm_var": 2.682974341781478e-06, "learning_rate": 0.0012465311482117286, "loss": 2.6009, "step": 6199 }, { "crossentropy": 2.690898895263672, "epoch": 0.5273902687989112, "grad_norm": 0.03349475562572479, "grad_norm_var": 2.6964485576335777e-06, "learning_rate": 0.0012452077925594435, "loss": 2.6909, "step": 6200 }, { "crossentropy": 2.6641268730163574, "epoch": 0.5274753317454917, "grad_norm": 0.03494589030742645, "grad_norm_var": 2.678938170343435e-06, "learning_rate": 0.0012438850398185962, "loss": 2.6641, "step": 6201 }, { "crossentropy": 2.546504020690918, "epoch": 0.5275603946920722, "grad_norm": 0.03169413283467293, "grad_norm_var": 1.8100463883524245e-06, "learning_rate": 0.0012425628902015817, "loss": 2.5465, "step": 6202 }, { "crossentropy": 2.597769260406494, "epoch": 0.5276454576386526, "grad_norm": 0.033165279775857925, "grad_norm_var": 1.8260027422347647e-06, "learning_rate": 0.0012412413439206987, "loss": 2.5978, "step": 6203 }, { "crossentropy": 2.6750295162200928, "epoch": 0.5277305205852331, "grad_norm": 0.03172098845243454, "grad_norm_var": 2.0544132988672393e-06, "learning_rate": 0.0012399204011881498, "loss": 2.675, "step": 6204 }, { "crossentropy": 2.6403183937072754, "epoch": 0.5278155835318136, "grad_norm": 0.034233901649713516, "grad_norm_var": 1.102122749381482e-06, "learning_rate": 0.0012386000622160403, "loss": 2.6403, "step": 6205 }, { "crossentropy": 2.6269845962524414, "epoch": 0.527900646478394, "grad_norm": 0.03309950605034828, "grad_norm_var": 8.878869163392658e-07, "learning_rate": 0.0012372803272163774, "loss": 2.627, "step": 6206 }, { "crossentropy": 2.751549243927002, "epoch": 0.5279857094249745, "grad_norm": 0.036518413573503494, "grad_norm_var": 1.5338229725878524e-06, "learning_rate": 0.0012359611964010747, "loss": 2.7515, "step": 6207 }, { "crossentropy": 2.742541790008545, "epoch": 0.528070772371555, "grad_norm": 0.03508546203374863, "grad_norm_var": 1.7123947082396557e-06, "learning_rate": 0.0012346426699819458, "loss": 2.7425, "step": 6208 }, { "crossentropy": 2.530209541320801, "epoch": 0.5281558353181354, "grad_norm": 0.03271503373980522, "grad_norm_var": 1.746904061015036e-06, "learning_rate": 0.001233324748170706, "loss": 2.5302, "step": 6209 }, { "crossentropy": 2.714186429977417, "epoch": 0.5282408982647159, "grad_norm": 0.032540395855903625, "grad_norm_var": 1.778793545198903e-06, "learning_rate": 0.0012320074311789803, "loss": 2.7142, "step": 6210 }, { "crossentropy": 2.653412103652954, "epoch": 0.5283259612112964, "grad_norm": 0.034115202724933624, "grad_norm_var": 1.7816934052250835e-06, "learning_rate": 0.0012306907192182876, "loss": 2.6534, "step": 6211 }, { "crossentropy": 2.634037971496582, "epoch": 0.5284110241578768, "grad_norm": 0.03404334932565689, "grad_norm_var": 1.7136114651085851e-06, "learning_rate": 0.0012293746125000538, "loss": 2.634, "step": 6212 }, { "crossentropy": 2.702202320098877, "epoch": 0.5284960871044573, "grad_norm": 0.03274957463145256, "grad_norm_var": 1.664648931712307e-06, "learning_rate": 0.0012280591112356109, "loss": 2.7022, "step": 6213 }, { "crossentropy": 2.635443687438965, "epoch": 0.5285811500510378, "grad_norm": 0.03272438794374466, "grad_norm_var": 1.6217438134739798e-06, "learning_rate": 0.0012267442156361885, "loss": 2.6354, "step": 6214 }, { "crossentropy": 2.5478672981262207, "epoch": 0.5286662129976183, "grad_norm": 0.036067381501197815, "grad_norm_var": 2.0259005162230497e-06, "learning_rate": 0.0012254299259129209, "loss": 2.5479, "step": 6215 }, { "crossentropy": 2.7000231742858887, "epoch": 0.5287512759441987, "grad_norm": 0.034144263714551926, "grad_norm_var": 2.036042285453547e-06, "learning_rate": 0.0012241162422768443, "loss": 2.7, "step": 6216 }, { "crossentropy": 2.652250289916992, "epoch": 0.5288363388907792, "grad_norm": 0.033208396285772324, "grad_norm_var": 1.9413506618751534e-06, "learning_rate": 0.0012228031649389021, "loss": 2.6523, "step": 6217 }, { "crossentropy": 2.6726815700531006, "epoch": 0.5289214018373597, "grad_norm": 0.03313099220395088, "grad_norm_var": 1.7025554175051708e-06, "learning_rate": 0.0012214906941099312, "loss": 2.6727, "step": 6218 }, { "crossentropy": 2.6210031509399414, "epoch": 0.5290064647839401, "grad_norm": 0.03760891035199165, "grad_norm_var": 2.617542510788678e-06, "learning_rate": 0.0012201788300006816, "loss": 2.621, "step": 6219 }, { "crossentropy": 2.5972695350646973, "epoch": 0.5290915277305206, "grad_norm": 0.0326148122549057, "grad_norm_var": 2.39805911327508e-06, "learning_rate": 0.0012188675728217985, "loss": 2.5973, "step": 6220 }, { "crossentropy": 2.606154203414917, "epoch": 0.5291765906771011, "grad_norm": 0.031419701874256134, "grad_norm_var": 2.8193461273656277e-06, "learning_rate": 0.0012175569227838323, "loss": 2.6062, "step": 6221 }, { "crossentropy": 2.706146478652954, "epoch": 0.5292616536236815, "grad_norm": 0.033681295812129974, "grad_norm_var": 2.7813830652450556e-06, "learning_rate": 0.0012162468800972343, "loss": 2.7061, "step": 6222 }, { "crossentropy": 2.609072208404541, "epoch": 0.529346716570262, "grad_norm": 0.032873786985874176, "grad_norm_var": 2.33818602604167e-06, "learning_rate": 0.0012149374449723627, "loss": 2.6091, "step": 6223 }, { "crossentropy": 2.8626887798309326, "epoch": 0.5294317795168425, "grad_norm": 0.03509876877069473, "grad_norm_var": 2.3407081235105044e-06, "learning_rate": 0.0012136286176194745, "loss": 2.8627, "step": 6224 }, { "crossentropy": 2.5653622150421143, "epoch": 0.5295168424634229, "grad_norm": 0.033579666167497635, "grad_norm_var": 2.2772226969482167e-06, "learning_rate": 0.0012123203982487257, "loss": 2.5654, "step": 6225 }, { "crossentropy": 2.595513343811035, "epoch": 0.5296019054100034, "grad_norm": 0.03414857015013695, "grad_norm_var": 2.184843195096077e-06, "learning_rate": 0.0012110127870701825, "loss": 2.5955, "step": 6226 }, { "crossentropy": 2.57431697845459, "epoch": 0.5296869683565839, "grad_norm": 0.03400195762515068, "grad_norm_var": 2.1812714025105927e-06, "learning_rate": 0.001209705784293809, "loss": 2.5743, "step": 6227 }, { "crossentropy": 2.6028387546539307, "epoch": 0.5297720313031643, "grad_norm": 0.030492370948195457, "grad_norm_var": 2.8628983981555106e-06, "learning_rate": 0.0012083993901294704, "loss": 2.6028, "step": 6228 }, { "crossentropy": 2.7151260375976562, "epoch": 0.5298570942497448, "grad_norm": 0.036675821989774704, "grad_norm_var": 3.3829695181262213e-06, "learning_rate": 0.0012070936047869384, "loss": 2.7151, "step": 6229 }, { "crossentropy": 2.7094812393188477, "epoch": 0.5299421571963253, "grad_norm": 0.03395494818687439, "grad_norm_var": 3.2942494506323403e-06, "learning_rate": 0.001205788428475883, "loss": 2.7095, "step": 6230 }, { "crossentropy": 2.5881099700927734, "epoch": 0.5300272201429057, "grad_norm": 0.03580794855952263, "grad_norm_var": 3.2241361585535655e-06, "learning_rate": 0.001204483861405879, "loss": 2.5881, "step": 6231 }, { "crossentropy": 2.715574264526367, "epoch": 0.5301122830894862, "grad_norm": 0.033273108303546906, "grad_norm_var": 3.2435023646236086e-06, "learning_rate": 0.0012031799037864012, "loss": 2.7156, "step": 6232 }, { "crossentropy": 2.791379451751709, "epoch": 0.5301973460360667, "grad_norm": 0.033132970333099365, "grad_norm_var": 3.2502922155100047e-06, "learning_rate": 0.001201876555826828, "loss": 2.7914, "step": 6233 }, { "crossentropy": 2.6580395698547363, "epoch": 0.5302824089826471, "grad_norm": 0.03228885307908058, "grad_norm_var": 3.374618608892885e-06, "learning_rate": 0.0012005738177364377, "loss": 2.658, "step": 6234 }, { "crossentropy": 2.5900232791900635, "epoch": 0.5303674719292276, "grad_norm": 0.03219590336084366, "grad_norm_var": 2.450279018954823e-06, "learning_rate": 0.001199271689724416, "loss": 2.59, "step": 6235 }, { "crossentropy": 2.739088296890259, "epoch": 0.5304525348758081, "grad_norm": 0.03646604344248772, "grad_norm_var": 2.9471117706152215e-06, "learning_rate": 0.0011979701719998454, "loss": 2.7391, "step": 6236 }, { "crossentropy": 2.644866943359375, "epoch": 0.5305375978223885, "grad_norm": 0.035925861448049545, "grad_norm_var": 2.8502184950441085e-06, "learning_rate": 0.0011966692647717126, "loss": 2.6449, "step": 6237 }, { "crossentropy": 2.6753389835357666, "epoch": 0.530622660768969, "grad_norm": 0.032046739012002945, "grad_norm_var": 3.081185704790915e-06, "learning_rate": 0.0011953689682489032, "loss": 2.6753, "step": 6238 }, { "crossentropy": 2.701463460922241, "epoch": 0.5307077237155495, "grad_norm": 0.03263511881232262, "grad_norm_var": 3.1165339288853476e-06, "learning_rate": 0.0011940692826402134, "loss": 2.7015, "step": 6239 }, { "crossentropy": 2.5542030334472656, "epoch": 0.5307927866621299, "grad_norm": 0.0327647365629673, "grad_norm_var": 3.070817851084567e-06, "learning_rate": 0.0011927702081543279, "loss": 2.5542, "step": 6240 }, { "crossentropy": 2.616241455078125, "epoch": 0.5308778496087104, "grad_norm": 0.033057164400815964, "grad_norm_var": 3.0970941277070565e-06, "learning_rate": 0.0011914717449998452, "loss": 2.6162, "step": 6241 }, { "crossentropy": 2.648080348968506, "epoch": 0.5309629125552909, "grad_norm": 0.03358052298426628, "grad_norm_var": 3.081715895929477e-06, "learning_rate": 0.0011901738933852606, "loss": 2.6481, "step": 6242 }, { "crossentropy": 2.730283498764038, "epoch": 0.5310479755018714, "grad_norm": 0.03427097573876381, "grad_norm_var": 3.0990874913382755e-06, "learning_rate": 0.001188876653518971, "loss": 2.7303, "step": 6243 }, { "crossentropy": 2.6763837337493896, "epoch": 0.5311330384484518, "grad_norm": 0.03379359841346741, "grad_norm_var": 2.385693721591806e-06, "learning_rate": 0.0011875800256092744, "loss": 2.6764, "step": 6244 }, { "crossentropy": 2.6543500423431396, "epoch": 0.5312181013950323, "grad_norm": 0.033800363540649414, "grad_norm_var": 1.8255328775582577e-06, "learning_rate": 0.0011862840098643746, "loss": 2.6544, "step": 6245 }, { "crossentropy": 2.6093990802764893, "epoch": 0.5313031643416128, "grad_norm": 0.033571477979421616, "grad_norm_var": 1.8210325726463728e-06, "learning_rate": 0.0011849886064923749, "loss": 2.6094, "step": 6246 }, { "crossentropy": 2.5277884006500244, "epoch": 0.5313882272881932, "grad_norm": 0.03064492903649807, "grad_norm_var": 2.010638244982057e-06, "learning_rate": 0.0011836938157012745, "loss": 2.5278, "step": 6247 }, { "crossentropy": 2.652977705001831, "epoch": 0.5314732902347737, "grad_norm": 0.0345914401113987, "grad_norm_var": 2.107413192433876e-06, "learning_rate": 0.0011823996376989848, "loss": 2.653, "step": 6248 }, { "crossentropy": 2.640092372894287, "epoch": 0.5315583531813542, "grad_norm": 0.03435196727514267, "grad_norm_var": 2.153159142219404e-06, "learning_rate": 0.0011811060726933116, "loss": 2.6401, "step": 6249 }, { "crossentropy": 2.6057910919189453, "epoch": 0.5316434161279346, "grad_norm": 0.03269604966044426, "grad_norm_var": 2.097814097334532e-06, "learning_rate": 0.0011798131208919627, "loss": 2.6058, "step": 6250 }, { "crossentropy": 2.691227912902832, "epoch": 0.5317284790745151, "grad_norm": 0.032937679439783096, "grad_norm_var": 2.000795241885788e-06, "learning_rate": 0.0011785207825025512, "loss": 2.6912, "step": 6251 }, { "crossentropy": 2.5621542930603027, "epoch": 0.5318135420210957, "grad_norm": 0.032459378242492676, "grad_norm_var": 1.4574901645982106e-06, "learning_rate": 0.0011772290577325894, "loss": 2.5622, "step": 6252 }, { "crossentropy": 2.6146106719970703, "epoch": 0.531898604967676, "grad_norm": 0.032158464193344116, "grad_norm_var": 1.0358461688143253e-06, "learning_rate": 0.0011759379467894905, "loss": 2.6146, "step": 6253 }, { "crossentropy": 2.662235736846924, "epoch": 0.5319836679142566, "grad_norm": 0.03571488708257675, "grad_norm_var": 1.3689852159703368e-06, "learning_rate": 0.0011746474498805688, "loss": 2.6622, "step": 6254 }, { "crossentropy": 2.728480339050293, "epoch": 0.5320687308608371, "grad_norm": 0.03410111740231514, "grad_norm_var": 1.3705506208430782e-06, "learning_rate": 0.001173357567213042, "loss": 2.7285, "step": 6255 }, { "crossentropy": 2.7198736667633057, "epoch": 0.5321537938074175, "grad_norm": 0.0349796898663044, "grad_norm_var": 1.4878173129121426e-06, "learning_rate": 0.0011720682989940262, "loss": 2.7199, "step": 6256 }, { "crossentropy": 2.6088764667510986, "epoch": 0.532238856753998, "grad_norm": 0.032139282673597336, "grad_norm_var": 1.6000986325110373e-06, "learning_rate": 0.0011707796454305437, "loss": 2.6089, "step": 6257 }, { "crossentropy": 2.6043174266815186, "epoch": 0.5323239197005785, "grad_norm": 0.03617970645427704, "grad_norm_var": 2.0547482411606275e-06, "learning_rate": 0.0011694916067295136, "loss": 2.6043, "step": 6258 }, { "crossentropy": 2.569206714630127, "epoch": 0.5324089826471589, "grad_norm": 0.033290229737758636, "grad_norm_var": 2.033588566893946e-06, "learning_rate": 0.001168204183097758, "loss": 2.5692, "step": 6259 }, { "crossentropy": 2.669839859008789, "epoch": 0.5324940455937394, "grad_norm": 0.035673364996910095, "grad_norm_var": 2.3059285680572845e-06, "learning_rate": 0.0011669173747419998, "loss": 2.6698, "step": 6260 }, { "crossentropy": 2.618250608444214, "epoch": 0.5325791085403199, "grad_norm": 0.036615852266550064, "grad_norm_var": 2.836928671917056e-06, "learning_rate": 0.0011656311818688632, "loss": 2.6183, "step": 6261 }, { "crossentropy": 2.60578989982605, "epoch": 0.5326641714869003, "grad_norm": 0.03519432619214058, "grad_norm_var": 2.9344279434327975e-06, "learning_rate": 0.0011643456046848744, "loss": 2.6058, "step": 6262 }, { "crossentropy": 2.6822495460510254, "epoch": 0.5327492344334808, "grad_norm": 0.033543068915605545, "grad_norm_var": 2.1694771824370383e-06, "learning_rate": 0.001163060643396458, "loss": 2.6822, "step": 6263 }, { "crossentropy": 2.5993523597717285, "epoch": 0.5328342973800613, "grad_norm": 0.03712562471628189, "grad_norm_var": 2.715233304036631e-06, "learning_rate": 0.0011617762982099444, "loss": 2.5994, "step": 6264 }, { "crossentropy": 2.7060587406158447, "epoch": 0.5329193603266417, "grad_norm": 0.03457419574260712, "grad_norm_var": 2.7191917500814232e-06, "learning_rate": 0.001160492569331562, "loss": 2.7061, "step": 6265 }, { "crossentropy": 2.6578402519226074, "epoch": 0.5330044232732222, "grad_norm": 0.03378378227353096, "grad_norm_var": 2.5552330235008766e-06, "learning_rate": 0.001159209456967439, "loss": 2.6578, "step": 6266 }, { "crossentropy": 2.636181592941284, "epoch": 0.5330894862198027, "grad_norm": 0.03352287411689758, "grad_norm_var": 2.462192828734701e-06, "learning_rate": 0.0011579269613236094, "loss": 2.6362, "step": 6267 }, { "crossentropy": 2.661364793777466, "epoch": 0.5331745491663831, "grad_norm": 0.0331282764673233, "grad_norm_var": 2.3134239755162945e-06, "learning_rate": 0.0011566450826060014, "loss": 2.6614, "step": 6268 }, { "crossentropy": 2.5684804916381836, "epoch": 0.5332596121129636, "grad_norm": 0.03203253820538521, "grad_norm_var": 2.3534409048305466e-06, "learning_rate": 0.0011553638210204486, "loss": 2.5685, "step": 6269 }, { "crossentropy": 2.705650568008423, "epoch": 0.5333446750595441, "grad_norm": 0.033495884388685226, "grad_norm_var": 2.294325637401615e-06, "learning_rate": 0.0011540831767726867, "loss": 2.7057, "step": 6270 }, { "crossentropy": 2.6787965297698975, "epoch": 0.5334297380061246, "grad_norm": 0.035002727061510086, "grad_norm_var": 2.316866904198261e-06, "learning_rate": 0.001152803150068349, "loss": 2.6788, "step": 6271 }, { "crossentropy": 2.611488103866577, "epoch": 0.533514800952705, "grad_norm": 0.03176741302013397, "grad_norm_var": 2.7103296621838747e-06, "learning_rate": 0.0011515237411129698, "loss": 2.6115, "step": 6272 }, { "crossentropy": 2.705883026123047, "epoch": 0.5335998638992855, "grad_norm": 0.032883912324905396, "grad_norm_var": 2.5412000550829756e-06, "learning_rate": 0.0011502449501119877, "loss": 2.7059, "step": 6273 }, { "crossentropy": 2.658759355545044, "epoch": 0.533684926845866, "grad_norm": 0.034915804862976074, "grad_norm_var": 2.313884568530949e-06, "learning_rate": 0.0011489667772707384, "loss": 2.6588, "step": 6274 }, { "crossentropy": 2.7118992805480957, "epoch": 0.5337699897924464, "grad_norm": 0.03144015371799469, "grad_norm_var": 2.7422044500739646e-06, "learning_rate": 0.0011476892227944597, "loss": 2.7119, "step": 6275 }, { "crossentropy": 2.733752727508545, "epoch": 0.5338550527390269, "grad_norm": 0.03449967876076698, "grad_norm_var": 2.573277805813697e-06, "learning_rate": 0.00114641228688829, "loss": 2.7338, "step": 6276 }, { "crossentropy": 2.600822925567627, "epoch": 0.5339401156856074, "grad_norm": 0.033081889152526855, "grad_norm_var": 2.1073012131176196e-06, "learning_rate": 0.0011451359697572682, "loss": 2.6008, "step": 6277 }, { "crossentropy": 2.566161870956421, "epoch": 0.5340251786321878, "grad_norm": 0.032157864421606064, "grad_norm_var": 2.0986066622300308e-06, "learning_rate": 0.001143860271606333, "loss": 2.5662, "step": 6278 }, { "crossentropy": 2.5852653980255127, "epoch": 0.5341102415787683, "grad_norm": 0.03435678407549858, "grad_norm_var": 2.1381822293034884e-06, "learning_rate": 0.0011425851926403275, "loss": 2.5853, "step": 6279 }, { "crossentropy": 2.654550075531006, "epoch": 0.5341953045253488, "grad_norm": 0.031829845160245895, "grad_norm_var": 1.4090308139591556e-06, "learning_rate": 0.001141310733063991, "loss": 2.6546, "step": 6280 }, { "crossentropy": 2.743372678756714, "epoch": 0.5342803674719292, "grad_norm": 0.03760338947176933, "grad_norm_var": 2.5054086372895574e-06, "learning_rate": 0.0011400368930819655, "loss": 2.7434, "step": 6281 }, { "crossentropy": 2.737813711166382, "epoch": 0.5343654304185097, "grad_norm": 0.03582633659243584, "grad_norm_var": 2.8519085100176326e-06, "learning_rate": 0.0011387636728987928, "loss": 2.7378, "step": 6282 }, { "crossentropy": 2.584623336791992, "epoch": 0.5344504933650902, "grad_norm": 0.03167169541120529, "grad_norm_var": 3.0842812068762027e-06, "learning_rate": 0.0011374910727189147, "loss": 2.5846, "step": 6283 }, { "crossentropy": 2.6565067768096924, "epoch": 0.5345355563116706, "grad_norm": 0.03538915514945984, "grad_norm_var": 3.297459844014914e-06, "learning_rate": 0.0011362190927466749, "loss": 2.6565, "step": 6284 }, { "crossentropy": 2.539848804473877, "epoch": 0.5346206192582511, "grad_norm": 0.03261981159448624, "grad_norm_var": 3.1945406226742876e-06, "learning_rate": 0.0011349477331863151, "loss": 2.5398, "step": 6285 }, { "crossentropy": 2.5900607109069824, "epoch": 0.5347056822048316, "grad_norm": 0.03237456828355789, "grad_norm_var": 3.297496743980867e-06, "learning_rate": 0.0011336769942419816, "loss": 2.5901, "step": 6286 }, { "crossentropy": 2.4802308082580566, "epoch": 0.534790745151412, "grad_norm": 0.03359458968043327, "grad_norm_var": 3.1559604722184232e-06, "learning_rate": 0.0011324068761177175, "loss": 2.4802, "step": 6287 }, { "crossentropy": 2.605239152908325, "epoch": 0.5348758080979925, "grad_norm": 0.03330675885081291, "grad_norm_var": 2.948287459255313e-06, "learning_rate": 0.0011311373790174656, "loss": 2.6052, "step": 6288 }, { "crossentropy": 2.586318016052246, "epoch": 0.534960871044573, "grad_norm": 0.0326274037361145, "grad_norm_var": 2.9767886730554713e-06, "learning_rate": 0.0011298685031450745, "loss": 2.5863, "step": 6289 }, { "crossentropy": 2.5831897258758545, "epoch": 0.5350459339911534, "grad_norm": 0.032572921365499496, "grad_norm_var": 2.9028800184145603e-06, "learning_rate": 0.0011286002487042857, "loss": 2.5832, "step": 6290 }, { "crossentropy": 2.7149670124053955, "epoch": 0.5351309969377339, "grad_norm": 0.03370329737663269, "grad_norm_var": 2.6211788231676633e-06, "learning_rate": 0.001127332615898743, "loss": 2.715, "step": 6291 }, { "crossentropy": 2.5505614280700684, "epoch": 0.5352160598843144, "grad_norm": 0.030491771176457405, "grad_norm_var": 3.1315336107237314e-06, "learning_rate": 0.0011260656049319957, "loss": 2.5506, "step": 6292 }, { "crossentropy": 2.6081156730651855, "epoch": 0.5353011228308948, "grad_norm": 0.03265444189310074, "grad_norm_var": 3.1568374537174613e-06, "learning_rate": 0.0011247992160074865, "loss": 2.6081, "step": 6293 }, { "crossentropy": 2.6123273372650146, "epoch": 0.5353861857774753, "grad_norm": 0.03440900146961212, "grad_norm_var": 3.13111304220275e-06, "learning_rate": 0.0011235334493285615, "loss": 2.6123, "step": 6294 }, { "crossentropy": 2.6985344886779785, "epoch": 0.5354712487240558, "grad_norm": 0.03371526673436165, "grad_norm_var": 3.078372865205241e-06, "learning_rate": 0.0011222683050984672, "loss": 2.6985, "step": 6295 }, { "crossentropy": 2.6294267177581787, "epoch": 0.5355563116706362, "grad_norm": 0.032016005367040634, "grad_norm_var": 3.041580581056519e-06, "learning_rate": 0.0011210037835203508, "loss": 2.6294, "step": 6296 }, { "crossentropy": 2.617305278778076, "epoch": 0.5356413746172167, "grad_norm": 0.03276919573545456, "grad_norm_var": 1.7999435260858485e-06, "learning_rate": 0.0011197398847972528, "loss": 2.6173, "step": 6297 }, { "crossentropy": 2.7032017707824707, "epoch": 0.5357264375637972, "grad_norm": 0.03376561030745506, "grad_norm_var": 1.318700091662211e-06, "learning_rate": 0.001118476609132124, "loss": 2.7032, "step": 6298 }, { "crossentropy": 2.6401286125183105, "epoch": 0.5358115005103776, "grad_norm": 0.035295769572257996, "grad_norm_var": 1.5073388492575634e-06, "learning_rate": 0.0011172139567278078, "loss": 2.6401, "step": 6299 }, { "crossentropy": 2.682426691055298, "epoch": 0.5358965634569581, "grad_norm": 0.03260432556271553, "grad_norm_var": 1.1816369653080134e-06, "learning_rate": 0.0011159519277870505, "loss": 2.6824, "step": 6300 }, { "crossentropy": 2.642458915710449, "epoch": 0.5359816264035386, "grad_norm": 0.0321107879281044, "grad_norm_var": 1.2258432558065554e-06, "learning_rate": 0.001114690522512496, "loss": 2.6425, "step": 6301 }, { "crossentropy": 2.542909860610962, "epoch": 0.5360666893501191, "grad_norm": 0.037034772336483, "grad_norm_var": 2.1941135759733032e-06, "learning_rate": 0.0011134297411066923, "loss": 2.5429, "step": 6302 }, { "crossentropy": 2.6403088569641113, "epoch": 0.5361517522966995, "grad_norm": 0.03277040272951126, "grad_norm_var": 2.2033162162658164e-06, "learning_rate": 0.001112169583772083, "loss": 2.6403, "step": 6303 }, { "crossentropy": 2.6312551498413086, "epoch": 0.53623681524328, "grad_norm": 0.03338233754038811, "grad_norm_var": 2.20434109456286e-06, "learning_rate": 0.0011109100507110132, "loss": 2.6313, "step": 6304 }, { "crossentropy": 2.598944664001465, "epoch": 0.5363218781898605, "grad_norm": 0.03738771378993988, "grad_norm_var": 3.2285009505151103e-06, "learning_rate": 0.0011096511421257278, "loss": 2.5989, "step": 6305 }, { "crossentropy": 2.546753406524658, "epoch": 0.5364069411364409, "grad_norm": 0.033232785761356354, "grad_norm_var": 3.1703894642280078e-06, "learning_rate": 0.001108392858218371, "loss": 2.5468, "step": 6306 }, { "crossentropy": 2.742781162261963, "epoch": 0.5364920040830214, "grad_norm": 0.033042747527360916, "grad_norm_var": 3.1871500971546956e-06, "learning_rate": 0.0011071351991909862, "loss": 2.7428, "step": 6307 }, { "crossentropy": 2.6344351768493652, "epoch": 0.536577067029602, "grad_norm": 0.03268510103225708, "grad_norm_var": 2.5955976536780796e-06, "learning_rate": 0.0011058781652455191, "loss": 2.6344, "step": 6308 }, { "crossentropy": 2.60461163520813, "epoch": 0.5366621299761823, "grad_norm": 0.03306962922215462, "grad_norm_var": 2.549611193618182e-06, "learning_rate": 0.0011046217565838117, "loss": 2.6046, "step": 6309 }, { "crossentropy": 2.6136956214904785, "epoch": 0.5367471929227629, "grad_norm": 0.03323763608932495, "grad_norm_var": 2.525526647098539e-06, "learning_rate": 0.0011033659734076067, "loss": 2.6137, "step": 6310 }, { "crossentropy": 2.668560743331909, "epoch": 0.5368322558693434, "grad_norm": 0.033400192856788635, "grad_norm_var": 2.52825432979382e-06, "learning_rate": 0.00110211081591855, "loss": 2.6686, "step": 6311 }, { "crossentropy": 2.4801831245422363, "epoch": 0.5369173188159237, "grad_norm": 0.03489316999912262, "grad_norm_var": 2.433063511882103e-06, "learning_rate": 0.0011008562843181796, "loss": 2.4802, "step": 6312 }, { "crossentropy": 2.6739869117736816, "epoch": 0.5370023817625043, "grad_norm": 0.033660490065813065, "grad_norm_var": 2.3610889786878735e-06, "learning_rate": 0.0010996023788079374, "loss": 2.674, "step": 6313 }, { "crossentropy": 2.62410306930542, "epoch": 0.5370874447090848, "grad_norm": 0.03316356986761093, "grad_norm_var": 2.390383316774822e-06, "learning_rate": 0.0010983490995891665, "loss": 2.6241, "step": 6314 }, { "crossentropy": 2.650587320327759, "epoch": 0.5371725076556652, "grad_norm": 0.033842310309410095, "grad_norm_var": 2.234621697884814e-06, "learning_rate": 0.0010970964468631074, "loss": 2.6506, "step": 6315 }, { "crossentropy": 2.60687518119812, "epoch": 0.5372575706022457, "grad_norm": 0.03305108845233917, "grad_norm_var": 2.1806451386163774e-06, "learning_rate": 0.0010958444208308976, "loss": 2.6069, "step": 6316 }, { "crossentropy": 2.610740900039673, "epoch": 0.5373426335488262, "grad_norm": 0.03283305838704109, "grad_norm_var": 2.0556014622192044e-06, "learning_rate": 0.0010945930216935801, "loss": 2.6107, "step": 6317 }, { "crossentropy": 2.6051530838012695, "epoch": 0.5374276964954066, "grad_norm": 0.032716576009988785, "grad_norm_var": 1.3545106589719913e-06, "learning_rate": 0.0010933422496520924, "loss": 2.6052, "step": 6318 }, { "crossentropy": 2.6791059970855713, "epoch": 0.5375127594419871, "grad_norm": 0.03373891860246658, "grad_norm_var": 1.3159435746643485e-06, "learning_rate": 0.001092092104907269, "loss": 2.6791, "step": 6319 }, { "crossentropy": 2.6375625133514404, "epoch": 0.5375978223885676, "grad_norm": 0.03435637056827545, "grad_norm_var": 1.3491039028117857e-06, "learning_rate": 0.001090842587659851, "loss": 2.6376, "step": 6320 }, { "crossentropy": 2.724663257598877, "epoch": 0.537682885335148, "grad_norm": 0.03498651832342148, "grad_norm_var": 5.110247321748451e-07, "learning_rate": 0.0010895936981104742, "loss": 2.7247, "step": 6321 }, { "crossentropy": 2.6622514724731445, "epoch": 0.5377679482817285, "grad_norm": 0.03461765497922897, "grad_norm_var": 5.825870208683281e-07, "learning_rate": 0.001088345436459673, "loss": 2.6623, "step": 6322 }, { "crossentropy": 2.6832215785980225, "epoch": 0.537853011228309, "grad_norm": 0.0334087498486042, "grad_norm_var": 5.646954413383749e-07, "learning_rate": 0.001087097802907882, "loss": 2.6832, "step": 6323 }, { "crossentropy": 2.606480360031128, "epoch": 0.5379380741748894, "grad_norm": 0.034237127751111984, "grad_norm_var": 5.251288778466516e-07, "learning_rate": 0.0010858507976554371, "loss": 2.6065, "step": 6324 }, { "crossentropy": 2.73858380317688, "epoch": 0.5380231371214699, "grad_norm": 0.034801263362169266, "grad_norm_var": 5.668073241408648e-07, "learning_rate": 0.0010846044209025707, "loss": 2.7386, "step": 6325 }, { "crossentropy": 2.6873631477355957, "epoch": 0.5381082000680504, "grad_norm": 0.03511890769004822, "grad_norm_var": 6.446765884182692e-07, "learning_rate": 0.0010833586728494148, "loss": 2.6874, "step": 6326 }, { "crossentropy": 2.6180310249328613, "epoch": 0.5381932630146308, "grad_norm": 0.0337604321539402, "grad_norm_var": 6.275019198452464e-07, "learning_rate": 0.001082113553696001, "loss": 2.618, "step": 6327 }, { "crossentropy": 2.572145462036133, "epoch": 0.5382783259612113, "grad_norm": 0.03134645149111748, "grad_norm_var": 9.67273864167321e-07, "learning_rate": 0.0010808690636422585, "loss": 2.5721, "step": 6328 }, { "crossentropy": 2.6920058727264404, "epoch": 0.5383633889077918, "grad_norm": 0.0338621512055397, "grad_norm_var": 9.680146524803622e-07, "learning_rate": 0.001079625202888016, "loss": 2.692, "step": 6329 }, { "crossentropy": 2.6524746417999268, "epoch": 0.5384484518543723, "grad_norm": 0.033121272921562195, "grad_norm_var": 9.71377703180322e-07, "learning_rate": 0.001078381971633004, "loss": 2.6525, "step": 6330 }, { "crossentropy": 2.668701410293579, "epoch": 0.5385335148009527, "grad_norm": 0.032754864543676376, "grad_norm_var": 1.030079206840867e-06, "learning_rate": 0.001077139370076849, "loss": 2.6687, "step": 6331 }, { "crossentropy": 2.5928008556365967, "epoch": 0.5386185777475332, "grad_norm": 0.035262580960989, "grad_norm_var": 1.1534105091243543e-06, "learning_rate": 0.0010758973984190763, "loss": 2.5928, "step": 6332 }, { "crossentropy": 2.649627923965454, "epoch": 0.5387036406941137, "grad_norm": 0.03153429552912712, "grad_norm_var": 1.42760841951439e-06, "learning_rate": 0.0010746560568591107, "loss": 2.6496, "step": 6333 }, { "crossentropy": 2.5932209491729736, "epoch": 0.5387887036406941, "grad_norm": 0.03299906477332115, "grad_norm_var": 1.394556636137058e-06, "learning_rate": 0.0010734153455962765, "loss": 2.5932, "step": 6334 }, { "crossentropy": 2.622197389602661, "epoch": 0.5388737665872746, "grad_norm": 0.03417905792593956, "grad_norm_var": 1.4063564701273513e-06, "learning_rate": 0.0010721752648297945, "loss": 2.6222, "step": 6335 }, { "crossentropy": 2.63195538520813, "epoch": 0.5389588295338551, "grad_norm": 0.03448984771966934, "grad_norm_var": 1.4178758193622576e-06, "learning_rate": 0.0010709358147587883, "loss": 2.632, "step": 6336 }, { "crossentropy": 2.6011929512023926, "epoch": 0.5390438924804355, "grad_norm": 0.032392192631959915, "grad_norm_var": 1.4211920311095356e-06, "learning_rate": 0.0010696969955822772, "loss": 2.6012, "step": 6337 }, { "crossentropy": 2.6550099849700928, "epoch": 0.539128955427016, "grad_norm": 0.03393223136663437, "grad_norm_var": 1.3591846865773984e-06, "learning_rate": 0.001068458807499179, "loss": 2.655, "step": 6338 }, { "crossentropy": 2.6098527908325195, "epoch": 0.5392140183735965, "grad_norm": 0.031466782093048096, "grad_norm_var": 1.6379420604707707e-06, "learning_rate": 0.0010672212507083106, "loss": 2.6099, "step": 6339 }, { "crossentropy": 2.6215009689331055, "epoch": 0.5392990813201769, "grad_norm": 0.03254879638552666, "grad_norm_var": 1.6397283902419105e-06, "learning_rate": 0.0010659843254083917, "loss": 2.6215, "step": 6340 }, { "crossentropy": 2.577411651611328, "epoch": 0.5393841442667574, "grad_norm": 0.034521348774433136, "grad_norm_var": 1.5903919008153703e-06, "learning_rate": 0.0010647480317980307, "loss": 2.5774, "step": 6341 }, { "crossentropy": 2.66015887260437, "epoch": 0.5394692072133379, "grad_norm": 0.03303281590342522, "grad_norm_var": 1.3649800094622336e-06, "learning_rate": 0.001063512370075746, "loss": 2.6602, "step": 6342 }, { "crossentropy": 2.539057970046997, "epoch": 0.5395542701599183, "grad_norm": 0.03152447193861008, "grad_norm_var": 1.5104473612900215e-06, "learning_rate": 0.0010622773404399472, "loss": 2.5391, "step": 6343 }, { "crossentropy": 2.6039960384368896, "epoch": 0.5396393331064988, "grad_norm": 0.03286386653780937, "grad_norm_var": 1.3075640448209058e-06, "learning_rate": 0.0010610429430889452, "loss": 2.604, "step": 6344 }, { "crossentropy": 2.647601366043091, "epoch": 0.5397243960530793, "grad_norm": 0.033407505601644516, "grad_norm_var": 1.2776372402397337e-06, "learning_rate": 0.0010598091782209457, "loss": 2.6476, "step": 6345 }, { "crossentropy": 2.5720129013061523, "epoch": 0.5398094589996597, "grad_norm": 0.034100260585546494, "grad_norm_var": 1.3367989219770574e-06, "learning_rate": 0.0010585760460340603, "loss": 2.572, "step": 6346 }, { "crossentropy": 2.6598381996154785, "epoch": 0.5398945219462402, "grad_norm": 0.032066814601421356, "grad_norm_var": 1.4061344321763628e-06, "learning_rate": 0.0010573435467262921, "loss": 2.6598, "step": 6347 }, { "crossentropy": 2.647047281265259, "epoch": 0.5399795848928207, "grad_norm": 0.03324178606271744, "grad_norm_var": 1.0908331669755183e-06, "learning_rate": 0.001056111680495545, "loss": 2.647, "step": 6348 }, { "crossentropy": 2.6684131622314453, "epoch": 0.5400646478394011, "grad_norm": 0.0326329804956913, "grad_norm_var": 9.488073267268614e-07, "learning_rate": 0.0010548804475396217, "loss": 2.6684, "step": 6349 }, { "crossentropy": 2.672600507736206, "epoch": 0.5401497107859816, "grad_norm": 0.03351902589201927, "grad_norm_var": 9.595745141712774e-07, "learning_rate": 0.0010536498480562223, "loss": 2.6726, "step": 6350 }, { "crossentropy": 2.6673386096954346, "epoch": 0.5402347737325621, "grad_norm": 0.031659673899412155, "grad_norm_var": 1.0005194992815727e-06, "learning_rate": 0.001052419882242945, "loss": 2.6673, "step": 6351 }, { "crossentropy": 2.591660261154175, "epoch": 0.5403198366791425, "grad_norm": 0.03350856900215149, "grad_norm_var": 8.608706777633752e-07, "learning_rate": 0.0010511905502972884, "loss": 2.5917, "step": 6352 }, { "crossentropy": 2.6661183834075928, "epoch": 0.540404899625723, "grad_norm": 0.03178881108760834, "grad_norm_var": 9.245747018317892e-07, "learning_rate": 0.0010499618524166478, "loss": 2.6661, "step": 6353 }, { "crossentropy": 2.642479658126831, "epoch": 0.5404899625723035, "grad_norm": 0.03266259282827377, "grad_norm_var": 8.444004921235686e-07, "learning_rate": 0.0010487337887983151, "loss": 2.6425, "step": 6354 }, { "crossentropy": 2.7020506858825684, "epoch": 0.5405750255188839, "grad_norm": 0.0330655612051487, "grad_norm_var": 7.233363431330756e-07, "learning_rate": 0.001047506359639483, "loss": 2.7021, "step": 6355 }, { "crossentropy": 2.653993606567383, "epoch": 0.5406600884654644, "grad_norm": 0.03342410549521446, "grad_norm_var": 7.320943920511095e-07, "learning_rate": 0.0010462795651372415, "loss": 2.654, "step": 6356 }, { "crossentropy": 2.6573305130004883, "epoch": 0.5407451514120449, "grad_norm": 0.03384152799844742, "grad_norm_var": 6.175290989756642e-07, "learning_rate": 0.0010450534054885763, "loss": 2.6573, "step": 6357 }, { "crossentropy": 2.5970048904418945, "epoch": 0.5408302143586253, "grad_norm": 0.03521963581442833, "grad_norm_var": 9.562280429757537e-07, "learning_rate": 0.0010438278808903763, "loss": 2.597, "step": 6358 }, { "crossentropy": 2.6564059257507324, "epoch": 0.5409152773052058, "grad_norm": 0.03279915824532509, "grad_norm_var": 8.01401555936924e-07, "learning_rate": 0.0010426029915394242, "loss": 2.6564, "step": 6359 }, { "crossentropy": 2.7242283821105957, "epoch": 0.5410003402517863, "grad_norm": 0.032147426158189774, "grad_norm_var": 8.572439854597861e-07, "learning_rate": 0.001041378737632402, "loss": 2.7242, "step": 6360 }, { "crossentropy": 2.582273483276367, "epoch": 0.5410854031983668, "grad_norm": 0.03294864669442177, "grad_norm_var": 8.496222898788744e-07, "learning_rate": 0.0010401551193658888, "loss": 2.5823, "step": 6361 }, { "crossentropy": 2.6049489974975586, "epoch": 0.5411704661449472, "grad_norm": 0.036270786076784134, "grad_norm_var": 1.451156921997848e-06, "learning_rate": 0.0010389321369363635, "loss": 2.6049, "step": 6362 }, { "crossentropy": 2.5574686527252197, "epoch": 0.5412555290915277, "grad_norm": 0.030651455745100975, "grad_norm_var": 1.7854559482294858e-06, "learning_rate": 0.0010377097905402005, "loss": 2.5575, "step": 6363 }, { "crossentropy": 2.552901029586792, "epoch": 0.5413405920381082, "grad_norm": 0.030399493873119354, "grad_norm_var": 2.231467422962235e-06, "learning_rate": 0.0010364880803736753, "loss": 2.5529, "step": 6364 }, { "crossentropy": 2.5853264331817627, "epoch": 0.5414256549846886, "grad_norm": 0.03323426842689514, "grad_norm_var": 2.2319579581849773e-06, "learning_rate": 0.0010352670066329584, "loss": 2.5853, "step": 6365 }, { "crossentropy": 2.703407049179077, "epoch": 0.5415107179312691, "grad_norm": 0.03517565876245499, "grad_norm_var": 2.5299920423447425e-06, "learning_rate": 0.0010340465695141189, "loss": 2.7034, "step": 6366 }, { "crossentropy": 2.6435494422912598, "epoch": 0.5415957808778497, "grad_norm": 0.03356938436627388, "grad_norm_var": 2.4039549642541987e-06, "learning_rate": 0.0010328267692131226, "loss": 2.6435, "step": 6367 }, { "crossentropy": 2.7572684288024902, "epoch": 0.54168084382443, "grad_norm": 0.034650854766368866, "grad_norm_var": 2.537194654873365e-06, "learning_rate": 0.001031607605925839, "loss": 2.7573, "step": 6368 }, { "crossentropy": 2.740003824234009, "epoch": 0.5417659067710106, "grad_norm": 0.0332176499068737, "grad_norm_var": 2.388213215218426e-06, "learning_rate": 0.0010303890798480254, "loss": 2.74, "step": 6369 }, { "crossentropy": 2.661806106567383, "epoch": 0.5418509697175911, "grad_norm": 0.03302142024040222, "grad_norm_var": 2.3643346996472197e-06, "learning_rate": 0.0010291711911753426, "loss": 2.6618, "step": 6370 }, { "crossentropy": 2.6721158027648926, "epoch": 0.5419360326641715, "grad_norm": 0.03473620489239693, "grad_norm_var": 2.4749003096261365e-06, "learning_rate": 0.0010279539401033511, "loss": 2.6721, "step": 6371 }, { "crossentropy": 2.6161930561065674, "epoch": 0.542021095610752, "grad_norm": 0.03288942947983742, "grad_norm_var": 2.4950935062725456e-06, "learning_rate": 0.001026737326827505, "loss": 2.6162, "step": 6372 }, { "crossentropy": 2.6600818634033203, "epoch": 0.5421061585573325, "grad_norm": 0.03280949592590332, "grad_norm_var": 2.5041134151246633e-06, "learning_rate": 0.0010255213515431554, "loss": 2.6601, "step": 6373 }, { "crossentropy": 2.627702236175537, "epoch": 0.5421912215039129, "grad_norm": 0.03193775936961174, "grad_norm_var": 2.3630165128513908e-06, "learning_rate": 0.0010243060144455563, "loss": 2.6277, "step": 6374 }, { "crossentropy": 2.512251615524292, "epoch": 0.5422762844504934, "grad_norm": 0.03217533603310585, "grad_norm_var": 2.4168275603378276e-06, "learning_rate": 0.0010230913157298544, "loss": 2.5123, "step": 6375 }, { "crossentropy": 2.6360604763031006, "epoch": 0.5423613473970739, "grad_norm": 0.03520653769373894, "grad_norm_var": 2.607177826906614e-06, "learning_rate": 0.0010218772555910954, "loss": 2.6361, "step": 6376 }, { "crossentropy": 2.63969349861145, "epoch": 0.5424464103436543, "grad_norm": 0.03380904346704483, "grad_norm_var": 2.612461671562524e-06, "learning_rate": 0.001020663834224222, "loss": 2.6397, "step": 6377 }, { "crossentropy": 2.5573015213012695, "epoch": 0.5425314732902348, "grad_norm": 0.031743165105581284, "grad_norm_var": 2.136282684119768e-06, "learning_rate": 0.0010194510518240745, "loss": 2.5573, "step": 6378 }, { "crossentropy": 2.5946106910705566, "epoch": 0.5426165362368153, "grad_norm": 0.03495458513498306, "grad_norm_var": 1.9021065365959027e-06, "learning_rate": 0.00101823890858539, "loss": 2.5946, "step": 6379 }, { "crossentropy": 2.595705509185791, "epoch": 0.5427015991833957, "grad_norm": 0.03314729407429695, "grad_norm_var": 1.2946163657743976e-06, "learning_rate": 0.0010170274047028066, "loss": 2.5957, "step": 6380 }, { "crossentropy": 2.621067762374878, "epoch": 0.5427866621299762, "grad_norm": 0.034284092485904694, "grad_norm_var": 1.323870479787564e-06, "learning_rate": 0.0010158165403708547, "loss": 2.6211, "step": 6381 }, { "crossentropy": 2.683509588241577, "epoch": 0.5428717250765567, "grad_norm": 0.034375954419374466, "grad_norm_var": 1.1940195138909297e-06, "learning_rate": 0.0010146063157839653, "loss": 2.6835, "step": 6382 }, { "crossentropy": 2.602479934692383, "epoch": 0.5429567880231371, "grad_norm": 0.03409677371382713, "grad_norm_var": 1.2139608197579236e-06, "learning_rate": 0.001013396731136465, "loss": 2.6025, "step": 6383 }, { "crossentropy": 2.70573091506958, "epoch": 0.5430418509697176, "grad_norm": 0.033221859484910965, "grad_norm_var": 1.1348824246104964e-06, "learning_rate": 0.0010121877866225782, "loss": 2.7057, "step": 6384 }, { "crossentropy": 2.663010835647583, "epoch": 0.5431269139162981, "grad_norm": 0.0330817848443985, "grad_norm_var": 1.1407282350467397e-06, "learning_rate": 0.0010109794824364254, "loss": 2.663, "step": 6385 }, { "crossentropy": 2.6387832164764404, "epoch": 0.5432119768628785, "grad_norm": 0.03137795627117157, "grad_norm_var": 1.407434940568312e-06, "learning_rate": 0.0010097718187720278, "loss": 2.6388, "step": 6386 }, { "crossentropy": 2.654203414916992, "epoch": 0.543297039809459, "grad_norm": 0.03705637529492378, "grad_norm_var": 2.1679342834165136e-06, "learning_rate": 0.0010085647958233007, "loss": 2.6542, "step": 6387 }, { "crossentropy": 2.715782642364502, "epoch": 0.5433821027560395, "grad_norm": 0.034266356378793716, "grad_norm_var": 2.1724136632317315e-06, "learning_rate": 0.0010073584137840564, "loss": 2.7158, "step": 6388 }, { "crossentropy": 2.636119842529297, "epoch": 0.54346716570262, "grad_norm": 0.033316995948553085, "grad_norm_var": 2.1352554220569265e-06, "learning_rate": 0.0010061526728480047, "loss": 2.6361, "step": 6389 }, { "crossentropy": 2.699888229370117, "epoch": 0.5435522286492004, "grad_norm": 0.03363809362053871, "grad_norm_var": 1.9327000873920265e-06, "learning_rate": 0.001004947573208756, "loss": 2.6999, "step": 6390 }, { "crossentropy": 2.5677995681762695, "epoch": 0.5436372915957809, "grad_norm": 0.03198877349495888, "grad_norm_var": 1.973659964900285e-06, "learning_rate": 0.001003743115059811, "loss": 2.5678, "step": 6391 }, { "crossentropy": 2.683326482772827, "epoch": 0.5437223545423614, "grad_norm": 0.032898787409067154, "grad_norm_var": 1.84998696305527e-06, "learning_rate": 0.0010025392985945703, "loss": 2.6833, "step": 6392 }, { "crossentropy": 2.740708112716675, "epoch": 0.5438074174889418, "grad_norm": 0.03553672879934311, "grad_norm_var": 2.089623315639349e-06, "learning_rate": 0.0010013361240063356, "loss": 2.7407, "step": 6393 }, { "crossentropy": 2.6642255783081055, "epoch": 0.5438924804355223, "grad_norm": 0.033517636358737946, "grad_norm_var": 1.8266111806902278e-06, "learning_rate": 0.0010001335914883009, "loss": 2.6642, "step": 6394 }, { "crossentropy": 2.63344407081604, "epoch": 0.5439775433821028, "grad_norm": 0.03178257495164871, "grad_norm_var": 1.9660940360023692e-06, "learning_rate": 0.0009989317012335565, "loss": 2.6334, "step": 6395 }, { "crossentropy": 2.717475175857544, "epoch": 0.5440626063286832, "grad_norm": 0.03752782568335533, "grad_norm_var": 2.9014344430746316e-06, "learning_rate": 0.000997730453435094, "loss": 2.7175, "step": 6396 }, { "crossentropy": 2.618453025817871, "epoch": 0.5441476692752637, "grad_norm": 0.033468909561634064, "grad_norm_var": 2.898288932844569e-06, "learning_rate": 0.0009965298482857994, "loss": 2.6185, "step": 6397 }, { "crossentropy": 2.661897659301758, "epoch": 0.5442327322218442, "grad_norm": 0.034392815083265305, "grad_norm_var": 2.899551844265149e-06, "learning_rate": 0.0009953298859784504, "loss": 2.6619, "step": 6398 }, { "crossentropy": 2.6469435691833496, "epoch": 0.5443177951684246, "grad_norm": 0.03412381932139397, "grad_norm_var": 2.900584304604594e-06, "learning_rate": 0.0009941305667057316, "loss": 2.6469, "step": 6399 }, { "crossentropy": 2.5893092155456543, "epoch": 0.5444028581150051, "grad_norm": 0.031135758385062218, "grad_norm_var": 3.3402874617413734e-06, "learning_rate": 0.0009929318906602175, "loss": 2.5893, "step": 6400 }, { "crossentropy": 2.647921085357666, "epoch": 0.5444879210615856, "grad_norm": 0.035310931503772736, "grad_norm_var": 3.4687599835256233e-06, "learning_rate": 0.0009917338580343793, "loss": 2.6479, "step": 6401 }, { "crossentropy": 2.6404929161071777, "epoch": 0.544572984008166, "grad_norm": 0.032468944787979126, "grad_norm_var": 3.1859155387538162e-06, "learning_rate": 0.0009905364690205886, "loss": 2.6405, "step": 6402 }, { "crossentropy": 2.5855281352996826, "epoch": 0.5446580469547465, "grad_norm": 0.032602034509181976, "grad_norm_var": 2.5525408609395795e-06, "learning_rate": 0.0009893397238111118, "loss": 2.5855, "step": 6403 }, { "crossentropy": 2.511463165283203, "epoch": 0.544743109901327, "grad_norm": 0.034124862402677536, "grad_norm_var": 2.54166526670826e-06, "learning_rate": 0.0009881436225981106, "loss": 2.5115, "step": 6404 }, { "crossentropy": 2.565964937210083, "epoch": 0.5448281728479074, "grad_norm": 0.03422736003994942, "grad_norm_var": 2.5573248494204035e-06, "learning_rate": 0.000986948165573644, "loss": 2.566, "step": 6405 }, { "crossentropy": 2.6316936016082764, "epoch": 0.5449132357944879, "grad_norm": 0.03203863278031349, "grad_norm_var": 2.7243660655643185e-06, "learning_rate": 0.0009857533529296692, "loss": 2.6317, "step": 6406 }, { "crossentropy": 2.565516710281372, "epoch": 0.5449982987410684, "grad_norm": 0.03351115435361862, "grad_norm_var": 2.5479200597853704e-06, "learning_rate": 0.0009845591848580376, "loss": 2.5655, "step": 6407 }, { "crossentropy": 2.645059823989868, "epoch": 0.5450833616876488, "grad_norm": 0.03438770771026611, "grad_norm_var": 2.5340076489543575e-06, "learning_rate": 0.0009833656615504977, "loss": 2.6451, "step": 6408 }, { "crossentropy": 2.5852584838867188, "epoch": 0.5451684246342293, "grad_norm": 0.03863156959414482, "grad_norm_var": 3.865853571416314e-06, "learning_rate": 0.000982172783198697, "loss": 2.5853, "step": 6409 }, { "crossentropy": 2.631026268005371, "epoch": 0.5452534875808098, "grad_norm": 0.03395485877990723, "grad_norm_var": 3.852404656638992e-06, "learning_rate": 0.0009809805499941765, "loss": 2.631, "step": 6410 }, { "crossentropy": 2.6900994777679443, "epoch": 0.5453385505273902, "grad_norm": 0.03514387831091881, "grad_norm_var": 3.573450510252355e-06, "learning_rate": 0.0009797889621283723, "loss": 2.6901, "step": 6411 }, { "crossentropy": 2.66642689704895, "epoch": 0.5454236134739707, "grad_norm": 0.03396204859018326, "grad_norm_var": 2.7815264834892776e-06, "learning_rate": 0.0009785980197926241, "loss": 2.6664, "step": 6412 }, { "crossentropy": 2.676732063293457, "epoch": 0.5455086764205512, "grad_norm": 0.03332974761724472, "grad_norm_var": 2.7919942998120508e-06, "learning_rate": 0.0009774077231781586, "loss": 2.6767, "step": 6413 }, { "crossentropy": 2.721311569213867, "epoch": 0.5455937393671316, "grad_norm": 0.03513762727379799, "grad_norm_var": 2.8697341280061918e-06, "learning_rate": 0.0009762180724761033, "loss": 2.7213, "step": 6414 }, { "crossentropy": 2.6096293926239014, "epoch": 0.5456788023137121, "grad_norm": 0.03162357583642006, "grad_norm_var": 3.2210527775700422e-06, "learning_rate": 0.0009750290678774837, "loss": 2.6096, "step": 6415 }, { "crossentropy": 2.6604151725769043, "epoch": 0.5457638652602926, "grad_norm": 0.0355551540851593, "grad_norm_var": 2.8427123420368104e-06, "learning_rate": 0.0009738407095732193, "loss": 2.6604, "step": 6416 }, { "crossentropy": 2.664748191833496, "epoch": 0.5458489282068731, "grad_norm": 0.03479090705513954, "grad_norm_var": 2.777429198535843e-06, "learning_rate": 0.0009726529977541243, "loss": 2.6647, "step": 6417 }, { "crossentropy": 2.571831464767456, "epoch": 0.5459339911534535, "grad_norm": 0.03411303460597992, "grad_norm_var": 2.5903280303087027e-06, "learning_rate": 0.0009714659326109138, "loss": 2.5718, "step": 6418 }, { "crossentropy": 2.751417636871338, "epoch": 0.546019054100034, "grad_norm": 0.03538772463798523, "grad_norm_var": 2.4833360463068683e-06, "learning_rate": 0.0009702795143341963, "loss": 2.7514, "step": 6419 }, { "crossentropy": 2.623070001602173, "epoch": 0.5461041170466145, "grad_norm": 0.03514283895492554, "grad_norm_var": 2.5148320619324793e-06, "learning_rate": 0.0009690937431144725, "loss": 2.6231, "step": 6420 }, { "crossentropy": 2.567033290863037, "epoch": 0.5461891799931949, "grad_norm": 0.03286737576127052, "grad_norm_var": 2.6678296322958534e-06, "learning_rate": 0.0009679086191421466, "loss": 2.567, "step": 6421 }, { "crossentropy": 2.7167835235595703, "epoch": 0.5462742429397754, "grad_norm": 0.03714428097009659, "grad_norm_var": 2.724529981142319e-06, "learning_rate": 0.0009667241426075152, "loss": 2.7168, "step": 6422 }, { "crossentropy": 2.6053221225738525, "epoch": 0.546359305886356, "grad_norm": 0.03274574875831604, "grad_norm_var": 2.879177352504055e-06, "learning_rate": 0.0009655403137007679, "loss": 2.6053, "step": 6423 }, { "crossentropy": 2.6995866298675537, "epoch": 0.5464443688329363, "grad_norm": 0.03407391160726547, "grad_norm_var": 2.895045566297707e-06, "learning_rate": 0.0009643571326119982, "loss": 2.6996, "step": 6424 }, { "crossentropy": 2.6100194454193115, "epoch": 0.5465294317795168, "grad_norm": 0.034214623272418976, "grad_norm_var": 1.7402447540929358e-06, "learning_rate": 0.0009631745995311881, "loss": 2.61, "step": 6425 }, { "crossentropy": 2.6519711017608643, "epoch": 0.5466144947260974, "grad_norm": 0.03403982147574425, "grad_norm_var": 1.7365117938502224e-06, "learning_rate": 0.0009619927146482188, "loss": 2.652, "step": 6426 }, { "crossentropy": 2.6203110218048096, "epoch": 0.5466995576726777, "grad_norm": 0.035383813083171844, "grad_norm_var": 1.766162263094837e-06, "learning_rate": 0.000960811478152867, "loss": 2.6203, "step": 6427 }, { "crossentropy": 2.630204916000366, "epoch": 0.5467846206192583, "grad_norm": 0.03282913565635681, "grad_norm_var": 1.9041539204173522e-06, "learning_rate": 0.0009596308902348055, "loss": 2.6302, "step": 6428 }, { "crossentropy": 2.5510756969451904, "epoch": 0.5468696835658388, "grad_norm": 0.032300811260938644, "grad_norm_var": 2.099826588219713e-06, "learning_rate": 0.0009584509510836021, "loss": 2.5511, "step": 6429 }, { "crossentropy": 2.644680976867676, "epoch": 0.5469547465124192, "grad_norm": 0.03405624255537987, "grad_norm_var": 2.039077398306259e-06, "learning_rate": 0.0009572716608887206, "loss": 2.6447, "step": 6430 }, { "crossentropy": 2.707132339477539, "epoch": 0.5470398094589997, "grad_norm": 0.035048708319664, "grad_norm_var": 1.6222589671214978e-06, "learning_rate": 0.0009560930198395224, "loss": 2.7071, "step": 6431 }, { "crossentropy": 2.561842203140259, "epoch": 0.5471248724055802, "grad_norm": 0.03797736018896103, "grad_norm_var": 2.3762691014329758e-06, "learning_rate": 0.0009549150281252633, "loss": 2.5618, "step": 6432 }, { "crossentropy": 2.6898887157440186, "epoch": 0.5472099353521606, "grad_norm": 0.035715989768505096, "grad_norm_var": 2.4647401196136824e-06, "learning_rate": 0.0009537376859350938, "loss": 2.6899, "step": 6433 }, { "crossentropy": 2.6525566577911377, "epoch": 0.5472949982987411, "grad_norm": 0.034484487026929855, "grad_norm_var": 2.450974792387167e-06, "learning_rate": 0.0009525609934580614, "loss": 2.6526, "step": 6434 }, { "crossentropy": 2.628769874572754, "epoch": 0.5473800612453216, "grad_norm": 0.031135914847254753, "grad_norm_var": 3.127645095130531e-06, "learning_rate": 0.0009513849508831097, "loss": 2.6288, "step": 6435 }, { "crossentropy": 2.557417869567871, "epoch": 0.547465124191902, "grad_norm": 0.03341416269540787, "grad_norm_var": 3.1253504527813663e-06, "learning_rate": 0.000950209558399075, "loss": 2.5574, "step": 6436 }, { "crossentropy": 2.6989104747772217, "epoch": 0.5475501871384825, "grad_norm": 0.033490389585494995, "grad_norm_var": 3.037703978192455e-06, "learning_rate": 0.0009490348161946949, "loss": 2.6989, "step": 6437 }, { "crossentropy": 2.608612060546875, "epoch": 0.547635250085063, "grad_norm": 0.033297087997198105, "grad_norm_var": 2.4798883393502366e-06, "learning_rate": 0.0009478607244585969, "loss": 2.6086, "step": 6438 }, { "crossentropy": 2.670478582382202, "epoch": 0.5477203130316434, "grad_norm": 0.03385046496987343, "grad_norm_var": 2.369500894078266e-06, "learning_rate": 0.0009466872833793056, "loss": 2.6705, "step": 6439 }, { "crossentropy": 2.696265459060669, "epoch": 0.5478053759782239, "grad_norm": 0.0372958667576313, "grad_norm_var": 3.014813538470051e-06, "learning_rate": 0.0009455144931452458, "loss": 2.6963, "step": 6440 }, { "crossentropy": 2.5973401069641113, "epoch": 0.5478904389248044, "grad_norm": 0.03349175676703453, "grad_norm_var": 3.0541037780891367e-06, "learning_rate": 0.0009443423539447299, "loss": 2.5973, "step": 6441 }, { "crossentropy": 2.710493564605713, "epoch": 0.5479755018713848, "grad_norm": 0.03461890295147896, "grad_norm_var": 3.059741337603838e-06, "learning_rate": 0.0009431708659659693, "loss": 2.7105, "step": 6442 }, { "crossentropy": 2.6390955448150635, "epoch": 0.5480605648179653, "grad_norm": 0.032195307314395905, "grad_norm_var": 3.223521018680036e-06, "learning_rate": 0.0009420000293970743, "loss": 2.6391, "step": 6443 }, { "crossentropy": 2.6913180351257324, "epoch": 0.5481456277645458, "grad_norm": 0.03372698277235031, "grad_norm_var": 3.1247386460035577e-06, "learning_rate": 0.0009408298444260455, "loss": 2.6913, "step": 6444 }, { "crossentropy": 2.6880569458007812, "epoch": 0.5482306907111262, "grad_norm": 0.038167521357536316, "grad_norm_var": 3.844039843410339e-06, "learning_rate": 0.000939660311240782, "loss": 2.6881, "step": 6445 }, { "crossentropy": 2.6901071071624756, "epoch": 0.5483157536577067, "grad_norm": 0.03321350738406181, "grad_norm_var": 3.938059443731957e-06, "learning_rate": 0.0009384914300290748, "loss": 2.6901, "step": 6446 }, { "crossentropy": 2.6172189712524414, "epoch": 0.5484008166042872, "grad_norm": 0.03192717209458351, "grad_norm_var": 4.295907135214262e-06, "learning_rate": 0.0009373232009786154, "loss": 2.6172, "step": 6447 }, { "crossentropy": 2.7821896076202393, "epoch": 0.5484858795508677, "grad_norm": 0.03575091436505318, "grad_norm_var": 3.4992747101431393e-06, "learning_rate": 0.000936155624276987, "loss": 2.7822, "step": 6448 }, { "crossentropy": 2.5883779525756836, "epoch": 0.5485709424974481, "grad_norm": 0.03359402343630791, "grad_norm_var": 3.3266057094493886e-06, "learning_rate": 0.0009349887001116681, "loss": 2.5884, "step": 6449 }, { "crossentropy": 2.645979642868042, "epoch": 0.5486560054440286, "grad_norm": 0.03333237022161484, "grad_norm_var": 3.3318242794679824e-06, "learning_rate": 0.0009338224286700331, "loss": 2.646, "step": 6450 }, { "crossentropy": 2.656475782394409, "epoch": 0.5487410683906091, "grad_norm": 0.034491125494241714, "grad_norm_var": 2.7960075372551277e-06, "learning_rate": 0.0009326568101393518, "loss": 2.6565, "step": 6451 }, { "crossentropy": 2.589230537414551, "epoch": 0.5488261313371895, "grad_norm": 0.0319024994969368, "grad_norm_var": 3.0803063754467526e-06, "learning_rate": 0.0009314918447067877, "loss": 2.5892, "step": 6452 }, { "crossentropy": 2.594298839569092, "epoch": 0.54891119428377, "grad_norm": 0.035229019820690155, "grad_norm_var": 3.1460855258199268e-06, "learning_rate": 0.0009303275325594035, "loss": 2.5943, "step": 6453 }, { "crossentropy": 2.6392383575439453, "epoch": 0.5489962572303505, "grad_norm": 0.033580977469682693, "grad_norm_var": 3.1195845755669266e-06, "learning_rate": 0.0009291638738841523, "loss": 2.6392, "step": 6454 }, { "crossentropy": 2.6379854679107666, "epoch": 0.5490813201769309, "grad_norm": 0.03423817455768585, "grad_norm_var": 3.113597204682829e-06, "learning_rate": 0.0009280008688678848, "loss": 2.638, "step": 6455 }, { "crossentropy": 2.6509788036346436, "epoch": 0.5491663831235114, "grad_norm": 0.03633053973317146, "grad_norm_var": 2.769797648683771e-06, "learning_rate": 0.000926838517697346, "loss": 2.651, "step": 6456 }, { "crossentropy": 2.6717851161956787, "epoch": 0.5492514460700919, "grad_norm": 0.035783037543296814, "grad_norm_var": 2.908456774328855e-06, "learning_rate": 0.0009256768205591754, "loss": 2.6718, "step": 6457 }, { "crossentropy": 2.5973453521728516, "epoch": 0.5493365090166723, "grad_norm": 0.03338000178337097, "grad_norm_var": 2.9442959917533667e-06, "learning_rate": 0.0009245157776399071, "loss": 2.5973, "step": 6458 }, { "crossentropy": 2.638702154159546, "epoch": 0.5494215719632528, "grad_norm": 0.03232302516698837, "grad_norm_var": 2.91155725090312e-06, "learning_rate": 0.0009233553891259738, "loss": 2.6387, "step": 6459 }, { "crossentropy": 2.6698060035705566, "epoch": 0.5495066349098333, "grad_norm": 0.033714376389980316, "grad_norm_var": 2.91233818647766e-06, "learning_rate": 0.0009221956552036992, "loss": 2.6698, "step": 6460 }, { "crossentropy": 2.558511257171631, "epoch": 0.5495916978564137, "grad_norm": 0.03238429129123688, "grad_norm_var": 1.9317033389055008e-06, "learning_rate": 0.000921036576059302, "loss": 2.5585, "step": 6461 }, { "crossentropy": 2.5286712646484375, "epoch": 0.5496767608029942, "grad_norm": 0.03384135290980339, "grad_norm_var": 1.9052810006298451e-06, "learning_rate": 0.0009198781518789007, "loss": 2.5287, "step": 6462 }, { "crossentropy": 2.669872522354126, "epoch": 0.5497618237495747, "grad_norm": 0.03261333331465721, "grad_norm_var": 1.7576308902556362e-06, "learning_rate": 0.0009187203828485008, "loss": 2.6699, "step": 6463 }, { "crossentropy": 2.5880749225616455, "epoch": 0.5498468866961551, "grad_norm": 0.03405701741576195, "grad_norm_var": 1.5201840898754552e-06, "learning_rate": 0.0009175632691540064, "loss": 2.5881, "step": 6464 }, { "crossentropy": 2.6197454929351807, "epoch": 0.5499319496427356, "grad_norm": 0.03174919635057449, "grad_norm_var": 1.7834869466354376e-06, "learning_rate": 0.0009164068109812196, "loss": 2.6197, "step": 6465 }, { "crossentropy": 2.7052345275878906, "epoch": 0.5500170125893161, "grad_norm": 0.0338800773024559, "grad_norm_var": 1.776528271998923e-06, "learning_rate": 0.0009152510085158333, "loss": 2.7052, "step": 6466 }, { "crossentropy": 2.624535322189331, "epoch": 0.5501020755358965, "grad_norm": 0.033239852637052536, "grad_norm_var": 1.7455028005243511e-06, "learning_rate": 0.0009140958619434353, "loss": 2.6245, "step": 6467 }, { "crossentropy": 2.5890183448791504, "epoch": 0.550187138482477, "grad_norm": 0.0350736603140831, "grad_norm_var": 1.6391876494120198e-06, "learning_rate": 0.0009129413714495083, "loss": 2.589, "step": 6468 }, { "crossentropy": 2.6721739768981934, "epoch": 0.5502722014290575, "grad_norm": 0.03324073925614357, "grad_norm_var": 1.5176659478722056e-06, "learning_rate": 0.0009117875372194334, "loss": 2.6722, "step": 6469 }, { "crossentropy": 2.5619020462036133, "epoch": 0.5503572643756379, "grad_norm": 0.03328933194279671, "grad_norm_var": 1.5281684809098985e-06, "learning_rate": 0.000910634359438478, "loss": 2.5619, "step": 6470 }, { "crossentropy": 2.7566914558410645, "epoch": 0.5504423273222184, "grad_norm": 0.033967286348342896, "grad_norm_var": 1.5131767972621664e-06, "learning_rate": 0.0009094818382918141, "loss": 2.7567, "step": 6471 }, { "crossentropy": 2.601672649383545, "epoch": 0.5505273902687989, "grad_norm": 0.03353448584675789, "grad_norm_var": 1.0133562576942919e-06, "learning_rate": 0.0009083299739645007, "loss": 2.6017, "step": 6472 }, { "crossentropy": 2.705526113510132, "epoch": 0.5506124532153793, "grad_norm": 0.033500876277685165, "grad_norm_var": 6.455227274891154e-07, "learning_rate": 0.0009071787666414949, "loss": 2.7055, "step": 6473 }, { "crossentropy": 2.661869764328003, "epoch": 0.5506975161619598, "grad_norm": 0.03291570395231247, "grad_norm_var": 6.578696052729483e-07, "learning_rate": 0.0009060282165076461, "loss": 2.6619, "step": 6474 }, { "crossentropy": 2.61564564704895, "epoch": 0.5507825791085403, "grad_norm": 0.03226514160633087, "grad_norm_var": 6.658721670752914e-07, "learning_rate": 0.0009048783237477021, "loss": 2.6156, "step": 6475 }, { "crossentropy": 2.6688461303710938, "epoch": 0.5508676420551208, "grad_norm": 0.03411925956606865, "grad_norm_var": 6.969129504527082e-07, "learning_rate": 0.0009037290885463017, "loss": 2.6688, "step": 6476 }, { "crossentropy": 2.6122775077819824, "epoch": 0.5509527050017012, "grad_norm": 0.03289889916777611, "grad_norm_var": 6.468956953646169e-07, "learning_rate": 0.0009025805110879765, "loss": 2.6123, "step": 6477 }, { "crossentropy": 2.7204980850219727, "epoch": 0.5510377679482817, "grad_norm": 0.03168642893433571, "grad_norm_var": 8.064767491246238e-07, "learning_rate": 0.0009014325915571575, "loss": 2.7205, "step": 6478 }, { "crossentropy": 2.669588327407837, "epoch": 0.5511228308948622, "grad_norm": 0.032559286803007126, "grad_norm_var": 8.112613544032643e-07, "learning_rate": 0.0009002853301381669, "loss": 2.6696, "step": 6479 }, { "crossentropy": 2.598784923553467, "epoch": 0.5512078938414426, "grad_norm": 0.03351767733693123, "grad_norm_var": 7.71305313949459e-07, "learning_rate": 0.0008991387270152201, "loss": 2.5988, "step": 6480 }, { "crossentropy": 2.676358699798584, "epoch": 0.5512929567880231, "grad_norm": 0.033545803278684616, "grad_norm_var": 6.219442461762846e-07, "learning_rate": 0.000897992782372432, "loss": 2.6764, "step": 6481 }, { "crossentropy": 2.605590343475342, "epoch": 0.5513780197346037, "grad_norm": 0.0421423576772213, "grad_norm_var": 5.497641831706673e-06, "learning_rate": 0.0008968474963938061, "loss": 2.6056, "step": 6482 }, { "crossentropy": 2.602640151977539, "epoch": 0.551463082681184, "grad_norm": 0.030991673469543457, "grad_norm_var": 5.994498654796933e-06, "learning_rate": 0.0008957028692632424, "loss": 2.6026, "step": 6483 }, { "crossentropy": 2.628462553024292, "epoch": 0.5515481456277646, "grad_norm": 0.033272888511419296, "grad_norm_var": 5.8680820022805055e-06, "learning_rate": 0.0008945589011645355, "loss": 2.6285, "step": 6484 }, { "crossentropy": 2.5969207286834717, "epoch": 0.5516332085743451, "grad_norm": 0.03437544405460358, "grad_norm_var": 5.8956390090990235e-06, "learning_rate": 0.0008934155922813736, "loss": 2.5969, "step": 6485 }, { "crossentropy": 2.518031358718872, "epoch": 0.5517182715209255, "grad_norm": 0.0320601686835289, "grad_norm_var": 6.0510457883402225e-06, "learning_rate": 0.0008922729427973375, "loss": 2.518, "step": 6486 }, { "crossentropy": 2.6544370651245117, "epoch": 0.551803334467506, "grad_norm": 0.03758235275745392, "grad_norm_var": 7.0523046501439845e-06, "learning_rate": 0.0008911309528959072, "loss": 2.6544, "step": 6487 }, { "crossentropy": 2.680814504623413, "epoch": 0.5518883974140865, "grad_norm": 0.033637192100286484, "grad_norm_var": 7.049183768733045e-06, "learning_rate": 0.000889989622760451, "loss": 2.6808, "step": 6488 }, { "crossentropy": 2.6352691650390625, "epoch": 0.5519734603606669, "grad_norm": 0.03170214220881462, "grad_norm_var": 7.327202690545163e-06, "learning_rate": 0.0008888489525742338, "loss": 2.6353, "step": 6489 }, { "crossentropy": 2.628852367401123, "epoch": 0.5520585233072474, "grad_norm": 0.03306560590863228, "grad_norm_var": 7.312840968610029e-06, "learning_rate": 0.0008877089425204138, "loss": 2.6289, "step": 6490 }, { "crossentropy": 2.6987173557281494, "epoch": 0.5521435862538279, "grad_norm": 0.03390679508447647, "grad_norm_var": 7.164166601233459e-06, "learning_rate": 0.0008865695927820472, "loss": 2.6987, "step": 6491 }, { "crossentropy": 2.625746011734009, "epoch": 0.5522286492004083, "grad_norm": 0.033870480954647064, "grad_norm_var": 7.157992044781504e-06, "learning_rate": 0.0008854309035420771, "loss": 2.6257, "step": 6492 }, { "crossentropy": 2.636636972427368, "epoch": 0.5523137121469888, "grad_norm": 0.03230726346373558, "grad_norm_var": 7.251027143939428e-06, "learning_rate": 0.000884292874983344, "loss": 2.6366, "step": 6493 }, { "crossentropy": 2.5600922107696533, "epoch": 0.5523987750935693, "grad_norm": 0.03214305266737938, "grad_norm_var": 7.137571289899299e-06, "learning_rate": 0.000883155507288585, "loss": 2.5601, "step": 6494 }, { "crossentropy": 2.540900945663452, "epoch": 0.5524838380401497, "grad_norm": 0.03413550555706024, "grad_norm_var": 7.03367278877583e-06, "learning_rate": 0.0008820188006404267, "loss": 2.5409, "step": 6495 }, { "crossentropy": 2.6465063095092773, "epoch": 0.5525689009867302, "grad_norm": 0.03282912075519562, "grad_norm_var": 7.0975808275706525e-06, "learning_rate": 0.0008808827552213916, "loss": 2.6465, "step": 6496 }, { "crossentropy": 2.6965770721435547, "epoch": 0.5526539639333107, "grad_norm": 0.03216995298862457, "grad_norm_var": 7.271326319064202e-06, "learning_rate": 0.0008797473712138976, "loss": 2.6966, "step": 6497 }, { "crossentropy": 2.538381576538086, "epoch": 0.5527390268798911, "grad_norm": 0.035039279609918594, "grad_norm_var": 2.4878388672960306e-06, "learning_rate": 0.0008786126488002544, "loss": 2.5384, "step": 6498 }, { "crossentropy": 2.743983268737793, "epoch": 0.5528240898264716, "grad_norm": 0.035009682178497314, "grad_norm_var": 2.250539359746559e-06, "learning_rate": 0.0008774785881626618, "loss": 2.744, "step": 6499 }, { "crossentropy": 2.6686975955963135, "epoch": 0.5529091527730521, "grad_norm": 0.03441476821899414, "grad_norm_var": 2.2869214219511936e-06, "learning_rate": 0.0008763451894832219, "loss": 2.6687, "step": 6500 }, { "crossentropy": 2.6637628078460693, "epoch": 0.5529942157196325, "grad_norm": 0.0345161110162735, "grad_norm_var": 2.301941488327038e-06, "learning_rate": 0.0008752124529439242, "loss": 2.6638, "step": 6501 }, { "crossentropy": 2.6305344104766846, "epoch": 0.553079278666213, "grad_norm": 0.0328947976231575, "grad_norm_var": 2.1686299763595818e-06, "learning_rate": 0.0008740803787266521, "loss": 2.6305, "step": 6502 }, { "crossentropy": 2.593679904937744, "epoch": 0.5531643416127935, "grad_norm": 0.03387480601668358, "grad_norm_var": 1.1092930230946832e-06, "learning_rate": 0.0008729489670131874, "loss": 2.5937, "step": 6503 }, { "crossentropy": 2.6216578483581543, "epoch": 0.553249404559374, "grad_norm": 0.034625597298145294, "grad_norm_var": 1.1924142450804587e-06, "learning_rate": 0.0008718182179851997, "loss": 2.6217, "step": 6504 }, { "crossentropy": 2.4893722534179688, "epoch": 0.5533344675059544, "grad_norm": 0.03525955602526665, "grad_norm_var": 1.1156309486094751e-06, "learning_rate": 0.0008706881318242554, "loss": 2.4894, "step": 6505 }, { "crossentropy": 2.623838186264038, "epoch": 0.5534195304525349, "grad_norm": 0.0324217863380909, "grad_norm_var": 1.2006222547523982e-06, "learning_rate": 0.0008695587087118134, "loss": 2.6238, "step": 6506 }, { "crossentropy": 2.625598430633545, "epoch": 0.5535045933991154, "grad_norm": 0.03803428262472153, "grad_norm_var": 2.371670340495004e-06, "learning_rate": 0.0008684299488292274, "loss": 2.6256, "step": 6507 }, { "crossentropy": 2.75783371925354, "epoch": 0.5535896563456958, "grad_norm": 0.03417716547846794, "grad_norm_var": 2.373412783610434e-06, "learning_rate": 0.0008673018523577414, "loss": 2.7578, "step": 6508 }, { "crossentropy": 2.7309534549713135, "epoch": 0.5536747192922763, "grad_norm": 0.03347351402044296, "grad_norm_var": 2.196632200929018e-06, "learning_rate": 0.0008661744194784987, "loss": 2.731, "step": 6509 }, { "crossentropy": 2.5855798721313477, "epoch": 0.5537597822388568, "grad_norm": 0.03224651888012886, "grad_norm_var": 2.170805189818011e-06, "learning_rate": 0.0008650476503725302, "loss": 2.5856, "step": 6510 }, { "crossentropy": 2.635448932647705, "epoch": 0.5538448451854372, "grad_norm": 0.03512988239526749, "grad_norm_var": 2.241268974643301e-06, "learning_rate": 0.0008639215452207639, "loss": 2.6354, "step": 6511 }, { "crossentropy": 2.5622196197509766, "epoch": 0.5539299081320177, "grad_norm": 0.03168819099664688, "grad_norm_var": 2.5208715208585584e-06, "learning_rate": 0.0008627961042040183, "loss": 2.5622, "step": 6512 }, { "crossentropy": 2.655094861984253, "epoch": 0.5540149710785982, "grad_norm": 0.03403085842728615, "grad_norm_var": 2.268100810144148e-06, "learning_rate": 0.0008616713275030075, "loss": 2.6551, "step": 6513 }, { "crossentropy": 2.649341583251953, "epoch": 0.5541000340251786, "grad_norm": 0.03358154371380806, "grad_norm_var": 2.2333744143782432e-06, "learning_rate": 0.0008605472152983384, "loss": 2.6493, "step": 6514 }, { "crossentropy": 2.569742202758789, "epoch": 0.5541850969717591, "grad_norm": 0.034256208688020706, "grad_norm_var": 2.1760802749327955e-06, "learning_rate": 0.0008594237677705103, "loss": 2.5697, "step": 6515 }, { "crossentropy": 2.6814961433410645, "epoch": 0.5542701599183396, "grad_norm": 0.03376765921711922, "grad_norm_var": 2.1698389828738705e-06, "learning_rate": 0.000858300985099918, "loss": 2.6815, "step": 6516 }, { "crossentropy": 2.6420555114746094, "epoch": 0.55435522286492, "grad_norm": 0.0348731204867363, "grad_norm_var": 2.202436531470395e-06, "learning_rate": 0.0008571788674668468, "loss": 2.6421, "step": 6517 }, { "crossentropy": 2.647338628768921, "epoch": 0.5544402858115005, "grad_norm": 0.03096572495996952, "grad_norm_var": 2.7246810506500178e-06, "learning_rate": 0.0008560574150514755, "loss": 2.6473, "step": 6518 }, { "crossentropy": 2.534592866897583, "epoch": 0.554525348758081, "grad_norm": 0.033603519201278687, "grad_norm_var": 2.730206644052527e-06, "learning_rate": 0.0008549366280338799, "loss": 2.5346, "step": 6519 }, { "crossentropy": 2.6098525524139404, "epoch": 0.5546104117046614, "grad_norm": 0.032905131578445435, "grad_norm_var": 2.744960567780521e-06, "learning_rate": 0.0008538165065940262, "loss": 2.6099, "step": 6520 }, { "crossentropy": 2.6311638355255127, "epoch": 0.5546954746512419, "grad_norm": 0.03107263892889023, "grad_norm_var": 3.0123526148145696e-06, "learning_rate": 0.0008526970509117687, "loss": 2.6312, "step": 6521 }, { "crossentropy": 2.5859460830688477, "epoch": 0.5547805375978224, "grad_norm": 0.03252742439508438, "grad_norm_var": 2.9976628691042215e-06, "learning_rate": 0.0008515782611668649, "loss": 2.5859, "step": 6522 }, { "crossentropy": 2.709833860397339, "epoch": 0.5548656005444028, "grad_norm": 0.03440161049365997, "grad_norm_var": 1.6363160509653597e-06, "learning_rate": 0.0008504601375389581, "loss": 2.7098, "step": 6523 }, { "crossentropy": 2.629030227661133, "epoch": 0.5549506634909833, "grad_norm": 0.03321864455938339, "grad_norm_var": 1.5808414219207657e-06, "learning_rate": 0.0008493426802075855, "loss": 2.629, "step": 6524 }, { "crossentropy": 2.607987403869629, "epoch": 0.5550357264375638, "grad_norm": 0.0333903431892395, "grad_norm_var": 1.578616427422745e-06, "learning_rate": 0.0008482258893521811, "loss": 2.608, "step": 6525 }, { "crossentropy": 2.6767523288726807, "epoch": 0.5551207893841442, "grad_norm": 0.03227937966585159, "grad_norm_var": 1.5743806003597985e-06, "learning_rate": 0.0008471097651520687, "loss": 2.6768, "step": 6526 }, { "crossentropy": 2.731788396835327, "epoch": 0.5552058523307247, "grad_norm": 0.03464996814727783, "grad_norm_var": 1.4672522199962822e-06, "learning_rate": 0.0008459943077864651, "loss": 2.7318, "step": 6527 }, { "crossentropy": 2.5862603187561035, "epoch": 0.5552909152773052, "grad_norm": 0.03324979916214943, "grad_norm_var": 1.3047298091950111e-06, "learning_rate": 0.0008448795174344803, "loss": 2.5863, "step": 6528 }, { "crossentropy": 2.6997110843658447, "epoch": 0.5553759782238856, "grad_norm": 0.03582002595067024, "grad_norm_var": 1.6795442498733116e-06, "learning_rate": 0.0008437653942751183, "loss": 2.6997, "step": 6529 }, { "crossentropy": 2.550602436065674, "epoch": 0.5554610411704661, "grad_norm": 0.03252306208014488, "grad_norm_var": 1.7253822836483472e-06, "learning_rate": 0.0008426519384872733, "loss": 2.5506, "step": 6530 }, { "crossentropy": 2.546053409576416, "epoch": 0.5555461041170466, "grad_norm": 0.03235303610563278, "grad_norm_var": 1.7202867938224346e-06, "learning_rate": 0.0008415391502497371, "loss": 2.5461, "step": 6531 }, { "crossentropy": 2.7033700942993164, "epoch": 0.555631167063627, "grad_norm": 0.03492598980665207, "grad_norm_var": 1.8879449029015795e-06, "learning_rate": 0.0008404270297411903, "loss": 2.7034, "step": 6532 }, { "crossentropy": 2.6383469104766846, "epoch": 0.5557162300102075, "grad_norm": 0.03359414264559746, "grad_norm_var": 1.7214840688431112e-06, "learning_rate": 0.0008393155771402072, "loss": 2.6383, "step": 6533 }, { "crossentropy": 2.7431252002716064, "epoch": 0.555801292956788, "grad_norm": 0.03458769991993904, "grad_norm_var": 1.4539399671132172e-06, "learning_rate": 0.0008382047926252545, "loss": 2.7431, "step": 6534 }, { "crossentropy": 2.716106653213501, "epoch": 0.5558863559033685, "grad_norm": 0.03516177088022232, "grad_norm_var": 1.6388626151062416e-06, "learning_rate": 0.0008370946763746928, "loss": 2.7161, "step": 6535 }, { "crossentropy": 2.61521053314209, "epoch": 0.5559714188499489, "grad_norm": 0.03364632651209831, "grad_norm_var": 1.6103290792580937e-06, "learning_rate": 0.0008359852285667751, "loss": 2.6152, "step": 6536 }, { "crossentropy": 2.672739028930664, "epoch": 0.5560564817965294, "grad_norm": 0.03324619308114052, "grad_norm_var": 1.1767415517666482e-06, "learning_rate": 0.0008348764493796446, "loss": 2.6727, "step": 6537 }, { "crossentropy": 2.6653647422790527, "epoch": 0.55614154474311, "grad_norm": 0.03247135132551193, "grad_norm_var": 1.1858801417241026e-06, "learning_rate": 0.0008337683389913425, "loss": 2.6654, "step": 6538 }, { "crossentropy": 2.704423427581787, "epoch": 0.5562266076896903, "grad_norm": 0.03169446811079979, "grad_norm_var": 1.397875187069316e-06, "learning_rate": 0.0008326608975797984, "loss": 2.7044, "step": 6539 }, { "crossentropy": 2.592517137527466, "epoch": 0.5563116706362708, "grad_norm": 0.03222202509641647, "grad_norm_var": 1.5040860374950412e-06, "learning_rate": 0.0008315541253228331, "loss": 2.5925, "step": 6540 }, { "crossentropy": 2.5115203857421875, "epoch": 0.5563967335828514, "grad_norm": 0.032305002212524414, "grad_norm_var": 1.5919095499056235e-06, "learning_rate": 0.000830448022398167, "loss": 2.5115, "step": 6541 }, { "crossentropy": 2.631965160369873, "epoch": 0.5564817965294317, "grad_norm": 0.0321992002427578, "grad_norm_var": 1.6045120926436522e-06, "learning_rate": 0.0008293425889834044, "loss": 2.632, "step": 6542 }, { "crossentropy": 2.58953595161438, "epoch": 0.5565668594760123, "grad_norm": 0.033800557255744934, "grad_norm_var": 1.5098109368972059e-06, "learning_rate": 0.0008282378252560457, "loss": 2.5895, "step": 6543 }, { "crossentropy": 2.694164752960205, "epoch": 0.5566519224225928, "grad_norm": 0.03332211822271347, "grad_norm_var": 1.5090507003469311e-06, "learning_rate": 0.0008271337313934868, "loss": 2.6942, "step": 6544 }, { "crossentropy": 2.618767499923706, "epoch": 0.5567369853691732, "grad_norm": 0.0312447901815176, "grad_norm_var": 1.3209636159811165e-06, "learning_rate": 0.000826030307573013, "loss": 2.6188, "step": 6545 }, { "crossentropy": 2.677751064300537, "epoch": 0.5568220483157537, "grad_norm": 0.033132828772068024, "grad_norm_var": 1.2988316755174842e-06, "learning_rate": 0.0008249275539717999, "loss": 2.6778, "step": 6546 }, { "crossentropy": 2.643451690673828, "epoch": 0.5569071112623342, "grad_norm": 0.03374709561467171, "grad_norm_var": 1.2778803974028737e-06, "learning_rate": 0.0008238254707669207, "loss": 2.6435, "step": 6547 }, { "crossentropy": 2.6166582107543945, "epoch": 0.5569921742089146, "grad_norm": 0.03440825641155243, "grad_norm_var": 1.1759245553578558e-06, "learning_rate": 0.0008227240581353373, "loss": 2.6167, "step": 6548 }, { "crossentropy": 2.6680421829223633, "epoch": 0.5570772371554951, "grad_norm": 0.032530657947063446, "grad_norm_var": 1.1870351291341409e-06, "learning_rate": 0.000821623316253905, "loss": 2.668, "step": 6549 }, { "crossentropy": 2.5707898139953613, "epoch": 0.5571623001020756, "grad_norm": 0.03235313668847084, "grad_norm_var": 1.0581076703734902e-06, "learning_rate": 0.0008205232452993705, "loss": 2.5708, "step": 6550 }, { "crossentropy": 2.6903693675994873, "epoch": 0.557247363048656, "grad_norm": 0.03241319581866264, "grad_norm_var": 7.26257485337287e-07, "learning_rate": 0.0008194238454483737, "loss": 2.6904, "step": 6551 }, { "crossentropy": 2.5933773517608643, "epoch": 0.5573324259952365, "grad_norm": 0.0334513783454895, "grad_norm_var": 7.065321271167476e-07, "learning_rate": 0.0008183251168774475, "loss": 2.5934, "step": 6552 }, { "crossentropy": 2.609924793243408, "epoch": 0.557417488941817, "grad_norm": 0.033978622406721115, "grad_norm_var": 7.852075729016985e-07, "learning_rate": 0.0008172270597630138, "loss": 2.6099, "step": 6553 }, { "crossentropy": 2.6442272663116455, "epoch": 0.5575025518883974, "grad_norm": 0.0334857702255249, "grad_norm_var": 8.010585301700904e-07, "learning_rate": 0.0008161296742813918, "loss": 2.6442, "step": 6554 }, { "crossentropy": 2.57472562789917, "epoch": 0.5575876148349779, "grad_norm": 0.03327954560518265, "grad_norm_var": 7.047712466326141e-07, "learning_rate": 0.0008150329606087881, "loss": 2.5747, "step": 6555 }, { "crossentropy": 2.556407928466797, "epoch": 0.5576726777815584, "grad_norm": 0.03197399899363518, "grad_norm_var": 7.340837471098467e-07, "learning_rate": 0.0008139369189213042, "loss": 2.5564, "step": 6556 }, { "crossentropy": 2.551664113998413, "epoch": 0.5577577407281388, "grad_norm": 0.03610498830676079, "grad_norm_var": 1.2962846016020447e-06, "learning_rate": 0.0008128415493949326, "loss": 2.5517, "step": 6557 }, { "crossentropy": 2.5738799571990967, "epoch": 0.5578428036747193, "grad_norm": 0.03416651114821434, "grad_norm_var": 1.271953796048404e-06, "learning_rate": 0.0008117468522055577, "loss": 2.5739, "step": 6558 }, { "crossentropy": 2.615720510482788, "epoch": 0.5579278666212998, "grad_norm": 0.03348527103662491, "grad_norm_var": 1.2586833530945126e-06, "learning_rate": 0.0008106528275289554, "loss": 2.6157, "step": 6559 }, { "crossentropy": 2.559755563735962, "epoch": 0.5580129295678802, "grad_norm": 0.033266376703977585, "grad_norm_var": 1.25884237229105e-06, "learning_rate": 0.000809559475540797, "loss": 2.5598, "step": 6560 }, { "crossentropy": 2.6259498596191406, "epoch": 0.5580979925144607, "grad_norm": 0.034115493297576904, "grad_norm_var": 9.819269832187876e-07, "learning_rate": 0.000808466796416642, "loss": 2.6259, "step": 6561 }, { "crossentropy": 2.5813190937042236, "epoch": 0.5581830554610412, "grad_norm": 0.0329098105430603, "grad_norm_var": 9.957550495115024e-07, "learning_rate": 0.0008073747903319417, "loss": 2.5813, "step": 6562 }, { "crossentropy": 2.650334358215332, "epoch": 0.5582681184076217, "grad_norm": 0.03279886022210121, "grad_norm_var": 1.0181045207574106e-06, "learning_rate": 0.0008062834574620453, "loss": 2.6503, "step": 6563 }, { "crossentropy": 2.6860735416412354, "epoch": 0.5583531813542021, "grad_norm": 0.03219400718808174, "grad_norm_var": 1.0328041886534593e-06, "learning_rate": 0.0008051927979821849, "loss": 2.6861, "step": 6564 }, { "crossentropy": 2.5145862102508545, "epoch": 0.5584382443007826, "grad_norm": 0.03250071033835411, "grad_norm_var": 1.0358592700950483e-06, "learning_rate": 0.0008041028120674892, "loss": 2.5146, "step": 6565 }, { "crossentropy": 2.5396029949188232, "epoch": 0.5585233072473631, "grad_norm": 0.03336792066693306, "grad_norm_var": 9.74831768249529e-07, "learning_rate": 0.0008030134998929812, "loss": 2.5396, "step": 6566 }, { "crossentropy": 2.626661777496338, "epoch": 0.5586083701939435, "grad_norm": 0.030432960018515587, "grad_norm_var": 1.4654862819352219e-06, "learning_rate": 0.0008019248616335716, "loss": 2.6267, "step": 6567 }, { "crossentropy": 2.6060643196105957, "epoch": 0.558693433140524, "grad_norm": 0.031259387731552124, "grad_norm_var": 1.6980218000775624e-06, "learning_rate": 0.0008008368974640634, "loss": 2.6061, "step": 6568 }, { "crossentropy": 2.647951126098633, "epoch": 0.5587784960871045, "grad_norm": 0.03877868503332138, "grad_norm_var": 3.711575821962659e-06, "learning_rate": 0.0007997496075591548, "loss": 2.648, "step": 6569 }, { "crossentropy": 2.701169967651367, "epoch": 0.5588635590336849, "grad_norm": 0.03268749266862869, "grad_norm_var": 3.7404139654325136e-06, "learning_rate": 0.0007986629920934324, "loss": 2.7012, "step": 6570 }, { "crossentropy": 2.669072389602661, "epoch": 0.5589486219802654, "grad_norm": 0.032708313316106796, "grad_norm_var": 3.7648509590021737e-06, "learning_rate": 0.0007975770512413727, "loss": 2.6691, "step": 6571 }, { "crossentropy": 2.5526504516601562, "epoch": 0.5590336849268459, "grad_norm": 0.03234649449586868, "grad_norm_var": 3.707818523271154e-06, "learning_rate": 0.0007964917851773495, "loss": 2.5527, "step": 6572 }, { "crossentropy": 2.67463755607605, "epoch": 0.5591187478734263, "grad_norm": 0.033298179507255554, "grad_norm_var": 3.158023992084789e-06, "learning_rate": 0.0007954071940756241, "loss": 2.6746, "step": 6573 }, { "crossentropy": 2.607337474822998, "epoch": 0.5592038108200068, "grad_norm": 0.03233637288212776, "grad_norm_var": 3.118040542594948e-06, "learning_rate": 0.000794323278110351, "loss": 2.6073, "step": 6574 }, { "crossentropy": 2.582728862762451, "epoch": 0.5592888737665873, "grad_norm": 0.032250117510557175, "grad_norm_var": 3.1384787361654938e-06, "learning_rate": 0.0007932400374555748, "loss": 2.5827, "step": 6575 }, { "crossentropy": 2.689384937286377, "epoch": 0.5593739367131677, "grad_norm": 0.03372827172279358, "grad_norm_var": 3.1711002942746835e-06, "learning_rate": 0.0007921574722852343, "loss": 2.6894, "step": 6576 }, { "crossentropy": 2.6085476875305176, "epoch": 0.5594589996597482, "grad_norm": 0.03375156223773956, "grad_norm_var": 3.1243796355613495e-06, "learning_rate": 0.0007910755827731574, "loss": 2.6085, "step": 6577 }, { "crossentropy": 2.630650520324707, "epoch": 0.5595440626063287, "grad_norm": 0.0312688909471035, "grad_norm_var": 3.303500700663888e-06, "learning_rate": 0.000789994369093065, "loss": 2.6307, "step": 6578 }, { "crossentropy": 2.6022584438323975, "epoch": 0.5596291255529091, "grad_norm": 0.03477600961923599, "grad_norm_var": 3.5325560540734457e-06, "learning_rate": 0.0007889138314185679, "loss": 2.6023, "step": 6579 }, { "crossentropy": 2.649782180786133, "epoch": 0.5597141884994896, "grad_norm": 0.032461605966091156, "grad_norm_var": 3.5089755395651858e-06, "learning_rate": 0.0007878339699231701, "loss": 2.6498, "step": 6580 }, { "crossentropy": 2.612704277038574, "epoch": 0.5597992514460701, "grad_norm": 0.03387907147407532, "grad_norm_var": 3.536497957150663e-06, "learning_rate": 0.0007867547847802642, "loss": 2.6127, "step": 6581 }, { "crossentropy": 2.6606571674346924, "epoch": 0.5598843143926505, "grad_norm": 0.03195957466959953, "grad_norm_var": 3.6069997590327014e-06, "learning_rate": 0.0007856762761631398, "loss": 2.6607, "step": 6582 }, { "crossentropy": 2.567434787750244, "epoch": 0.559969377339231, "grad_norm": 0.03111589327454567, "grad_norm_var": 3.4028389673747734e-06, "learning_rate": 0.0007845984442449721, "loss": 2.5674, "step": 6583 }, { "crossentropy": 2.582245349884033, "epoch": 0.5600544402858115, "grad_norm": 0.03354765102267265, "grad_norm_var": 3.1874801228422172e-06, "learning_rate": 0.0007835212891988292, "loss": 2.5822, "step": 6584 }, { "crossentropy": 2.6670331954956055, "epoch": 0.5601395032323919, "grad_norm": 0.032529354095458984, "grad_norm_var": 9.640310771949836e-07, "learning_rate": 0.0007824448111976745, "loss": 2.667, "step": 6585 }, { "crossentropy": 2.568781852722168, "epoch": 0.5602245661789724, "grad_norm": 0.03510993346571922, "grad_norm_var": 1.2975877155982825e-06, "learning_rate": 0.0007813690104143556, "loss": 2.5688, "step": 6586 }, { "crossentropy": 2.6817402839660645, "epoch": 0.5603096291255529, "grad_norm": 0.032883286476135254, "grad_norm_var": 1.2940561966600627e-06, "learning_rate": 0.0007802938870216158, "loss": 2.6817, "step": 6587 }, { "crossentropy": 2.665802478790283, "epoch": 0.5603946920721333, "grad_norm": 0.031239639967679977, "grad_norm_var": 1.4600822211607855e-06, "learning_rate": 0.0007792194411920905, "loss": 2.6658, "step": 6588 }, { "crossentropy": 2.541133403778076, "epoch": 0.5604797550187138, "grad_norm": 0.03222084790468216, "grad_norm_var": 1.4730508673434228e-06, "learning_rate": 0.0007781456730983045, "loss": 2.5411, "step": 6589 }, { "crossentropy": 2.7174594402313232, "epoch": 0.5605648179652943, "grad_norm": 0.03190634399652481, "grad_norm_var": 1.5121166040951127e-06, "learning_rate": 0.000777072582912673, "loss": 2.7175, "step": 6590 }, { "crossentropy": 2.694150686264038, "epoch": 0.5606498809118748, "grad_norm": 0.03298155218362808, "grad_norm_var": 1.4929748725520572e-06, "learning_rate": 0.0007760001708075026, "loss": 2.6942, "step": 6591 }, { "crossentropy": 2.7069008350372314, "epoch": 0.5607349438584552, "grad_norm": 0.03243471682071686, "grad_norm_var": 1.4434834967581856e-06, "learning_rate": 0.0007749284369549952, "loss": 2.7069, "step": 6592 }, { "crossentropy": 2.557332992553711, "epoch": 0.5608200068050357, "grad_norm": 0.031458303332328796, "grad_norm_var": 1.4671877695047094e-06, "learning_rate": 0.000773857381527236, "loss": 2.5573, "step": 6593 }, { "crossentropy": 2.6626291275024414, "epoch": 0.5609050697516162, "grad_norm": 0.03356437757611275, "grad_norm_var": 1.3858075970624162e-06, "learning_rate": 0.0007727870046962087, "loss": 2.6626, "step": 6594 }, { "crossentropy": 2.597522497177124, "epoch": 0.5609901326981966, "grad_norm": 0.033895932137966156, "grad_norm_var": 1.1969766255559566e-06, "learning_rate": 0.0007717173066337846, "loss": 2.5975, "step": 6595 }, { "crossentropy": 2.63948655128479, "epoch": 0.5610751956447771, "grad_norm": 0.0324837788939476, "grad_norm_var": 1.1963047688523521e-06, "learning_rate": 0.0007706482875117254, "loss": 2.6395, "step": 6596 }, { "crossentropy": 2.5385024547576904, "epoch": 0.5611602585913577, "grad_norm": 0.0315190926194191, "grad_norm_var": 1.173589089675566e-06, "learning_rate": 0.0007695799475016846, "loss": 2.5385, "step": 6597 }, { "crossentropy": 2.7700366973876953, "epoch": 0.561245321537938, "grad_norm": 0.03352260962128639, "grad_norm_var": 1.2025791992794262e-06, "learning_rate": 0.0007685122867752081, "loss": 2.77, "step": 6598 }, { "crossentropy": 2.618781089782715, "epoch": 0.5613303844845186, "grad_norm": 0.03206922486424446, "grad_norm_var": 1.0642743482105106e-06, "learning_rate": 0.0007674453055037323, "loss": 2.6188, "step": 6599 }, { "crossentropy": 2.7411153316497803, "epoch": 0.5614154474310991, "grad_norm": 0.03351913020014763, "grad_norm_var": 1.0611413678908905e-06, "learning_rate": 0.0007663790038585794, "loss": 2.7411, "step": 6600 }, { "crossentropy": 2.654346227645874, "epoch": 0.5615005103776795, "grad_norm": 0.03438713401556015, "grad_norm_var": 1.2324424670119572e-06, "learning_rate": 0.0007653133820109709, "loss": 2.6543, "step": 6601 }, { "crossentropy": 2.6781346797943115, "epoch": 0.56158557332426, "grad_norm": 0.03709627315402031, "grad_norm_var": 2.0842607213908865e-06, "learning_rate": 0.0007642484401320138, "loss": 2.6781, "step": 6602 }, { "crossentropy": 2.562969207763672, "epoch": 0.5616706362708405, "grad_norm": 0.03563874587416649, "grad_norm_var": 2.534693036322586e-06, "learning_rate": 0.0007631841783927052, "loss": 2.563, "step": 6603 }, { "crossentropy": 2.632786750793457, "epoch": 0.5617556992174209, "grad_norm": 0.03496759384870529, "grad_norm_var": 2.4680928428758945e-06, "learning_rate": 0.0007621205969639377, "loss": 2.6328, "step": 6604 }, { "crossentropy": 2.6270194053649902, "epoch": 0.5618407621640014, "grad_norm": 0.032820794731378555, "grad_norm_var": 2.3999364367384307e-06, "learning_rate": 0.0007610576960164905, "loss": 2.627, "step": 6605 }, { "crossentropy": 2.6813411712646484, "epoch": 0.5619258251105819, "grad_norm": 0.03467709198594093, "grad_norm_var": 2.331048945628661e-06, "learning_rate": 0.0007599954757210353, "loss": 2.6813, "step": 6606 }, { "crossentropy": 2.6629269123077393, "epoch": 0.5620108880571623, "grad_norm": 0.033396996557712555, "grad_norm_var": 2.3095300206533997e-06, "learning_rate": 0.0007589339362481334, "loss": 2.6629, "step": 6607 }, { "crossentropy": 2.669252872467041, "epoch": 0.5620959510037428, "grad_norm": 0.032500870525836945, "grad_norm_var": 2.2996068687895316e-06, "learning_rate": 0.0007578730777682385, "loss": 2.6693, "step": 6608 }, { "crossentropy": 2.671571969985962, "epoch": 0.5621810139503233, "grad_norm": 0.03398972749710083, "grad_norm_var": 1.9789721404743035e-06, "learning_rate": 0.0007568129004516916, "loss": 2.6716, "step": 6609 }, { "crossentropy": 2.6960432529449463, "epoch": 0.5622660768969037, "grad_norm": 0.03254774585366249, "grad_norm_var": 2.0691479741032546e-06, "learning_rate": 0.0007557534044687292, "loss": 2.696, "step": 6610 }, { "crossentropy": 2.6889309883117676, "epoch": 0.5623511398434842, "grad_norm": 0.0346488393843174, "grad_norm_var": 2.1252958825101454e-06, "learning_rate": 0.0007546945899894752, "loss": 2.6889, "step": 6611 }, { "crossentropy": 2.627474308013916, "epoch": 0.5624362027900647, "grad_norm": 0.0328928604722023, "grad_norm_var": 2.0674208088891063e-06, "learning_rate": 0.0007536364571839438, "loss": 2.6275, "step": 6612 }, { "crossentropy": 2.6600804328918457, "epoch": 0.5625212657366451, "grad_norm": 0.033494651317596436, "grad_norm_var": 1.7205034977124089e-06, "learning_rate": 0.0007525790062220417, "loss": 2.6601, "step": 6613 }, { "crossentropy": 2.595332145690918, "epoch": 0.5626063286832256, "grad_norm": 0.03130292519927025, "grad_norm_var": 2.135883570983319e-06, "learning_rate": 0.0007515222372735647, "loss": 2.5953, "step": 6614 }, { "crossentropy": 2.7008821964263916, "epoch": 0.5626913916298061, "grad_norm": 0.033968180418014526, "grad_norm_var": 1.936479929313256e-06, "learning_rate": 0.0007504661505081978, "loss": 2.7009, "step": 6615 }, { "crossentropy": 2.6866414546966553, "epoch": 0.5627764545763865, "grad_norm": 0.03307398036122322, "grad_norm_var": 1.9694288135446714e-06, "learning_rate": 0.0007494107460955207, "loss": 2.6866, "step": 6616 }, { "crossentropy": 2.603262186050415, "epoch": 0.562861517522967, "grad_norm": 0.03201616182923317, "grad_norm_var": 2.1471046945479092e-06, "learning_rate": 0.0007483560242049997, "loss": 2.6033, "step": 6617 }, { "crossentropy": 2.6051762104034424, "epoch": 0.5629465804695475, "grad_norm": 0.03199011832475662, "grad_norm_var": 1.4573155052245711e-06, "learning_rate": 0.0007473019850059931, "loss": 2.6052, "step": 6618 }, { "crossentropy": 2.673445701599121, "epoch": 0.5630316434161279, "grad_norm": 0.036221593618392944, "grad_norm_var": 1.6548232157936303e-06, "learning_rate": 0.0007462486286677483, "loss": 2.6734, "step": 6619 }, { "crossentropy": 2.5550594329833984, "epoch": 0.5631167063627084, "grad_norm": 0.03252869471907616, "grad_norm_var": 1.519065428453222e-06, "learning_rate": 0.0007451959553594051, "loss": 2.5551, "step": 6620 }, { "crossentropy": 2.6391239166259766, "epoch": 0.5632017693092889, "grad_norm": 0.033333919942379, "grad_norm_var": 1.5058521206908735e-06, "learning_rate": 0.0007441439652499937, "loss": 2.6391, "step": 6621 }, { "crossentropy": 2.598907709121704, "epoch": 0.5632868322558694, "grad_norm": 0.03507595509290695, "grad_norm_var": 1.5897482788627035e-06, "learning_rate": 0.0007430926585084291, "loss": 2.5989, "step": 6622 }, { "crossentropy": 2.5036957263946533, "epoch": 0.5633718952024498, "grad_norm": 0.03167952597141266, "grad_norm_var": 1.754515330755028e-06, "learning_rate": 0.0007420420353035251, "loss": 2.5037, "step": 6623 }, { "crossentropy": 2.6142830848693848, "epoch": 0.5634569581490303, "grad_norm": 0.03344561532139778, "grad_norm_var": 1.7217150814448242e-06, "learning_rate": 0.0007409920958039795, "loss": 2.6143, "step": 6624 }, { "crossentropy": 2.5700318813323975, "epoch": 0.5635420210956108, "grad_norm": 0.03317767381668091, "grad_norm_var": 1.6842608511646294e-06, "learning_rate": 0.000739942840178382, "loss": 2.57, "step": 6625 }, { "crossentropy": 2.6883842945098877, "epoch": 0.5636270840421912, "grad_norm": 0.032500337809324265, "grad_norm_var": 1.6886026648815198e-06, "learning_rate": 0.0007388942685952149, "loss": 2.6884, "step": 6626 }, { "crossentropy": 2.6404170989990234, "epoch": 0.5637121469887717, "grad_norm": 0.032485123723745346, "grad_norm_var": 1.565946508733038e-06, "learning_rate": 0.000737846381222847, "loss": 2.6404, "step": 6627 }, { "crossentropy": 2.7096829414367676, "epoch": 0.5637972099353522, "grad_norm": 0.03135349601507187, "grad_norm_var": 1.751270388052004e-06, "learning_rate": 0.0007367991782295391, "loss": 2.7097, "step": 6628 }, { "crossentropy": 2.636251211166382, "epoch": 0.5638822728819326, "grad_norm": 0.035789262503385544, "grad_norm_var": 2.238417327460098e-06, "learning_rate": 0.0007357526597834419, "loss": 2.6363, "step": 6629 }, { "crossentropy": 2.652052879333496, "epoch": 0.5639673358285131, "grad_norm": 0.03221091255545616, "grad_norm_var": 2.069790029559047e-06, "learning_rate": 0.0007347068260525963, "loss": 2.6521, "step": 6630 }, { "crossentropy": 2.5881857872009277, "epoch": 0.5640523987750936, "grad_norm": 0.03788455203175545, "grad_norm_var": 3.4409483420598405e-06, "learning_rate": 0.0007336616772049315, "loss": 2.5882, "step": 6631 }, { "crossentropy": 2.7177791595458984, "epoch": 0.564137461721674, "grad_norm": 0.03224097192287445, "grad_norm_var": 3.5230746519396745e-06, "learning_rate": 0.0007326172134082704, "loss": 2.7178, "step": 6632 }, { "crossentropy": 2.713829278945923, "epoch": 0.5642225246682545, "grad_norm": 0.03238726779818535, "grad_norm_var": 3.4646501058545794e-06, "learning_rate": 0.0007315734348303232, "loss": 2.7138, "step": 6633 }, { "crossentropy": 2.635246992111206, "epoch": 0.564307587614835, "grad_norm": 0.031352847814559937, "grad_norm_var": 3.6093246208232972e-06, "learning_rate": 0.0007305303416386905, "loss": 2.6352, "step": 6634 }, { "crossentropy": 2.6770777702331543, "epoch": 0.5643926505614154, "grad_norm": 0.038114894181489944, "grad_norm_var": 4.557197682034976e-06, "learning_rate": 0.0007294879340008631, "loss": 2.6771, "step": 6635 }, { "crossentropy": 2.57039475440979, "epoch": 0.5644777135079959, "grad_norm": 0.035284899175167084, "grad_norm_var": 4.685122265505464e-06, "learning_rate": 0.0007284462120842217, "loss": 2.5704, "step": 6636 }, { "crossentropy": 2.555102586746216, "epoch": 0.5645627764545764, "grad_norm": 0.04148276150226593, "grad_norm_var": 8.497542517107993e-06, "learning_rate": 0.0007274051760560363, "loss": 2.5551, "step": 6637 }, { "crossentropy": 2.668322801589966, "epoch": 0.5646478394011568, "grad_norm": 0.03311320021748543, "grad_norm_var": 8.497075991779843e-06, "learning_rate": 0.000726364826083466, "loss": 2.6683, "step": 6638 }, { "crossentropy": 2.6022698879241943, "epoch": 0.5647329023477373, "grad_norm": 0.03492949157953262, "grad_norm_var": 8.138058120493672e-06, "learning_rate": 0.0007253251623335644, "loss": 2.6023, "step": 6639 }, { "crossentropy": 2.6044375896453857, "epoch": 0.5648179652943178, "grad_norm": 0.0339067243039608, "grad_norm_var": 8.102840363021813e-06, "learning_rate": 0.0007242861849732695, "loss": 2.6044, "step": 6640 }, { "crossentropy": 2.6846792697906494, "epoch": 0.5649030282408982, "grad_norm": 0.03320496529340744, "grad_norm_var": 8.09893610126209e-06, "learning_rate": 0.0007232478941694104, "loss": 2.6847, "step": 6641 }, { "crossentropy": 2.561967372894287, "epoch": 0.5649880911874787, "grad_norm": 0.03174816444516182, "grad_norm_var": 8.311284694196136e-06, "learning_rate": 0.00072221029008871, "loss": 2.562, "step": 6642 }, { "crossentropy": 2.720942735671997, "epoch": 0.5650731541340592, "grad_norm": 0.03259466215968132, "grad_norm_var": 8.286724335505348e-06, "learning_rate": 0.0007211733728977743, "loss": 2.7209, "step": 6643 }, { "crossentropy": 2.7065601348876953, "epoch": 0.5651582170806396, "grad_norm": 0.032557904720306396, "grad_norm_var": 7.916267568919219e-06, "learning_rate": 0.000720137142763102, "loss": 2.7066, "step": 6644 }, { "crossentropy": 2.5479960441589355, "epoch": 0.5652432800272201, "grad_norm": 0.03333110734820366, "grad_norm_var": 7.805885050166048e-06, "learning_rate": 0.0007191015998510842, "loss": 2.548, "step": 6645 }, { "crossentropy": 2.6924285888671875, "epoch": 0.5653283429738006, "grad_norm": 0.03446770831942558, "grad_norm_var": 7.541750465100451e-06, "learning_rate": 0.0007180667443279982, "loss": 2.6924, "step": 6646 }, { "crossentropy": 2.5796475410461426, "epoch": 0.565413405920381, "grad_norm": 0.03271956741809845, "grad_norm_var": 6.731996009826102e-06, "learning_rate": 0.0007170325763600105, "loss": 2.5796, "step": 6647 }, { "crossentropy": 2.6728858947753906, "epoch": 0.5654984688669615, "grad_norm": 0.037115033715963364, "grad_norm_var": 7.096489561638346e-06, "learning_rate": 0.0007159990961131818, "loss": 2.6729, "step": 6648 }, { "crossentropy": 2.658029794692993, "epoch": 0.565583531813542, "grad_norm": 0.03366749733686447, "grad_norm_var": 6.877642924270576e-06, "learning_rate": 0.0007149663037534587, "loss": 2.658, "step": 6649 }, { "crossentropy": 2.654303789138794, "epoch": 0.5656685947601225, "grad_norm": 0.035228509455919266, "grad_norm_var": 6.2679237641721205e-06, "learning_rate": 0.0007139341994466742, "loss": 2.6543, "step": 6650 }, { "crossentropy": 2.665472984313965, "epoch": 0.5657536577067029, "grad_norm": 0.03339933231472969, "grad_norm_var": 5.442523535928387e-06, "learning_rate": 0.0007129027833585583, "loss": 2.6655, "step": 6651 }, { "crossentropy": 2.6714537143707275, "epoch": 0.5658387206532834, "grad_norm": 0.03220241889357567, "grad_norm_var": 5.630342774270991e-06, "learning_rate": 0.000711872055654726, "loss": 2.6715, "step": 6652 }, { "crossentropy": 2.6520259380340576, "epoch": 0.565923783599864, "grad_norm": 0.03388341888785362, "grad_norm_var": 1.7635397987690478e-06, "learning_rate": 0.0007108420165006801, "loss": 2.652, "step": 6653 }, { "crossentropy": 2.6657192707061768, "epoch": 0.5660088465464443, "grad_norm": 0.03606655076146126, "grad_norm_var": 2.1054301458114353e-06, "learning_rate": 0.0007098126660618187, "loss": 2.6657, "step": 6654 }, { "crossentropy": 2.669435739517212, "epoch": 0.5660939094930248, "grad_norm": 0.0340094156563282, "grad_norm_var": 2.021486720743306e-06, "learning_rate": 0.0007087840045034238, "loss": 2.6694, "step": 6655 }, { "crossentropy": 2.532743453979492, "epoch": 0.5661789724396054, "grad_norm": 0.03619294986128807, "grad_norm_var": 2.3939757259447875e-06, "learning_rate": 0.0007077560319906695, "loss": 2.5327, "step": 6656 }, { "crossentropy": 2.680061101913452, "epoch": 0.5662640353861857, "grad_norm": 0.03187577798962593, "grad_norm_var": 2.6274548602882195e-06, "learning_rate": 0.0007067287486886182, "loss": 2.6801, "step": 6657 }, { "crossentropy": 2.528451442718506, "epoch": 0.5663490983327663, "grad_norm": 0.03318474069237709, "grad_norm_var": 2.3603107418482607e-06, "learning_rate": 0.0007057021547622222, "loss": 2.5285, "step": 6658 }, { "crossentropy": 2.596487283706665, "epoch": 0.5664341612793468, "grad_norm": 0.033127930015325546, "grad_norm_var": 2.2848422658145255e-06, "learning_rate": 0.0007046762503763226, "loss": 2.5965, "step": 6659 }, { "crossentropy": 2.6201024055480957, "epoch": 0.5665192242259272, "grad_norm": 0.03503790497779846, "grad_norm_var": 2.2124389490511705e-06, "learning_rate": 0.0007036510356956494, "loss": 2.6201, "step": 6660 }, { "crossentropy": 2.596234083175659, "epoch": 0.5666042871725077, "grad_norm": 0.0331684835255146, "grad_norm_var": 2.230641738097151e-06, "learning_rate": 0.000702626510884824, "loss": 2.5962, "step": 6661 }, { "crossentropy": 2.6249916553497314, "epoch": 0.5666893501190882, "grad_norm": 0.0346798449754715, "grad_norm_var": 2.2443017799115655e-06, "learning_rate": 0.0007016026761083555, "loss": 2.625, "step": 6662 }, { "crossentropy": 2.695686101913452, "epoch": 0.5667744130656686, "grad_norm": 0.033838480710983276, "grad_norm_var": 2.1169839176021775e-06, "learning_rate": 0.0007005795315306401, "loss": 2.6957, "step": 6663 }, { "crossentropy": 2.61810564994812, "epoch": 0.5668594760122491, "grad_norm": 0.03475860878825188, "grad_norm_var": 1.5379108967391216e-06, "learning_rate": 0.0006995570773159693, "loss": 2.6181, "step": 6664 }, { "crossentropy": 2.604555606842041, "epoch": 0.5669445389588296, "grad_norm": 0.03215058892965317, "grad_norm_var": 1.7530428985557122e-06, "learning_rate": 0.0006985353136285166, "loss": 2.6046, "step": 6665 }, { "crossentropy": 2.571394920349121, "epoch": 0.56702960190541, "grad_norm": 0.03220106661319733, "grad_norm_var": 1.7998326952881407e-06, "learning_rate": 0.000697514240632347, "loss": 2.5714, "step": 6666 }, { "crossentropy": 2.604086399078369, "epoch": 0.5671146648519905, "grad_norm": 0.033743277192115784, "grad_norm_var": 1.791782636835724e-06, "learning_rate": 0.0006964938584914188, "loss": 2.6041, "step": 6667 }, { "crossentropy": 2.7490248680114746, "epoch": 0.567199727798571, "grad_norm": 0.03235292062163353, "grad_norm_var": 1.7619908282785756e-06, "learning_rate": 0.0006954741673695741, "loss": 2.749, "step": 6668 }, { "crossentropy": 2.560379981994629, "epoch": 0.5672847907451514, "grad_norm": 0.034666016697883606, "grad_norm_var": 1.812417685004691e-06, "learning_rate": 0.0006944551674305443, "loss": 2.5604, "step": 6669 }, { "crossentropy": 2.6199791431427, "epoch": 0.5673698536917319, "grad_norm": 0.03305009752511978, "grad_norm_var": 1.4759108367271937e-06, "learning_rate": 0.0006934368588379553, "loss": 2.62, "step": 6670 }, { "crossentropy": 2.640045642852783, "epoch": 0.5674549166383124, "grad_norm": 0.034417375922203064, "grad_norm_var": 1.5070934418908698e-06, "learning_rate": 0.000692419241755316, "loss": 2.64, "step": 6671 }, { "crossentropy": 2.5833165645599365, "epoch": 0.5675399795848928, "grad_norm": 0.03120913729071617, "grad_norm_var": 1.371594440197019e-06, "learning_rate": 0.0006914023163460248, "loss": 2.5833, "step": 6672 }, { "crossentropy": 2.6908371448516846, "epoch": 0.5676250425314733, "grad_norm": 0.0390697717666626, "grad_norm_var": 3.200376522460626e-06, "learning_rate": 0.0006903860827733732, "loss": 2.6908, "step": 6673 }, { "crossentropy": 2.724032402038574, "epoch": 0.5677101054780538, "grad_norm": 0.03304027020931244, "grad_norm_var": 3.2133595114544033e-06, "learning_rate": 0.000689370541200538, "loss": 2.724, "step": 6674 }, { "crossentropy": 2.61667799949646, "epoch": 0.5677951684246342, "grad_norm": 0.03170290216803551, "grad_norm_var": 3.464551597052943e-06, "learning_rate": 0.0006883556917905858, "loss": 2.6167, "step": 6675 }, { "crossentropy": 2.492494583129883, "epoch": 0.5678802313712147, "grad_norm": 0.03240611404180527, "grad_norm_var": 3.425484860121395e-06, "learning_rate": 0.000687341534706471, "loss": 2.4925, "step": 6676 }, { "crossentropy": 2.6491687297821045, "epoch": 0.5679652943177952, "grad_norm": 0.03160552680492401, "grad_norm_var": 3.653173737821804e-06, "learning_rate": 0.0006863280701110408, "loss": 2.6492, "step": 6677 }, { "crossentropy": 2.6485517024993896, "epoch": 0.5680503572643756, "grad_norm": 0.03250015527009964, "grad_norm_var": 3.5870955986968174e-06, "learning_rate": 0.0006853152981670269, "loss": 2.6486, "step": 6678 }, { "crossentropy": 2.626875877380371, "epoch": 0.5681354202109561, "grad_norm": 0.03160477429628372, "grad_norm_var": 3.7369292239369336e-06, "learning_rate": 0.0006843032190370512, "loss": 2.6269, "step": 6679 }, { "crossentropy": 2.712054967880249, "epoch": 0.5682204831575366, "grad_norm": 0.033713024109601974, "grad_norm_var": 3.5816838484116485e-06, "learning_rate": 0.0006832918328836246, "loss": 2.7121, "step": 6680 }, { "crossentropy": 2.6471617221832275, "epoch": 0.5683055461041171, "grad_norm": 0.03433048725128174, "grad_norm_var": 3.605765200201394e-06, "learning_rate": 0.0006822811398691464, "loss": 2.6472, "step": 6681 }, { "crossentropy": 2.614551067352295, "epoch": 0.5683906090506975, "grad_norm": 0.031588032841682434, "grad_norm_var": 3.7130134419333107e-06, "learning_rate": 0.0006812711401559035, "loss": 2.6146, "step": 6682 }, { "crossentropy": 2.557673931121826, "epoch": 0.568475671997278, "grad_norm": 0.03175570070743561, "grad_norm_var": 3.8126286647149898e-06, "learning_rate": 0.0006802618339060756, "loss": 2.5577, "step": 6683 }, { "crossentropy": 2.623338222503662, "epoch": 0.5685607349438585, "grad_norm": 0.03464695066213608, "grad_norm_var": 3.924264725611801e-06, "learning_rate": 0.0006792532212817271, "loss": 2.6233, "step": 6684 }, { "crossentropy": 2.678772449493408, "epoch": 0.5686457978904389, "grad_norm": 0.035408925265073776, "grad_norm_var": 4.103316486344053e-06, "learning_rate": 0.000678245302444811, "loss": 2.6788, "step": 6685 }, { "crossentropy": 2.647491693496704, "epoch": 0.5687308608370194, "grad_norm": 0.03236021846532822, "grad_norm_var": 4.151733225983787e-06, "learning_rate": 0.0006772380775571712, "loss": 2.6475, "step": 6686 }, { "crossentropy": 2.663414716720581, "epoch": 0.5688159237835999, "grad_norm": 0.03161175549030304, "grad_norm_var": 4.192029079350817e-06, "learning_rate": 0.0006762315467805392, "loss": 2.6634, "step": 6687 }, { "crossentropy": 2.635530710220337, "epoch": 0.5689009867301803, "grad_norm": 0.03356030955910683, "grad_norm_var": 3.965263256059763e-06, "learning_rate": 0.0006752257102765325, "loss": 2.6355, "step": 6688 }, { "crossentropy": 2.578054428100586, "epoch": 0.5689860496767608, "grad_norm": 0.03426451236009598, "grad_norm_var": 1.635834319448313e-06, "learning_rate": 0.0006742205682066621, "loss": 2.5781, "step": 6689 }, { "crossentropy": 2.5213799476623535, "epoch": 0.5690711126233413, "grad_norm": 0.03388291597366333, "grad_norm_var": 1.698081313975669e-06, "learning_rate": 0.000673216120732324, "loss": 2.5214, "step": 6690 }, { "crossentropy": 2.5399811267852783, "epoch": 0.5691561755699217, "grad_norm": 0.0311263557523489, "grad_norm_var": 1.813486537780235e-06, "learning_rate": 0.0006722123680148029, "loss": 2.54, "step": 6691 }, { "crossentropy": 2.4900803565979004, "epoch": 0.5692412385165022, "grad_norm": 0.03553912043571472, "grad_norm_var": 2.2215505367978997e-06, "learning_rate": 0.0006712093102152739, "loss": 2.4901, "step": 6692 }, { "crossentropy": 2.625396251678467, "epoch": 0.5693263014630827, "grad_norm": 0.03328806161880493, "grad_norm_var": 2.064635583738751e-06, "learning_rate": 0.0006702069474947991, "loss": 2.6254, "step": 6693 }, { "crossentropy": 2.632108688354492, "epoch": 0.5694113644096631, "grad_norm": 0.0345509797334671, "grad_norm_var": 2.1364549050798307e-06, "learning_rate": 0.0006692052800143267, "loss": 2.6321, "step": 6694 }, { "crossentropy": 2.597200870513916, "epoch": 0.5694964273562436, "grad_norm": 0.03625449910759926, "grad_norm_var": 2.4199796367347134e-06, "learning_rate": 0.0006682043079346983, "loss": 2.5972, "step": 6695 }, { "crossentropy": 2.636577606201172, "epoch": 0.5695814903028241, "grad_norm": 0.03334670886397362, "grad_norm_var": 2.4237063690697116e-06, "learning_rate": 0.00066720403141664, "loss": 2.6366, "step": 6696 }, { "crossentropy": 2.572722911834717, "epoch": 0.5696665532494045, "grad_norm": 0.03134413808584213, "grad_norm_var": 2.688131581044537e-06, "learning_rate": 0.0006662044506207681, "loss": 2.5727, "step": 6697 }, { "crossentropy": 2.609262704849243, "epoch": 0.569751616195985, "grad_norm": 0.03260038048028946, "grad_norm_var": 2.5065159745931655e-06, "learning_rate": 0.0006652055657075845, "loss": 2.6093, "step": 6698 }, { "crossentropy": 2.563366413116455, "epoch": 0.5698366791425655, "grad_norm": 0.03489270433783531, "grad_norm_var": 2.4039674823939645e-06, "learning_rate": 0.000664207376837484, "loss": 2.5634, "step": 6699 }, { "crossentropy": 2.6846923828125, "epoch": 0.5699217420891459, "grad_norm": 0.032771773636341095, "grad_norm_var": 2.378826879169043e-06, "learning_rate": 0.0006632098841707457, "loss": 2.6847, "step": 6700 }, { "crossentropy": 2.6618916988372803, "epoch": 0.5700068050357264, "grad_norm": 0.03332347050309181, "grad_norm_var": 2.1338114156762976e-06, "learning_rate": 0.000662213087867538, "loss": 2.6619, "step": 6701 }, { "crossentropy": 2.6703946590423584, "epoch": 0.5700918679823069, "grad_norm": 0.03362752124667168, "grad_norm_var": 2.0551368566579327e-06, "learning_rate": 0.0006612169880879182, "loss": 2.6704, "step": 6702 }, { "crossentropy": 2.639148473739624, "epoch": 0.5701769309288873, "grad_norm": 0.03229077160358429, "grad_norm_var": 1.913083865066065e-06, "learning_rate": 0.0006602215849918308, "loss": 2.6391, "step": 6703 }, { "crossentropy": 2.626984119415283, "epoch": 0.5702619938754678, "grad_norm": 0.0328100211918354, "grad_norm_var": 1.946386871727934e-06, "learning_rate": 0.0006592268787391076, "loss": 2.627, "step": 6704 }, { "crossentropy": 2.7333085536956787, "epoch": 0.5703470568220483, "grad_norm": 0.03220688924193382, "grad_norm_var": 1.9997806579578053e-06, "learning_rate": 0.000658232869489473, "loss": 2.7333, "step": 6705 }, { "crossentropy": 2.6218857765197754, "epoch": 0.5704321197686287, "grad_norm": 0.032781925052404404, "grad_norm_var": 1.999662192654266e-06, "learning_rate": 0.0006572395574025336, "loss": 2.6219, "step": 6706 }, { "crossentropy": 2.6025803089141846, "epoch": 0.5705171827152092, "grad_norm": 0.03208422288298607, "grad_norm_var": 1.7797548542585727e-06, "learning_rate": 0.0006562469426377876, "loss": 2.6026, "step": 6707 }, { "crossentropy": 2.741621494293213, "epoch": 0.5706022456617897, "grad_norm": 0.03343939408659935, "grad_norm_var": 1.4444147098222523e-06, "learning_rate": 0.0006552550253546207, "loss": 2.7416, "step": 6708 }, { "crossentropy": 2.6531169414520264, "epoch": 0.5706873086083702, "grad_norm": 0.03235454484820366, "grad_norm_var": 1.4911360693675135e-06, "learning_rate": 0.0006542638057123057, "loss": 2.6531, "step": 6709 }, { "crossentropy": 2.635129928588867, "epoch": 0.5707723715549506, "grad_norm": 0.03277526795864105, "grad_norm_var": 1.3606524395359231e-06, "learning_rate": 0.0006532732838700023, "loss": 2.6351, "step": 6710 }, { "crossentropy": 2.674287796020508, "epoch": 0.5708574345015311, "grad_norm": 0.036626074463129044, "grad_norm_var": 1.5277206703522817e-06, "learning_rate": 0.0006522834599867628, "loss": 2.6743, "step": 6711 }, { "crossentropy": 2.6814637184143066, "epoch": 0.5709424974481117, "grad_norm": 0.03269364684820175, "grad_norm_var": 1.531129827016716e-06, "learning_rate": 0.0006512943342215232, "loss": 2.6815, "step": 6712 }, { "crossentropy": 2.6637938022613525, "epoch": 0.571027560394692, "grad_norm": 0.033033452928066254, "grad_norm_var": 1.327754961919873e-06, "learning_rate": 0.0006503059067331085, "loss": 2.6638, "step": 6713 }, { "crossentropy": 2.6080212593078613, "epoch": 0.5711126233412726, "grad_norm": 0.033860739320516586, "grad_norm_var": 1.3355977304595086e-06, "learning_rate": 0.0006493181776802315, "loss": 2.608, "step": 6714 }, { "crossentropy": 2.616450548171997, "epoch": 0.5711976862878531, "grad_norm": 0.033092401921749115, "grad_norm_var": 1.1374357260442633e-06, "learning_rate": 0.0006483311472214931, "loss": 2.6165, "step": 6715 }, { "crossentropy": 2.653639078140259, "epoch": 0.5712827492344335, "grad_norm": 0.032089635729789734, "grad_norm_var": 1.1973488866332446e-06, "learning_rate": 0.0006473448155153805, "loss": 2.6536, "step": 6716 }, { "crossentropy": 2.6887710094451904, "epoch": 0.571367812181014, "grad_norm": 0.03159763664007187, "grad_norm_var": 1.3247471487162188e-06, "learning_rate": 0.000646359182720273, "loss": 2.6888, "step": 6717 }, { "crossentropy": 2.682053565979004, "epoch": 0.5714528751275945, "grad_norm": 0.03266168385744095, "grad_norm_var": 1.29712087076142e-06, "learning_rate": 0.0006453742489944337, "loss": 2.6821, "step": 6718 }, { "crossentropy": 2.7045741081237793, "epoch": 0.5715379380741749, "grad_norm": 0.03285866975784302, "grad_norm_var": 1.271155020757947e-06, "learning_rate": 0.0006443900144960141, "loss": 2.7046, "step": 6719 }, { "crossentropy": 2.6202304363250732, "epoch": 0.5716230010207554, "grad_norm": 0.033022042363882065, "grad_norm_var": 1.2704205284097548e-06, "learning_rate": 0.000643406479383053, "loss": 2.6202, "step": 6720 }, { "crossentropy": 2.5207862854003906, "epoch": 0.5717080639673359, "grad_norm": 0.03139815106987953, "grad_norm_var": 1.391283323265204e-06, "learning_rate": 0.0006424236438134812, "loss": 2.5208, "step": 6721 }, { "crossentropy": 2.5867624282836914, "epoch": 0.5717931269139163, "grad_norm": 0.03283200412988663, "grad_norm_var": 1.390664389348759e-06, "learning_rate": 0.0006414415079451091, "loss": 2.5868, "step": 6722 }, { "crossentropy": 2.7105743885040283, "epoch": 0.5718781898604968, "grad_norm": 0.03603963926434517, "grad_norm_var": 1.9376200317680037e-06, "learning_rate": 0.0006404600719356429, "loss": 2.7106, "step": 6723 }, { "crossentropy": 2.6417524814605713, "epoch": 0.5719632528070773, "grad_norm": 0.033362701535224915, "grad_norm_var": 1.9350124044544465e-06, "learning_rate": 0.0006394793359426721, "loss": 2.6418, "step": 6724 }, { "crossentropy": 2.567119598388672, "epoch": 0.5720483157536577, "grad_norm": 0.03200339525938034, "grad_norm_var": 1.9796645773371865e-06, "learning_rate": 0.0006384993001236744, "loss": 2.5671, "step": 6725 }, { "crossentropy": 2.6895861625671387, "epoch": 0.5721333787002382, "grad_norm": 0.03219400718808174, "grad_norm_var": 2.0276297873905724e-06, "learning_rate": 0.0006375199646360141, "loss": 2.6896, "step": 6726 }, { "crossentropy": 2.6484804153442383, "epoch": 0.5722184416468187, "grad_norm": 0.03469986841082573, "grad_norm_var": 1.3501708594522432e-06, "learning_rate": 0.0006365413296369466, "loss": 2.6485, "step": 6727 }, { "crossentropy": 2.63254714012146, "epoch": 0.5723035045933991, "grad_norm": 0.03166946396231651, "grad_norm_var": 1.4527828747778448e-06, "learning_rate": 0.0006355633952836115, "loss": 2.6325, "step": 6728 }, { "crossentropy": 2.7161643505096436, "epoch": 0.5723885675399796, "grad_norm": 0.034379757940769196, "grad_norm_var": 1.589848405445365e-06, "learning_rate": 0.0006345861617330367, "loss": 2.7162, "step": 6729 }, { "crossentropy": 2.569704532623291, "epoch": 0.5724736304865601, "grad_norm": 0.035989321768283844, "grad_norm_var": 2.121539904446218e-06, "learning_rate": 0.0006336096291421378, "loss": 2.5697, "step": 6730 }, { "crossentropy": 2.5405113697052, "epoch": 0.5725586934331405, "grad_norm": 0.033207278698682785, "grad_norm_var": 2.1219703341119125e-06, "learning_rate": 0.0006326337976677176, "loss": 2.5405, "step": 6731 }, { "crossentropy": 2.725780963897705, "epoch": 0.572643756379721, "grad_norm": 0.03432322293519974, "grad_norm_var": 2.1253359431547617e-06, "learning_rate": 0.0006316586674664653, "loss": 2.7258, "step": 6732 }, { "crossentropy": 2.574519634246826, "epoch": 0.5727288193263015, "grad_norm": 0.032464396208524704, "grad_norm_var": 1.9796050437927147e-06, "learning_rate": 0.0006306842386949619, "loss": 2.5745, "step": 6733 }, { "crossentropy": 2.7075366973876953, "epoch": 0.5728138822728819, "grad_norm": 0.03253195434808731, "grad_norm_var": 1.9920284107445564e-06, "learning_rate": 0.0006297105115096701, "loss": 2.7075, "step": 6734 }, { "crossentropy": 2.631657123565674, "epoch": 0.5728989452194624, "grad_norm": 0.030767066404223442, "grad_norm_var": 2.391597576219826e-06, "learning_rate": 0.0006287374860669437, "loss": 2.6317, "step": 6735 }, { "crossentropy": 2.677624225616455, "epoch": 0.5729840081660429, "grad_norm": 0.03521307557821274, "grad_norm_var": 2.6454133536447786e-06, "learning_rate": 0.0006277651625230218, "loss": 2.6776, "step": 6736 }, { "crossentropy": 2.665407419204712, "epoch": 0.5730690711126234, "grad_norm": 0.03495045006275177, "grad_norm_var": 2.525148935064248e-06, "learning_rate": 0.000626793541034032, "loss": 2.6654, "step": 6737 }, { "crossentropy": 2.685999631881714, "epoch": 0.5731541340592038, "grad_norm": 0.03416319563984871, "grad_norm_var": 2.5103771322773726e-06, "learning_rate": 0.000625822621755987, "loss": 2.686, "step": 6738 }, { "crossentropy": 2.6449995040893555, "epoch": 0.5732391970057843, "grad_norm": 0.03429567068815231, "grad_norm_var": 2.138393449733798e-06, "learning_rate": 0.0006248524048447912, "loss": 2.645, "step": 6739 }, { "crossentropy": 2.640810251235962, "epoch": 0.5733242599523648, "grad_norm": 0.03327103331685066, "grad_norm_var": 2.1407608683310694e-06, "learning_rate": 0.0006238828904562316, "loss": 2.6408, "step": 6740 }, { "crossentropy": 2.6712934970855713, "epoch": 0.5734093228989452, "grad_norm": 0.033713266253471375, "grad_norm_var": 1.980534531333686e-06, "learning_rate": 0.0006229140787459852, "loss": 2.6713, "step": 6741 }, { "crossentropy": 2.6385161876678467, "epoch": 0.5734943858455257, "grad_norm": 0.03220037743449211, "grad_norm_var": 1.979330494375009e-06, "learning_rate": 0.0006219459698696139, "loss": 2.6385, "step": 6742 }, { "crossentropy": 2.640451192855835, "epoch": 0.5735794487921062, "grad_norm": 0.04227280989289284, "grad_norm_var": 6.659127881425752e-06, "learning_rate": 0.0006209785639825716, "loss": 2.6405, "step": 6743 }, { "crossentropy": 2.712294340133667, "epoch": 0.5736645117386866, "grad_norm": 0.03300734609365463, "grad_norm_var": 6.33952119955225e-06, "learning_rate": 0.0006200118612401917, "loss": 2.7123, "step": 6744 }, { "crossentropy": 2.6233558654785156, "epoch": 0.5737495746852671, "grad_norm": 0.03274010866880417, "grad_norm_var": 6.462105010285194e-06, "learning_rate": 0.0006190458617976996, "loss": 2.6234, "step": 6745 }, { "crossentropy": 2.5682830810546875, "epoch": 0.5738346376318476, "grad_norm": 0.03250065818428993, "grad_norm_var": 6.329721931144305e-06, "learning_rate": 0.0006180805658102085, "loss": 2.5683, "step": 6746 }, { "crossentropy": 2.64803147315979, "epoch": 0.573919700578428, "grad_norm": 0.03417165204882622, "grad_norm_var": 6.305028740826339e-06, "learning_rate": 0.0006171159734327164, "loss": 2.648, "step": 6747 }, { "crossentropy": 2.6816744804382324, "epoch": 0.5740047635250085, "grad_norm": 0.03239437937736511, "grad_norm_var": 6.431706240352904e-06, "learning_rate": 0.0006161520848201085, "loss": 2.6817, "step": 6748 }, { "crossentropy": 2.5799760818481445, "epoch": 0.574089826471589, "grad_norm": 0.031412746757268906, "grad_norm_var": 6.686858059001923e-06, "learning_rate": 0.0006151889001271588, "loss": 2.58, "step": 6749 }, { "crossentropy": 2.6263833045959473, "epoch": 0.5741748894181694, "grad_norm": 0.03346194326877594, "grad_norm_var": 6.592932244847448e-06, "learning_rate": 0.0006142264195085274, "loss": 2.6264, "step": 6750 }, { "crossentropy": 2.6446638107299805, "epoch": 0.5742599523647499, "grad_norm": 0.0323396660387516, "grad_norm_var": 6.115016333557479e-06, "learning_rate": 0.0006132646431187572, "loss": 2.6447, "step": 6751 }, { "crossentropy": 2.636986255645752, "epoch": 0.5743450153113304, "grad_norm": 0.031900178641080856, "grad_norm_var": 6.212909596729708e-06, "learning_rate": 0.000612303571112286, "loss": 2.637, "step": 6752 }, { "crossentropy": 2.6146326065063477, "epoch": 0.5744300782579108, "grad_norm": 0.032547976821660995, "grad_norm_var": 6.164996886118962e-06, "learning_rate": 0.0006113432036434324, "loss": 2.6146, "step": 6753 }, { "crossentropy": 2.675194025039673, "epoch": 0.5745151412044913, "grad_norm": 0.03335769474506378, "grad_norm_var": 6.136959652685604e-06, "learning_rate": 0.0006103835408664032, "loss": 2.6752, "step": 6754 }, { "crossentropy": 2.5738134384155273, "epoch": 0.5746002041510718, "grad_norm": 0.032481562346220016, "grad_norm_var": 6.143952885924684e-06, "learning_rate": 0.0006094245829352946, "loss": 2.5738, "step": 6755 }, { "crossentropy": 2.509221076965332, "epoch": 0.5746852670976522, "grad_norm": 0.03212675824761391, "grad_norm_var": 6.23948964955263e-06, "learning_rate": 0.0006084663300040866, "loss": 2.5092, "step": 6756 }, { "crossentropy": 2.6374456882476807, "epoch": 0.5747703300442327, "grad_norm": 0.03436720743775368, "grad_norm_var": 6.303181853991625e-06, "learning_rate": 0.0006075087822266462, "loss": 2.6374, "step": 6757 }, { "crossentropy": 2.6759824752807617, "epoch": 0.5748553929908132, "grad_norm": 0.032129161059856415, "grad_norm_var": 6.314227009166645e-06, "learning_rate": 0.0006065519397567287, "loss": 2.676, "step": 6758 }, { "crossentropy": 2.50099778175354, "epoch": 0.5749404559373936, "grad_norm": 0.032454852014780045, "grad_norm_var": 6.264856409456121e-07, "learning_rate": 0.0006055958027479757, "loss": 2.501, "step": 6759 }, { "crossentropy": 2.4815783500671387, "epoch": 0.5750255188839741, "grad_norm": 0.03284880518913269, "grad_norm_var": 6.21815833184991e-07, "learning_rate": 0.0006046403713539139, "loss": 2.4816, "step": 6760 }, { "crossentropy": 2.5485925674438477, "epoch": 0.5751105818305546, "grad_norm": 0.03268721327185631, "grad_norm_var": 6.217234106551187e-07, "learning_rate": 0.0006036856457279599, "loss": 2.5486, "step": 6761 }, { "crossentropy": 2.582090139389038, "epoch": 0.575195644777135, "grad_norm": 0.03250709921121597, "grad_norm_var": 6.215557498225055e-07, "learning_rate": 0.0006027316260234144, "loss": 2.5821, "step": 6762 }, { "crossentropy": 2.631338596343994, "epoch": 0.5752807077237155, "grad_norm": 0.0324191153049469, "grad_norm_var": 4.6947191500440234e-07, "learning_rate": 0.000601778312393465, "loss": 2.6313, "step": 6763 }, { "crossentropy": 2.6452109813690186, "epoch": 0.575365770670296, "grad_norm": 0.03360307961702347, "grad_norm_var": 5.292921226371719e-07, "learning_rate": 0.0006008257049911863, "loss": 2.6452, "step": 6764 }, { "crossentropy": 2.663374185562134, "epoch": 0.5754508336168764, "grad_norm": 0.03487204760313034, "grad_norm_var": 6.994794757075873e-07, "learning_rate": 0.0005998738039695418, "loss": 2.6634, "step": 6765 }, { "crossentropy": 2.629014730453491, "epoch": 0.5755358965634569, "grad_norm": 0.03332992643117905, "grad_norm_var": 6.903520458639382e-07, "learning_rate": 0.0005989226094813766, "loss": 2.629, "step": 6766 }, { "crossentropy": 2.6999382972717285, "epoch": 0.5756209595100374, "grad_norm": 0.03505140542984009, "grad_norm_var": 9.570145109687988e-07, "learning_rate": 0.0005979721216794248, "loss": 2.6999, "step": 6767 }, { "crossentropy": 2.5670931339263916, "epoch": 0.575706022456618, "grad_norm": 0.03286532685160637, "grad_norm_var": 8.681998551862112e-07, "learning_rate": 0.00059702234071631, "loss": 2.5671, "step": 6768 }, { "crossentropy": 2.5977866649627686, "epoch": 0.5757910854031983, "grad_norm": 0.03313829377293587, "grad_norm_var": 8.462881493494108e-07, "learning_rate": 0.000596073266744539, "loss": 2.5978, "step": 6769 }, { "crossentropy": 2.5724899768829346, "epoch": 0.5758761483497788, "grad_norm": 0.032795701175928116, "grad_norm_var": 8.497134270081993e-07, "learning_rate": 0.0005951248999165032, "loss": 2.5725, "step": 6770 }, { "crossentropy": 2.616197109222412, "epoch": 0.5759612112963594, "grad_norm": 0.03387989476323128, "grad_norm_var": 8.557138543823876e-07, "learning_rate": 0.0005941772403844864, "loss": 2.6162, "step": 6771 }, { "crossentropy": 2.626021385192871, "epoch": 0.5760462742429397, "grad_norm": 0.03146260604262352, "grad_norm_var": 9.776350251150534e-07, "learning_rate": 0.0005932302883006547, "loss": 2.626, "step": 6772 }, { "crossentropy": 2.6720147132873535, "epoch": 0.5761313371895203, "grad_norm": 0.03433142602443695, "grad_norm_var": 9.719114233150764e-07, "learning_rate": 0.0005922840438170585, "loss": 2.672, "step": 6773 }, { "crossentropy": 2.6146576404571533, "epoch": 0.5762164001361008, "grad_norm": 0.033711619675159454, "grad_norm_var": 9.133480956624931e-07, "learning_rate": 0.0005913385070856403, "loss": 2.6147, "step": 6774 }, { "crossentropy": 2.648196220397949, "epoch": 0.5763014630826812, "grad_norm": 0.03230776637792587, "grad_norm_var": 9.302432373903269e-07, "learning_rate": 0.0005903936782582253, "loss": 2.6482, "step": 6775 }, { "crossentropy": 2.605846881866455, "epoch": 0.5763865260292617, "grad_norm": 0.0351358987390995, "grad_norm_var": 1.1384213137306694e-06, "learning_rate": 0.000589449557486525, "loss": 2.6058, "step": 6776 }, { "crossentropy": 2.5952200889587402, "epoch": 0.5764715889758422, "grad_norm": 0.03447907418012619, "grad_norm_var": 1.173302109256739e-06, "learning_rate": 0.0005885061449221391, "loss": 2.5952, "step": 6777 }, { "crossentropy": 2.657944917678833, "epoch": 0.5765566519224226, "grad_norm": 0.03194120526313782, "grad_norm_var": 1.2677163197358696e-06, "learning_rate": 0.0005875634407165519, "loss": 2.6579, "step": 6778 }, { "crossentropy": 2.6218056678771973, "epoch": 0.5766417148690031, "grad_norm": 0.034357525408267975, "grad_norm_var": 1.2341096741215936e-06, "learning_rate": 0.0005866214450211355, "loss": 2.6218, "step": 6779 }, { "crossentropy": 2.641918659210205, "epoch": 0.5767267778155836, "grad_norm": 0.03289052098989487, "grad_norm_var": 1.2635485183052722e-06, "learning_rate": 0.0005856801579871457, "loss": 2.6419, "step": 6780 }, { "crossentropy": 2.6270158290863037, "epoch": 0.576811840762164, "grad_norm": 0.03257770463824272, "grad_norm_var": 1.183343055038919e-06, "learning_rate": 0.0005847395797657268, "loss": 2.627, "step": 6781 }, { "crossentropy": 2.545950412750244, "epoch": 0.5768969037087445, "grad_norm": 0.033824849873781204, "grad_norm_var": 1.1946225787003625e-06, "learning_rate": 0.000583799710507909, "loss": 2.546, "step": 6782 }, { "crossentropy": 2.627460241317749, "epoch": 0.576981966655325, "grad_norm": 0.03388292342424393, "grad_norm_var": 1.026088011623951e-06, "learning_rate": 0.0005828605503646067, "loss": 2.6275, "step": 6783 }, { "crossentropy": 2.6099588871002197, "epoch": 0.5770670296019054, "grad_norm": 0.03222400322556496, "grad_norm_var": 1.0931439269149894e-06, "learning_rate": 0.0005819220994866237, "loss": 2.61, "step": 6784 }, { "crossentropy": 2.564790725708008, "epoch": 0.5771520925484859, "grad_norm": 0.03250943496823311, "grad_norm_var": 1.132158087369106e-06, "learning_rate": 0.0005809843580246477, "loss": 2.5648, "step": 6785 }, { "crossentropy": 2.61043381690979, "epoch": 0.5772371554950664, "grad_norm": 0.033390238881111145, "grad_norm_var": 1.1166906778339291e-06, "learning_rate": 0.0005800473261292527, "loss": 2.6104, "step": 6786 }, { "crossentropy": 2.5720057487487793, "epoch": 0.5773222184416468, "grad_norm": 0.03366262465715408, "grad_norm_var": 1.1030350743364743e-06, "learning_rate": 0.000579111003950899, "loss": 2.572, "step": 6787 }, { "crossentropy": 2.579926013946533, "epoch": 0.5774072813882273, "grad_norm": 0.03212212026119232, "grad_norm_var": 9.692560827829002e-07, "learning_rate": 0.0005781753916399335, "loss": 2.5799, "step": 6788 }, { "crossentropy": 2.5916199684143066, "epoch": 0.5774923443348078, "grad_norm": 0.033922240138053894, "grad_norm_var": 9.2531979714542e-07, "learning_rate": 0.0005772404893465866, "loss": 2.5916, "step": 6789 }, { "crossentropy": 2.6341466903686523, "epoch": 0.5775774072813882, "grad_norm": 0.03159051015973091, "grad_norm_var": 1.0925720622279033e-06, "learning_rate": 0.0005763062972209792, "loss": 2.6341, "step": 6790 }, { "crossentropy": 2.5940818786621094, "epoch": 0.5776624702279687, "grad_norm": 0.03244585916399956, "grad_norm_var": 1.0777746339419094e-06, "learning_rate": 0.000575372815413115, "loss": 2.5941, "step": 6791 }, { "crossentropy": 2.7228901386260986, "epoch": 0.5777475331745492, "grad_norm": 0.03334510698914528, "grad_norm_var": 8.123388884408977e-07, "learning_rate": 0.0005744400440728826, "loss": 2.7229, "step": 6792 }, { "crossentropy": 2.583280086517334, "epoch": 0.5778325961211296, "grad_norm": 0.031069118529558182, "grad_norm_var": 8.997310140988295e-07, "learning_rate": 0.0005735079833500606, "loss": 2.5833, "step": 6793 }, { "crossentropy": 2.643106698989868, "epoch": 0.5779176590677101, "grad_norm": 0.03265004977583885, "grad_norm_var": 8.443208245190528e-07, "learning_rate": 0.0005725766333943111, "loss": 2.6431, "step": 6794 }, { "crossentropy": 2.5512211322784424, "epoch": 0.5780027220142906, "grad_norm": 0.03358815610408783, "grad_norm_var": 7.32215345732902e-07, "learning_rate": 0.0005716459943551788, "loss": 2.5512, "step": 6795 }, { "crossentropy": 2.6574106216430664, "epoch": 0.5780877849608711, "grad_norm": 0.03635244071483612, "grad_norm_var": 1.4972209038412987e-06, "learning_rate": 0.000570716066382101, "loss": 2.6574, "step": 6796 }, { "crossentropy": 2.5584728717803955, "epoch": 0.5781728479074515, "grad_norm": 0.033166803419589996, "grad_norm_var": 1.4800591483804064e-06, "learning_rate": 0.0005697868496243958, "loss": 2.5585, "step": 6797 }, { "crossentropy": 2.5838279724121094, "epoch": 0.578257910854032, "grad_norm": 0.033935610204935074, "grad_norm_var": 1.4913953022978322e-06, "learning_rate": 0.0005688583442312678, "loss": 2.5838, "step": 6798 }, { "crossentropy": 2.5749459266662598, "epoch": 0.5783429738006125, "grad_norm": 0.03627494350075722, "grad_norm_var": 2.09358007241389e-06, "learning_rate": 0.0005679305503518106, "loss": 2.5749, "step": 6799 }, { "crossentropy": 2.7079036235809326, "epoch": 0.5784280367471929, "grad_norm": 0.03288776054978371, "grad_norm_var": 2.028935479091107e-06, "learning_rate": 0.0005670034681349995, "loss": 2.7079, "step": 6800 }, { "crossentropy": 2.6837496757507324, "epoch": 0.5785130996937734, "grad_norm": 0.034002065658569336, "grad_norm_var": 2.0094401257978825e-06, "learning_rate": 0.000566077097729698, "loss": 2.6837, "step": 6801 }, { "crossentropy": 2.58280086517334, "epoch": 0.5785981626403539, "grad_norm": 0.031802576035261154, "grad_norm_var": 2.1691232586125314e-06, "learning_rate": 0.0005651514392846535, "loss": 2.5828, "step": 6802 }, { "crossentropy": 2.6700220108032227, "epoch": 0.5786832255869343, "grad_norm": 0.03522729501128197, "grad_norm_var": 2.3975525648117778e-06, "learning_rate": 0.0005642264929485008, "loss": 2.67, "step": 6803 }, { "crossentropy": 2.722780704498291, "epoch": 0.5787682885335148, "grad_norm": 0.03567586466670036, "grad_norm_var": 2.5818838361247112e-06, "learning_rate": 0.0005633022588697595, "loss": 2.7228, "step": 6804 }, { "crossentropy": 2.5578625202178955, "epoch": 0.5788533514800953, "grad_norm": 0.0336419977247715, "grad_norm_var": 2.5755372247545277e-06, "learning_rate": 0.0005623787371968336, "loss": 2.5579, "step": 6805 }, { "crossentropy": 2.557762861251831, "epoch": 0.5789384144266757, "grad_norm": 0.031978730112314224, "grad_norm_var": 2.4807586742679282e-06, "learning_rate": 0.0005614559280780169, "loss": 2.5578, "step": 6806 }, { "crossentropy": 2.611696243286133, "epoch": 0.5790234773732562, "grad_norm": 0.03262604400515556, "grad_norm_var": 2.454392759164888e-06, "learning_rate": 0.0005605338316614838, "loss": 2.6117, "step": 6807 }, { "crossentropy": 2.5493032932281494, "epoch": 0.5791085403198367, "grad_norm": 0.0335073359310627, "grad_norm_var": 2.449679827095282e-06, "learning_rate": 0.0005596124480952974, "loss": 2.5493, "step": 6808 }, { "crossentropy": 2.723193883895874, "epoch": 0.5791936032664171, "grad_norm": 0.03247900679707527, "grad_norm_var": 2.0889043000423452e-06, "learning_rate": 0.0005586917775274047, "loss": 2.7232, "step": 6809 }, { "crossentropy": 2.588419198989868, "epoch": 0.5792786662129976, "grad_norm": 0.03202467039227486, "grad_norm_var": 2.2040065699333494e-06, "learning_rate": 0.0005577718201056392, "loss": 2.5884, "step": 6810 }, { "crossentropy": 2.570080041885376, "epoch": 0.5793637291595781, "grad_norm": 0.03356429934501648, "grad_norm_var": 2.204392200299878e-06, "learning_rate": 0.0005568525759777188, "loss": 2.5701, "step": 6811 }, { "crossentropy": 2.579836368560791, "epoch": 0.5794487921061585, "grad_norm": 0.03276415169239044, "grad_norm_var": 1.7385294298823128e-06, "learning_rate": 0.0005559340452912487, "loss": 2.5798, "step": 6812 }, { "crossentropy": 2.760707378387451, "epoch": 0.579533855052739, "grad_norm": 0.034412965178489685, "grad_norm_var": 1.784802664923461e-06, "learning_rate": 0.0005550162281937188, "loss": 2.7607, "step": 6813 }, { "crossentropy": 2.598684310913086, "epoch": 0.5796189179993195, "grad_norm": 0.03185330703854561, "grad_norm_var": 1.948833099637189e-06, "learning_rate": 0.0005540991248325023, "loss": 2.5987, "step": 6814 }, { "crossentropy": 2.6775050163269043, "epoch": 0.5797039809458999, "grad_norm": 0.03305009379982948, "grad_norm_var": 1.371324069410207e-06, "learning_rate": 0.0005531827353548619, "loss": 2.6775, "step": 6815 }, { "crossentropy": 2.650458335876465, "epoch": 0.5797890438924804, "grad_norm": 0.0324564091861248, "grad_norm_var": 1.4019828362075066e-06, "learning_rate": 0.0005522670599079416, "loss": 2.6505, "step": 6816 }, { "crossentropy": 2.650209665298462, "epoch": 0.5798741068390609, "grad_norm": 0.03309571370482445, "grad_norm_var": 1.3553918267960574e-06, "learning_rate": 0.0005513520986387721, "loss": 2.6502, "step": 6817 }, { "crossentropy": 2.5620265007019043, "epoch": 0.5799591697856413, "grad_norm": 0.03145035356283188, "grad_norm_var": 1.425721592443474e-06, "learning_rate": 0.0005504378516942715, "loss": 2.562, "step": 6818 }, { "crossentropy": 2.6632251739501953, "epoch": 0.5800442327322218, "grad_norm": 0.032500166445970535, "grad_norm_var": 1.121759983426183e-06, "learning_rate": 0.0005495243192212407, "loss": 2.6632, "step": 6819 }, { "crossentropy": 2.646475315093994, "epoch": 0.5801292956788023, "grad_norm": 0.032199542969465256, "grad_norm_var": 6.101522582751632e-07, "learning_rate": 0.0005486115013663668, "loss": 2.6465, "step": 6820 }, { "crossentropy": 2.5656464099884033, "epoch": 0.5802143586253827, "grad_norm": 0.03273678198456764, "grad_norm_var": 5.50724409984632e-07, "learning_rate": 0.0005476993982762207, "loss": 2.5656, "step": 6821 }, { "crossentropy": 2.6175756454467773, "epoch": 0.5802994215719632, "grad_norm": 0.03214409202337265, "grad_norm_var": 5.372203003700093e-07, "learning_rate": 0.0005467880100972633, "loss": 2.6176, "step": 6822 }, { "crossentropy": 2.5032949447631836, "epoch": 0.5803844845185437, "grad_norm": 0.031093278899788857, "grad_norm_var": 6.948903321864758e-07, "learning_rate": 0.0005458773369758329, "loss": 2.5033, "step": 6823 }, { "crossentropy": 2.5579288005828857, "epoch": 0.5804695474651242, "grad_norm": 0.03162259981036186, "grad_norm_var": 6.846862916682576e-07, "learning_rate": 0.0005449673790581611, "loss": 2.5579, "step": 6824 }, { "crossentropy": 2.601653814315796, "epoch": 0.5805546104117046, "grad_norm": 0.03435820713639259, "grad_norm_var": 9.087915569294062e-07, "learning_rate": 0.0005440581364903591, "loss": 2.6017, "step": 6825 }, { "crossentropy": 2.6371145248413086, "epoch": 0.5806396733582851, "grad_norm": 0.03330526873469353, "grad_norm_var": 9.159691044014015e-07, "learning_rate": 0.0005431496094184252, "loss": 2.6371, "step": 6826 }, { "crossentropy": 2.6765024662017822, "epoch": 0.5807247363048657, "grad_norm": 0.03546616807579994, "grad_norm_var": 1.3706040647540647e-06, "learning_rate": 0.0005422417979882416, "loss": 2.6765, "step": 6827 }, { "crossentropy": 2.6775808334350586, "epoch": 0.580809799251446, "grad_norm": 0.034388065338134766, "grad_norm_var": 1.5315972285540828e-06, "learning_rate": 0.0005413347023455789, "loss": 2.6776, "step": 6828 }, { "crossentropy": 2.6129419803619385, "epoch": 0.5808948621980266, "grad_norm": 0.03158833459019661, "grad_norm_var": 1.454162345322626e-06, "learning_rate": 0.0005404283226360901, "loss": 2.6129, "step": 6829 }, { "crossentropy": 2.6064255237579346, "epoch": 0.5809799251446071, "grad_norm": 0.03262072801589966, "grad_norm_var": 1.403641657947184e-06, "learning_rate": 0.0005395226590053126, "loss": 2.6064, "step": 6830 }, { "crossentropy": 2.709146499633789, "epoch": 0.5810649880911875, "grad_norm": 0.031629953533411026, "grad_norm_var": 1.4737652905200444e-06, "learning_rate": 0.0005386177115986701, "loss": 2.7091, "step": 6831 }, { "crossentropy": 2.665379762649536, "epoch": 0.581150051037768, "grad_norm": 0.03658616170287132, "grad_norm_var": 2.424297585206407e-06, "learning_rate": 0.0005377134805614714, "loss": 2.6654, "step": 6832 }, { "crossentropy": 2.5202038288116455, "epoch": 0.5812351139843485, "grad_norm": 0.032615870237350464, "grad_norm_var": 2.4277077657813694e-06, "learning_rate": 0.0005368099660389092, "loss": 2.5202, "step": 6833 }, { "crossentropy": 2.5567970275878906, "epoch": 0.5813201769309289, "grad_norm": 0.03348880261182785, "grad_norm_var": 2.2950124076506618e-06, "learning_rate": 0.0005359071681760635, "loss": 2.5568, "step": 6834 }, { "crossentropy": 2.7079954147338867, "epoch": 0.5814052398775094, "grad_norm": 0.034694015979766846, "grad_norm_var": 2.4433260062292944e-06, "learning_rate": 0.0005350050871178963, "loss": 2.708, "step": 6835 }, { "crossentropy": 2.6447041034698486, "epoch": 0.5814903028240899, "grad_norm": 0.03286995366215706, "grad_norm_var": 2.385686866252983e-06, "learning_rate": 0.0005341037230092555, "loss": 2.6447, "step": 6836 }, { "crossentropy": 2.64359974861145, "epoch": 0.5815753657706703, "grad_norm": 0.032049473375082016, "grad_norm_var": 2.457708704898841e-06, "learning_rate": 0.000533203075994877, "loss": 2.6436, "step": 6837 }, { "crossentropy": 2.610365390777588, "epoch": 0.5816604287172508, "grad_norm": 0.03275136277079582, "grad_norm_var": 2.3986973180530376e-06, "learning_rate": 0.0005323031462193756, "loss": 2.6104, "step": 6838 }, { "crossentropy": 2.596266508102417, "epoch": 0.5817454916638313, "grad_norm": 0.03058365173637867, "grad_norm_var": 2.5577773768204463e-06, "learning_rate": 0.0005314039338272542, "loss": 2.5963, "step": 6839 }, { "crossentropy": 2.6916568279266357, "epoch": 0.5818305546104117, "grad_norm": 0.03247904032468796, "grad_norm_var": 2.4276432640957796e-06, "learning_rate": 0.0005305054389629022, "loss": 2.6917, "step": 6840 }, { "crossentropy": 2.6359996795654297, "epoch": 0.5819156175569922, "grad_norm": 0.03297913074493408, "grad_norm_var": 2.336702560453785e-06, "learning_rate": 0.0005296076617705914, "loss": 2.636, "step": 6841 }, { "crossentropy": 2.623431444168091, "epoch": 0.5820006805035727, "grad_norm": 0.031700726598501205, "grad_norm_var": 2.460329158502023e-06, "learning_rate": 0.0005287106023944782, "loss": 2.6234, "step": 6842 }, { "crossentropy": 2.692101001739502, "epoch": 0.5820857434501531, "grad_norm": 0.03322282433509827, "grad_norm_var": 2.0463916364429294e-06, "learning_rate": 0.0005278142609786047, "loss": 2.6921, "step": 6843 }, { "crossentropy": 2.6189815998077393, "epoch": 0.5821708063967336, "grad_norm": 0.03505770117044449, "grad_norm_var": 2.20812665066906e-06, "learning_rate": 0.0005269186376668995, "loss": 2.619, "step": 6844 }, { "crossentropy": 2.6165614128112793, "epoch": 0.5822558693433141, "grad_norm": 0.03153402730822563, "grad_norm_var": 2.2180430166965677e-06, "learning_rate": 0.0005260237326031697, "loss": 2.6166, "step": 6845 }, { "crossentropy": 2.7182984352111816, "epoch": 0.5823409322898945, "grad_norm": 0.034980595111846924, "grad_norm_var": 2.4691176814368855e-06, "learning_rate": 0.000525129545931115, "loss": 2.7183, "step": 6846 }, { "crossentropy": 2.59663987159729, "epoch": 0.582425995236475, "grad_norm": 0.034919559955596924, "grad_norm_var": 2.5110057065607366e-06, "learning_rate": 0.000524236077794315, "loss": 2.5966, "step": 6847 }, { "crossentropy": 2.623473882675171, "epoch": 0.5825110581830555, "grad_norm": 0.033097874373197556, "grad_norm_var": 1.734759026814876e-06, "learning_rate": 0.0005233433283362348, "loss": 2.6235, "step": 6848 }, { "crossentropy": 2.6442246437072754, "epoch": 0.5825961211296359, "grad_norm": 0.031818512827157974, "grad_norm_var": 1.822141871085633e-06, "learning_rate": 0.0005224512977002227, "loss": 2.6442, "step": 6849 }, { "crossentropy": 2.7137980461120605, "epoch": 0.5826811840762164, "grad_norm": 0.03314958140254021, "grad_norm_var": 1.8078679236317586e-06, "learning_rate": 0.0005215599860295162, "loss": 2.7138, "step": 6850 }, { "crossentropy": 2.577685832977295, "epoch": 0.5827662470227969, "grad_norm": 0.033357467502355576, "grad_norm_var": 1.616383848595373e-06, "learning_rate": 0.0005206693934672329, "loss": 2.5777, "step": 6851 }, { "crossentropy": 2.618622303009033, "epoch": 0.5828513099693773, "grad_norm": 0.04976575821638107, "grad_norm_var": 1.9369130874905674e-05, "learning_rate": 0.0005197795201563743, "loss": 2.6186, "step": 6852 }, { "crossentropy": 2.5585923194885254, "epoch": 0.5829363729159578, "grad_norm": 0.03240837901830673, "grad_norm_var": 1.928549413151314e-05, "learning_rate": 0.0005188903662398315, "loss": 2.5586, "step": 6853 }, { "crossentropy": 2.599785804748535, "epoch": 0.5830214358625383, "grad_norm": 0.033516936004161835, "grad_norm_var": 1.9195905540777046e-05, "learning_rate": 0.0005180019318603751, "loss": 2.5998, "step": 6854 }, { "crossentropy": 2.6007282733917236, "epoch": 0.5831064988091188, "grad_norm": 0.03258700296282768, "grad_norm_var": 1.852464590662983e-05, "learning_rate": 0.0005171142171606619, "loss": 2.6007, "step": 6855 }, { "crossentropy": 2.622011661529541, "epoch": 0.5831915617556992, "grad_norm": 0.032609157264232635, "grad_norm_var": 1.8496524821548915e-05, "learning_rate": 0.0005162272222832349, "loss": 2.622, "step": 6856 }, { "crossentropy": 2.5329456329345703, "epoch": 0.5832766247022797, "grad_norm": 0.035525646060705185, "grad_norm_var": 1.849779217959757e-05, "learning_rate": 0.0005153409473705195, "loss": 2.5329, "step": 6857 }, { "crossentropy": 2.5745341777801514, "epoch": 0.5833616876488602, "grad_norm": 0.03247355297207832, "grad_norm_var": 1.826437332574903e-05, "learning_rate": 0.0005144553925648249, "loss": 2.5745, "step": 6858 }, { "crossentropy": 2.6182734966278076, "epoch": 0.5834467505954406, "grad_norm": 0.03356323018670082, "grad_norm_var": 1.821925155645465e-05, "learning_rate": 0.0005135705580083461, "loss": 2.6183, "step": 6859 }, { "crossentropy": 2.615828275680542, "epoch": 0.5835318135420211, "grad_norm": 0.032440949231386185, "grad_norm_var": 1.8416977762969234e-05, "learning_rate": 0.0005126864438431628, "loss": 2.6158, "step": 6860 }, { "crossentropy": 2.6774990558624268, "epoch": 0.5836168764886016, "grad_norm": 0.033653583377599716, "grad_norm_var": 1.7934652944828534e-05, "learning_rate": 0.0005118030502112359, "loss": 2.6775, "step": 6861 }, { "crossentropy": 2.623650550842285, "epoch": 0.583701939435182, "grad_norm": 0.03416742756962776, "grad_norm_var": 1.7909424565790088e-05, "learning_rate": 0.0005109203772544163, "loss": 2.6237, "step": 6862 }, { "crossentropy": 2.685408115386963, "epoch": 0.5837870023817625, "grad_norm": 0.03309684619307518, "grad_norm_var": 1.797036415783007e-05, "learning_rate": 0.0005100384251144335, "loss": 2.6854, "step": 6863 }, { "crossentropy": 2.6886162757873535, "epoch": 0.583872065328343, "grad_norm": 0.03248762711882591, "grad_norm_var": 1.8083477400141628e-05, "learning_rate": 0.0005091571939329048, "loss": 2.6886, "step": 6864 }, { "crossentropy": 2.6413393020629883, "epoch": 0.5839571282749234, "grad_norm": 0.03471406549215317, "grad_norm_var": 1.7702017106595706e-05, "learning_rate": 0.0005082766838513292, "loss": 2.6413, "step": 6865 }, { "crossentropy": 2.7461166381835938, "epoch": 0.5840421912215039, "grad_norm": 0.034293029457330704, "grad_norm_var": 1.7601507567855238e-05, "learning_rate": 0.0005073968950110941, "loss": 2.7461, "step": 6866 }, { "crossentropy": 2.621523857116699, "epoch": 0.5841272541680844, "grad_norm": 0.03221310302615166, "grad_norm_var": 1.7844913054284168e-05, "learning_rate": 0.0005065178275534648, "loss": 2.6215, "step": 6867 }, { "crossentropy": 2.6532695293426514, "epoch": 0.5842123171146648, "grad_norm": 0.033230558037757874, "grad_norm_var": 9.346620203955099e-07, "learning_rate": 0.0005056394816195952, "loss": 2.6533, "step": 6868 }, { "crossentropy": 2.6497421264648438, "epoch": 0.5842973800612453, "grad_norm": 0.03192188963294029, "grad_norm_var": 1.008023403770153e-06, "learning_rate": 0.0005047618573505236, "loss": 2.6497, "step": 6869 }, { "crossentropy": 2.6375248432159424, "epoch": 0.5843824430078258, "grad_norm": 0.034162238240242004, "grad_norm_var": 1.0543568477607013e-06, "learning_rate": 0.0005038849548871704, "loss": 2.6375, "step": 6870 }, { "crossentropy": 2.61751127243042, "epoch": 0.5844675059544062, "grad_norm": 0.03310999646782875, "grad_norm_var": 1.0202515342070764e-06, "learning_rate": 0.0005030087743703399, "loss": 2.6175, "step": 6871 }, { "crossentropy": 2.6026806831359863, "epoch": 0.5845525689009867, "grad_norm": 0.03231773152947426, "grad_norm_var": 1.0544991024059752e-06, "learning_rate": 0.0005021333159407232, "loss": 2.6027, "step": 6872 }, { "crossentropy": 2.713364362716675, "epoch": 0.5846376318475672, "grad_norm": 0.033378906548023224, "grad_norm_var": 7.15702174654507e-07, "learning_rate": 0.0005012585797388936, "loss": 2.7134, "step": 6873 }, { "crossentropy": 2.5621514320373535, "epoch": 0.5847226947941476, "grad_norm": 0.031971100717782974, "grad_norm_var": 7.802517109465242e-07, "learning_rate": 0.0005003845659053063, "loss": 2.5622, "step": 6874 }, { "crossentropy": 2.627037525177002, "epoch": 0.5848077577407281, "grad_norm": 0.03399078920483589, "grad_norm_var": 8.140862118808286e-07, "learning_rate": 0.0004995112745803049, "loss": 2.627, "step": 6875 }, { "crossentropy": 2.6012449264526367, "epoch": 0.5848928206873086, "grad_norm": 0.03407953679561615, "grad_norm_var": 8.167455382993363e-07, "learning_rate": 0.0004986387059041142, "loss": 2.6012, "step": 6876 }, { "crossentropy": 2.660292863845825, "epoch": 0.584977883633889, "grad_norm": 0.03431643173098564, "grad_norm_var": 8.755195686988362e-07, "learning_rate": 0.0004977668600168428, "loss": 2.6603, "step": 6877 }, { "crossentropy": 2.5906214714050293, "epoch": 0.5850629465804695, "grad_norm": 0.033586565405130386, "grad_norm_var": 8.32578863088571e-07, "learning_rate": 0.0004968957370584859, "loss": 2.5906, "step": 6878 }, { "crossentropy": 2.5956945419311523, "epoch": 0.58514800952705, "grad_norm": 0.03365504741668701, "grad_norm_var": 8.366055065431773e-07, "learning_rate": 0.0004960253371689194, "loss": 2.5957, "step": 6879 }, { "crossentropy": 2.6644375324249268, "epoch": 0.5852330724736304, "grad_norm": 0.03201477229595184, "grad_norm_var": 9.042749488735157e-07, "learning_rate": 0.0004951556604879049, "loss": 2.6644, "step": 6880 }, { "crossentropy": 2.6991350650787354, "epoch": 0.5853181354202109, "grad_norm": 0.03184469789266586, "grad_norm_var": 8.815823428940175e-07, "learning_rate": 0.0004942867071550866, "loss": 2.6991, "step": 6881 }, { "crossentropy": 2.666810989379883, "epoch": 0.5854031983667914, "grad_norm": 0.0341641865670681, "grad_norm_var": 8.626469955839873e-07, "learning_rate": 0.0004934184773099942, "loss": 2.6668, "step": 6882 }, { "crossentropy": 2.6599738597869873, "epoch": 0.585488261313372, "grad_norm": 0.036172084510326385, "grad_norm_var": 1.3622855703387948e-06, "learning_rate": 0.0004925509710920395, "loss": 2.66, "step": 6883 }, { "crossentropy": 2.654615879058838, "epoch": 0.5855733242599523, "grad_norm": 0.034441206604242325, "grad_norm_var": 1.4314162251389054e-06, "learning_rate": 0.0004916841886405205, "loss": 2.6546, "step": 6884 }, { "crossentropy": 2.5752909183502197, "epoch": 0.5856583872065328, "grad_norm": 0.03298867121338844, "grad_norm_var": 1.2858353380060598e-06, "learning_rate": 0.0004908181300946162, "loss": 2.5753, "step": 6885 }, { "crossentropy": 2.567923069000244, "epoch": 0.5857434501531134, "grad_norm": 0.031105851754546165, "grad_norm_var": 1.60474506984419e-06, "learning_rate": 0.0004899527955933919, "loss": 2.5679, "step": 6886 }, { "crossentropy": 2.6417741775512695, "epoch": 0.5858285130996937, "grad_norm": 0.033740270882844925, "grad_norm_var": 1.6118326366505586e-06, "learning_rate": 0.0004890881852757939, "loss": 2.6418, "step": 6887 }, { "crossentropy": 2.623455047607422, "epoch": 0.5859135760462743, "grad_norm": 0.03181583434343338, "grad_norm_var": 1.6973574876437964e-06, "learning_rate": 0.0004882242992806546, "loss": 2.6235, "step": 6888 }, { "crossentropy": 2.569023370742798, "epoch": 0.5859986389928548, "grad_norm": 0.03246571496129036, "grad_norm_var": 1.7434157208867507e-06, "learning_rate": 0.0004873611377466891, "loss": 2.569, "step": 6889 }, { "crossentropy": 2.6804416179656982, "epoch": 0.5860837019394352, "grad_norm": 0.03302730619907379, "grad_norm_var": 1.6299298772076092e-06, "learning_rate": 0.0004864987008124949, "loss": 2.6804, "step": 6890 }, { "crossentropy": 2.640711784362793, "epoch": 0.5861687648860157, "grad_norm": 0.030847258865833282, "grad_norm_var": 1.9739583101701137e-06, "learning_rate": 0.0004856369886165568, "loss": 2.6407, "step": 6891 }, { "crossentropy": 2.727637529373169, "epoch": 0.5862538278325962, "grad_norm": 0.0337081179022789, "grad_norm_var": 1.9361308137775714e-06, "learning_rate": 0.0004847760012972402, "loss": 2.7276, "step": 6892 }, { "crossentropy": 2.663999319076538, "epoch": 0.5863388907791766, "grad_norm": 0.033637259155511856, "grad_norm_var": 1.8564689937073612e-06, "learning_rate": 0.0004839157389927934, "loss": 2.664, "step": 6893 }, { "crossentropy": 2.5431413650512695, "epoch": 0.5864239537257571, "grad_norm": 0.03349200636148453, "grad_norm_var": 1.8505897791133794e-06, "learning_rate": 0.00048305620184135314, "loss": 2.5431, "step": 6894 }, { "crossentropy": 2.542537212371826, "epoch": 0.5865090166723376, "grad_norm": 0.03223107010126114, "grad_norm_var": 1.8662459127916405e-06, "learning_rate": 0.0004821973899809329, "loss": 2.5425, "step": 6895 }, { "crossentropy": 2.629276752471924, "epoch": 0.586594079618918, "grad_norm": 0.03261706978082657, "grad_norm_var": 1.8113227979472119e-06, "learning_rate": 0.0004813393035494329, "loss": 2.6293, "step": 6896 }, { "crossentropy": 2.669830322265625, "epoch": 0.5866791425654985, "grad_norm": 0.03385253995656967, "grad_norm_var": 1.7490023027957504e-06, "learning_rate": 0.0004804819426846402, "loss": 2.6698, "step": 6897 }, { "crossentropy": 2.5568387508392334, "epoch": 0.586764205512079, "grad_norm": 0.032506246119737625, "grad_norm_var": 1.695312905622863e-06, "learning_rate": 0.0004796253075242202, "loss": 2.5568, "step": 6898 }, { "crossentropy": 2.6724205017089844, "epoch": 0.5868492684586594, "grad_norm": 0.030670996755361557, "grad_norm_var": 1.2897596379337112e-06, "learning_rate": 0.00047876939820572283, "loss": 2.6724, "step": 6899 }, { "crossentropy": 2.5670244693756104, "epoch": 0.5869343314052399, "grad_norm": 0.030229797586798668, "grad_norm_var": 1.4186878160936273e-06, "learning_rate": 0.0004779142148665855, "loss": 2.567, "step": 6900 }, { "crossentropy": 2.6025428771972656, "epoch": 0.5870193943518204, "grad_norm": 0.034288935363292694, "grad_norm_var": 1.6206048455455016e-06, "learning_rate": 0.0004770597576441238, "loss": 2.6025, "step": 6901 }, { "crossentropy": 2.4436731338500977, "epoch": 0.5871044572984008, "grad_norm": 0.03188259154558182, "grad_norm_var": 1.5123978752280456e-06, "learning_rate": 0.0004762060266755397, "loss": 2.4437, "step": 6902 }, { "crossentropy": 2.534590005874634, "epoch": 0.5871895202449813, "grad_norm": 0.031806014478206635, "grad_norm_var": 1.442693799142538e-06, "learning_rate": 0.00047535302209791744, "loss": 2.5346, "step": 6903 }, { "crossentropy": 2.597201347351074, "epoch": 0.5872745831915618, "grad_norm": 0.03688865900039673, "grad_norm_var": 2.6272311162370826e-06, "learning_rate": 0.0004745007440482252, "loss": 2.5972, "step": 6904 }, { "crossentropy": 2.6362268924713135, "epoch": 0.5873596461381422, "grad_norm": 0.032849181443452835, "grad_norm_var": 2.621401958720681e-06, "learning_rate": 0.0004736491926633141, "loss": 2.6362, "step": 6905 }, { "crossentropy": 2.5446033477783203, "epoch": 0.5874447090847227, "grad_norm": 0.032497845590114594, "grad_norm_var": 2.621706866250808e-06, "learning_rate": 0.0004727983680799181, "loss": 2.5446, "step": 6906 }, { "crossentropy": 2.5887885093688965, "epoch": 0.5875297720313032, "grad_norm": 0.03344191610813141, "grad_norm_var": 2.384089910937775e-06, "learning_rate": 0.00047194827043465674, "loss": 2.5888, "step": 6907 }, { "crossentropy": 2.614192008972168, "epoch": 0.5876148349778836, "grad_norm": 0.03312693163752556, "grad_norm_var": 2.3435485743254393e-06, "learning_rate": 0.0004710988998640298, "loss": 2.6142, "step": 6908 }, { "crossentropy": 2.6795194149017334, "epoch": 0.5876998979244641, "grad_norm": 0.03399132937192917, "grad_norm_var": 2.387313459207641e-06, "learning_rate": 0.00047025025650442254, "loss": 2.6795, "step": 6909 }, { "crossentropy": 2.520613670349121, "epoch": 0.5877849608710446, "grad_norm": 0.0312977209687233, "grad_norm_var": 2.5145485466236076e-06, "learning_rate": 0.0004694023404921027, "loss": 2.5206, "step": 6910 }, { "crossentropy": 2.540520429611206, "epoch": 0.587870023817625, "grad_norm": 0.03236718475818634, "grad_norm_var": 2.506085771796747e-06, "learning_rate": 0.0004685551519632203, "loss": 2.5405, "step": 6911 }, { "crossentropy": 2.71378493309021, "epoch": 0.5879550867642055, "grad_norm": 0.03367835283279419, "grad_norm_var": 2.5548851488828614e-06, "learning_rate": 0.0004677086910538092, "loss": 2.7138, "step": 6912 }, { "crossentropy": 2.6914100646972656, "epoch": 0.588040149710786, "grad_norm": 0.033094216138124466, "grad_norm_var": 2.4880454312433868e-06, "learning_rate": 0.00046686295789978794, "loss": 2.6914, "step": 6913 }, { "crossentropy": 2.5968289375305176, "epoch": 0.5881252126573665, "grad_norm": 0.034243110567331314, "grad_norm_var": 2.611196380300597e-06, "learning_rate": 0.0004660179526369568, "loss": 2.5968, "step": 6914 }, { "crossentropy": 2.6307926177978516, "epoch": 0.5882102756039469, "grad_norm": 0.03271329030394554, "grad_norm_var": 2.265680571514429e-06, "learning_rate": 0.0004651736754009972, "loss": 2.6308, "step": 6915 }, { "crossentropy": 2.6558432579040527, "epoch": 0.5882953385505274, "grad_norm": 0.03411569818854332, "grad_norm_var": 1.7612886724064729e-06, "learning_rate": 0.0004643301263274796, "loss": 2.6558, "step": 6916 }, { "crossentropy": 2.686046838760376, "epoch": 0.5883804014971079, "grad_norm": 0.03323156014084816, "grad_norm_var": 1.6871871705139683e-06, "learning_rate": 0.00046348730555185015, "loss": 2.686, "step": 6917 }, { "crossentropy": 2.6793453693389893, "epoch": 0.5884654644436883, "grad_norm": 0.035756830126047134, "grad_norm_var": 1.9439410651464947e-06, "learning_rate": 0.0004626452132094422, "loss": 2.6793, "step": 6918 }, { "crossentropy": 2.636401414871216, "epoch": 0.5885505273902688, "grad_norm": 0.032985035330057144, "grad_norm_var": 1.7733666791543954e-06, "learning_rate": 0.0004618038494354726, "loss": 2.6364, "step": 6919 }, { "crossentropy": 2.6810803413391113, "epoch": 0.5886355903368493, "grad_norm": 0.03391526639461517, "grad_norm_var": 9.894010689875365e-07, "learning_rate": 0.00046096321436504, "loss": 2.6811, "step": 6920 }, { "crossentropy": 2.5799617767333984, "epoch": 0.5887206532834297, "grad_norm": 0.03276918828487396, "grad_norm_var": 9.9494627172995e-07, "learning_rate": 0.00046012330813312465, "loss": 2.58, "step": 6921 }, { "crossentropy": 2.577949285507202, "epoch": 0.5888057162300102, "grad_norm": 0.033210497349500656, "grad_norm_var": 9.479405984256946e-07, "learning_rate": 0.0004592841308745932, "loss": 2.5779, "step": 6922 }, { "crossentropy": 2.703178644180298, "epoch": 0.5888907791765907, "grad_norm": 0.033606331795454025, "grad_norm_var": 9.511818439121115e-07, "learning_rate": 0.00045844568272419394, "loss": 2.7032, "step": 6923 }, { "crossentropy": 2.605609655380249, "epoch": 0.5889758421231711, "grad_norm": 0.03322126716375351, "grad_norm_var": 9.485372091129369e-07, "learning_rate": 0.00045760796381655365, "loss": 2.6056, "step": 6924 }, { "crossentropy": 2.6954636573791504, "epoch": 0.5890609050697516, "grad_norm": 0.03649962693452835, "grad_norm_var": 1.543769221132712e-06, "learning_rate": 0.00045677097428618976, "loss": 2.6955, "step": 6925 }, { "crossentropy": 2.6476287841796875, "epoch": 0.5891459680163321, "grad_norm": 0.03285377472639084, "grad_norm_var": 1.2290412876378789e-06, "learning_rate": 0.00045593471426749743, "loss": 2.6476, "step": 6926 }, { "crossentropy": 2.68768048286438, "epoch": 0.5892310309629125, "grad_norm": 0.03358721733093262, "grad_norm_var": 1.1148052598825355e-06, "learning_rate": 0.0004550991838947555, "loss": 2.6877, "step": 6927 }, { "crossentropy": 2.6275901794433594, "epoch": 0.589316093909493, "grad_norm": 0.03353890776634216, "grad_norm_var": 1.1167498867355996e-06, "learning_rate": 0.0004542643833021254, "loss": 2.6276, "step": 6928 }, { "crossentropy": 2.534749984741211, "epoch": 0.5894011568560735, "grad_norm": 0.03140987455844879, "grad_norm_var": 1.4320996466789249e-06, "learning_rate": 0.0004534303126236533, "loss": 2.5347, "step": 6929 }, { "crossentropy": 2.6749672889709473, "epoch": 0.5894862198026539, "grad_norm": 0.033013466745615005, "grad_norm_var": 1.4217505190129709e-06, "learning_rate": 0.0004525969719932671, "loss": 2.675, "step": 6930 }, { "crossentropy": 2.587869882583618, "epoch": 0.5895712827492344, "grad_norm": 0.034670308232307434, "grad_norm_var": 1.4488624939365883e-06, "learning_rate": 0.000451764361544777, "loss": 2.5879, "step": 6931 }, { "crossentropy": 2.6180837154388428, "epoch": 0.5896563456958149, "grad_norm": 0.03358835726976395, "grad_norm_var": 1.4334322215460438e-06, "learning_rate": 0.0004509324814118754, "loss": 2.6181, "step": 6932 }, { "crossentropy": 2.6532890796661377, "epoch": 0.5897414086423953, "grad_norm": 0.036490023136138916, "grad_norm_var": 1.929965626062899e-06, "learning_rate": 0.0004501013317281394, "loss": 2.6533, "step": 6933 }, { "crossentropy": 2.6106364727020264, "epoch": 0.5898264715889758, "grad_norm": 0.03248538449406624, "grad_norm_var": 1.7539218676684264e-06, "learning_rate": 0.00044927091262702613, "loss": 2.6106, "step": 6934 }, { "crossentropy": 2.7706992626190186, "epoch": 0.5899115345355563, "grad_norm": 0.03474157303571701, "grad_norm_var": 1.7991537464417489e-06, "learning_rate": 0.0004484412242418789, "loss": 2.7707, "step": 6935 }, { "crossentropy": 2.543877363204956, "epoch": 0.5899965974821367, "grad_norm": 0.031484004110097885, "grad_norm_var": 2.1069367622624767e-06, "learning_rate": 0.0004476122667059207, "loss": 2.5439, "step": 6936 }, { "crossentropy": 2.684809446334839, "epoch": 0.5900816604287172, "grad_norm": 0.03249029442667961, "grad_norm_var": 2.141692734063579e-06, "learning_rate": 0.0004467840401522577, "loss": 2.6848, "step": 6937 }, { "crossentropy": 2.6447842121124268, "epoch": 0.5901667233752977, "grad_norm": 0.03297073021531105, "grad_norm_var": 2.1563209368911004e-06, "learning_rate": 0.0004459565447138814, "loss": 2.6448, "step": 6938 }, { "crossentropy": 2.6440086364746094, "epoch": 0.5902517863218781, "grad_norm": 0.0333385244011879, "grad_norm_var": 2.1584597999973454e-06, "learning_rate": 0.0004451297805236615, "loss": 2.644, "step": 6939 }, { "crossentropy": 2.5903069972991943, "epoch": 0.5903368492684586, "grad_norm": 0.03141877427697182, "grad_norm_var": 2.434267596837514e-06, "learning_rate": 0.00044430374771435246, "loss": 2.5903, "step": 6940 }, { "crossentropy": 2.6931958198547363, "epoch": 0.5904219122150391, "grad_norm": 0.03218982368707657, "grad_norm_var": 1.8204918181765248e-06, "learning_rate": 0.00044347844641859314, "loss": 2.6932, "step": 6941 }, { "crossentropy": 2.5427913665771484, "epoch": 0.5905069751616197, "grad_norm": 0.03619968146085739, "grad_norm_var": 2.3916285540446595e-06, "learning_rate": 0.000442653876768902, "loss": 2.5428, "step": 6942 }, { "crossentropy": 2.6796176433563232, "epoch": 0.5905920381082, "grad_norm": 0.03378159925341606, "grad_norm_var": 2.4001107276395867e-06, "learning_rate": 0.00044183003889768145, "loss": 2.6796, "step": 6943 }, { "crossentropy": 2.627262592315674, "epoch": 0.5906771010547806, "grad_norm": 0.034637611359357834, "grad_norm_var": 2.501296513845776e-06, "learning_rate": 0.0004410069329372152, "loss": 2.6273, "step": 6944 }, { "crossentropy": 2.6301522254943848, "epoch": 0.5907621640013611, "grad_norm": 0.03455556184053421, "grad_norm_var": 2.2716775133879706e-06, "learning_rate": 0.00044018455901967233, "loss": 2.6302, "step": 6945 }, { "crossentropy": 2.6801798343658447, "epoch": 0.5908472269479415, "grad_norm": 0.03507233038544655, "grad_norm_var": 2.367778873324511e-06, "learning_rate": 0.00043936291727709965, "loss": 2.6802, "step": 6946 }, { "crossentropy": 2.6020588874816895, "epoch": 0.590932289894522, "grad_norm": 0.03383587673306465, "grad_norm_var": 2.3097016490484025e-06, "learning_rate": 0.00043854200784143203, "loss": 2.6021, "step": 6947 }, { "crossentropy": 2.646007537841797, "epoch": 0.5910173528411025, "grad_norm": 0.03205055743455887, "grad_norm_var": 2.481421766063231e-06, "learning_rate": 0.00043772183084448194, "loss": 2.646, "step": 6948 }, { "crossentropy": 2.6622674465179443, "epoch": 0.5911024157876829, "grad_norm": 0.04274708777666092, "grad_norm_var": 7.332002806525496e-06, "learning_rate": 0.0004369023864179472, "loss": 2.6623, "step": 6949 }, { "crossentropy": 2.6338484287261963, "epoch": 0.5911874787342634, "grad_norm": 0.03304547443985939, "grad_norm_var": 7.238502378635083e-06, "learning_rate": 0.0004360836746934055, "loss": 2.6338, "step": 6950 }, { "crossentropy": 2.6578152179718018, "epoch": 0.5912725416808439, "grad_norm": 0.03371957689523697, "grad_norm_var": 7.2074959297289845e-06, "learning_rate": 0.0004352656958023199, "loss": 2.6578, "step": 6951 }, { "crossentropy": 2.6435325145721436, "epoch": 0.5913576046274243, "grad_norm": 0.03259310498833656, "grad_norm_var": 6.916586289551901e-06, "learning_rate": 0.00043444844987603425, "loss": 2.6435, "step": 6952 }, { "crossentropy": 2.543071746826172, "epoch": 0.5914426675740048, "grad_norm": 0.03516353294253349, "grad_norm_var": 6.810712458062082e-06, "learning_rate": 0.0004336319370457736, "loss": 2.5431, "step": 6953 }, { "crossentropy": 2.7246999740600586, "epoch": 0.5915277305205853, "grad_norm": 0.03284599259495735, "grad_norm_var": 6.832254329924789e-06, "learning_rate": 0.00043281615744264755, "loss": 2.7247, "step": 6954 }, { "crossentropy": 2.6086666584014893, "epoch": 0.5916127934671657, "grad_norm": 0.03200007230043411, "grad_norm_var": 7.097904846534106e-06, "learning_rate": 0.0004320011111976457, "loss": 2.6087, "step": 6955 }, { "crossentropy": 2.6611552238464355, "epoch": 0.5916978564137462, "grad_norm": 0.03297313302755356, "grad_norm_var": 6.689904076710024e-06, "learning_rate": 0.0004311867984416412, "loss": 2.6612, "step": 6956 }, { "crossentropy": 2.584460735321045, "epoch": 0.5917829193603267, "grad_norm": 0.03241609036922455, "grad_norm_var": 6.632061190414706e-06, "learning_rate": 0.00043037321930539, "loss": 2.5845, "step": 6957 }, { "crossentropy": 2.5846171379089355, "epoch": 0.5918679823069071, "grad_norm": 0.0319112129509449, "grad_norm_var": 6.653714177271876e-06, "learning_rate": 0.0004295603739195292, "loss": 2.5846, "step": 6958 }, { "crossentropy": 2.6033270359039307, "epoch": 0.5919530452534876, "grad_norm": 0.03319011628627777, "grad_norm_var": 6.689594270903321e-06, "learning_rate": 0.00042874826241457776, "loss": 2.6033, "step": 6959 }, { "crossentropy": 2.6575307846069336, "epoch": 0.5920381082000681, "grad_norm": 0.03173138573765755, "grad_norm_var": 6.940310546630496e-06, "learning_rate": 0.0004279368849209381, "loss": 2.6575, "step": 6960 }, { "crossentropy": 2.6374588012695312, "epoch": 0.5921231711466485, "grad_norm": 0.031716592609882355, "grad_norm_var": 7.1355934570284385e-06, "learning_rate": 0.0004271262415688937, "loss": 2.6375, "step": 6961 }, { "crossentropy": 2.5416316986083984, "epoch": 0.592208234093229, "grad_norm": 0.03125176951289177, "grad_norm_var": 7.279152873632122e-06, "learning_rate": 0.00042631633248860933, "loss": 2.5416, "step": 6962 }, { "crossentropy": 2.6988024711608887, "epoch": 0.5922932970398095, "grad_norm": 0.03249870613217354, "grad_norm_var": 7.299726673851148e-06, "learning_rate": 0.0004255071578101355, "loss": 2.6988, "step": 6963 }, { "crossentropy": 2.592036008834839, "epoch": 0.5923783599863899, "grad_norm": 0.03222723677754402, "grad_norm_var": 7.27363644708965e-06, "learning_rate": 0.000424698717663401, "loss": 2.592, "step": 6964 }, { "crossentropy": 2.573659658432007, "epoch": 0.5924634229329704, "grad_norm": 0.03241424635052681, "grad_norm_var": 8.650352582774412e-07, "learning_rate": 0.00042389101217821804, "loss": 2.5737, "step": 6965 }, { "crossentropy": 2.668945789337158, "epoch": 0.5925484858795509, "grad_norm": 0.033591751009225845, "grad_norm_var": 9.156861191718379e-07, "learning_rate": 0.00042308404148428124, "loss": 2.6689, "step": 6966 }, { "crossentropy": 2.566718816757202, "epoch": 0.5926335488261313, "grad_norm": 0.034394919872283936, "grad_norm_var": 1.0413774769790773e-06, "learning_rate": 0.0004222778057111665, "loss": 2.5667, "step": 6967 }, { "crossentropy": 2.6337997913360596, "epoch": 0.5927186117727118, "grad_norm": 0.033600982278585434, "grad_norm_var": 1.0928539363184072e-06, "learning_rate": 0.00042147230498833064, "loss": 2.6338, "step": 6968 }, { "crossentropy": 2.5850095748901367, "epoch": 0.5928036747192923, "grad_norm": 0.03538071736693382, "grad_norm_var": 1.1658236850340306e-06, "learning_rate": 0.0004206675394451165, "loss": 2.585, "step": 6969 }, { "crossentropy": 2.568474292755127, "epoch": 0.5928887376658728, "grad_norm": 0.03128277137875557, "grad_norm_var": 1.3004327001665816e-06, "learning_rate": 0.00041986350921074554, "loss": 2.5685, "step": 6970 }, { "crossentropy": 2.686058282852173, "epoch": 0.5929738006124532, "grad_norm": 0.033561836928129196, "grad_norm_var": 1.3151742947973994e-06, "learning_rate": 0.00041906021441432073, "loss": 2.6861, "step": 6971 }, { "crossentropy": 2.727443218231201, "epoch": 0.5930588635590337, "grad_norm": 0.031593192368745804, "grad_norm_var": 1.3947841648781027e-06, "learning_rate": 0.00041825765518482827, "loss": 2.7274, "step": 6972 }, { "crossentropy": 2.6780145168304443, "epoch": 0.5931439265056142, "grad_norm": 0.034514497965574265, "grad_norm_var": 1.5981893698492506e-06, "learning_rate": 0.00041745583165113734, "loss": 2.678, "step": 6973 }, { "crossentropy": 2.545919895172119, "epoch": 0.5932289894521946, "grad_norm": 0.032784003764390945, "grad_norm_var": 1.5419191117461867e-06, "learning_rate": 0.00041665474394199766, "loss": 2.5459, "step": 6974 }, { "crossentropy": 2.557413101196289, "epoch": 0.5933140523987751, "grad_norm": 0.03232147544622421, "grad_norm_var": 1.550661055682058e-06, "learning_rate": 0.00041585439218603805, "loss": 2.5574, "step": 6975 }, { "crossentropy": 2.6532673835754395, "epoch": 0.5933991153453556, "grad_norm": 0.036840394139289856, "grad_norm_var": 2.451279156321512e-06, "learning_rate": 0.00041505477651177457, "loss": 2.6533, "step": 6976 }, { "crossentropy": 2.574068546295166, "epoch": 0.593484178291936, "grad_norm": 0.034082282334566116, "grad_norm_var": 2.357303330468609e-06, "learning_rate": 0.00041425589704760236, "loss": 2.5741, "step": 6977 }, { "crossentropy": 2.5766184329986572, "epoch": 0.5935692412385165, "grad_norm": 0.0329187735915184, "grad_norm_var": 2.082109562462353e-06, "learning_rate": 0.0004134577539217965, "loss": 2.5766, "step": 6978 }, { "crossentropy": 2.556928873062134, "epoch": 0.593654304185097, "grad_norm": 0.03087743930518627, "grad_norm_var": 2.4359239019346255e-06, "learning_rate": 0.00041266034726251846, "loss": 2.5569, "step": 6979 }, { "crossentropy": 2.542167901992798, "epoch": 0.5937393671316774, "grad_norm": 0.033877357840538025, "grad_norm_var": 2.3757656519723106e-06, "learning_rate": 0.0004118636771978074, "loss": 2.5422, "step": 6980 }, { "crossentropy": 2.709869623184204, "epoch": 0.5938244300782579, "grad_norm": 0.033367425203323364, "grad_norm_var": 2.310156294764428e-06, "learning_rate": 0.0004110677438555854, "loss": 2.7099, "step": 6981 }, { "crossentropy": 2.7026238441467285, "epoch": 0.5939094930248384, "grad_norm": 0.032826051115989685, "grad_norm_var": 2.3309868658521417e-06, "learning_rate": 0.00041027254736365705, "loss": 2.7026, "step": 6982 }, { "crossentropy": 2.6471877098083496, "epoch": 0.5939945559714188, "grad_norm": 0.03234490007162094, "grad_norm_var": 2.3186961677170893e-06, "learning_rate": 0.0004094780878497073, "loss": 2.6472, "step": 6983 }, { "crossentropy": 2.5904955863952637, "epoch": 0.5940796189179993, "grad_norm": 0.03163725882768631, "grad_norm_var": 2.4706607156155564e-06, "learning_rate": 0.00040868436544130307, "loss": 2.5905, "step": 6984 }, { "crossentropy": 2.6629438400268555, "epoch": 0.5941646818645798, "grad_norm": 0.03401106595993042, "grad_norm_var": 2.1783689482885907e-06, "learning_rate": 0.0004078913802658946, "loss": 2.6629, "step": 6985 }, { "crossentropy": 2.637996196746826, "epoch": 0.5942497448111602, "grad_norm": 0.032522983849048615, "grad_norm_var": 1.981849110877518e-06, "learning_rate": 0.0004070991324508122, "loss": 2.638, "step": 6986 }, { "crossentropy": 2.583071708679199, "epoch": 0.5943348077577407, "grad_norm": 0.0339861623942852, "grad_norm_var": 2.0175309686543218e-06, "learning_rate": 0.00040630762212326757, "loss": 2.5831, "step": 6987 }, { "crossentropy": 2.558967113494873, "epoch": 0.5944198707043212, "grad_norm": 0.034890126436948776, "grad_norm_var": 2.0096408348819433e-06, "learning_rate": 0.00040551684941035447, "loss": 2.559, "step": 6988 }, { "crossentropy": 2.595078229904175, "epoch": 0.5945049336509016, "grad_norm": 0.03434351831674576, "grad_norm_var": 1.9852086668487816e-06, "learning_rate": 0.0004047268144390481, "loss": 2.5951, "step": 6989 }, { "crossentropy": 2.6555495262145996, "epoch": 0.5945899965974821, "grad_norm": 0.0323786661028862, "grad_norm_var": 2.0261720599402347e-06, "learning_rate": 0.00040393751733620533, "loss": 2.6555, "step": 6990 }, { "crossentropy": 2.7513339519500732, "epoch": 0.5946750595440626, "grad_norm": 0.03282148763537407, "grad_norm_var": 1.9747867140435057e-06, "learning_rate": 0.0004031489582285652, "loss": 2.7513, "step": 6991 }, { "crossentropy": 2.5735678672790527, "epoch": 0.594760122490643, "grad_norm": 0.030848095193505287, "grad_norm_var": 1.4365698575218485e-06, "learning_rate": 0.0004023611372427471, "loss": 2.5736, "step": 6992 }, { "crossentropy": 2.735344171524048, "epoch": 0.5948451854372235, "grad_norm": 0.032718725502491, "grad_norm_var": 1.3529810178186085e-06, "learning_rate": 0.00040157405450525266, "loss": 2.7353, "step": 6993 }, { "crossentropy": 2.5774450302124023, "epoch": 0.594930248383804, "grad_norm": 0.03165271505713463, "grad_norm_var": 1.449677281726893e-06, "learning_rate": 0.0004007877101424634, "loss": 2.5774, "step": 6994 }, { "crossentropy": 2.6687169075012207, "epoch": 0.5950153113303844, "grad_norm": 0.03244849294424057, "grad_norm_var": 1.1972345224916935e-06, "learning_rate": 0.0004000021042806462, "loss": 2.6687, "step": 6995 }, { "crossentropy": 2.5657551288604736, "epoch": 0.5951003742769649, "grad_norm": 0.03236130252480507, "grad_norm_var": 1.1467969018321417e-06, "learning_rate": 0.0003992172370459446, "loss": 2.5658, "step": 6996 }, { "crossentropy": 2.6720223426818848, "epoch": 0.5951854372235454, "grad_norm": 0.03266531229019165, "grad_norm_var": 1.1265878731136209e-06, "learning_rate": 0.000398433108564385, "loss": 2.672, "step": 6997 }, { "crossentropy": 2.624267578125, "epoch": 0.5952705001701258, "grad_norm": 0.03305068239569664, "grad_norm_var": 1.1311641518282756e-06, "learning_rate": 0.00039764971896187774, "loss": 2.6243, "step": 6998 }, { "crossentropy": 2.6589415073394775, "epoch": 0.5953555631167063, "grad_norm": 0.03504020348191261, "grad_norm_var": 1.4243161593594273e-06, "learning_rate": 0.00039686706836421226, "loss": 2.6589, "step": 6999 }, { "crossentropy": 2.710373878479004, "epoch": 0.5954406260632868, "grad_norm": 0.033191509544849396, "grad_norm_var": 1.3009633453184607e-06, "learning_rate": 0.0003960851568970586, "loss": 2.7104, "step": 7000 }, { "crossentropy": 2.5577433109283447, "epoch": 0.5955256890098674, "grad_norm": 0.03469368815422058, "grad_norm_var": 1.4168138439502361e-06, "learning_rate": 0.00039530398468597073, "loss": 2.5577, "step": 7001 }, { "crossentropy": 2.6580960750579834, "epoch": 0.5956107519564477, "grad_norm": 0.031966615468263626, "grad_norm_var": 1.4790283151982676e-06, "learning_rate": 0.00039452355185638224, "loss": 2.6581, "step": 7002 }, { "crossentropy": 2.606799364089966, "epoch": 0.5956958149030283, "grad_norm": 0.031821489334106445, "grad_norm_var": 1.5063348195794157e-06, "learning_rate": 0.0003937438585336062, "loss": 2.6068, "step": 7003 }, { "crossentropy": 2.622116804122925, "epoch": 0.5957808778496088, "grad_norm": 0.03343956172466278, "grad_norm_var": 1.2588907547084512e-06, "learning_rate": 0.0003929649048428408, "loss": 2.6221, "step": 7004 }, { "crossentropy": 2.5646629333496094, "epoch": 0.5958659407961892, "grad_norm": 0.031937070190906525, "grad_norm_var": 1.138450704125229e-06, "learning_rate": 0.0003921866909091626, "loss": 2.5647, "step": 7005 }, { "crossentropy": 2.6795172691345215, "epoch": 0.5959510037427697, "grad_norm": 0.034071385860443115, "grad_norm_var": 1.2473269789456949e-06, "learning_rate": 0.00039140921685753064, "loss": 2.6795, "step": 7006 }, { "crossentropy": 2.519097089767456, "epoch": 0.5960360666893502, "grad_norm": 0.03322070464491844, "grad_norm_var": 1.2586700397421245e-06, "learning_rate": 0.00039063248281278554, "loss": 2.5191, "step": 7007 }, { "crossentropy": 2.623685359954834, "epoch": 0.5961211296359306, "grad_norm": 0.032251935452222824, "grad_norm_var": 1.0126560508480834e-06, "learning_rate": 0.00038985648889964755, "loss": 2.6237, "step": 7008 }, { "crossentropy": 2.6200385093688965, "epoch": 0.5962061925825111, "grad_norm": 0.03322456404566765, "grad_norm_var": 1.0158681363151938e-06, "learning_rate": 0.00038908123524271924, "loss": 2.62, "step": 7009 }, { "crossentropy": 2.5979573726654053, "epoch": 0.5962912555290916, "grad_norm": 0.0309264175593853, "grad_norm_var": 1.1734808876349514e-06, "learning_rate": 0.00038830672196648377, "loss": 2.598, "step": 7010 }, { "crossentropy": 2.635637044906616, "epoch": 0.596376318475672, "grad_norm": 0.03408122435212135, "grad_norm_var": 1.2430139799706729e-06, "learning_rate": 0.0003875329491953061, "loss": 2.6356, "step": 7011 }, { "crossentropy": 2.5875778198242188, "epoch": 0.5964613814222525, "grad_norm": 0.030689289793372154, "grad_norm_var": 1.5593437700878958e-06, "learning_rate": 0.0003867599170534319, "loss": 2.5876, "step": 7012 }, { "crossentropy": 2.538430690765381, "epoch": 0.596546444368833, "grad_norm": 0.031003456562757492, "grad_norm_var": 1.7821788969873126e-06, "learning_rate": 0.00038598762566498636, "loss": 2.5384, "step": 7013 }, { "crossentropy": 2.661959171295166, "epoch": 0.5966315073154134, "grad_norm": 0.03246821090579033, "grad_norm_var": 1.7829915243621943e-06, "learning_rate": 0.00038521607515397903, "loss": 2.662, "step": 7014 }, { "crossentropy": 2.6645379066467285, "epoch": 0.5967165702619939, "grad_norm": 0.030886853113770485, "grad_norm_var": 1.5938133070959407e-06, "learning_rate": 0.00038444526564429814, "loss": 2.6645, "step": 7015 }, { "crossentropy": 2.6760315895080566, "epoch": 0.5968016332085744, "grad_norm": 0.034623172134160995, "grad_norm_var": 1.8554215540440727e-06, "learning_rate": 0.000383675197259713, "loss": 2.676, "step": 7016 }, { "crossentropy": 2.551126480102539, "epoch": 0.5968866961551548, "grad_norm": 0.030668700113892555, "grad_norm_var": 1.7344719759707863e-06, "learning_rate": 0.00038290587012387633, "loss": 2.5511, "step": 7017 }, { "crossentropy": 2.624877691268921, "epoch": 0.5969717591017353, "grad_norm": 0.03435308858752251, "grad_norm_var": 1.9747847404708117e-06, "learning_rate": 0.00038213728436031714, "loss": 2.6249, "step": 7018 }, { "crossentropy": 2.662114143371582, "epoch": 0.5970568220483158, "grad_norm": 0.03347087278962135, "grad_norm_var": 2.000172612595533e-06, "learning_rate": 0.0003813694400924489, "loss": 2.6621, "step": 7019 }, { "crossentropy": 2.6644628047943115, "epoch": 0.5971418849948962, "grad_norm": 0.03139643371105194, "grad_norm_var": 2.0275331671564065e-06, "learning_rate": 0.0003806023374435663, "loss": 2.6645, "step": 7020 }, { "crossentropy": 2.514679431915283, "epoch": 0.5972269479414767, "grad_norm": 0.0312761515378952, "grad_norm_var": 2.10043880999552e-06, "learning_rate": 0.00037983597653684366, "loss": 2.5147, "step": 7021 }, { "crossentropy": 2.6363704204559326, "epoch": 0.5973120108880572, "grad_norm": 0.03216192126274109, "grad_norm_var": 1.906171025598891e-06, "learning_rate": 0.00037907035749533527, "loss": 2.6364, "step": 7022 }, { "crossentropy": 2.700808525085449, "epoch": 0.5973970738346376, "grad_norm": 0.03236103802919388, "grad_norm_var": 1.84613206580321e-06, "learning_rate": 0.00037830548044197944, "loss": 2.7008, "step": 7023 }, { "crossentropy": 2.6182026863098145, "epoch": 0.5974821367812181, "grad_norm": 0.0329020619392395, "grad_norm_var": 1.8735651602777425e-06, "learning_rate": 0.000377541345499593, "loss": 2.6182, "step": 7024 }, { "crossentropy": 2.5905637741088867, "epoch": 0.5975671997277986, "grad_norm": 0.033574629575014114, "grad_norm_var": 1.92527293051517e-06, "learning_rate": 0.00037677795279087155, "loss": 2.5906, "step": 7025 }, { "crossentropy": 2.6352832317352295, "epoch": 0.597652262674379, "grad_norm": 0.031332723796367645, "grad_norm_var": 1.86103068582874e-06, "learning_rate": 0.00037601530243839664, "loss": 2.6353, "step": 7026 }, { "crossentropy": 2.5737712383270264, "epoch": 0.5977373256209595, "grad_norm": 0.036110565066337585, "grad_norm_var": 2.5927740266671553e-06, "learning_rate": 0.0003752533945646275, "loss": 2.5738, "step": 7027 }, { "crossentropy": 2.63997220993042, "epoch": 0.59782238856754, "grad_norm": 0.032176848500967026, "grad_norm_var": 2.3808732484080495e-06, "learning_rate": 0.0003744922292919029, "loss": 2.64, "step": 7028 }, { "crossentropy": 2.7831101417541504, "epoch": 0.5979074515141205, "grad_norm": 0.036221977323293686, "grad_norm_var": 3.008290914947633e-06, "learning_rate": 0.0003737318067424461, "loss": 2.7831, "step": 7029 }, { "crossentropy": 2.6606857776641846, "epoch": 0.5979925144607009, "grad_norm": 0.03425434231758118, "grad_norm_var": 3.111024902487045e-06, "learning_rate": 0.00037297212703835846, "loss": 2.6607, "step": 7030 }, { "crossentropy": 2.6727609634399414, "epoch": 0.5980775774072814, "grad_norm": 0.03185871243476868, "grad_norm_var": 2.8980841671029235e-06, "learning_rate": 0.00037221319030162237, "loss": 2.6728, "step": 7031 }, { "crossentropy": 2.5748116970062256, "epoch": 0.5981626403538619, "grad_norm": 0.034150201827287674, "grad_norm_var": 2.812633261352439e-06, "learning_rate": 0.00037145499665410144, "loss": 2.5748, "step": 7032 }, { "crossentropy": 2.5928590297698975, "epoch": 0.5982477033004423, "grad_norm": 0.03063233569264412, "grad_norm_var": 2.824101327104864e-06, "learning_rate": 0.00037069754621753937, "loss": 2.5929, "step": 7033 }, { "crossentropy": 2.651864528656006, "epoch": 0.5983327662470228, "grad_norm": 0.03413734212517738, "grad_norm_var": 2.788507809181086e-06, "learning_rate": 0.00036994083911356116, "loss": 2.6519, "step": 7034 }, { "crossentropy": 2.7291340827941895, "epoch": 0.5984178291936033, "grad_norm": 0.033427122980356216, "grad_norm_var": 2.7858873111291997e-06, "learning_rate": 0.00036918487546367117, "loss": 2.7291, "step": 7035 }, { "crossentropy": 2.5576133728027344, "epoch": 0.5985028921401837, "grad_norm": 0.0334140881896019, "grad_norm_var": 2.60935834662749e-06, "learning_rate": 0.0003684296553892569, "loss": 2.5576, "step": 7036 }, { "crossentropy": 2.645155191421509, "epoch": 0.5985879550867642, "grad_norm": 0.032074254006147385, "grad_norm_var": 2.4524788728524927e-06, "learning_rate": 0.00036767517901158456, "loss": 2.6452, "step": 7037 }, { "crossentropy": 2.654707908630371, "epoch": 0.5986730180333447, "grad_norm": 0.033391255885362625, "grad_norm_var": 2.3809785666637288e-06, "learning_rate": 0.00036692144645180004, "loss": 2.6547, "step": 7038 }, { "crossentropy": 2.5979108810424805, "epoch": 0.5987580809799251, "grad_norm": 0.03687627613544464, "grad_norm_var": 3.1192724706737815e-06, "learning_rate": 0.0003661684578309338, "loss": 2.5979, "step": 7039 }, { "crossentropy": 2.5718045234680176, "epoch": 0.5988431439265056, "grad_norm": 0.03331269696354866, "grad_norm_var": 3.0952435322935862e-06, "learning_rate": 0.00036541621326989184, "loss": 2.5718, "step": 7040 }, { "crossentropy": 2.5974011421203613, "epoch": 0.5989282068730861, "grad_norm": 0.03372843191027641, "grad_norm_var": 3.0970407355166336e-06, "learning_rate": 0.0003646647128894626, "loss": 2.5974, "step": 7041 }, { "crossentropy": 2.589372158050537, "epoch": 0.5990132698196665, "grad_norm": 0.03594740480184555, "grad_norm_var": 3.0522211510894864e-06, "learning_rate": 0.0003639139568103172, "loss": 2.5894, "step": 7042 }, { "crossentropy": 2.520371198654175, "epoch": 0.599098332766247, "grad_norm": 0.03237692266702652, "grad_norm_var": 2.8016667734563164e-06, "learning_rate": 0.0003631639451530044, "loss": 2.5204, "step": 7043 }, { "crossentropy": 2.621048927307129, "epoch": 0.5991833957128275, "grad_norm": 0.031745824962854385, "grad_norm_var": 2.896432019029958e-06, "learning_rate": 0.00036241467803795394, "loss": 2.621, "step": 7044 }, { "crossentropy": 2.6048312187194824, "epoch": 0.5992684586594079, "grad_norm": 0.03188379481434822, "grad_norm_var": 2.5542188975112105e-06, "learning_rate": 0.0003616661555854778, "loss": 2.6048, "step": 7045 }, { "crossentropy": 2.7071070671081543, "epoch": 0.5993535216059884, "grad_norm": 0.03351476043462753, "grad_norm_var": 2.4968297763418303e-06, "learning_rate": 0.00036091837791576764, "loss": 2.7071, "step": 7046 }, { "crossentropy": 2.6584959030151367, "epoch": 0.5994385845525689, "grad_norm": 0.03224697709083557, "grad_norm_var": 2.4327012615693127e-06, "learning_rate": 0.00036017134514889195, "loss": 2.6585, "step": 7047 }, { "crossentropy": 2.6687123775482178, "epoch": 0.5995236474991493, "grad_norm": 0.0327775739133358, "grad_norm_var": 2.395539312280426e-06, "learning_rate": 0.0003594250574048058, "loss": 2.6687, "step": 7048 }, { "crossentropy": 2.695712089538574, "epoch": 0.5996087104457298, "grad_norm": 0.036575064063072205, "grad_norm_var": 2.5540502650721216e-06, "learning_rate": 0.00035867951480334, "loss": 2.6957, "step": 7049 }, { "crossentropy": 2.6410317420959473, "epoch": 0.5996937733923103, "grad_norm": 0.034029580652713776, "grad_norm_var": 2.5469025615352617e-06, "learning_rate": 0.00035793471746420726, "loss": 2.641, "step": 7050 }, { "crossentropy": 2.5440337657928467, "epoch": 0.5997788363388907, "grad_norm": 0.03277551010251045, "grad_norm_var": 2.5869504629498035e-06, "learning_rate": 0.00035719066550700065, "loss": 2.544, "step": 7051 }, { "crossentropy": 2.6033005714416504, "epoch": 0.5998638992854712, "grad_norm": 0.0331510454416275, "grad_norm_var": 2.595757629921212e-06, "learning_rate": 0.0003564473590511941, "loss": 2.6033, "step": 7052 }, { "crossentropy": 2.574568271636963, "epoch": 0.5999489622320517, "grad_norm": 0.032829541712999344, "grad_norm_var": 2.485267523148297e-06, "learning_rate": 0.000355704798216141, "loss": 2.5746, "step": 7053 }, { "crossentropy": 2.5819714069366455, "epoch": 0.6000340251786321, "grad_norm": 0.03193948045372963, "grad_norm_var": 2.652111400646325e-06, "learning_rate": 0.00035496298312107436, "loss": 2.582, "step": 7054 }, { "crossentropy": 2.57649827003479, "epoch": 0.6001190881252126, "grad_norm": 0.03251378983259201, "grad_norm_var": 1.867195241504055e-06, "learning_rate": 0.0003542219138851094, "loss": 2.5765, "step": 7055 }, { "crossentropy": 2.5594911575317383, "epoch": 0.6002041510717931, "grad_norm": 0.0324445441365242, "grad_norm_var": 1.9023293473746997e-06, "learning_rate": 0.00035348159062724037, "loss": 2.5595, "step": 7056 }, { "crossentropy": 2.616755962371826, "epoch": 0.6002892140183737, "grad_norm": 0.03194793686270714, "grad_norm_var": 1.964335818644726e-06, "learning_rate": 0.00035274201346634004, "loss": 2.6168, "step": 7057 }, { "crossentropy": 2.5929126739501953, "epoch": 0.600374276964954, "grad_norm": 0.03215496242046356, "grad_norm_var": 1.3949825426235255e-06, "learning_rate": 0.0003520031825211656, "loss": 2.5929, "step": 7058 }, { "crossentropy": 2.584994077682495, "epoch": 0.6004593399115346, "grad_norm": 0.03249013051390648, "grad_norm_var": 1.389296218099606e-06, "learning_rate": 0.00035126509791035156, "loss": 2.585, "step": 7059 }, { "crossentropy": 2.6130151748657227, "epoch": 0.6005444028581151, "grad_norm": 0.034108635038137436, "grad_norm_var": 1.4017749576951862e-06, "learning_rate": 0.000350527759752412, "loss": 2.613, "step": 7060 }, { "crossentropy": 2.5132014751434326, "epoch": 0.6006294658046955, "grad_norm": 0.031937822699546814, "grad_norm_var": 1.394194214484184e-06, "learning_rate": 0.00034979116816574285, "loss": 2.5132, "step": 7061 }, { "crossentropy": 2.666728973388672, "epoch": 0.600714528751276, "grad_norm": 0.03338084742426872, "grad_norm_var": 1.385496046656572e-06, "learning_rate": 0.0003490553232686194, "loss": 2.6667, "step": 7062 }, { "crossentropy": 2.5632681846618652, "epoch": 0.6007995916978565, "grad_norm": 0.032360807061195374, "grad_norm_var": 1.3755377416139562e-06, "learning_rate": 0.0003483202251791967, "loss": 2.5633, "step": 7063 }, { "crossentropy": 2.6207058429718018, "epoch": 0.6008846546444369, "grad_norm": 0.032584793865680695, "grad_norm_var": 1.3826415900978935e-06, "learning_rate": 0.00034758587401551076, "loss": 2.6207, "step": 7064 }, { "crossentropy": 2.517838954925537, "epoch": 0.6009697175910174, "grad_norm": 0.03217434138059616, "grad_norm_var": 4.668837428921264e-07, "learning_rate": 0.0003468522698954779, "loss": 2.5178, "step": 7065 }, { "crossentropy": 2.6613070964813232, "epoch": 0.6010547805375979, "grad_norm": 0.03272617235779762, "grad_norm_var": 3.37911941434658e-07, "learning_rate": 0.0003461194129368922, "loss": 2.6613, "step": 7066 }, { "crossentropy": 2.587172031402588, "epoch": 0.6011398434841783, "grad_norm": 0.03364388272166252, "grad_norm_var": 4.0593876880097116e-07, "learning_rate": 0.0003453873032574317, "loss": 2.5872, "step": 7067 }, { "crossentropy": 2.637857437133789, "epoch": 0.6012249064307588, "grad_norm": 0.032203879207372665, "grad_norm_var": 3.9864364406212945e-07, "learning_rate": 0.0003446559409746497, "loss": 2.6379, "step": 7068 }, { "crossentropy": 2.6905195713043213, "epoch": 0.6013099693773393, "grad_norm": 0.03193886950612068, "grad_norm_var": 4.1978925497439486e-07, "learning_rate": 0.0003439253262059822, "loss": 2.6905, "step": 7069 }, { "crossentropy": 2.641800880432129, "epoch": 0.6013950323239197, "grad_norm": 0.03110853210091591, "grad_norm_var": 5.288603727286619e-07, "learning_rate": 0.00034319545906874636, "loss": 2.6418, "step": 7070 }, { "crossentropy": 2.6356797218322754, "epoch": 0.6014800952705002, "grad_norm": 0.032369330525398254, "grad_norm_var": 5.295619087912368e-07, "learning_rate": 0.000342466339680137, "loss": 2.6357, "step": 7071 }, { "crossentropy": 2.600783586502075, "epoch": 0.6015651582170807, "grad_norm": 0.033202700316905975, "grad_norm_var": 5.625631187490427e-07, "learning_rate": 0.0003417379681572297, "loss": 2.6008, "step": 7072 }, { "crossentropy": 2.5982778072357178, "epoch": 0.6016502211636611, "grad_norm": 0.032720599323511124, "grad_norm_var": 5.408533270532049e-07, "learning_rate": 0.0003410103446169793, "loss": 2.5983, "step": 7073 }, { "crossentropy": 2.5811967849731445, "epoch": 0.6017352841102416, "grad_norm": 0.0316271148622036, "grad_norm_var": 5.874172429115587e-07, "learning_rate": 0.00034028346917622297, "loss": 2.5812, "step": 7074 }, { "crossentropy": 2.632603645324707, "epoch": 0.6018203470568221, "grad_norm": 0.033072471618652344, "grad_norm_var": 6.050388245798857e-07, "learning_rate": 0.000339557341951674, "loss": 2.6326, "step": 7075 }, { "crossentropy": 2.576333522796631, "epoch": 0.6019054100034025, "grad_norm": 0.033100396394729614, "grad_norm_var": 4.620741912909898e-07, "learning_rate": 0.000338831963059929, "loss": 2.5763, "step": 7076 }, { "crossentropy": 2.643393039703369, "epoch": 0.601990472949983, "grad_norm": 0.03157246485352516, "grad_norm_var": 4.982677026314659e-07, "learning_rate": 0.0003381073326174616, "loss": 2.6434, "step": 7077 }, { "crossentropy": 2.697075128555298, "epoch": 0.6020755358965635, "grad_norm": 0.031867992132902145, "grad_norm_var": 4.609513545282435e-07, "learning_rate": 0.0003373834507406276, "loss": 2.6971, "step": 7078 }, { "crossentropy": 2.6591262817382812, "epoch": 0.6021605988431439, "grad_norm": 0.032245516777038574, "grad_norm_var": 4.622638502334182e-07, "learning_rate": 0.00033666031754566063, "loss": 2.6591, "step": 7079 }, { "crossentropy": 2.662792682647705, "epoch": 0.6022456617897244, "grad_norm": 0.03237718343734741, "grad_norm_var": 4.594255290141583e-07, "learning_rate": 0.00033593793314867613, "loss": 2.6628, "step": 7080 }, { "crossentropy": 2.5754764080047607, "epoch": 0.6023307247363049, "grad_norm": 0.03168662637472153, "grad_norm_var": 4.871433806253115e-07, "learning_rate": 0.0003352162976656675, "loss": 2.5755, "step": 7081 }, { "crossentropy": 2.6555209159851074, "epoch": 0.6024157876828853, "grad_norm": 0.03244093433022499, "grad_norm_var": 4.775980326345829e-07, "learning_rate": 0.0003344954112125087, "loss": 2.6555, "step": 7082 }, { "crossentropy": 2.535672426223755, "epoch": 0.6025008506294658, "grad_norm": 0.032990530133247375, "grad_norm_var": 3.892675685566079e-07, "learning_rate": 0.00033377527390495375, "loss": 2.5357, "step": 7083 }, { "crossentropy": 2.546753168106079, "epoch": 0.6025859135760463, "grad_norm": 0.03292242810130119, "grad_norm_var": 4.1397392694907574e-07, "learning_rate": 0.0003330558858586352, "loss": 2.5468, "step": 7084 }, { "crossentropy": 2.629624128341675, "epoch": 0.6026709765226267, "grad_norm": 0.034528784453868866, "grad_norm_var": 6.98920328126978e-07, "learning_rate": 0.00033233724718906624, "loss": 2.6296, "step": 7085 }, { "crossentropy": 2.6754167079925537, "epoch": 0.6027560394692072, "grad_norm": 0.03213135898113251, "grad_norm_var": 5.759604231674968e-07, "learning_rate": 0.00033161935801164, "loss": 2.6754, "step": 7086 }, { "crossentropy": 2.6421926021575928, "epoch": 0.6028411024157877, "grad_norm": 0.032875023782253265, "grad_norm_var": 5.795236862597235e-07, "learning_rate": 0.0003309022184416288, "loss": 2.6422, "step": 7087 }, { "crossentropy": 2.6326711177825928, "epoch": 0.6029261653623682, "grad_norm": 0.032400961965322495, "grad_norm_var": 5.536807090579405e-07, "learning_rate": 0.00033018582859418446, "loss": 2.6327, "step": 7088 }, { "crossentropy": 2.635836601257324, "epoch": 0.6030112283089486, "grad_norm": 0.03246661648154259, "grad_norm_var": 5.514280286879221e-07, "learning_rate": 0.0003294701885843382, "loss": 2.6358, "step": 7089 }, { "crossentropy": 2.717167377471924, "epoch": 0.6030962912555291, "grad_norm": 0.0342879556119442, "grad_norm_var": 6.774573924024525e-07, "learning_rate": 0.0003287552985270015, "loss": 2.7172, "step": 7090 }, { "crossentropy": 2.5545570850372314, "epoch": 0.6031813542021096, "grad_norm": 0.03182702511548996, "grad_norm_var": 7.101352963736462e-07, "learning_rate": 0.00032804115853696415, "loss": 2.5546, "step": 7091 }, { "crossentropy": 2.670879602432251, "epoch": 0.60326641714869, "grad_norm": 0.03251316770911217, "grad_norm_var": 6.931040581905264e-07, "learning_rate": 0.0003273277687288978, "loss": 2.6709, "step": 7092 }, { "crossentropy": 2.6175007820129395, "epoch": 0.6033514800952705, "grad_norm": 0.033651627600193024, "grad_norm_var": 6.864955782785616e-07, "learning_rate": 0.0003266151292173519, "loss": 2.6175, "step": 7093 }, { "crossentropy": 2.619774103164673, "epoch": 0.603436543041851, "grad_norm": 0.03208895027637482, "grad_norm_var": 6.650098426502308e-07, "learning_rate": 0.00032590324011675607, "loss": 2.6198, "step": 7094 }, { "crossentropy": 2.6395974159240723, "epoch": 0.6035216059884314, "grad_norm": 0.03258426487445831, "grad_norm_var": 6.509918480147869e-07, "learning_rate": 0.00032519210154141777, "loss": 2.6396, "step": 7095 }, { "crossentropy": 2.595552444458008, "epoch": 0.6036066689350119, "grad_norm": 0.03512020409107208, "grad_norm_var": 9.900783619618455e-07, "learning_rate": 0.00032448171360552835, "loss": 2.5956, "step": 7096 }, { "crossentropy": 2.4278998374938965, "epoch": 0.6036917318815924, "grad_norm": 0.03166135773062706, "grad_norm_var": 9.942308325183147e-07, "learning_rate": 0.00032377207642315166, "loss": 2.4279, "step": 7097 }, { "crossentropy": 2.6450536251068115, "epoch": 0.6037767948281728, "grad_norm": 0.0331864058971405, "grad_norm_var": 9.82767932403844e-07, "learning_rate": 0.00032306319010823813, "loss": 2.6451, "step": 7098 }, { "crossentropy": 2.6958096027374268, "epoch": 0.6038618577747533, "grad_norm": 0.03318759799003601, "grad_norm_var": 9.861999155274618e-07, "learning_rate": 0.0003223550547746135, "loss": 2.6958, "step": 7099 }, { "crossentropy": 2.6092984676361084, "epoch": 0.6039469207213338, "grad_norm": 0.03182670846581459, "grad_norm_var": 1.0673998472376932e-06, "learning_rate": 0.0003216476705359839, "loss": 2.6093, "step": 7100 }, { "crossentropy": 2.654766798019409, "epoch": 0.6040319836679142, "grad_norm": 0.03490587696433067, "grad_norm_var": 1.1583757221368558e-06, "learning_rate": 0.0003209410375059335, "loss": 2.6548, "step": 7101 }, { "crossentropy": 2.605268716812134, "epoch": 0.6041170466144947, "grad_norm": 0.032597340643405914, "grad_norm_var": 1.1229669507811211e-06, "learning_rate": 0.00032023515579792994, "loss": 2.6053, "step": 7102 }, { "crossentropy": 2.7161238193511963, "epoch": 0.6042021095610752, "grad_norm": 0.034209270030260086, "grad_norm_var": 1.2211023431479377e-06, "learning_rate": 0.00031953002552531585, "loss": 2.7161, "step": 7103 }, { "crossentropy": 2.6244118213653564, "epoch": 0.6042871725076556, "grad_norm": 0.0324447900056839, "grad_norm_var": 1.2175335610676088e-06, "learning_rate": 0.00031882564680131396, "loss": 2.6244, "step": 7104 }, { "crossentropy": 2.626504898071289, "epoch": 0.6043722354542361, "grad_norm": 0.03315482288599014, "grad_norm_var": 1.19498477788559e-06, "learning_rate": 0.0003181220197390289, "loss": 2.6265, "step": 7105 }, { "crossentropy": 2.6687448024749756, "epoch": 0.6044572984008166, "grad_norm": 0.03274364024400711, "grad_norm_var": 1.0948930834502856e-06, "learning_rate": 0.0003174191444514424, "loss": 2.6687, "step": 7106 }, { "crossentropy": 2.592843532562256, "epoch": 0.604542361347397, "grad_norm": 0.031842298805713654, "grad_norm_var": 1.092556705750182e-06, "learning_rate": 0.00031671702105141533, "loss": 2.5928, "step": 7107 }, { "crossentropy": 2.594475030899048, "epoch": 0.6046274242939775, "grad_norm": 0.034137554466724396, "grad_norm_var": 1.1558436234978171e-06, "learning_rate": 0.00031601564965168916, "loss": 2.5945, "step": 7108 }, { "crossentropy": 2.581376075744629, "epoch": 0.604712487240558, "grad_norm": 0.031816694885492325, "grad_norm_var": 1.2273855816007905e-06, "learning_rate": 0.0003153150303648838, "loss": 2.5814, "step": 7109 }, { "crossentropy": 2.688568353652954, "epoch": 0.6047975501871384, "grad_norm": 0.03363415226340294, "grad_norm_var": 1.1952510597580678e-06, "learning_rate": 0.00031461516330349824, "loss": 2.6886, "step": 7110 }, { "crossentropy": 2.5183846950531006, "epoch": 0.6048826131337189, "grad_norm": 0.032739996910095215, "grad_norm_var": 1.1867678789817075e-06, "learning_rate": 0.0003139160485799114, "loss": 2.5184, "step": 7111 }, { "crossentropy": 2.6098270416259766, "epoch": 0.6049676760802994, "grad_norm": 0.031730446964502335, "grad_norm_var": 9.808013002566076e-07, "learning_rate": 0.0003132176863063807, "loss": 2.6098, "step": 7112 }, { "crossentropy": 2.6685969829559326, "epoch": 0.6050527390268798, "grad_norm": 0.032448071986436844, "grad_norm_var": 8.933653891549445e-07, "learning_rate": 0.00031252007659504253, "loss": 2.6686, "step": 7113 }, { "crossentropy": 2.667562484741211, "epoch": 0.6051378019734603, "grad_norm": 0.03461020439863205, "grad_norm_var": 1.0719964982770665e-06, "learning_rate": 0.00031182321955791403, "loss": 2.6676, "step": 7114 }, { "crossentropy": 2.570347309112549, "epoch": 0.6052228649200408, "grad_norm": 0.031398992985486984, "grad_norm_var": 1.22764146626315e-06, "learning_rate": 0.00031112711530688954, "loss": 2.5703, "step": 7115 }, { "crossentropy": 2.6467230319976807, "epoch": 0.6053079278666214, "grad_norm": 0.032762087881565094, "grad_norm_var": 1.149707353902997e-06, "learning_rate": 0.0003104317639537435, "loss": 2.6467, "step": 7116 }, { "crossentropy": 2.6409921646118164, "epoch": 0.6053929908132017, "grad_norm": 0.03293513506650925, "grad_norm_var": 8.78119004463912e-07, "learning_rate": 0.0003097371656101289, "loss": 2.641, "step": 7117 }, { "crossentropy": 2.5401086807250977, "epoch": 0.6054780537597823, "grad_norm": 0.039959754794836044, "grad_norm_var": 4.042119922221567e-06, "learning_rate": 0.00030904332038757975, "loss": 2.5401, "step": 7118 }, { "crossentropy": 2.609799861907959, "epoch": 0.6055631167063628, "grad_norm": 0.032496511936187744, "grad_norm_var": 4.0145056914887844e-06, "learning_rate": 0.0003083502283975059, "loss": 2.6098, "step": 7119 }, { "crossentropy": 2.6492183208465576, "epoch": 0.6056481796529432, "grad_norm": 0.034157030284404755, "grad_norm_var": 4.0302481099439385e-06, "learning_rate": 0.0003076578897511978, "loss": 2.6492, "step": 7120 }, { "crossentropy": 2.5930635929107666, "epoch": 0.6057332425995237, "grad_norm": 0.030652323737740517, "grad_norm_var": 4.465244486268046e-06, "learning_rate": 0.000306966304559827, "loss": 2.5931, "step": 7121 }, { "crossentropy": 2.620248556137085, "epoch": 0.6058183055461042, "grad_norm": 0.033221468329429626, "grad_norm_var": 4.454959463431061e-06, "learning_rate": 0.0003062754729344408, "loss": 2.6202, "step": 7122 }, { "crossentropy": 2.625967264175415, "epoch": 0.6059033684926846, "grad_norm": 0.0352611318230629, "grad_norm_var": 4.5853110997673505e-06, "learning_rate": 0.00030558539498596606, "loss": 2.626, "step": 7123 }, { "crossentropy": 2.5665009021759033, "epoch": 0.6059884314392651, "grad_norm": 0.033879369497299194, "grad_norm_var": 4.563143928928504e-06, "learning_rate": 0.00030489607082521134, "loss": 2.5665, "step": 7124 }, { "crossentropy": 2.610426902770996, "epoch": 0.6060734943858456, "grad_norm": 0.03216741234064102, "grad_norm_var": 4.498828559259038e-06, "learning_rate": 0.00030420750056286196, "loss": 2.6104, "step": 7125 }, { "crossentropy": 2.5846970081329346, "epoch": 0.606158557332426, "grad_norm": 0.034044209867715836, "grad_norm_var": 4.5233219049587195e-06, "learning_rate": 0.00030351968430948004, "loss": 2.5847, "step": 7126 }, { "crossentropy": 2.7728958129882812, "epoch": 0.6062436202790065, "grad_norm": 0.0332004688680172, "grad_norm_var": 4.495806178408121e-06, "learning_rate": 0.0003028326221755118, "loss": 2.7729, "step": 7127 }, { "crossentropy": 2.5970547199249268, "epoch": 0.606328683225587, "grad_norm": 0.031965628266334534, "grad_norm_var": 4.445881872905963e-06, "learning_rate": 0.0003021463142712788, "loss": 2.5971, "step": 7128 }, { "crossentropy": 2.6723523139953613, "epoch": 0.6064137461721674, "grad_norm": 0.03316938132047653, "grad_norm_var": 4.382281442572818e-06, "learning_rate": 0.0003014607607069814, "loss": 2.6724, "step": 7129 }, { "crossentropy": 2.6270291805267334, "epoch": 0.6064988091187479, "grad_norm": 0.034007273614406586, "grad_norm_var": 4.315154253755578e-06, "learning_rate": 0.00030077596159270084, "loss": 2.627, "step": 7130 }, { "crossentropy": 2.611140012741089, "epoch": 0.6065838720653284, "grad_norm": 0.030700745061039925, "grad_norm_var": 4.53702923655606e-06, "learning_rate": 0.00030009191703839644, "loss": 2.6111, "step": 7131 }, { "crossentropy": 2.483825922012329, "epoch": 0.6066689350119088, "grad_norm": 0.032029684633016586, "grad_norm_var": 4.63394785905896e-06, "learning_rate": 0.00029940862715390485, "loss": 2.4838, "step": 7132 }, { "crossentropy": 2.647930860519409, "epoch": 0.6067539979584893, "grad_norm": 0.034155018627643585, "grad_norm_var": 4.6569605388508194e-06, "learning_rate": 0.00029872609204894374, "loss": 2.6479, "step": 7133 }, { "crossentropy": 2.6555864810943604, "epoch": 0.6068390609050698, "grad_norm": 0.0317317359149456, "grad_norm_var": 1.7374865067855289e-06, "learning_rate": 0.00029804431183310775, "loss": 2.6556, "step": 7134 }, { "crossentropy": 2.7409679889678955, "epoch": 0.6069241238516502, "grad_norm": 0.03281731531023979, "grad_norm_var": 1.7254853118093641e-06, "learning_rate": 0.00029736328661587045, "loss": 2.741, "step": 7135 }, { "crossentropy": 2.699471950531006, "epoch": 0.6070091867982307, "grad_norm": 0.032682932913303375, "grad_norm_var": 1.6235692074340983e-06, "learning_rate": 0.00029668301650658755, "loss": 2.6995, "step": 7136 }, { "crossentropy": 2.6043508052825928, "epoch": 0.6070942497448112, "grad_norm": 0.0321691632270813, "grad_norm_var": 1.3218113822087053e-06, "learning_rate": 0.00029600350161448853, "loss": 2.6044, "step": 7137 }, { "crossentropy": 2.663288116455078, "epoch": 0.6071793126913916, "grad_norm": 0.03347529098391533, "grad_norm_var": 1.3350190938790034e-06, "learning_rate": 0.0002953247420486849, "loss": 2.6633, "step": 7138 }, { "crossentropy": 2.557910680770874, "epoch": 0.6072643756379721, "grad_norm": 0.03311086818575859, "grad_norm_var": 9.65991392818397e-07, "learning_rate": 0.00029464673791816556, "loss": 2.5579, "step": 7139 }, { "crossentropy": 2.587254524230957, "epoch": 0.6073494385845526, "grad_norm": 0.03196065127849579, "grad_norm_var": 9.280483133837589e-07, "learning_rate": 0.0002939694893317979, "loss": 2.5873, "step": 7140 }, { "crossentropy": 2.666196346282959, "epoch": 0.607434501531133, "grad_norm": 0.032791342586278915, "grad_norm_var": 9.070961839703715e-07, "learning_rate": 0.0002932929963983294, "loss": 2.6662, "step": 7141 }, { "crossentropy": 2.638066291809082, "epoch": 0.6075195644777135, "grad_norm": 0.03272855281829834, "grad_norm_var": 7.883776579209554e-07, "learning_rate": 0.00029261725922638483, "loss": 2.6381, "step": 7142 }, { "crossentropy": 2.6657330989837646, "epoch": 0.607604627424294, "grad_norm": 0.033057183027267456, "grad_norm_var": 7.794977514087673e-07, "learning_rate": 0.0002919422779244685, "loss": 2.6657, "step": 7143 }, { "crossentropy": 2.6465654373168945, "epoch": 0.6076896903708745, "grad_norm": 0.033054426312446594, "grad_norm_var": 7.528518675621685e-07, "learning_rate": 0.00029126805260096256, "loss": 2.6466, "step": 7144 }, { "crossentropy": 2.5421793460845947, "epoch": 0.6077747533174549, "grad_norm": 0.032651983201503754, "grad_norm_var": 7.391061111165862e-07, "learning_rate": 0.0002905945833641277, "loss": 2.5422, "step": 7145 }, { "crossentropy": 2.6063005924224854, "epoch": 0.6078598162640354, "grad_norm": 0.03106163814663887, "grad_norm_var": 7.661091410932162e-07, "learning_rate": 0.00028992187032210513, "loss": 2.6063, "step": 7146 }, { "crossentropy": 2.5424256324768066, "epoch": 0.6079448792106159, "grad_norm": 0.03142966702580452, "grad_norm_var": 6.233637664644631e-07, "learning_rate": 0.0002892499135829135, "loss": 2.5424, "step": 7147 }, { "crossentropy": 2.6472420692443848, "epoch": 0.6080299421571963, "grad_norm": 0.03224455192685127, "grad_norm_var": 6.111503596778479e-07, "learning_rate": 0.0002885787132544465, "loss": 2.6472, "step": 7148 }, { "crossentropy": 2.5576183795928955, "epoch": 0.6081150051037768, "grad_norm": 0.030695244669914246, "grad_norm_var": 6.281703753512983e-07, "learning_rate": 0.0002879082694444829, "loss": 2.5576, "step": 7149 }, { "crossentropy": 2.5811665058135986, "epoch": 0.6082000680503573, "grad_norm": 0.03444464132189751, "grad_norm_var": 8.63108382899667e-07, "learning_rate": 0.0002872385822606749, "loss": 2.5812, "step": 7150 }, { "crossentropy": 2.5673749446868896, "epoch": 0.6082851309969377, "grad_norm": 0.03235438093543053, "grad_norm_var": 8.583649095596224e-07, "learning_rate": 0.00028656965181055483, "loss": 2.5674, "step": 7151 }, { "crossentropy": 2.6225101947784424, "epoch": 0.6083701939435182, "grad_norm": 0.03477827087044716, "grad_norm_var": 1.1854025118051535e-06, "learning_rate": 0.0002859014782015351, "loss": 2.6225, "step": 7152 }, { "crossentropy": 2.6607589721679688, "epoch": 0.6084552568900987, "grad_norm": 0.032935310155153275, "grad_norm_var": 1.1754736120211477e-06, "learning_rate": 0.00028523406154090436, "loss": 2.6608, "step": 7153 }, { "crossentropy": 2.7155532836914062, "epoch": 0.6085403198366791, "grad_norm": 0.032011695206165314, "grad_norm_var": 1.152864911836635e-06, "learning_rate": 0.00028456740193583076, "loss": 2.7156, "step": 7154 }, { "crossentropy": 2.5316693782806396, "epoch": 0.6086253827832596, "grad_norm": 0.032857008278369904, "grad_norm_var": 1.1389882227254751e-06, "learning_rate": 0.00028390149949335974, "loss": 2.5317, "step": 7155 }, { "crossentropy": 2.7484352588653564, "epoch": 0.6087104457298401, "grad_norm": 0.031554803252220154, "grad_norm_var": 1.1820418919108085e-06, "learning_rate": 0.00028323635432041694, "loss": 2.7484, "step": 7156 }, { "crossentropy": 2.663790464401245, "epoch": 0.6087955086764205, "grad_norm": 0.0331239253282547, "grad_norm_var": 1.200071069903883e-06, "learning_rate": 0.00028257196652380523, "loss": 2.6638, "step": 7157 }, { "crossentropy": 2.622987747192383, "epoch": 0.608880571623001, "grad_norm": 0.03412910923361778, "grad_norm_var": 1.3538724225492515e-06, "learning_rate": 0.0002819083362102054, "loss": 2.623, "step": 7158 }, { "crossentropy": 2.609616279602051, "epoch": 0.6089656345695815, "grad_norm": 0.03222616761922836, "grad_norm_var": 1.3518054518858735e-06, "learning_rate": 0.0002812454634861783, "loss": 2.6096, "step": 7159 }, { "crossentropy": 2.546753168106079, "epoch": 0.6090506975161619, "grad_norm": 0.03326285630464554, "grad_norm_var": 1.3672313946139214e-06, "learning_rate": 0.0002805833484581621, "loss": 2.5468, "step": 7160 }, { "crossentropy": 2.6288602352142334, "epoch": 0.6091357604627424, "grad_norm": 0.03257058933377266, "grad_norm_var": 1.3671906815776805e-06, "learning_rate": 0.0002799219912324735, "loss": 2.6289, "step": 7161 }, { "crossentropy": 2.59627628326416, "epoch": 0.6092208234093229, "grad_norm": 0.03167007490992546, "grad_norm_var": 1.2651235205865876e-06, "learning_rate": 0.00027926139191530695, "loss": 2.5963, "step": 7162 }, { "crossentropy": 2.6552891731262207, "epoch": 0.6093058863559033, "grad_norm": 0.032616131007671356, "grad_norm_var": 1.1611581894811762e-06, "learning_rate": 0.0002786015506127365, "loss": 2.6553, "step": 7163 }, { "crossentropy": 2.7000489234924316, "epoch": 0.6093909493024838, "grad_norm": 0.0331159345805645, "grad_norm_var": 1.1537038040909664e-06, "learning_rate": 0.00027794246743071126, "loss": 2.7, "step": 7164 }, { "crossentropy": 2.646392345428467, "epoch": 0.6094760122490643, "grad_norm": 0.0320289172232151, "grad_norm_var": 8.956416754879806e-07, "learning_rate": 0.0002772841424750644, "loss": 2.6464, "step": 7165 }, { "crossentropy": 2.60349178314209, "epoch": 0.6095610751956447, "grad_norm": 0.03211109712719917, "grad_norm_var": 7.413776001810937e-07, "learning_rate": 0.0002766265758515019, "loss": 2.6035, "step": 7166 }, { "crossentropy": 2.5823264122009277, "epoch": 0.6096461381422252, "grad_norm": 0.031799089163541794, "grad_norm_var": 7.869155267611316e-07, "learning_rate": 0.0002759697676656098, "loss": 2.5823, "step": 7167 }, { "crossentropy": 2.6401522159576416, "epoch": 0.6097312010888057, "grad_norm": 0.03294652700424194, "grad_norm_var": 4.82796048489655e-07, "learning_rate": 0.00027531371802285435, "loss": 2.6402, "step": 7168 }, { "crossentropy": 2.58658504486084, "epoch": 0.6098162640353861, "grad_norm": 0.035218603909015656, "grad_norm_var": 9.229090996107705e-07, "learning_rate": 0.00027465842702857624, "loss": 2.5866, "step": 7169 }, { "crossentropy": 2.5487990379333496, "epoch": 0.6099013269819666, "grad_norm": 0.031862180680036545, "grad_norm_var": 9.380807935110979e-07, "learning_rate": 0.00027400389478799694, "loss": 2.5488, "step": 7170 }, { "crossentropy": 2.5584399700164795, "epoch": 0.6099863899285471, "grad_norm": 0.03242086246609688, "grad_norm_var": 9.404504322023273e-07, "learning_rate": 0.0002733501214062162, "loss": 2.5584, "step": 7171 }, { "crossentropy": 2.6336112022399902, "epoch": 0.6100714528751275, "grad_norm": 0.03130427747964859, "grad_norm_var": 9.81492734224559e-07, "learning_rate": 0.00027269710698821003, "loss": 2.6336, "step": 7172 }, { "crossentropy": 2.6518115997314453, "epoch": 0.610156515821708, "grad_norm": 0.03128264471888542, "grad_norm_var": 1.0771341072949673e-06, "learning_rate": 0.0002720448516388341, "loss": 2.6518, "step": 7173 }, { "crossentropy": 2.675367832183838, "epoch": 0.6102415787682886, "grad_norm": 0.0317312516272068, "grad_norm_var": 9.269332474190946e-07, "learning_rate": 0.00027139335546282284, "loss": 2.6754, "step": 7174 }, { "crossentropy": 2.7092692852020264, "epoch": 0.6103266417148691, "grad_norm": 0.034158870577812195, "grad_norm_var": 1.1193458893018111e-06, "learning_rate": 0.00027074261856478764, "loss": 2.7093, "step": 7175 }, { "crossentropy": 2.63523006439209, "epoch": 0.6104117046614495, "grad_norm": 0.032952453941106796, "grad_norm_var": 1.0940538519618159e-06, "learning_rate": 0.00027009264104921603, "loss": 2.6352, "step": 7176 }, { "crossentropy": 2.5006821155548096, "epoch": 0.61049676760803, "grad_norm": 0.0319177582859993, "grad_norm_var": 1.1134010945081096e-06, "learning_rate": 0.0002694434230204779, "loss": 2.5007, "step": 7177 }, { "crossentropy": 2.6272103786468506, "epoch": 0.6105818305546105, "grad_norm": 0.03198784217238426, "grad_norm_var": 1.086835164357952e-06, "learning_rate": 0.00026879496458281807, "loss": 2.6272, "step": 7178 }, { "crossentropy": 2.7141690254211426, "epoch": 0.6106668935011909, "grad_norm": 0.03273583948612213, "grad_norm_var": 1.0901286116961803e-06, "learning_rate": 0.0002681472658403605, "loss": 2.7142, "step": 7179 }, { "crossentropy": 2.6196649074554443, "epoch": 0.6107519564477714, "grad_norm": 0.0324476920068264, "grad_norm_var": 1.0607872856653137e-06, "learning_rate": 0.000267500326897106, "loss": 2.6197, "step": 7180 }, { "crossentropy": 2.5797340869903564, "epoch": 0.6108370193943519, "grad_norm": 0.03256337344646454, "grad_norm_var": 1.0499431215805002e-06, "learning_rate": 0.00026685414785693586, "loss": 2.5797, "step": 7181 }, { "crossentropy": 2.622281551361084, "epoch": 0.6109220823409323, "grad_norm": 0.032194510102272034, "grad_norm_var": 1.046441714071487e-06, "learning_rate": 0.000266208728823607, "loss": 2.6223, "step": 7182 }, { "crossentropy": 2.5930023193359375, "epoch": 0.6110071452875128, "grad_norm": 0.031538479030132294, "grad_norm_var": 1.0740075912039788e-06, "learning_rate": 0.000265564069900755, "loss": 2.593, "step": 7183 }, { "crossentropy": 2.672502040863037, "epoch": 0.6110922082340933, "grad_norm": 0.03191075846552849, "grad_norm_var": 1.0730322330602485e-06, "learning_rate": 0.00026492017119189413, "loss": 2.6725, "step": 7184 }, { "crossentropy": 2.691760778427124, "epoch": 0.6111772711806737, "grad_norm": 0.032447416335344315, "grad_norm_var": 5.075631563752324e-07, "learning_rate": 0.00026427703280041515, "loss": 2.6918, "step": 7185 }, { "crossentropy": 2.5918233394622803, "epoch": 0.6112623341272542, "grad_norm": 0.03190567344427109, "grad_norm_var": 5.056294955468425e-07, "learning_rate": 0.0002636346548295876, "loss": 2.5918, "step": 7186 }, { "crossentropy": 2.571866989135742, "epoch": 0.6113473970738347, "grad_norm": 0.031148817390203476, "grad_norm_var": 5.724780316721893e-07, "learning_rate": 0.00026299303738255955, "loss": 2.5719, "step": 7187 }, { "crossentropy": 2.619558811187744, "epoch": 0.6114324600204151, "grad_norm": 0.03243394196033478, "grad_norm_var": 5.264749382003404e-07, "learning_rate": 0.00026235218056235634, "loss": 2.6196, "step": 7188 }, { "crossentropy": 2.694159507751465, "epoch": 0.6115175229669956, "grad_norm": 0.031929533928632736, "grad_norm_var": 4.72657322380989e-07, "learning_rate": 0.00026171208447187957, "loss": 2.6942, "step": 7189 }, { "crossentropy": 2.6365020275115967, "epoch": 0.6116025859135761, "grad_norm": 0.032745711505413055, "grad_norm_var": 4.667756777792497e-07, "learning_rate": 0.00026107274921391313, "loss": 2.6365, "step": 7190 }, { "crossentropy": 2.692445755004883, "epoch": 0.6116876488601565, "grad_norm": 0.03122321516275406, "grad_norm_var": 2.8315430777714246e-07, "learning_rate": 0.0002604341748911132, "loss": 2.6924, "step": 7191 }, { "crossentropy": 2.5837302207946777, "epoch": 0.611772711806737, "grad_norm": 0.03284849971532822, "grad_norm_var": 2.724326511096183e-07, "learning_rate": 0.00025979636160601674, "loss": 2.5837, "step": 7192 }, { "crossentropy": 2.6450695991516113, "epoch": 0.6118577747533175, "grad_norm": 0.03434918448328972, "grad_norm_var": 5.751607457910382e-07, "learning_rate": 0.0002591593094610395, "loss": 2.6451, "step": 7193 }, { "crossentropy": 2.5670406818389893, "epoch": 0.6119428376998979, "grad_norm": 0.03320073336362839, "grad_norm_var": 6.205599711770853e-07, "learning_rate": 0.0002585230185584725, "loss": 2.567, "step": 7194 }, { "crossentropy": 2.6115589141845703, "epoch": 0.6120279006464784, "grad_norm": 0.030899696052074432, "grad_norm_var": 7.371707446632755e-07, "learning_rate": 0.00025788748900048674, "loss": 2.6116, "step": 7195 }, { "crossentropy": 2.6656947135925293, "epoch": 0.6121129635930589, "grad_norm": 0.03233226016163826, "grad_norm_var": 7.347562020448839e-07, "learning_rate": 0.0002572527208891284, "loss": 2.6657, "step": 7196 }, { "crossentropy": 2.584913492202759, "epoch": 0.6121980265396393, "grad_norm": 0.032809313386678696, "grad_norm_var": 7.494853814071199e-07, "learning_rate": 0.00025661871432632543, "loss": 2.5849, "step": 7197 }, { "crossentropy": 2.5469303131103516, "epoch": 0.6122830894862198, "grad_norm": 0.032413385808467865, "grad_norm_var": 7.510101880343784e-07, "learning_rate": 0.000255985469413878, "loss": 2.5469, "step": 7198 }, { "crossentropy": 2.590280055999756, "epoch": 0.6123681524328003, "grad_norm": 0.032496511936187744, "grad_norm_var": 7.163955895944998e-07, "learning_rate": 0.0002553529862534693, "loss": 2.5903, "step": 7199 }, { "crossentropy": 2.5971415042877197, "epoch": 0.6124532153793807, "grad_norm": 0.035023365169763565, "grad_norm_var": 1.152732019130999e-06, "learning_rate": 0.0002547212649466568, "loss": 2.5971, "step": 7200 }, { "crossentropy": 2.6956255435943604, "epoch": 0.6125382783259612, "grad_norm": 0.034068308770656586, "grad_norm_var": 1.3027739097538904e-06, "learning_rate": 0.0002540903055948773, "loss": 2.6956, "step": 7201 }, { "crossentropy": 2.579430103302002, "epoch": 0.6126233412725417, "grad_norm": 0.03210948407649994, "grad_norm_var": 1.286114433109627e-06, "learning_rate": 0.00025346010829944363, "loss": 2.5794, "step": 7202 }, { "crossentropy": 2.539105176925659, "epoch": 0.6127084042191222, "grad_norm": 0.03508387878537178, "grad_norm_var": 1.4783446690737334e-06, "learning_rate": 0.0002528306731615493, "loss": 2.5391, "step": 7203 }, { "crossentropy": 2.7252297401428223, "epoch": 0.6127934671657026, "grad_norm": 0.03778938576579094, "grad_norm_var": 2.9574234406993042e-06, "learning_rate": 0.0002522020002822628, "loss": 2.7252, "step": 7204 }, { "crossentropy": 2.6338086128234863, "epoch": 0.6128785301122831, "grad_norm": 0.033330973237752914, "grad_norm_var": 2.8413477070748347e-06, "learning_rate": 0.0002515740897625285, "loss": 2.6338, "step": 7205 }, { "crossentropy": 2.708357572555542, "epoch": 0.6129635930588636, "grad_norm": 0.03487948328256607, "grad_norm_var": 2.9695654314177215e-06, "learning_rate": 0.000250946941703174, "loss": 2.7084, "step": 7206 }, { "crossentropy": 2.662482500076294, "epoch": 0.613048656005444, "grad_norm": 0.03350471705198288, "grad_norm_var": 2.6240134828056996e-06, "learning_rate": 0.00025032055620489936, "loss": 2.6625, "step": 7207 }, { "crossentropy": 2.541503667831421, "epoch": 0.6131337189520245, "grad_norm": 0.03181896731257439, "grad_norm_var": 2.7894651621319933e-06, "learning_rate": 0.00024969493336828355, "loss": 2.5415, "step": 7208 }, { "crossentropy": 2.59978985786438, "epoch": 0.613218781898605, "grad_norm": 0.03513878956437111, "grad_norm_var": 2.9171136486140967e-06, "learning_rate": 0.0002490700732937856, "loss": 2.5998, "step": 7209 }, { "crossentropy": 2.6111466884613037, "epoch": 0.6133038448451854, "grad_norm": 0.033345744013786316, "grad_norm_var": 2.9115549794916468e-06, "learning_rate": 0.0002484459760817392, "loss": 2.6111, "step": 7210 }, { "crossentropy": 2.724764108657837, "epoch": 0.6133889077917659, "grad_norm": 0.033021703362464905, "grad_norm_var": 2.438805831676364e-06, "learning_rate": 0.00024782264183235536, "loss": 2.7248, "step": 7211 }, { "crossentropy": 2.619624376296997, "epoch": 0.6134739707383464, "grad_norm": 0.03165266290307045, "grad_norm_var": 2.5914155608255014e-06, "learning_rate": 0.00024720007064572505, "loss": 2.6196, "step": 7212 }, { "crossentropy": 2.632045269012451, "epoch": 0.6135590336849268, "grad_norm": 0.031121745705604553, "grad_norm_var": 2.9597893357157767e-06, "learning_rate": 0.0002465782626218144, "loss": 2.632, "step": 7213 }, { "crossentropy": 2.5634429454803467, "epoch": 0.6136440966315073, "grad_norm": 0.03283160179853439, "grad_norm_var": 2.9073439566271524e-06, "learning_rate": 0.00024595721786046745, "loss": 2.5634, "step": 7214 }, { "crossentropy": 2.575608253479004, "epoch": 0.6137291595780878, "grad_norm": 0.03240735083818436, "grad_norm_var": 2.9206749079102936e-06, "learning_rate": 0.0002453369364614072, "loss": 2.5756, "step": 7215 }, { "crossentropy": 2.5921497344970703, "epoch": 0.6138142225246682, "grad_norm": 0.033515676856040955, "grad_norm_var": 2.770684813579773e-06, "learning_rate": 0.00024471741852423234, "loss": 2.5921, "step": 7216 }, { "crossentropy": 2.5022971630096436, "epoch": 0.6138992854712487, "grad_norm": 0.03204905986785889, "grad_norm_var": 2.866126291735845e-06, "learning_rate": 0.00024409866414841998, "loss": 2.5023, "step": 7217 }, { "crossentropy": 2.682842969894409, "epoch": 0.6139843484178292, "grad_norm": 0.0346461720764637, "grad_norm_var": 2.8487009532395777e-06, "learning_rate": 0.0002434806734333228, "loss": 2.6828, "step": 7218 }, { "crossentropy": 2.5997555255889893, "epoch": 0.6140694113644096, "grad_norm": 0.031864508986473083, "grad_norm_var": 2.8202933776163193e-06, "learning_rate": 0.00024286344647817538, "loss": 2.5998, "step": 7219 }, { "crossentropy": 2.634009599685669, "epoch": 0.6141544743109901, "grad_norm": 0.03204529732465744, "grad_norm_var": 1.4498033551093558e-06, "learning_rate": 0.0002422469833820834, "loss": 2.634, "step": 7220 }, { "crossentropy": 2.6440258026123047, "epoch": 0.6142395372575706, "grad_norm": 0.03394903987646103, "grad_norm_var": 1.505205908173407e-06, "learning_rate": 0.00024163128424403436, "loss": 2.644, "step": 7221 }, { "crossentropy": 2.5441970825195312, "epoch": 0.614324600204151, "grad_norm": 0.03677980974316597, "grad_norm_var": 2.210411675889791e-06, "learning_rate": 0.0002410163491628925, "loss": 2.5442, "step": 7222 }, { "crossentropy": 2.6500518321990967, "epoch": 0.6144096631507315, "grad_norm": 0.0315849669277668, "grad_norm_var": 2.3386429913332026e-06, "learning_rate": 0.00024040217823739773, "loss": 2.6501, "step": 7223 }, { "crossentropy": 2.62384033203125, "epoch": 0.614494726097312, "grad_norm": 0.03194589912891388, "grad_norm_var": 2.319901898401438e-06, "learning_rate": 0.00023978877156616786, "loss": 2.6238, "step": 7224 }, { "crossentropy": 2.6045501232147217, "epoch": 0.6145797890438924, "grad_norm": 0.03184802457690239, "grad_norm_var": 2.055547434601177e-06, "learning_rate": 0.00023917612924769904, "loss": 2.6046, "step": 7225 }, { "crossentropy": 2.596276044845581, "epoch": 0.6146648519904729, "grad_norm": 0.03328647464513779, "grad_norm_var": 2.051359994451919e-06, "learning_rate": 0.00023856425138036485, "loss": 2.5963, "step": 7226 }, { "crossentropy": 2.488363265991211, "epoch": 0.6147499149370534, "grad_norm": 0.03199220821261406, "grad_norm_var": 2.085024095005439e-06, "learning_rate": 0.00023795313806241337, "loss": 2.4884, "step": 7227 }, { "crossentropy": 2.619812250137329, "epoch": 0.6148349778836338, "grad_norm": 0.03273287042975426, "grad_norm_var": 2.0042215423100863e-06, "learning_rate": 0.00023734278939197275, "loss": 2.6198, "step": 7228 }, { "crossentropy": 2.542278289794922, "epoch": 0.6149200408302143, "grad_norm": 0.033082060515880585, "grad_norm_var": 1.809000085778445e-06, "learning_rate": 0.00023673320546704736, "loss": 2.5423, "step": 7229 }, { "crossentropy": 2.5919394493103027, "epoch": 0.6150051037767948, "grad_norm": 0.03141447529196739, "grad_norm_var": 1.9493409795585833e-06, "learning_rate": 0.00023612438638551835, "loss": 2.5919, "step": 7230 }, { "crossentropy": 2.5618896484375, "epoch": 0.6150901667233752, "grad_norm": 0.0331873819231987, "grad_norm_var": 1.9442964616565746e-06, "learning_rate": 0.00023551633224514534, "loss": 2.5619, "step": 7231 }, { "crossentropy": 2.5902023315429688, "epoch": 0.6151752296699557, "grad_norm": 0.03190028667449951, "grad_norm_var": 1.9683728164187597e-06, "learning_rate": 0.0002349090431435641, "loss": 2.5902, "step": 7232 }, { "crossentropy": 2.633439302444458, "epoch": 0.6152602926165363, "grad_norm": 0.03291446343064308, "grad_norm_var": 1.9320759765681835e-06, "learning_rate": 0.00023430251917828772, "loss": 2.6334, "step": 7233 }, { "crossentropy": 2.5527567863464355, "epoch": 0.6153453555631168, "grad_norm": 0.031547319144010544, "grad_norm_var": 1.7791110058655117e-06, "learning_rate": 0.00023369676044670607, "loss": 2.5528, "step": 7234 }, { "crossentropy": 2.669987201690674, "epoch": 0.6154304185096972, "grad_norm": 0.03238799050450325, "grad_norm_var": 1.7428301086631787e-06, "learning_rate": 0.00023309176704608693, "loss": 2.67, "step": 7235 }, { "crossentropy": 2.5901589393615723, "epoch": 0.6155154814562777, "grad_norm": 0.0335954912006855, "grad_norm_var": 1.7654712446061685e-06, "learning_rate": 0.00023248753907357367, "loss": 2.5902, "step": 7236 }, { "crossentropy": 2.6897637844085693, "epoch": 0.6156005444028582, "grad_norm": 0.03167586401104927, "grad_norm_var": 1.7278301648048754e-06, "learning_rate": 0.0002318840766261898, "loss": 2.6898, "step": 7237 }, { "crossentropy": 2.554262399673462, "epoch": 0.6156856073494386, "grad_norm": 0.032301273196935654, "grad_norm_var": 4.957718689755156e-07, "learning_rate": 0.00023128137980083286, "loss": 2.5543, "step": 7238 }, { "crossentropy": 2.6216981410980225, "epoch": 0.6157706702960191, "grad_norm": 0.032337989658117294, "grad_norm_var": 4.556739828108387e-07, "learning_rate": 0.00023067944869427926, "loss": 2.6217, "step": 7239 }, { "crossentropy": 2.6592843532562256, "epoch": 0.6158557332425996, "grad_norm": 0.032671645283699036, "grad_norm_var": 4.461631379880255e-07, "learning_rate": 0.00023007828340318115, "loss": 2.6593, "step": 7240 }, { "crossentropy": 2.5908701419830322, "epoch": 0.61594079618918, "grad_norm": 0.03190917149186134, "grad_norm_var": 4.4165415244984174e-07, "learning_rate": 0.00022947788402406856, "loss": 2.5909, "step": 7241 }, { "crossentropy": 2.6914563179016113, "epoch": 0.6160258591357605, "grad_norm": 0.032269373536109924, "grad_norm_var": 3.906433924652855e-07, "learning_rate": 0.00022887825065334822, "loss": 2.6915, "step": 7242 }, { "crossentropy": 2.60402774810791, "epoch": 0.616110922082341, "grad_norm": 0.032113321125507355, "grad_norm_var": 3.8545957180904923e-07, "learning_rate": 0.00022827938338730315, "loss": 2.604, "step": 7243 }, { "crossentropy": 2.5096499919891357, "epoch": 0.6161959850289214, "grad_norm": 0.03435381501913071, "grad_norm_var": 6.264674741254227e-07, "learning_rate": 0.0002276812823220964, "loss": 2.5096, "step": 7244 }, { "crossentropy": 2.6472036838531494, "epoch": 0.6162810479755019, "grad_norm": 0.034849539399147034, "grad_norm_var": 9.63866494647856e-07, "learning_rate": 0.00022708394755376394, "loss": 2.6472, "step": 7245 }, { "crossentropy": 2.71909761428833, "epoch": 0.6163661109220824, "grad_norm": 0.03209644928574562, "grad_norm_var": 8.861044596422095e-07, "learning_rate": 0.0002264873791782207, "loss": 2.7191, "step": 7246 }, { "crossentropy": 2.6626408100128174, "epoch": 0.6164511738686628, "grad_norm": 0.037755336612463, "grad_norm_var": 2.5285276445424314e-06, "learning_rate": 0.00022589157729126008, "loss": 2.6626, "step": 7247 }, { "crossentropy": 2.682070255279541, "epoch": 0.6165362368152433, "grad_norm": 0.03135056421160698, "grad_norm_var": 2.6219697493898646e-06, "learning_rate": 0.00022529654198854833, "loss": 2.6821, "step": 7248 }, { "crossentropy": 2.5956571102142334, "epoch": 0.6166212997618238, "grad_norm": 0.030988501384854317, "grad_norm_var": 2.8457490058637296e-06, "learning_rate": 0.00022470227336563177, "loss": 2.5957, "step": 7249 }, { "crossentropy": 2.6241767406463623, "epoch": 0.6167063627084042, "grad_norm": 0.033564284443855286, "grad_norm_var": 2.77315003729948e-06, "learning_rate": 0.00022410877151793351, "loss": 2.6242, "step": 7250 }, { "crossentropy": 2.6580259799957275, "epoch": 0.6167914256549847, "grad_norm": 0.03218803554773331, "grad_norm_var": 2.7890005074951053e-06, "learning_rate": 0.00022351603654075238, "loss": 2.658, "step": 7251 }, { "crossentropy": 2.6270592212677, "epoch": 0.6168764886015652, "grad_norm": 0.031853292137384415, "grad_norm_var": 2.8116387752899572e-06, "learning_rate": 0.00022292406852926383, "loss": 2.6271, "step": 7252 }, { "crossentropy": 2.6098783016204834, "epoch": 0.6169615515481456, "grad_norm": 0.03302087262272835, "grad_norm_var": 2.7289536134137885e-06, "learning_rate": 0.00022233286757852188, "loss": 2.6099, "step": 7253 }, { "crossentropy": 2.6642961502075195, "epoch": 0.6170466144947261, "grad_norm": 0.03268202021718025, "grad_norm_var": 2.71008286729008e-06, "learning_rate": 0.0002217424337834567, "loss": 2.6643, "step": 7254 }, { "crossentropy": 2.6347641944885254, "epoch": 0.6171316774413066, "grad_norm": 0.03491031751036644, "grad_norm_var": 2.9393647690394473e-06, "learning_rate": 0.00022115276723887358, "loss": 2.6348, "step": 7255 }, { "crossentropy": 2.6363205909729004, "epoch": 0.617216740387887, "grad_norm": 0.0347151979804039, "grad_norm_var": 3.101085227397261e-06, "learning_rate": 0.00022056386803945727, "loss": 2.6363, "step": 7256 }, { "crossentropy": 2.558332920074463, "epoch": 0.6173018033344675, "grad_norm": 0.0309002548456192, "grad_norm_var": 3.3334742407151263e-06, "learning_rate": 0.00021997573627976718, "loss": 2.5583, "step": 7257 }, { "crossentropy": 2.5493664741516113, "epoch": 0.617386866281048, "grad_norm": 0.031190896406769753, "grad_norm_var": 3.5257107917893714e-06, "learning_rate": 0.00021938837205424, "loss": 2.5494, "step": 7258 }, { "crossentropy": 2.59209942817688, "epoch": 0.6174719292276284, "grad_norm": 0.032139211893081665, "grad_norm_var": 3.52257684814658e-06, "learning_rate": 0.00021880177545719083, "loss": 2.5921, "step": 7259 }, { "crossentropy": 2.6627023220062256, "epoch": 0.6175569921742089, "grad_norm": 0.0318412259221077, "grad_norm_var": 3.475297614881125e-06, "learning_rate": 0.0002182159465828093, "loss": 2.6627, "step": 7260 }, { "crossentropy": 2.5855982303619385, "epoch": 0.6176420551207894, "grad_norm": 0.032690513879060745, "grad_norm_var": 3.1990514049233416e-06, "learning_rate": 0.00021763088552516298, "loss": 2.5856, "step": 7261 }, { "crossentropy": 2.625960350036621, "epoch": 0.6177271180673699, "grad_norm": 0.03150571882724762, "grad_norm_var": 3.271781473258619e-06, "learning_rate": 0.00021704659237819558, "loss": 2.626, "step": 7262 }, { "crossentropy": 2.614941120147705, "epoch": 0.6178121810139503, "grad_norm": 0.03228102996945381, "grad_norm_var": 1.4592457728367554e-06, "learning_rate": 0.0002164630672357276, "loss": 2.6149, "step": 7263 }, { "crossentropy": 2.5579638481140137, "epoch": 0.6178972439605308, "grad_norm": 0.034112270921468735, "grad_norm_var": 1.5628072408258956e-06, "learning_rate": 0.00021588031019145637, "loss": 2.558, "step": 7264 }, { "crossentropy": 2.627775192260742, "epoch": 0.6179823069071113, "grad_norm": 0.03231928497552872, "grad_norm_var": 1.398824256104815e-06, "learning_rate": 0.00021529832133895588, "loss": 2.6278, "step": 7265 }, { "crossentropy": 2.592005491256714, "epoch": 0.6180673698536917, "grad_norm": 0.03320230916142464, "grad_norm_var": 1.36142222982107e-06, "learning_rate": 0.00021471710077167693, "loss": 2.592, "step": 7266 }, { "crossentropy": 2.5968568325042725, "epoch": 0.6181524328002722, "grad_norm": 0.030712667852640152, "grad_norm_var": 1.5779218873326055e-06, "learning_rate": 0.0002141366485829471, "loss": 2.5969, "step": 7267 }, { "crossentropy": 2.618379592895508, "epoch": 0.6182374957468527, "grad_norm": 0.032567281275987625, "grad_norm_var": 1.5477588664911661e-06, "learning_rate": 0.00021355696486596965, "loss": 2.6184, "step": 7268 }, { "crossentropy": 2.732011079788208, "epoch": 0.6183225586934331, "grad_norm": 0.03177899867296219, "grad_norm_var": 1.5660885836533128e-06, "learning_rate": 0.00021297804971382673, "loss": 2.732, "step": 7269 }, { "crossentropy": 2.650998830795288, "epoch": 0.6184076216400136, "grad_norm": 0.031910594552755356, "grad_norm_var": 1.5816621848782485e-06, "learning_rate": 0.0002123999032194729, "loss": 2.651, "step": 7270 }, { "crossentropy": 2.6793246269226074, "epoch": 0.6184926845865941, "grad_norm": 0.03368927165865898, "grad_norm_var": 1.2699957093383843e-06, "learning_rate": 0.00021182252547574277, "loss": 2.6793, "step": 7271 }, { "crossentropy": 2.5699245929718018, "epoch": 0.6185777475331745, "grad_norm": 0.03392859920859337, "grad_norm_var": 1.0603215919087096e-06, "learning_rate": 0.00021124591657534775, "loss": 2.5699, "step": 7272 }, { "crossentropy": 2.6235597133636475, "epoch": 0.618662810479755, "grad_norm": 0.030882013961672783, "grad_norm_var": 1.0637421922564278e-06, "learning_rate": 0.00021067007661087323, "loss": 2.6236, "step": 7273 }, { "crossentropy": 2.573530435562134, "epoch": 0.6187478734263355, "grad_norm": 0.03229231387376785, "grad_norm_var": 9.771256613569223e-07, "learning_rate": 0.000210095005674783, "loss": 2.5735, "step": 7274 }, { "crossentropy": 2.7393112182617188, "epoch": 0.6188329363729159, "grad_norm": 0.0354483425617218, "grad_norm_var": 1.5615336820165438e-06, "learning_rate": 0.00020952070385941713, "loss": 2.7393, "step": 7275 }, { "crossentropy": 2.63547945022583, "epoch": 0.6189179993194964, "grad_norm": 0.03225927799940109, "grad_norm_var": 1.5316867426809807e-06, "learning_rate": 0.00020894717125699236, "loss": 2.6355, "step": 7276 }, { "crossentropy": 2.5919320583343506, "epoch": 0.6190030622660769, "grad_norm": 0.03129623085260391, "grad_norm_var": 1.6361346916426613e-06, "learning_rate": 0.0002083744079595995, "loss": 2.5919, "step": 7277 }, { "crossentropy": 2.5327117443084717, "epoch": 0.6190881252126573, "grad_norm": 0.03261765092611313, "grad_norm_var": 1.564274093579264e-06, "learning_rate": 0.0002078024140592094, "loss": 2.5327, "step": 7278 }, { "crossentropy": 2.5916707515716553, "epoch": 0.6191731881592378, "grad_norm": 0.03561166673898697, "grad_norm_var": 2.1243239098060145e-06, "learning_rate": 0.00020723118964766806, "loss": 2.5917, "step": 7279 }, { "crossentropy": 2.4969444274902344, "epoch": 0.6192582511058183, "grad_norm": 0.03360973671078682, "grad_norm_var": 2.051462509111571e-06, "learning_rate": 0.0002066607348166971, "loss": 2.4969, "step": 7280 }, { "crossentropy": 2.7225546836853027, "epoch": 0.6193433140523987, "grad_norm": 0.03361952677369118, "grad_norm_var": 2.081087782882527e-06, "learning_rate": 0.00020609104965789495, "loss": 2.7226, "step": 7281 }, { "crossentropy": 2.68302059173584, "epoch": 0.6194283769989792, "grad_norm": 0.03369791433215141, "grad_norm_var": 2.1204367790783e-06, "learning_rate": 0.00020552213426273724, "loss": 2.683, "step": 7282 }, { "crossentropy": 2.650296688079834, "epoch": 0.6195134399455597, "grad_norm": 0.03564659133553505, "grad_norm_var": 2.22261109023591e-06, "learning_rate": 0.00020495398872257486, "loss": 2.6503, "step": 7283 }, { "crossentropy": 2.6348342895507812, "epoch": 0.6195985028921401, "grad_norm": 0.03205113112926483, "grad_norm_var": 2.2813259098388433e-06, "learning_rate": 0.00020438661312863595, "loss": 2.6348, "step": 7284 }, { "crossentropy": 2.607545852661133, "epoch": 0.6196835658387206, "grad_norm": 0.03182905539870262, "grad_norm_var": 2.2723572224783573e-06, "learning_rate": 0.00020382000757202424, "loss": 2.6075, "step": 7285 }, { "crossentropy": 2.5893032550811768, "epoch": 0.6197686287853011, "grad_norm": 0.03247163072228432, "grad_norm_var": 2.1993634531544306e-06, "learning_rate": 0.00020325417214372087, "loss": 2.5893, "step": 7286 }, { "crossentropy": 2.6110453605651855, "epoch": 0.6198536917318815, "grad_norm": 0.03286443650722504, "grad_norm_var": 2.1863645411277956e-06, "learning_rate": 0.00020268910693458153, "loss": 2.611, "step": 7287 }, { "crossentropy": 2.628652811050415, "epoch": 0.619938754678462, "grad_norm": 0.032456573098897934, "grad_norm_var": 2.1656178973567987e-06, "learning_rate": 0.00020212481203534083, "loss": 2.6287, "step": 7288 }, { "crossentropy": 2.601597547531128, "epoch": 0.6200238176250426, "grad_norm": 0.032503142952919006, "grad_norm_var": 1.8632313801086298e-06, "learning_rate": 0.00020156128753660797, "loss": 2.6016, "step": 7289 }, { "crossentropy": 2.6565918922424316, "epoch": 0.6201088805716231, "grad_norm": 0.03178989887237549, "grad_norm_var": 1.93594051253228e-06, "learning_rate": 0.00020099853352886722, "loss": 2.6566, "step": 7290 }, { "crossentropy": 2.628977060317993, "epoch": 0.6201939435182034, "grad_norm": 0.03346266970038414, "grad_norm_var": 1.5634923611627475e-06, "learning_rate": 0.00020043655010248406, "loss": 2.629, "step": 7291 }, { "crossentropy": 2.5481760501861572, "epoch": 0.620279006464784, "grad_norm": 0.03239293396472931, "grad_norm_var": 1.551645690807633e-06, "learning_rate": 0.00019987533734769358, "loss": 2.5482, "step": 7292 }, { "crossentropy": 2.741079330444336, "epoch": 0.6203640694113645, "grad_norm": 0.03474533557891846, "grad_norm_var": 1.5139121710139314e-06, "learning_rate": 0.0001993148953546109, "loss": 2.7411, "step": 7293 }, { "crossentropy": 2.616288423538208, "epoch": 0.6204491323579449, "grad_norm": 0.03418930992484093, "grad_norm_var": 1.5440351448680588e-06, "learning_rate": 0.0001987552242132279, "loss": 2.6163, "step": 7294 }, { "crossentropy": 2.664609432220459, "epoch": 0.6205341953045254, "grad_norm": 0.03279300779104233, "grad_norm_var": 1.1751391156236112e-06, "learning_rate": 0.00019819632401341103, "loss": 2.6646, "step": 7295 }, { "crossentropy": 2.621703863143921, "epoch": 0.6206192582511059, "grad_norm": 0.03281598538160324, "grad_norm_var": 1.1640282003268222e-06, "learning_rate": 0.00019763819484490354, "loss": 2.6217, "step": 7296 }, { "crossentropy": 2.671046018600464, "epoch": 0.6207043211976863, "grad_norm": 0.03336421400308609, "grad_norm_var": 1.149840386400515e-06, "learning_rate": 0.00019708083679732536, "loss": 2.671, "step": 7297 }, { "crossentropy": 2.5194931030273438, "epoch": 0.6207893841442668, "grad_norm": 0.031694445759058, "grad_norm_var": 1.2322032411891832e-06, "learning_rate": 0.0001965242499601727, "loss": 2.5195, "step": 7298 }, { "crossentropy": 2.518092632293701, "epoch": 0.6208744470908473, "grad_norm": 0.03344760835170746, "grad_norm_var": 7.414135691441448e-07, "learning_rate": 0.0001959684344228152, "loss": 2.5181, "step": 7299 }, { "crossentropy": 2.6064443588256836, "epoch": 0.6209595100374277, "grad_norm": 0.0319954939186573, "grad_norm_var": 7.47195462322212e-07, "learning_rate": 0.00019541339027450257, "loss": 2.6064, "step": 7300 }, { "crossentropy": 2.5959279537200928, "epoch": 0.6210445729840082, "grad_norm": 0.031905557960271835, "grad_norm_var": 7.376472500763541e-07, "learning_rate": 0.0001948591176043585, "loss": 2.5959, "step": 7301 }, { "crossentropy": 2.633525848388672, "epoch": 0.6211296359305887, "grad_norm": 0.031533386558294296, "grad_norm_var": 8.344661052612941e-07, "learning_rate": 0.0001943056165013829, "loss": 2.6335, "step": 7302 }, { "crossentropy": 2.7850148677825928, "epoch": 0.6212146988771691, "grad_norm": 0.03172861412167549, "grad_norm_var": 8.973308842493329e-07, "learning_rate": 0.00019375288705445194, "loss": 2.785, "step": 7303 }, { "crossentropy": 2.6056666374206543, "epoch": 0.6212997618237496, "grad_norm": 0.030877942219376564, "grad_norm_var": 1.0993001290350434e-06, "learning_rate": 0.00019320092935231958, "loss": 2.6057, "step": 7304 }, { "crossentropy": 2.5385775566101074, "epoch": 0.6213848247703301, "grad_norm": 0.0325230173766613, "grad_norm_var": 1.099127850634586e-06, "learning_rate": 0.00019264974348361276, "loss": 2.5386, "step": 7305 }, { "crossentropy": 2.6781482696533203, "epoch": 0.6214698877169105, "grad_norm": 0.03258918970823288, "grad_norm_var": 1.054991288726885e-06, "learning_rate": 0.00019209932953683618, "loss": 2.6781, "step": 7306 }, { "crossentropy": 2.672586441040039, "epoch": 0.621554950663491, "grad_norm": 0.03353508934378624, "grad_norm_var": 1.0633721426127659e-06, "learning_rate": 0.0001915496876003714, "loss": 2.6726, "step": 7307 }, { "crossentropy": 2.6418521404266357, "epoch": 0.6216400136100715, "grad_norm": 0.03238048776984215, "grad_norm_var": 1.0637805370113887e-06, "learning_rate": 0.0001910008177624739, "loss": 2.6419, "step": 7308 }, { "crossentropy": 2.6673777103424072, "epoch": 0.6217250765566519, "grad_norm": 0.03228858485817909, "grad_norm_var": 7.488854264712004e-07, "learning_rate": 0.00019045272011127657, "loss": 2.6674, "step": 7309 }, { "crossentropy": 2.718140125274658, "epoch": 0.6218101395032324, "grad_norm": 0.03184940665960312, "grad_norm_var": 5.574472301786745e-07, "learning_rate": 0.00018990539473478897, "loss": 2.7181, "step": 7310 }, { "crossentropy": 2.6577441692352295, "epoch": 0.6218952024498129, "grad_norm": 0.0336705707013607, "grad_norm_var": 6.594479373731351e-07, "learning_rate": 0.00018935884172089523, "loss": 2.6577, "step": 7311 }, { "crossentropy": 2.5802557468414307, "epoch": 0.6219802653963933, "grad_norm": 0.031244806945323944, "grad_norm_var": 7.239666366081701e-07, "learning_rate": 0.0001888130611573563, "loss": 2.5803, "step": 7312 }, { "crossentropy": 2.6028854846954346, "epoch": 0.6220653283429738, "grad_norm": 0.03198551759123802, "grad_norm_var": 6.451651210833866e-07, "learning_rate": 0.00018826805313180818, "loss": 2.6029, "step": 7313 }, { "crossentropy": 2.646228790283203, "epoch": 0.6221503912895543, "grad_norm": 0.03170977905392647, "grad_norm_var": 6.441398874125105e-07, "learning_rate": 0.00018772381773176416, "loss": 2.6462, "step": 7314 }, { "crossentropy": 2.6206414699554443, "epoch": 0.6222354542361347, "grad_norm": 0.03292994201183319, "grad_norm_var": 5.750565251734131e-07, "learning_rate": 0.0001871803550446116, "loss": 2.6206, "step": 7315 }, { "crossentropy": 2.5774710178375244, "epoch": 0.6223205171827152, "grad_norm": 0.031238125637173653, "grad_norm_var": 6.287018433465477e-07, "learning_rate": 0.00018663766515761626, "loss": 2.5775, "step": 7316 }, { "crossentropy": 2.6010966300964355, "epoch": 0.6224055801292957, "grad_norm": 0.03156282752752304, "grad_norm_var": 6.460427734070185e-07, "learning_rate": 0.0001860957481579184, "loss": 2.6011, "step": 7317 }, { "crossentropy": 2.6092796325683594, "epoch": 0.6224906430758761, "grad_norm": 0.03209618851542473, "grad_norm_var": 6.230987986822571e-07, "learning_rate": 0.00018555460413253343, "loss": 2.6093, "step": 7318 }, { "crossentropy": 2.659113883972168, "epoch": 0.6225757060224566, "grad_norm": 0.0323568657040596, "grad_norm_var": 6.13463632809635e-07, "learning_rate": 0.00018501423316835352, "loss": 2.6591, "step": 7319 }, { "crossentropy": 2.525420665740967, "epoch": 0.6226607689690371, "grad_norm": 0.03121657855808735, "grad_norm_var": 5.619584748127344e-07, "learning_rate": 0.0001844746353521487, "loss": 2.5254, "step": 7320 }, { "crossentropy": 2.6300485134124756, "epoch": 0.6227458319156176, "grad_norm": 0.03231045603752136, "grad_norm_var": 5.555867892142093e-07, "learning_rate": 0.00018393581077055966, "loss": 2.63, "step": 7321 }, { "crossentropy": 2.636753797531128, "epoch": 0.622830894862198, "grad_norm": 0.03249124065041542, "grad_norm_var": 5.509113535551516e-07, "learning_rate": 0.00018339775951010828, "loss": 2.6368, "step": 7322 }, { "crossentropy": 2.626359701156616, "epoch": 0.6229159578087785, "grad_norm": 0.031963150948286057, "grad_norm_var": 4.21155384716596e-07, "learning_rate": 0.0001828604816571894, "loss": 2.6264, "step": 7323 }, { "crossentropy": 2.672250747680664, "epoch": 0.623001020755359, "grad_norm": 0.033561863005161285, "grad_norm_var": 5.55572154760144e-07, "learning_rate": 0.0001823239772980745, "loss": 2.6723, "step": 7324 }, { "crossentropy": 2.636742115020752, "epoch": 0.6230860837019394, "grad_norm": 0.031764790415763855, "grad_norm_var": 5.633723487898899e-07, "learning_rate": 0.00018178824651891024, "loss": 2.6367, "step": 7325 }, { "crossentropy": 2.6284780502319336, "epoch": 0.6231711466485199, "grad_norm": 0.03129780665040016, "grad_norm_var": 6.024375967625395e-07, "learning_rate": 0.00018125328940572061, "loss": 2.6285, "step": 7326 }, { "crossentropy": 2.623096466064453, "epoch": 0.6232562095951004, "grad_norm": 0.03241826966404915, "grad_norm_var": 4.36128225699689e-07, "learning_rate": 0.00018071910604440357, "loss": 2.6231, "step": 7327 }, { "crossentropy": 2.613609552383423, "epoch": 0.6233412725416808, "grad_norm": 0.03400306776165962, "grad_norm_var": 6.304858178008201e-07, "learning_rate": 0.0001801856965207338, "loss": 2.6136, "step": 7328 }, { "crossentropy": 2.7350046634674072, "epoch": 0.6234263354882613, "grad_norm": 0.032456886023283005, "grad_norm_var": 6.320455561749257e-07, "learning_rate": 0.00017965306092036115, "loss": 2.735, "step": 7329 }, { "crossentropy": 2.5134172439575195, "epoch": 0.6235113984348418, "grad_norm": 0.031102776527404785, "grad_norm_var": 6.95648759211261e-07, "learning_rate": 0.00017912119932881164, "loss": 2.5134, "step": 7330 }, { "crossentropy": 2.5641720294952393, "epoch": 0.6235964613814222, "grad_norm": 0.037028878927230835, "grad_norm_var": 2.1593198138237607e-06, "learning_rate": 0.00017859011183148698, "loss": 2.5642, "step": 7331 }, { "crossentropy": 2.5684430599212646, "epoch": 0.6236815243280027, "grad_norm": 0.033996179699897766, "grad_norm_var": 2.196683243731627e-06, "learning_rate": 0.00017805979851366505, "loss": 2.5684, "step": 7332 }, { "crossentropy": 2.558454751968384, "epoch": 0.6237665872745832, "grad_norm": 0.032656315714120865, "grad_norm_var": 2.1199438395047653e-06, "learning_rate": 0.00017753025946049827, "loss": 2.5585, "step": 7333 }, { "crossentropy": 2.5990076065063477, "epoch": 0.6238516502211636, "grad_norm": 0.031110523268580437, "grad_norm_var": 2.25608710962179e-06, "learning_rate": 0.0001770014947570153, "loss": 2.599, "step": 7334 }, { "crossentropy": 2.6192164421081543, "epoch": 0.6239367131677441, "grad_norm": 0.03366001322865486, "grad_norm_var": 2.318505792572173e-06, "learning_rate": 0.00017647350448812105, "loss": 2.6192, "step": 7335 }, { "crossentropy": 2.5335376262664795, "epoch": 0.6240217761143246, "grad_norm": 0.031319890171289444, "grad_norm_var": 2.29887770264527e-06, "learning_rate": 0.00017594628873859487, "loss": 2.5335, "step": 7336 }, { "crossentropy": 2.666465997695923, "epoch": 0.624106839060905, "grad_norm": 0.03425457701086998, "grad_norm_var": 2.4350649192734677e-06, "learning_rate": 0.00017541984759309292, "loss": 2.6665, "step": 7337 }, { "crossentropy": 2.635990619659424, "epoch": 0.6241919020074855, "grad_norm": 0.03350074961781502, "grad_norm_var": 2.454791885390346e-06, "learning_rate": 0.00017489418113614597, "loss": 2.636, "step": 7338 }, { "crossentropy": 2.5259459018707275, "epoch": 0.624276964954066, "grad_norm": 0.03226875141263008, "grad_norm_var": 2.423230182553336e-06, "learning_rate": 0.000174369289452162, "loss": 2.5259, "step": 7339 }, { "crossentropy": 2.6381428241729736, "epoch": 0.6243620279006464, "grad_norm": 0.034377846866846085, "grad_norm_var": 2.5368447015506793e-06, "learning_rate": 0.00017384517262542256, "loss": 2.6381, "step": 7340 }, { "crossentropy": 2.5977609157562256, "epoch": 0.6244470908472269, "grad_norm": 0.033729203045368195, "grad_norm_var": 2.467311343939945e-06, "learning_rate": 0.0001733218307400858, "loss": 2.5978, "step": 7341 }, { "crossentropy": 2.646188735961914, "epoch": 0.6245321537938074, "grad_norm": 0.03277164697647095, "grad_norm_var": 2.254058602361982e-06, "learning_rate": 0.00017279926388018564, "loss": 2.6462, "step": 7342 }, { "crossentropy": 2.648231267929077, "epoch": 0.6246172167403878, "grad_norm": 0.030976833775639534, "grad_norm_var": 2.5276194758685452e-06, "learning_rate": 0.00017227747212962996, "loss": 2.6482, "step": 7343 }, { "crossentropy": 2.5907413959503174, "epoch": 0.6247022796869683, "grad_norm": 0.032053545117378235, "grad_norm_var": 2.524150554952115e-06, "learning_rate": 0.00017175645557220565, "loss": 2.5907, "step": 7344 }, { "crossentropy": 2.6548774242401123, "epoch": 0.6247873426335488, "grad_norm": 0.03300556540489197, "grad_norm_var": 2.506595806648672e-06, "learning_rate": 0.00017123621429157189, "loss": 2.6549, "step": 7345 }, { "crossentropy": 2.5587236881256104, "epoch": 0.6248724055801292, "grad_norm": 0.03451033681631088, "grad_norm_var": 2.3756270742964284e-06, "learning_rate": 0.0001707167483712646, "loss": 2.5587, "step": 7346 }, { "crossentropy": 2.6197142601013184, "epoch": 0.6249574685267097, "grad_norm": 0.03303394094109535, "grad_norm_var": 1.3343075080794741e-06, "learning_rate": 0.00017019805789469377, "loss": 2.6197, "step": 7347 }, { "crossentropy": 2.670771598815918, "epoch": 0.6250425314732903, "grad_norm": 0.032552070915699005, "grad_norm_var": 1.2635204213837236e-06, "learning_rate": 0.0001696801429451489, "loss": 2.6708, "step": 7348 }, { "crossentropy": 2.5822057723999023, "epoch": 0.6251275944198708, "grad_norm": 0.03372444584965706, "grad_norm_var": 1.305624483841546e-06, "learning_rate": 0.00016916300360579007, "loss": 2.5822, "step": 7349 }, { "crossentropy": 2.5777227878570557, "epoch": 0.6252126573664512, "grad_norm": 0.03269008547067642, "grad_norm_var": 1.0787618158759517e-06, "learning_rate": 0.0001686466399596548, "loss": 2.5777, "step": 7350 }, { "crossentropy": 2.5554614067077637, "epoch": 0.6252977203130317, "grad_norm": 0.03450215980410576, "grad_norm_var": 1.194183676972886e-06, "learning_rate": 0.0001681310520896573, "loss": 2.5555, "step": 7351 }, { "crossentropy": 2.5963964462280273, "epoch": 0.6253827832596122, "grad_norm": 0.03309309855103493, "grad_norm_var": 9.746849556281951e-07, "learning_rate": 0.00016761624007858523, "loss": 2.5964, "step": 7352 }, { "crossentropy": 2.5700199604034424, "epoch": 0.6254678462061926, "grad_norm": 0.032828982919454575, "grad_norm_var": 8.994086109497298e-07, "learning_rate": 0.00016710220400910304, "loss": 2.57, "step": 7353 }, { "crossentropy": 2.6294615268707275, "epoch": 0.6255529091527731, "grad_norm": 0.03241831809282303, "grad_norm_var": 9.149731287124423e-07, "learning_rate": 0.0001665889439637508, "loss": 2.6295, "step": 7354 }, { "crossentropy": 2.6384193897247314, "epoch": 0.6256379720993536, "grad_norm": 0.032055072486400604, "grad_norm_var": 9.396163654662073e-07, "learning_rate": 0.00016607646002494202, "loss": 2.6384, "step": 7355 }, { "crossentropy": 2.573988437652588, "epoch": 0.625723035045934, "grad_norm": 0.03223181515932083, "grad_norm_var": 8.389825179658105e-07, "learning_rate": 0.00016556475227496814, "loss": 2.574, "step": 7356 }, { "crossentropy": 2.680424451828003, "epoch": 0.6258080979925145, "grad_norm": 0.03353568911552429, "grad_norm_var": 8.195685991378334e-07, "learning_rate": 0.0001650538207959934, "loss": 2.6804, "step": 7357 }, { "crossentropy": 2.6065571308135986, "epoch": 0.625893160939095, "grad_norm": 0.033907100558280945, "grad_norm_var": 8.846551269845454e-07, "learning_rate": 0.0001645436656700594, "loss": 2.6066, "step": 7358 }, { "crossentropy": 2.707178831100464, "epoch": 0.6259782238856754, "grad_norm": 0.03302551805973053, "grad_norm_var": 6.093702026587743e-07, "learning_rate": 0.00016403428697908174, "loss": 2.7072, "step": 7359 }, { "crossentropy": 2.6438653469085693, "epoch": 0.6260632868322559, "grad_norm": 0.03312817960977554, "grad_norm_var": 5.35477755631876e-07, "learning_rate": 0.00016352568480485275, "loss": 2.6439, "step": 7360 }, { "crossentropy": 2.590738296508789, "epoch": 0.6261483497788364, "grad_norm": 0.03108362667262554, "grad_norm_var": 8.008314081901867e-07, "learning_rate": 0.00016301785922903935, "loss": 2.5907, "step": 7361 }, { "crossentropy": 2.6575767993927, "epoch": 0.6262334127254168, "grad_norm": 0.0332338772714138, "grad_norm_var": 6.490231227895682e-07, "learning_rate": 0.00016251081033318295, "loss": 2.6576, "step": 7362 }, { "crossentropy": 2.6570465564727783, "epoch": 0.6263184756719973, "grad_norm": 0.03210209682583809, "grad_norm_var": 6.91653106287673e-07, "learning_rate": 0.0001620045381987012, "loss": 2.657, "step": 7363 }, { "crossentropy": 2.649484634399414, "epoch": 0.6264035386185778, "grad_norm": 0.033044926822185516, "grad_norm_var": 6.851532264345728e-07, "learning_rate": 0.00016149904290688744, "loss": 2.6495, "step": 7364 }, { "crossentropy": 2.631946563720703, "epoch": 0.6264886015651582, "grad_norm": 0.03228253126144409, "grad_norm_var": 6.590572049145939e-07, "learning_rate": 0.00016099432453890839, "loss": 2.6319, "step": 7365 }, { "crossentropy": 2.6805684566497803, "epoch": 0.6265736645117387, "grad_norm": 0.03133264183998108, "grad_norm_var": 7.982238912925307e-07, "learning_rate": 0.00016049038317580812, "loss": 2.6806, "step": 7366 }, { "crossentropy": 2.7079153060913086, "epoch": 0.6266587274583192, "grad_norm": 0.030998235568404198, "grad_norm_var": 7.41300012739067e-07, "learning_rate": 0.00015998721889850577, "loss": 2.7079, "step": 7367 }, { "crossentropy": 2.496279001235962, "epoch": 0.6267437904048996, "grad_norm": 0.032215554267168045, "grad_norm_var": 7.222406001126831e-07, "learning_rate": 0.0001594848317877934, "loss": 2.4963, "step": 7368 }, { "crossentropy": 2.6215381622314453, "epoch": 0.6268288533514801, "grad_norm": 0.03295372426509857, "grad_norm_var": 7.292834136108743e-07, "learning_rate": 0.0001589832219243409, "loss": 2.6215, "step": 7369 }, { "crossentropy": 2.692680835723877, "epoch": 0.6269139162980606, "grad_norm": 0.03476022928953171, "grad_norm_var": 1.0553655780079496e-06, "learning_rate": 0.0001584823893886933, "loss": 2.6927, "step": 7370 }, { "crossentropy": 2.5813958644866943, "epoch": 0.626998979244641, "grad_norm": 0.032584693282842636, "grad_norm_var": 1.033132523684382e-06, "learning_rate": 0.00015798233426126795, "loss": 2.5814, "step": 7371 }, { "crossentropy": 2.646960973739624, "epoch": 0.6270840421912215, "grad_norm": 0.03278395161032677, "grad_norm_var": 1.0213058808210274e-06, "learning_rate": 0.00015748305662236008, "loss": 2.647, "step": 7372 }, { "crossentropy": 2.6617178916931152, "epoch": 0.627169105137802, "grad_norm": 0.033776719123125076, "grad_norm_var": 1.0522504672991554e-06, "learning_rate": 0.0001569845565521394, "loss": 2.6617, "step": 7373 }, { "crossentropy": 2.5818238258361816, "epoch": 0.6272541680843824, "grad_norm": 0.03131021186709404, "grad_norm_var": 1.0560735760827e-06, "learning_rate": 0.0001564868341306508, "loss": 2.5818, "step": 7374 }, { "crossentropy": 2.674370765686035, "epoch": 0.6273392310309629, "grad_norm": 0.03276015818119049, "grad_norm_var": 1.0432448141632632e-06, "learning_rate": 0.00015598988943781367, "loss": 2.6744, "step": 7375 }, { "crossentropy": 2.6969921588897705, "epoch": 0.6274242939775434, "grad_norm": 0.03388611227273941, "grad_norm_var": 1.140411850846651e-06, "learning_rate": 0.00015549372255342365, "loss": 2.697, "step": 7376 }, { "crossentropy": 2.6530795097351074, "epoch": 0.6275093569241239, "grad_norm": 0.033840324729681015, "grad_norm_var": 1.0692883924364916e-06, "learning_rate": 0.00015499833355715086, "loss": 2.6531, "step": 7377 }, { "crossentropy": 2.5710666179656982, "epoch": 0.6275944198707043, "grad_norm": 0.03166656196117401, "grad_norm_var": 1.1199494691909907e-06, "learning_rate": 0.0001545037225285395, "loss": 2.5711, "step": 7378 }, { "crossentropy": 2.5594372749328613, "epoch": 0.6276794828172848, "grad_norm": 0.03317156434059143, "grad_norm_var": 1.1142089455058822e-06, "learning_rate": 0.00015400988954701044, "loss": 2.5594, "step": 7379 }, { "crossentropy": 2.651441812515259, "epoch": 0.6277645457638653, "grad_norm": 0.03341374546289444, "grad_norm_var": 1.1391559252746461e-06, "learning_rate": 0.00015351683469185974, "loss": 2.6514, "step": 7380 }, { "crossentropy": 2.6388790607452393, "epoch": 0.6278496087104457, "grad_norm": 0.030124733224511147, "grad_norm_var": 1.5599255430057136e-06, "learning_rate": 0.00015302455804225623, "loss": 2.6389, "step": 7381 }, { "crossentropy": 2.5691211223602295, "epoch": 0.6279346716570262, "grad_norm": 0.032738372683525085, "grad_norm_var": 1.4461326794265595e-06, "learning_rate": 0.00015253305967724673, "loss": 2.5691, "step": 7382 }, { "crossentropy": 2.7174222469329834, "epoch": 0.6280197346036067, "grad_norm": 0.03129403665661812, "grad_norm_var": 1.3850137289138682e-06, "learning_rate": 0.00015204233967575144, "loss": 2.7174, "step": 7383 }, { "crossentropy": 2.6365342140197754, "epoch": 0.6281047975501871, "grad_norm": 0.03357599303126335, "grad_norm_var": 1.411899012120178e-06, "learning_rate": 0.00015155239811656562, "loss": 2.6365, "step": 7384 }, { "crossentropy": 2.631850481033325, "epoch": 0.6281898604967676, "grad_norm": 0.031485434621572495, "grad_norm_var": 1.5146023985533926e-06, "learning_rate": 0.0001510632350783603, "loss": 2.6319, "step": 7385 }, { "crossentropy": 2.5472800731658936, "epoch": 0.6282749234433481, "grad_norm": 0.03146016597747803, "grad_norm_var": 1.2879883859711333e-06, "learning_rate": 0.0001505748506396798, "loss": 2.5473, "step": 7386 }, { "crossentropy": 2.588479518890381, "epoch": 0.6283599863899285, "grad_norm": 0.032085731625556946, "grad_norm_var": 1.2973850769361678e-06, "learning_rate": 0.00015008724487894587, "loss": 2.5885, "step": 7387 }, { "crossentropy": 2.620677947998047, "epoch": 0.628445049336509, "grad_norm": 0.03518286347389221, "grad_norm_var": 1.7604000027459139e-06, "learning_rate": 0.00014960041787445255, "loss": 2.6207, "step": 7388 }, { "crossentropy": 2.688293933868408, "epoch": 0.6285301122830895, "grad_norm": 0.035477783530950546, "grad_norm_var": 2.205692733080753e-06, "learning_rate": 0.0001491143697043712, "loss": 2.6883, "step": 7389 }, { "crossentropy": 2.6191134452819824, "epoch": 0.6286151752296699, "grad_norm": 0.03161551430821419, "grad_norm_var": 2.1542476550259152e-06, "learning_rate": 0.0001486291004467477, "loss": 2.6191, "step": 7390 }, { "crossentropy": 2.643170118331909, "epoch": 0.6287002381762504, "grad_norm": 0.03290515020489693, "grad_norm_var": 2.156024864543776e-06, "learning_rate": 0.0001481446101795003, "loss": 2.6432, "step": 7391 }, { "crossentropy": 2.6687510013580322, "epoch": 0.6287853011228309, "grad_norm": 0.038009997457265854, "grad_norm_var": 3.846228320065675e-06, "learning_rate": 0.00014766089898042678, "loss": 2.6688, "step": 7392 }, { "crossentropy": 2.588026762008667, "epoch": 0.6288703640694113, "grad_norm": 0.03335998207330704, "grad_norm_var": 3.807021741883511e-06, "learning_rate": 0.00014717796692719553, "loss": 2.588, "step": 7393 }, { "crossentropy": 2.6266565322875977, "epoch": 0.6289554270159918, "grad_norm": 0.03517448529601097, "grad_norm_var": 3.9650766578932e-06, "learning_rate": 0.00014669581409735178, "loss": 2.6267, "step": 7394 }, { "crossentropy": 2.6276259422302246, "epoch": 0.6290404899625723, "grad_norm": 0.03214959800243378, "grad_norm_var": 4.0331674821006734e-06, "learning_rate": 0.00014621444056831523, "loss": 2.6276, "step": 7395 }, { "crossentropy": 2.6935694217681885, "epoch": 0.6291255529091527, "grad_norm": 0.03361940756440163, "grad_norm_var": 4.043637063638041e-06, "learning_rate": 0.0001457338464173813, "loss": 2.6936, "step": 7396 }, { "crossentropy": 2.7011313438415527, "epoch": 0.6292106158557332, "grad_norm": 0.03287111967802048, "grad_norm_var": 3.4104663696590243e-06, "learning_rate": 0.0001452540317217188, "loss": 2.7011, "step": 7397 }, { "crossentropy": 2.650268316268921, "epoch": 0.6292956788023137, "grad_norm": 0.034054361283779144, "grad_norm_var": 3.4179043509676685e-06, "learning_rate": 0.0001447749965583728, "loss": 2.6503, "step": 7398 }, { "crossentropy": 2.6418542861938477, "epoch": 0.6293807417488941, "grad_norm": 0.03248897194862366, "grad_norm_var": 3.1723947025293536e-06, "learning_rate": 0.00014429674100426283, "loss": 2.6419, "step": 7399 }, { "crossentropy": 2.661935806274414, "epoch": 0.6294658046954746, "grad_norm": 0.03652575612068176, "grad_norm_var": 3.757985397407198e-06, "learning_rate": 0.00014381926513618137, "loss": 2.6619, "step": 7400 }, { "crossentropy": 2.6347098350524902, "epoch": 0.6295508676420551, "grad_norm": 0.033227864652872086, "grad_norm_var": 3.4438957732853154e-06, "learning_rate": 0.00014334256903079823, "loss": 2.6347, "step": 7401 }, { "crossentropy": 2.613131523132324, "epoch": 0.6296359305886355, "grad_norm": 0.033302705734968185, "grad_norm_var": 3.0903269901532784e-06, "learning_rate": 0.0001428666527646577, "loss": 2.6131, "step": 7402 }, { "crossentropy": 2.5508227348327637, "epoch": 0.629720993535216, "grad_norm": 0.03375118598341942, "grad_norm_var": 2.8656477699316614e-06, "learning_rate": 0.00014239151641417703, "loss": 2.5508, "step": 7403 }, { "crossentropy": 2.6612842082977295, "epoch": 0.6298060564817966, "grad_norm": 0.0347098782658577, "grad_norm_var": 2.8039165816108487e-06, "learning_rate": 0.00014191716005565013, "loss": 2.6613, "step": 7404 }, { "crossentropy": 2.692333459854126, "epoch": 0.6298911194283769, "grad_norm": 0.032635800540447235, "grad_norm_var": 2.7308325742545397e-06, "learning_rate": 0.000141443583765245, "loss": 2.6923, "step": 7405 }, { "crossentropy": 2.7014918327331543, "epoch": 0.6299761823749574, "grad_norm": 0.03303380310535431, "grad_norm_var": 2.4481630760348275e-06, "learning_rate": 0.00014097078761900474, "loss": 2.7015, "step": 7406 }, { "crossentropy": 2.6520307064056396, "epoch": 0.630061245321538, "grad_norm": 0.032773371785879135, "grad_norm_var": 2.466091532991356e-06, "learning_rate": 0.00014049877169284574, "loss": 2.652, "step": 7407 }, { "crossentropy": 2.60638165473938, "epoch": 0.6301463082681185, "grad_norm": 0.03134408965706825, "grad_norm_var": 1.5507867716636643e-06, "learning_rate": 0.0001400275360625608, "loss": 2.6064, "step": 7408 }, { "crossentropy": 2.552825927734375, "epoch": 0.6302313712146989, "grad_norm": 0.030579816550016403, "grad_norm_var": 2.0631228551131846e-06, "learning_rate": 0.00013955708080381658, "loss": 2.5528, "step": 7409 }, { "crossentropy": 2.6947219371795654, "epoch": 0.6303164341612794, "grad_norm": 0.033864494413137436, "grad_norm_var": 1.836880694707175e-06, "learning_rate": 0.00013908740599215432, "loss": 2.6947, "step": 7410 }, { "crossentropy": 2.647625207901001, "epoch": 0.6304014971078599, "grad_norm": 0.030871907249093056, "grad_norm_var": 2.1150056271843396e-06, "learning_rate": 0.0001386185117029909, "loss": 2.6476, "step": 7411 }, { "crossentropy": 2.617915153503418, "epoch": 0.6304865600544403, "grad_norm": 0.03112991712987423, "grad_norm_var": 2.3310766293980003e-06, "learning_rate": 0.0001381503980116172, "loss": 2.6179, "step": 7412 }, { "crossentropy": 2.6846368312835693, "epoch": 0.6305716230010208, "grad_norm": 0.03933868929743767, "grad_norm_var": 4.879279812762307e-06, "learning_rate": 0.00013768306499319872, "loss": 2.6846, "step": 7413 }, { "crossentropy": 2.615145444869995, "epoch": 0.6306566859476013, "grad_norm": 0.032270003110170364, "grad_norm_var": 4.911182939603722e-06, "learning_rate": 0.0001372165127227748, "loss": 2.6151, "step": 7414 }, { "crossentropy": 2.5360989570617676, "epoch": 0.6307417488941817, "grad_norm": 0.03209611028432846, "grad_norm_var": 4.960196252310327e-06, "learning_rate": 0.00013675074127526165, "loss": 2.5361, "step": 7415 }, { "crossentropy": 2.6625423431396484, "epoch": 0.6308268118407622, "grad_norm": 0.033559948205947876, "grad_norm_var": 4.201118855582236e-06, "learning_rate": 0.0001362857507254478, "loss": 2.6625, "step": 7416 }, { "crossentropy": 2.6350603103637695, "epoch": 0.6309118747873427, "grad_norm": 0.03200088068842888, "grad_norm_var": 4.262939739704298e-06, "learning_rate": 0.0001358215411479974, "loss": 2.6351, "step": 7417 }, { "crossentropy": 2.622877597808838, "epoch": 0.6309969377339231, "grad_norm": 0.03285667672753334, "grad_norm_var": 4.254630694308127e-06, "learning_rate": 0.00013535811261745022, "loss": 2.6229, "step": 7418 }, { "crossentropy": 2.6580944061279297, "epoch": 0.6310820006805036, "grad_norm": 0.03272901102900505, "grad_norm_var": 4.207473584817191e-06, "learning_rate": 0.00013489546520821738, "loss": 2.6581, "step": 7419 }, { "crossentropy": 2.671076536178589, "epoch": 0.6311670636270841, "grad_norm": 0.03056729957461357, "grad_norm_var": 4.259452106615547e-06, "learning_rate": 0.00013443359899458995, "loss": 2.6711, "step": 7420 }, { "crossentropy": 2.5984466075897217, "epoch": 0.6312521265736645, "grad_norm": 0.03197886049747467, "grad_norm_var": 4.283573089266385e-06, "learning_rate": 0.00013397251405072698, "loss": 2.5984, "step": 7421 }, { "crossentropy": 2.6007919311523438, "epoch": 0.631337189520245, "grad_norm": 0.03097938932478428, "grad_norm_var": 4.418173708653018e-06, "learning_rate": 0.00013351221045066698, "loss": 2.6008, "step": 7422 }, { "crossentropy": 2.5646731853485107, "epoch": 0.6314222524668255, "grad_norm": 0.0316503643989563, "grad_norm_var": 4.446146628086937e-06, "learning_rate": 0.0001330526882683214, "loss": 2.5647, "step": 7423 }, { "crossentropy": 2.555936336517334, "epoch": 0.6315073154134059, "grad_norm": 0.030966760590672493, "grad_norm_var": 4.5063368766533215e-06, "learning_rate": 0.0001325939475774768, "loss": 2.5559, "step": 7424 }, { "crossentropy": 2.585549831390381, "epoch": 0.6315923783599864, "grad_norm": 0.03216830641031265, "grad_norm_var": 4.291236945339061e-06, "learning_rate": 0.00013213598845179252, "loss": 2.5855, "step": 7425 }, { "crossentropy": 2.6458847522735596, "epoch": 0.6316774413065669, "grad_norm": 0.03293377906084061, "grad_norm_var": 4.168514939959823e-06, "learning_rate": 0.00013167881096480373, "loss": 2.6459, "step": 7426 }, { "crossentropy": 2.6279568672180176, "epoch": 0.6317625042531473, "grad_norm": 0.03258517384529114, "grad_norm_var": 4.0072124872276516e-06, "learning_rate": 0.00013122241518992106, "loss": 2.628, "step": 7427 }, { "crossentropy": 2.704010248184204, "epoch": 0.6318475671997278, "grad_norm": 0.03260629251599312, "grad_norm_var": 3.876065068794214e-06, "learning_rate": 0.00013076680120042817, "loss": 2.704, "step": 7428 }, { "crossentropy": 2.642526388168335, "epoch": 0.6319326301463083, "grad_norm": 0.03203899413347244, "grad_norm_var": 6.286880369946526e-07, "learning_rate": 0.00013031196906948262, "loss": 2.6425, "step": 7429 }, { "crossentropy": 2.587157726287842, "epoch": 0.6320176930928887, "grad_norm": 0.031916502863168716, "grad_norm_var": 6.296279131044631e-07, "learning_rate": 0.00012985791887011876, "loss": 2.5872, "step": 7430 }, { "crossentropy": 2.6589698791503906, "epoch": 0.6321027560394692, "grad_norm": 0.033761054277420044, "grad_norm_var": 8.015402392197793e-07, "learning_rate": 0.00012940465067524275, "loss": 2.659, "step": 7431 }, { "crossentropy": 2.5930728912353516, "epoch": 0.6321878189860497, "grad_norm": 0.035409215837717056, "grad_norm_var": 1.349068076954979e-06, "learning_rate": 0.0001289521645576358, "loss": 2.5931, "step": 7432 }, { "crossentropy": 2.487556219100952, "epoch": 0.6322728819326301, "grad_norm": 0.03312504664063454, "grad_norm_var": 1.3799524181951997e-06, "learning_rate": 0.00012850046058995645, "loss": 2.4876, "step": 7433 }, { "crossentropy": 2.605787754058838, "epoch": 0.6323579448792106, "grad_norm": 0.03385581076145172, "grad_norm_var": 1.5042414045891734e-06, "learning_rate": 0.00012804953884473392, "loss": 2.6058, "step": 7434 }, { "crossentropy": 2.679518699645996, "epoch": 0.6324430078257911, "grad_norm": 0.03138231858611107, "grad_norm_var": 1.5682977129370552e-06, "learning_rate": 0.00012759939939437304, "loss": 2.6795, "step": 7435 }, { "crossentropy": 2.6538169384002686, "epoch": 0.6325280707723716, "grad_norm": 0.03330286219716072, "grad_norm_var": 1.3783663130627229e-06, "learning_rate": 0.0001271500423111538, "loss": 2.6538, "step": 7436 }, { "crossentropy": 2.6329562664031982, "epoch": 0.632613133718952, "grad_norm": 0.032495804131031036, "grad_norm_var": 1.3563019235879672e-06, "learning_rate": 0.00012670146766723012, "loss": 2.633, "step": 7437 }, { "crossentropy": 2.5464611053466797, "epoch": 0.6326981966655325, "grad_norm": 0.031603697687387466, "grad_norm_var": 1.2479577144578936e-06, "learning_rate": 0.00012625367553462886, "loss": 2.5465, "step": 7438 }, { "crossentropy": 2.6170175075531006, "epoch": 0.632783259612113, "grad_norm": 0.03223579004406929, "grad_norm_var": 1.1942670541901691e-06, "learning_rate": 0.00012580666598525415, "loss": 2.617, "step": 7439 }, { "crossentropy": 2.577559471130371, "epoch": 0.6328683225586934, "grad_norm": 0.031222840771079063, "grad_norm_var": 1.1409199214532877e-06, "learning_rate": 0.0001253604390908819, "loss": 2.5776, "step": 7440 }, { "crossentropy": 2.5556931495666504, "epoch": 0.6329533855052739, "grad_norm": 0.03052964061498642, "grad_norm_var": 1.4173161482044453e-06, "learning_rate": 0.00012491499492316372, "loss": 2.5557, "step": 7441 }, { "crossentropy": 2.5607776641845703, "epoch": 0.6330384484518544, "grad_norm": 0.03202679380774498, "grad_norm_var": 1.4238672149562465e-06, "learning_rate": 0.0001244703335536257, "loss": 2.5608, "step": 7442 }, { "crossentropy": 2.6231884956359863, "epoch": 0.6331235113984348, "grad_norm": 0.03313537314534187, "grad_norm_var": 1.4485869247225405e-06, "learning_rate": 0.00012402645505366628, "loss": 2.6232, "step": 7443 }, { "crossentropy": 2.543893814086914, "epoch": 0.6332085743450153, "grad_norm": 0.03561246767640114, "grad_norm_var": 2.039775217813862e-06, "learning_rate": 0.00012358335949455955, "loss": 2.5439, "step": 7444 }, { "crossentropy": 2.6766605377197266, "epoch": 0.6332936372915958, "grad_norm": 0.03549564629793167, "grad_norm_var": 2.468820190249761e-06, "learning_rate": 0.00012314104694745531, "loss": 2.6767, "step": 7445 }, { "crossentropy": 2.632301092147827, "epoch": 0.6333787002381762, "grad_norm": 0.03318508341908455, "grad_norm_var": 2.3955336005855214e-06, "learning_rate": 0.0001226995174833756, "loss": 2.6323, "step": 7446 }, { "crossentropy": 2.5902209281921387, "epoch": 0.6334637631847567, "grad_norm": 0.03206133097410202, "grad_norm_var": 2.4089968571947325e-06, "learning_rate": 0.00012225877117321703, "loss": 2.5902, "step": 7447 }, { "crossentropy": 2.5910565853118896, "epoch": 0.6335488261313372, "grad_norm": 0.03092464990913868, "grad_norm_var": 2.176042853131663e-06, "learning_rate": 0.00012181880808775026, "loss": 2.5911, "step": 7448 }, { "crossentropy": 2.642246723175049, "epoch": 0.6336338890779176, "grad_norm": 0.033797625452280045, "grad_norm_var": 2.2480644456511373e-06, "learning_rate": 0.00012137962829762206, "loss": 2.6422, "step": 7449 }, { "crossentropy": 2.632978677749634, "epoch": 0.6337189520244981, "grad_norm": 0.031509384512901306, "grad_norm_var": 2.2240714581719367e-06, "learning_rate": 0.00012094123187335104, "loss": 2.633, "step": 7450 }, { "crossentropy": 2.5216124057769775, "epoch": 0.6338040149710786, "grad_norm": 0.031745459884405136, "grad_norm_var": 2.1766190217888925e-06, "learning_rate": 0.0001205036188853309, "loss": 2.5216, "step": 7451 }, { "crossentropy": 2.6069138050079346, "epoch": 0.633889077917659, "grad_norm": 0.031427543610334396, "grad_norm_var": 2.209492493526697e-06, "learning_rate": 0.00012006678940383098, "loss": 2.6069, "step": 7452 }, { "crossentropy": 2.657548427581787, "epoch": 0.6339741408642395, "grad_norm": 0.03256043419241905, "grad_norm_var": 2.210251067123516e-06, "learning_rate": 0.00011963074349899183, "loss": 2.6575, "step": 7453 }, { "crossentropy": 2.5884275436401367, "epoch": 0.63405920381082, "grad_norm": 0.031304776668548584, "grad_norm_var": 2.2492515580992943e-06, "learning_rate": 0.0001191954812408308, "loss": 2.5884, "step": 7454 }, { "crossentropy": 2.575681447982788, "epoch": 0.6341442667574004, "grad_norm": 0.03019058331847191, "grad_norm_var": 2.561848635795836e-06, "learning_rate": 0.00011876100269923806, "loss": 2.5757, "step": 7455 }, { "crossentropy": 2.5546274185180664, "epoch": 0.6342293297039809, "grad_norm": 0.03048464097082615, "grad_norm_var": 2.7014956170583656e-06, "learning_rate": 0.0001183273079439795, "loss": 2.5546, "step": 7456 }, { "crossentropy": 2.688558578491211, "epoch": 0.6343143926505614, "grad_norm": 0.03270602971315384, "grad_norm_var": 2.4984699561502363e-06, "learning_rate": 0.00011789439704469217, "loss": 2.6886, "step": 7457 }, { "crossentropy": 2.644827365875244, "epoch": 0.6343994555971418, "grad_norm": 0.03285154327750206, "grad_norm_var": 2.501538700431146e-06, "learning_rate": 0.0001174622700708905, "loss": 2.6448, "step": 7458 }, { "crossentropy": 2.5849356651306152, "epoch": 0.6344845185437223, "grad_norm": 0.032025739550590515, "grad_norm_var": 2.4751743116395324e-06, "learning_rate": 0.00011703092709196117, "loss": 2.5849, "step": 7459 }, { "crossentropy": 2.571363925933838, "epoch": 0.6345695814903028, "grad_norm": 0.030423952266573906, "grad_norm_var": 1.912969359628526e-06, "learning_rate": 0.0001166003681771649, "loss": 2.5714, "step": 7460 }, { "crossentropy": 2.627310037612915, "epoch": 0.6346546444368832, "grad_norm": 0.030490458011627197, "grad_norm_var": 1.1748290251835236e-06, "learning_rate": 0.00011617059339563807, "loss": 2.6273, "step": 7461 }, { "crossentropy": 2.667914867401123, "epoch": 0.6347397073834637, "grad_norm": 0.03131017088890076, "grad_norm_var": 1.0309255043494294e-06, "learning_rate": 0.00011574160281638935, "loss": 2.6679, "step": 7462 }, { "crossentropy": 2.5942301750183105, "epoch": 0.6348247703300443, "grad_norm": 0.03221622109413147, "grad_norm_var": 1.0416757154527948e-06, "learning_rate": 0.00011531339650830252, "loss": 2.5942, "step": 7463 }, { "crossentropy": 2.698089122772217, "epoch": 0.6349098332766248, "grad_norm": 0.03295588120818138, "grad_norm_var": 1.1103892228978355e-06, "learning_rate": 0.00011488597454013538, "loss": 2.6981, "step": 7464 }, { "crossentropy": 2.5722908973693848, "epoch": 0.6349948962232052, "grad_norm": 0.03446311503648758, "grad_norm_var": 1.3197563194247698e-06, "learning_rate": 0.00011445933698051914, "loss": 2.5723, "step": 7465 }, { "crossentropy": 2.6880602836608887, "epoch": 0.6350799591697857, "grad_norm": 0.031226035207509995, "grad_norm_var": 1.3354371103339752e-06, "learning_rate": 0.00011403348389795953, "loss": 2.6881, "step": 7466 }, { "crossentropy": 2.5601277351379395, "epoch": 0.6351650221163662, "grad_norm": 0.031553965061903, "grad_norm_var": 1.3384554484037354e-06, "learning_rate": 0.00011360841536083633, "loss": 2.5601, "step": 7467 }, { "crossentropy": 2.6315832138061523, "epoch": 0.6352500850629466, "grad_norm": 0.03291446715593338, "grad_norm_var": 1.4103424310854078e-06, "learning_rate": 0.00011318413143740436, "loss": 2.6316, "step": 7468 }, { "crossentropy": 2.590704917907715, "epoch": 0.6353351480095271, "grad_norm": 0.03195962682366371, "grad_norm_var": 1.3763824044346617e-06, "learning_rate": 0.0001127606321957897, "loss": 2.5907, "step": 7469 }, { "crossentropy": 2.557582378387451, "epoch": 0.6354202109561076, "grad_norm": 0.03264632448554039, "grad_norm_var": 1.3971856642424596e-06, "learning_rate": 0.00011233791770399516, "loss": 2.5576, "step": 7470 }, { "crossentropy": 2.6150426864624023, "epoch": 0.635505273902688, "grad_norm": 0.0346900075674057, "grad_norm_var": 1.6362648548507812e-06, "learning_rate": 0.00011191598802989644, "loss": 2.615, "step": 7471 }, { "crossentropy": 2.704094409942627, "epoch": 0.6355903368492685, "grad_norm": 0.03445935621857643, "grad_norm_var": 1.7239219520689588e-06, "learning_rate": 0.00011149484324124326, "loss": 2.7041, "step": 7472 }, { "crossentropy": 2.5267491340637207, "epoch": 0.635675399795849, "grad_norm": 0.03246043622493744, "grad_norm_var": 1.7186792869480628e-06, "learning_rate": 0.00011107448340565818, "loss": 2.5267, "step": 7473 }, { "crossentropy": 2.567599296569824, "epoch": 0.6357604627424294, "grad_norm": 0.03253040835261345, "grad_norm_var": 1.7064523946834353e-06, "learning_rate": 0.0001106549085906411, "loss": 2.5676, "step": 7474 }, { "crossentropy": 2.6459407806396484, "epoch": 0.6358455256890099, "grad_norm": 0.03173478692770004, "grad_norm_var": 1.7260831548852047e-06, "learning_rate": 0.000110236118863562, "loss": 2.6459, "step": 7475 }, { "crossentropy": 2.557288408279419, "epoch": 0.6359305886355904, "grad_norm": 0.032217640429735184, "grad_norm_var": 1.4600296490132244e-06, "learning_rate": 0.000109818114291666, "loss": 2.5573, "step": 7476 }, { "crossentropy": 2.692596912384033, "epoch": 0.6360156515821708, "grad_norm": 0.032122790813446045, "grad_norm_var": 1.1915234653409045e-06, "learning_rate": 0.00010940089494207439, "loss": 2.6926, "step": 7477 }, { "crossentropy": 2.5621860027313232, "epoch": 0.6361007145287513, "grad_norm": 0.031172877177596092, "grad_norm_var": 1.216154188384795e-06, "learning_rate": 0.00010898446088178026, "loss": 2.5622, "step": 7478 }, { "crossentropy": 2.558945894241333, "epoch": 0.6361857774753318, "grad_norm": 0.03298819065093994, "grad_norm_var": 1.2156740837708599e-06, "learning_rate": 0.00010856881217764902, "loss": 2.5589, "step": 7479 }, { "crossentropy": 2.6973788738250732, "epoch": 0.6362708404219122, "grad_norm": 0.032396797090768814, "grad_norm_var": 1.2109914789006863e-06, "learning_rate": 0.00010815394889642338, "loss": 2.6974, "step": 7480 }, { "crossentropy": 2.6317379474639893, "epoch": 0.6363559033684927, "grad_norm": 0.031671371310949326, "grad_norm_var": 1.0031243499556644e-06, "learning_rate": 0.00010773987110471895, "loss": 2.6317, "step": 7481 }, { "crossentropy": 2.5680320262908936, "epoch": 0.6364409663150732, "grad_norm": 0.0322793684899807, "grad_norm_var": 9.045629140169564e-07, "learning_rate": 0.00010732657886902308, "loss": 2.568, "step": 7482 }, { "crossentropy": 2.516555070877075, "epoch": 0.6365260292616536, "grad_norm": 0.031500671058893204, "grad_norm_var": 9.113733008407707e-07, "learning_rate": 0.00010691407225569993, "loss": 2.5166, "step": 7483 }, { "crossentropy": 2.643097162246704, "epoch": 0.6366110922082341, "grad_norm": 0.032204486429691315, "grad_norm_var": 9.021346868794221e-07, "learning_rate": 0.0001065023513309854, "loss": 2.6431, "step": 7484 }, { "crossentropy": 2.5815377235412598, "epoch": 0.6366961551548146, "grad_norm": 0.0312562994658947, "grad_norm_var": 9.780709844378476e-07, "learning_rate": 0.00010609141616099106, "loss": 2.5815, "step": 7485 }, { "crossentropy": 2.556173801422119, "epoch": 0.636781218101395, "grad_norm": 0.03125983476638794, "grad_norm_var": 1.0518934027179399e-06, "learning_rate": 0.00010568126681169965, "loss": 2.5562, "step": 7486 }, { "crossentropy": 2.6766676902770996, "epoch": 0.6368662810479755, "grad_norm": 0.03328721225261688, "grad_norm_var": 7.29556368695205e-07, "learning_rate": 0.00010527190334897074, "loss": 2.6767, "step": 7487 }, { "crossentropy": 2.538494110107422, "epoch": 0.636951343994556, "grad_norm": 0.030608758330345154, "grad_norm_var": 5.072585392722062e-07, "learning_rate": 0.00010486332583853564, "loss": 2.5385, "step": 7488 }, { "crossentropy": 2.644798994064331, "epoch": 0.6370364069411364, "grad_norm": 0.031571872532367706, "grad_norm_var": 4.997737072193285e-07, "learning_rate": 0.00010445553434600019, "loss": 2.6448, "step": 7489 }, { "crossentropy": 2.655142068862915, "epoch": 0.6371214698877169, "grad_norm": 0.0314708948135376, "grad_norm_var": 4.844388731624668e-07, "learning_rate": 0.00010404852893684424, "loss": 2.6551, "step": 7490 }, { "crossentropy": 2.559823513031006, "epoch": 0.6372065328342974, "grad_norm": 0.0314195454120636, "grad_norm_var": 4.958705105657096e-07, "learning_rate": 0.00010364230967642051, "loss": 2.5598, "step": 7491 }, { "crossentropy": 2.6897659301757812, "epoch": 0.6372915957808778, "grad_norm": 0.033858004957437515, "grad_norm_var": 7.467966596569429e-07, "learning_rate": 0.00010323687662995684, "loss": 2.6898, "step": 7492 }, { "crossentropy": 2.6701247692108154, "epoch": 0.6373766587274583, "grad_norm": 0.03253492712974548, "grad_norm_var": 7.673577964616701e-07, "learning_rate": 0.00010283222986255391, "loss": 2.6701, "step": 7493 }, { "crossentropy": 2.671225070953369, "epoch": 0.6374617216740388, "grad_norm": 0.03170507401227951, "grad_norm_var": 7.286688571655579e-07, "learning_rate": 0.00010242836943918587, "loss": 2.6712, "step": 7494 }, { "crossentropy": 2.5334420204162598, "epoch": 0.6375467846206193, "grad_norm": 0.03197505697607994, "grad_norm_var": 6.594444797423822e-07, "learning_rate": 0.00010202529542470084, "loss": 2.5334, "step": 7495 }, { "crossentropy": 2.5444161891937256, "epoch": 0.6376318475671997, "grad_norm": 0.034290652722120285, "grad_norm_var": 9.995887736785036e-07, "learning_rate": 0.00010162300788382262, "loss": 2.5444, "step": 7496 }, { "crossentropy": 2.6822991371154785, "epoch": 0.6377169105137802, "grad_norm": 0.034806977957487106, "grad_norm_var": 1.4533361445666243e-06, "learning_rate": 0.00010122150688114506, "loss": 2.6823, "step": 7497 }, { "crossentropy": 2.6137475967407227, "epoch": 0.6378019734603607, "grad_norm": 0.03244154900312424, "grad_norm_var": 1.4555750632840037e-06, "learning_rate": 0.00010082079248113829, "loss": 2.6137, "step": 7498 }, { "crossentropy": 2.642113208770752, "epoch": 0.6378870364069411, "grad_norm": 0.03404327481985092, "grad_norm_var": 1.6015299973414753e-06, "learning_rate": 0.00010042086474814583, "loss": 2.6421, "step": 7499 }, { "crossentropy": 2.633134126663208, "epoch": 0.6379720993535216, "grad_norm": 0.03281429037451744, "grad_norm_var": 1.6071752110167573e-06, "learning_rate": 0.00010002172374638518, "loss": 2.6331, "step": 7500 }, { "crossentropy": 2.589879274368286, "epoch": 0.6380571623001021, "grad_norm": 0.03138652816414833, "grad_norm_var": 1.5873514525567568e-06, "learning_rate": 9.962336953994455e-05, "loss": 2.5899, "step": 7501 }, { "crossentropy": 2.674102544784546, "epoch": 0.6381422252466825, "grad_norm": 0.030853629112243652, "grad_norm_var": 1.6630534289294637e-06, "learning_rate": 9.922580219279053e-05, "loss": 2.6741, "step": 7502 }, { "crossentropy": 2.6856117248535156, "epoch": 0.638227288193263, "grad_norm": 0.03161944821476936, "grad_norm_var": 1.6488924835646988e-06, "learning_rate": 9.882902176875985e-05, "loss": 2.6856, "step": 7503 }, { "crossentropy": 2.555206537246704, "epoch": 0.6383123511398435, "grad_norm": 0.031535450369119644, "grad_norm_var": 1.4889596395691503e-06, "learning_rate": 9.843302833156375e-05, "loss": 2.5552, "step": 7504 }, { "crossentropy": 2.637557029724121, "epoch": 0.6383974140864239, "grad_norm": 0.03151421248912811, "grad_norm_var": 1.4954990891674162e-06, "learning_rate": 9.803782194478861e-05, "loss": 2.6376, "step": 7505 }, { "crossentropy": 2.5976479053497314, "epoch": 0.6384824770330044, "grad_norm": 0.031610507518053055, "grad_norm_var": 1.4795738120770263e-06, "learning_rate": 9.764340267189253e-05, "loss": 2.5976, "step": 7506 }, { "crossentropy": 2.5428168773651123, "epoch": 0.6385675399795849, "grad_norm": 0.030903641134500504, "grad_norm_var": 1.5636906490849405e-06, "learning_rate": 9.72497705762071e-05, "loss": 2.5428, "step": 7507 }, { "crossentropy": 2.670952796936035, "epoch": 0.6386526029261653, "grad_norm": 0.03288767486810684, "grad_norm_var": 1.4298062928546605e-06, "learning_rate": 9.685692572093952e-05, "loss": 2.671, "step": 7508 }, { "crossentropy": 2.6137640476226807, "epoch": 0.6387376658727458, "grad_norm": 0.03249214217066765, "grad_norm_var": 1.4286243398137222e-06, "learning_rate": 9.646486816916877e-05, "loss": 2.6138, "step": 7509 }, { "crossentropy": 2.562087059020996, "epoch": 0.6388227288193263, "grad_norm": 0.031307652592659, "grad_norm_var": 1.470285982048634e-06, "learning_rate": 9.607359798384785e-05, "loss": 2.5621, "step": 7510 }, { "crossentropy": 2.603811264038086, "epoch": 0.6389077917659067, "grad_norm": 0.03365933895111084, "grad_norm_var": 1.5790672695363434e-06, "learning_rate": 9.568311522780371e-05, "loss": 2.6038, "step": 7511 }, { "crossentropy": 2.6188747882843018, "epoch": 0.6389928547124872, "grad_norm": 0.03142272308468819, "grad_norm_var": 1.3645939120875023e-06, "learning_rate": 9.529341996373674e-05, "loss": 2.6189, "step": 7512 }, { "crossentropy": 2.562912940979004, "epoch": 0.6390779176590677, "grad_norm": 0.032641299068927765, "grad_norm_var": 9.067330327304039e-07, "learning_rate": 9.490451225422025e-05, "loss": 2.5629, "step": 7513 }, { "crossentropy": 2.5587081909179688, "epoch": 0.6391629806056481, "grad_norm": 0.03515215590596199, "grad_norm_var": 1.4999261751445665e-06, "learning_rate": 9.451639216170261e-05, "loss": 2.5587, "step": 7514 }, { "crossentropy": 2.618269443511963, "epoch": 0.6392480435522286, "grad_norm": 0.03253567963838577, "grad_norm_var": 1.2795476148722355e-06, "learning_rate": 9.412905974850339e-05, "loss": 2.6183, "step": 7515 }, { "crossentropy": 2.609057664871216, "epoch": 0.6393331064988091, "grad_norm": 0.03319115936756134, "grad_norm_var": 1.3220043873117074e-06, "learning_rate": 9.374251507681841e-05, "loss": 2.6091, "step": 7516 }, { "crossentropy": 2.576920986175537, "epoch": 0.6394181694453895, "grad_norm": 0.032917894423007965, "grad_norm_var": 1.3086873062408942e-06, "learning_rate": 9.335675820871415e-05, "loss": 2.5769, "step": 7517 }, { "crossentropy": 2.6109657287597656, "epoch": 0.63950323239197, "grad_norm": 0.03775293752551079, "grad_norm_var": 2.9851197848596023e-06, "learning_rate": 9.297178920613325e-05, "loss": 2.611, "step": 7518 }, { "crossentropy": 2.6222171783447266, "epoch": 0.6395882953385505, "grad_norm": 0.031188547611236572, "grad_norm_var": 3.0586044980152396e-06, "learning_rate": 9.258760813089018e-05, "loss": 2.6222, "step": 7519 }, { "crossentropy": 2.551988363265991, "epoch": 0.6396733582851309, "grad_norm": 0.03729600831866264, "grad_norm_var": 4.261522951742349e-06, "learning_rate": 9.22042150446728e-05, "loss": 2.552, "step": 7520 }, { "crossentropy": 2.5599629878997803, "epoch": 0.6397584212317114, "grad_norm": 0.03253161907196045, "grad_norm_var": 4.120649228976127e-06, "learning_rate": 9.182161000904465e-05, "loss": 2.56, "step": 7521 }, { "crossentropy": 2.54394793510437, "epoch": 0.639843484178292, "grad_norm": 0.04584681615233421, "grad_norm_var": 1.3973296363421703e-05, "learning_rate": 9.143979308543992e-05, "loss": 2.5439, "step": 7522 }, { "crossentropy": 2.7404003143310547, "epoch": 0.6399285471248725, "grad_norm": 0.032460667192935944, "grad_norm_var": 1.3485540611898882e-05, "learning_rate": 9.105876433516624e-05, "loss": 2.7404, "step": 7523 }, { "crossentropy": 2.6112189292907715, "epoch": 0.6400136100714529, "grad_norm": 0.03247926011681557, "grad_norm_var": 1.3560908887051635e-05, "learning_rate": 9.0678523819408e-05, "loss": 2.6112, "step": 7524 }, { "crossentropy": 2.5803029537200928, "epoch": 0.6400986730180334, "grad_norm": 0.03294384106993675, "grad_norm_var": 1.3479550820920799e-05, "learning_rate": 9.029907159922024e-05, "loss": 2.5803, "step": 7525 }, { "crossentropy": 2.5851404666900635, "epoch": 0.6401837359646139, "grad_norm": 0.03118223510682583, "grad_norm_var": 1.3526943778786868e-05, "learning_rate": 8.992040773553089e-05, "loss": 2.5851, "step": 7526 }, { "crossentropy": 2.609987258911133, "epoch": 0.6402687989111943, "grad_norm": 0.032568275928497314, "grad_norm_var": 1.3661833100966436e-05, "learning_rate": 8.954253228914355e-05, "loss": 2.61, "step": 7527 }, { "crossentropy": 2.546394109725952, "epoch": 0.6403538618577748, "grad_norm": 0.03128223866224289, "grad_norm_var": 1.3711472315636981e-05, "learning_rate": 8.916544532073412e-05, "loss": 2.5464, "step": 7528 }, { "crossentropy": 2.676788330078125, "epoch": 0.6404389248043553, "grad_norm": 0.03095966950058937, "grad_norm_var": 1.4192447405803504e-05, "learning_rate": 8.878914689085193e-05, "loss": 2.6768, "step": 7529 }, { "crossentropy": 2.5627243518829346, "epoch": 0.6405239877509357, "grad_norm": 0.031478796154260635, "grad_norm_var": 1.4419115413105915e-05, "learning_rate": 8.841363705991923e-05, "loss": 2.5627, "step": 7530 }, { "crossentropy": 2.612201452255249, "epoch": 0.6406090506975162, "grad_norm": 0.03159718960523605, "grad_norm_var": 1.4615286787184493e-05, "learning_rate": 8.803891588823221e-05, "loss": 2.6122, "step": 7531 }, { "crossentropy": 2.619753837585449, "epoch": 0.6406941136440967, "grad_norm": 0.03453623503446579, "grad_norm_var": 1.4654175856701118e-05, "learning_rate": 8.766498343596052e-05, "loss": 2.6198, "step": 7532 }, { "crossentropy": 2.5104820728302, "epoch": 0.6407791765906771, "grad_norm": 0.03325161710381508, "grad_norm_var": 1.4626830062361761e-05, "learning_rate": 8.729183976314669e-05, "loss": 2.5105, "step": 7533 }, { "crossentropy": 2.630197286605835, "epoch": 0.6408642395372576, "grad_norm": 0.03329327329993248, "grad_norm_var": 1.3465698336092762e-05, "learning_rate": 8.69194849297067e-05, "loss": 2.6302, "step": 7534 }, { "crossentropy": 2.6522300243377686, "epoch": 0.6409493024838381, "grad_norm": 0.03365689143538475, "grad_norm_var": 1.3108468315544995e-05, "learning_rate": 8.654791899543046e-05, "loss": 2.6522, "step": 7535 }, { "crossentropy": 2.577669858932495, "epoch": 0.6410343654304185, "grad_norm": 0.03301467373967171, "grad_norm_var": 1.2135838650365382e-05, "learning_rate": 8.617714201998083e-05, "loss": 2.5777, "step": 7536 }, { "crossentropy": 2.568361759185791, "epoch": 0.641119428376999, "grad_norm": 0.03266115114092827, "grad_norm_var": 1.2123310844586972e-05, "learning_rate": 8.580715406289352e-05, "loss": 2.5684, "step": 7537 }, { "crossentropy": 2.5829238891601562, "epoch": 0.6412044913235795, "grad_norm": 0.03103708289563656, "grad_norm_var": 1.1069392372576683e-06, "learning_rate": 8.543795518357766e-05, "loss": 2.5829, "step": 7538 }, { "crossentropy": 2.6378705501556396, "epoch": 0.6412895542701599, "grad_norm": 0.032902903854846954, "grad_norm_var": 1.1227283835517816e-06, "learning_rate": 8.506954544131585e-05, "loss": 2.6379, "step": 7539 }, { "crossentropy": 2.6497533321380615, "epoch": 0.6413746172167404, "grad_norm": 0.0334349051117897, "grad_norm_var": 1.186359724282028e-06, "learning_rate": 8.470192489526518e-05, "loss": 2.6498, "step": 7540 }, { "crossentropy": 2.5426852703094482, "epoch": 0.6414596801633209, "grad_norm": 0.031196800991892815, "grad_norm_var": 1.2708338161122965e-06, "learning_rate": 8.433509360445458e-05, "loss": 2.5427, "step": 7541 }, { "crossentropy": 2.5625417232513428, "epoch": 0.6415447431099013, "grad_norm": 0.031501322984695435, "grad_norm_var": 1.226307711903185e-06, "learning_rate": 8.396905162778523e-05, "loss": 2.5625, "step": 7542 }, { "crossentropy": 2.7196576595306396, "epoch": 0.6416298060564818, "grad_norm": 0.03364201635122299, "grad_norm_var": 1.322697752776982e-06, "learning_rate": 8.36037990240357e-05, "loss": 2.7197, "step": 7543 }, { "crossentropy": 2.618732452392578, "epoch": 0.6417148690030623, "grad_norm": 0.03248820826411247, "grad_norm_var": 1.2233441756913261e-06, "learning_rate": 8.323933585185184e-05, "loss": 2.6187, "step": 7544 }, { "crossentropy": 2.6694743633270264, "epoch": 0.6417999319496427, "grad_norm": 0.033605512231588364, "grad_norm_var": 1.10308610410294e-06, "learning_rate": 8.287566216975794e-05, "loss": 2.6695, "step": 7545 }, { "crossentropy": 2.5600335597991943, "epoch": 0.6418849948962232, "grad_norm": 0.034071147441864014, "grad_norm_var": 1.0988690598498496e-06, "learning_rate": 8.251277803614899e-05, "loss": 2.56, "step": 7546 }, { "crossentropy": 2.5815658569335938, "epoch": 0.6419700578428037, "grad_norm": 0.03177173063158989, "grad_norm_var": 1.0711943581805915e-06, "learning_rate": 8.215068350929333e-05, "loss": 2.5816, "step": 7547 }, { "crossentropy": 2.5441441535949707, "epoch": 0.6420551207893841, "grad_norm": 0.0320761539041996, "grad_norm_var": 9.058834754424594e-07, "learning_rate": 8.178937864733338e-05, "loss": 2.5441, "step": 7548 }, { "crossentropy": 2.5922319889068604, "epoch": 0.6421401837359646, "grad_norm": 0.03168167546391487, "grad_norm_var": 9.497643911851594e-07, "learning_rate": 8.142886350828437e-05, "loss": 2.5922, "step": 7549 }, { "crossentropy": 2.624150037765503, "epoch": 0.6422252466825451, "grad_norm": 0.03363091126084328, "grad_norm_var": 9.86874201309764e-07, "learning_rate": 8.106913815003502e-05, "loss": 2.6242, "step": 7550 }, { "crossentropy": 2.695695400238037, "epoch": 0.6423103096291255, "grad_norm": 0.03143980726599693, "grad_norm_var": 9.959449579278098e-07, "learning_rate": 8.071020263034579e-05, "loss": 2.6957, "step": 7551 }, { "crossentropy": 2.6441524028778076, "epoch": 0.642395372575706, "grad_norm": 0.031788043677806854, "grad_norm_var": 1.0074032074480188e-06, "learning_rate": 8.035205700685166e-05, "loss": 2.6442, "step": 7552 }, { "crossentropy": 2.590984344482422, "epoch": 0.6424804355222865, "grad_norm": 0.03059903346002102, "grad_norm_var": 1.2104674592071924e-06, "learning_rate": 7.99947013370611e-05, "loss": 2.591, "step": 7553 }, { "crossentropy": 2.5799834728240967, "epoch": 0.642565498468867, "grad_norm": 0.03253985568881035, "grad_norm_var": 1.0977202584013779e-06, "learning_rate": 7.963813567835488e-05, "loss": 2.58, "step": 7554 }, { "crossentropy": 2.539351224899292, "epoch": 0.6426505614154474, "grad_norm": 0.0321919247508049, "grad_norm_var": 1.0814620041015831e-06, "learning_rate": 7.928236008798661e-05, "loss": 2.5394, "step": 7555 }, { "crossentropy": 2.648669958114624, "epoch": 0.6427356243620279, "grad_norm": 0.03132452815771103, "grad_norm_var": 1.055581657451119e-06, "learning_rate": 7.892737462308452e-05, "loss": 2.6487, "step": 7556 }, { "crossentropy": 2.6290690898895264, "epoch": 0.6428206873086084, "grad_norm": 0.03169921040534973, "grad_norm_var": 1.0026955889241244e-06, "learning_rate": 7.857317934064855e-05, "loss": 2.6291, "step": 7557 }, { "crossentropy": 2.6092092990875244, "epoch": 0.6429057502551888, "grad_norm": 0.03152992203831673, "grad_norm_var": 9.998796733671437e-07, "learning_rate": 7.821977429755212e-05, "loss": 2.6092, "step": 7558 }, { "crossentropy": 2.6458067893981934, "epoch": 0.6429908132017693, "grad_norm": 0.033108048141002655, "grad_norm_var": 9.189486968231523e-07, "learning_rate": 7.786715955054203e-05, "loss": 2.6458, "step": 7559 }, { "crossentropy": 2.6094460487365723, "epoch": 0.6430758761483498, "grad_norm": 0.033744536340236664, "grad_norm_var": 1.0622546903924265e-06, "learning_rate": 7.751533515623799e-05, "loss": 2.6094, "step": 7560 }, { "crossentropy": 2.6016502380371094, "epoch": 0.6431609390949302, "grad_norm": 0.03192872926592827, "grad_norm_var": 9.461335211559715e-07, "learning_rate": 7.716430117113315e-05, "loss": 2.6017, "step": 7561 }, { "crossentropy": 2.6264076232910156, "epoch": 0.6432460020415107, "grad_norm": 0.030549010261893272, "grad_norm_var": 8.405553363459324e-07, "learning_rate": 7.6814057651593e-05, "loss": 2.6264, "step": 7562 }, { "crossentropy": 2.6578009128570557, "epoch": 0.6433310649880912, "grad_norm": 0.032276932150125504, "grad_norm_var": 8.428017181820341e-07, "learning_rate": 7.646460465385696e-05, "loss": 2.6578, "step": 7563 }, { "crossentropy": 2.6805360317230225, "epoch": 0.6434161279346716, "grad_norm": 0.03177308663725853, "grad_norm_var": 8.457386023109798e-07, "learning_rate": 7.611594223403685e-05, "loss": 2.6805, "step": 7564 }, { "crossentropy": 2.6787965297698975, "epoch": 0.6435011908812521, "grad_norm": 0.031369682401418686, "grad_norm_var": 8.645580121724797e-07, "learning_rate": 7.576807044811839e-05, "loss": 2.6788, "step": 7565 }, { "crossentropy": 2.5681514739990234, "epoch": 0.6435862538278326, "grad_norm": 0.030255867168307304, "grad_norm_var": 8.283188256598955e-07, "learning_rate": 7.542098935195918e-05, "loss": 2.5682, "step": 7566 }, { "crossentropy": 2.5780959129333496, "epoch": 0.643671316774413, "grad_norm": 0.03150124475359917, "grad_norm_var": 8.259532159543183e-07, "learning_rate": 7.507469900129017e-05, "loss": 2.5781, "step": 7567 }, { "crossentropy": 2.6096487045288086, "epoch": 0.6437563797209935, "grad_norm": 0.03017858974635601, "grad_norm_var": 9.820952064425066e-07, "learning_rate": 7.47291994517163e-05, "loss": 2.6096, "step": 7568 }, { "crossentropy": 2.476269483566284, "epoch": 0.643841442667574, "grad_norm": 0.03307758644223213, "grad_norm_var": 1.0152144856608587e-06, "learning_rate": 7.438449075871434e-05, "loss": 2.4763, "step": 7569 }, { "crossentropy": 2.680593252182007, "epoch": 0.6439265056141544, "grad_norm": 0.03231149539351463, "grad_norm_var": 9.964199871859698e-07, "learning_rate": 7.4040572977635e-05, "loss": 2.6806, "step": 7570 }, { "crossentropy": 2.5560925006866455, "epoch": 0.6440115685607349, "grad_norm": 0.03218908980488777, "grad_norm_var": 9.962728265640869e-07, "learning_rate": 7.369744616370133e-05, "loss": 2.5561, "step": 7571 }, { "crossentropy": 2.618452548980713, "epoch": 0.6440966315073154, "grad_norm": 0.03221770003437996, "grad_norm_var": 9.893781387731396e-07, "learning_rate": 7.335511037200981e-05, "loss": 2.6185, "step": 7572 }, { "crossentropy": 2.6010799407958984, "epoch": 0.6441816944538958, "grad_norm": 0.033025309443473816, "grad_norm_var": 1.0714015446615695e-06, "learning_rate": 7.301356565752925e-05, "loss": 2.6011, "step": 7573 }, { "crossentropy": 2.7254831790924072, "epoch": 0.6442667574004763, "grad_norm": 0.03324403613805771, "grad_norm_var": 1.1613608000494152e-06, "learning_rate": 7.26728120751019e-05, "loss": 2.7255, "step": 7574 }, { "crossentropy": 2.6728620529174805, "epoch": 0.6443518203470568, "grad_norm": 0.03172633796930313, "grad_norm_var": 1.0851940286120033e-06, "learning_rate": 7.233284967944398e-05, "loss": 2.6729, "step": 7575 }, { "crossentropy": 2.520690441131592, "epoch": 0.6444368832936372, "grad_norm": 0.031011661514639854, "grad_norm_var": 9.019368643743582e-07, "learning_rate": 7.199367852514238e-05, "loss": 2.5207, "step": 7576 }, { "crossentropy": 2.647643804550171, "epoch": 0.6445219462402177, "grad_norm": 0.03293650597333908, "grad_norm_var": 9.840843922415132e-07, "learning_rate": 7.165529866665855e-05, "loss": 2.6476, "step": 7577 }, { "crossentropy": 2.714423894882202, "epoch": 0.6446070091867983, "grad_norm": 0.030064377933740616, "grad_norm_var": 1.0830088116038495e-06, "learning_rate": 7.131771015832678e-05, "loss": 2.7144, "step": 7578 }, { "crossentropy": 2.657353639602661, "epoch": 0.6446920721333786, "grad_norm": 0.03401142731308937, "grad_norm_var": 1.3761402954122338e-06, "learning_rate": 7.098091305435483e-05, "loss": 2.6574, "step": 7579 }, { "crossentropy": 2.581068992614746, "epoch": 0.6447771350799592, "grad_norm": 0.031077414751052856, "grad_norm_var": 1.4210236055737924e-06, "learning_rate": 7.064490740882056e-05, "loss": 2.5811, "step": 7580 }, { "crossentropy": 2.540921449661255, "epoch": 0.6448621980265397, "grad_norm": 0.03148648142814636, "grad_norm_var": 1.413813781078382e-06, "learning_rate": 7.03096932756786e-05, "loss": 2.5409, "step": 7581 }, { "crossentropy": 2.513343572616577, "epoch": 0.6449472609731202, "grad_norm": 0.03459969535470009, "grad_norm_var": 1.6439447408288117e-06, "learning_rate": 6.997527070875421e-05, "loss": 2.5133, "step": 7582 }, { "crossentropy": 2.5981786251068115, "epoch": 0.6450323239197006, "grad_norm": 0.032809946686029434, "grad_norm_var": 1.6349607916816995e-06, "learning_rate": 6.964163976174554e-05, "loss": 2.5982, "step": 7583 }, { "crossentropy": 2.5898404121398926, "epoch": 0.6451173868662811, "grad_norm": 0.03212585300207138, "grad_norm_var": 1.334664471990205e-06, "learning_rate": 6.93088004882253e-05, "loss": 2.5898, "step": 7584 }, { "crossentropy": 2.6124114990234375, "epoch": 0.6452024498128616, "grad_norm": 0.03439104184508324, "grad_norm_var": 1.5664606670745627e-06, "learning_rate": 6.89767529416363e-05, "loss": 2.6124, "step": 7585 }, { "crossentropy": 2.6818556785583496, "epoch": 0.645287512759442, "grad_norm": 0.03274416923522949, "grad_norm_var": 1.5700684657073226e-06, "learning_rate": 6.8645497175297e-05, "loss": 2.6819, "step": 7586 }, { "crossentropy": 2.672581434249878, "epoch": 0.6453725757060225, "grad_norm": 0.032012250274419785, "grad_norm_var": 1.5788543075096699e-06, "learning_rate": 6.831503324239707e-05, "loss": 2.6726, "step": 7587 }, { "crossentropy": 2.560371160507202, "epoch": 0.645457638652603, "grad_norm": 0.031905241310596466, "grad_norm_var": 1.5953741272376932e-06, "learning_rate": 6.798536119599963e-05, "loss": 2.5604, "step": 7588 }, { "crossentropy": 2.604496955871582, "epoch": 0.6455427015991834, "grad_norm": 0.03187796473503113, "grad_norm_var": 1.5893685845659675e-06, "learning_rate": 6.765648108903954e-05, "loss": 2.6045, "step": 7589 }, { "crossentropy": 2.67201566696167, "epoch": 0.6456277645457639, "grad_norm": 0.03175250440835953, "grad_norm_var": 1.5558876153432074e-06, "learning_rate": 6.732839297432736e-05, "loss": 2.672, "step": 7590 }, { "crossentropy": 2.7265336513519287, "epoch": 0.6457128274923444, "grad_norm": 0.03374967351555824, "grad_norm_var": 1.661498149603752e-06, "learning_rate": 6.700109690454315e-05, "loss": 2.7265, "step": 7591 }, { "crossentropy": 2.552912473678589, "epoch": 0.6457978904389248, "grad_norm": 0.03360038623213768, "grad_norm_var": 1.5977682845407885e-06, "learning_rate": 6.667459293224155e-05, "loss": 2.5529, "step": 7592 }, { "crossentropy": 2.663785219192505, "epoch": 0.6458829533855053, "grad_norm": 0.03139456361532211, "grad_norm_var": 1.6713370028599571e-06, "learning_rate": 6.634888110985005e-05, "loss": 2.6638, "step": 7593 }, { "crossentropy": 2.7128794193267822, "epoch": 0.6459680163320858, "grad_norm": 0.03206252306699753, "grad_norm_var": 1.2785873201312872e-06, "learning_rate": 6.602396148966794e-05, "loss": 2.7129, "step": 7594 }, { "crossentropy": 2.620573043823242, "epoch": 0.6460530792786662, "grad_norm": 0.03373131528496742, "grad_norm_var": 1.2307795264713054e-06, "learning_rate": 6.569983412386848e-05, "loss": 2.6206, "step": 7595 }, { "crossentropy": 2.5863115787506104, "epoch": 0.6461381422252467, "grad_norm": 0.03205077722668648, "grad_norm_var": 1.0946534040129353e-06, "learning_rate": 6.537649906449728e-05, "loss": 2.5863, "step": 7596 }, { "crossentropy": 2.563356399536133, "epoch": 0.6462232051718272, "grad_norm": 0.032443758100271225, "grad_norm_var": 1.0042616820882602e-06, "learning_rate": 6.505395636347222e-05, "loss": 2.5634, "step": 7597 }, { "crossentropy": 2.5967326164245605, "epoch": 0.6463082681184076, "grad_norm": 0.03358100727200508, "grad_norm_var": 8.115318293292276e-07, "learning_rate": 6.473220607258467e-05, "loss": 2.5967, "step": 7598 }, { "crossentropy": 2.5313119888305664, "epoch": 0.6463933310649881, "grad_norm": 0.032757021486759186, "grad_norm_var": 8.105045370225087e-07, "learning_rate": 6.441124824349776e-05, "loss": 2.5313, "step": 7599 }, { "crossentropy": 2.670572519302368, "epoch": 0.6464783940115686, "grad_norm": 0.031475529074668884, "grad_norm_var": 8.811938376612297e-07, "learning_rate": 6.409108292774912e-05, "loss": 2.6706, "step": 7600 }, { "crossentropy": 2.66410756111145, "epoch": 0.646563456958149, "grad_norm": 0.03366662561893463, "grad_norm_var": 7.405736542976174e-07, "learning_rate": 6.377171017674822e-05, "loss": 2.6641, "step": 7601 }, { "crossentropy": 2.713761568069458, "epoch": 0.6466485199047295, "grad_norm": 0.03173433244228363, "grad_norm_var": 7.782100918312639e-07, "learning_rate": 6.345313004177511e-05, "loss": 2.7138, "step": 7602 }, { "crossentropy": 2.610227584838867, "epoch": 0.64673358285131, "grad_norm": 0.03286724165081978, "grad_norm_var": 7.697525602325927e-07, "learning_rate": 6.313534257398723e-05, "loss": 2.6102, "step": 7603 }, { "crossentropy": 2.4885079860687256, "epoch": 0.6468186457978904, "grad_norm": 0.031180990859866142, "grad_norm_var": 8.638959597893527e-07, "learning_rate": 6.281834782440987e-05, "loss": 2.4885, "step": 7604 }, { "crossentropy": 2.6695327758789062, "epoch": 0.6469037087444709, "grad_norm": 0.03240285441279411, "grad_norm_var": 8.379047078460687e-07, "learning_rate": 6.250214584394454e-05, "loss": 2.6695, "step": 7605 }, { "crossentropy": 2.6091556549072266, "epoch": 0.6469887716910514, "grad_norm": 0.03084976226091385, "grad_norm_var": 9.822050297750381e-07, "learning_rate": 6.218673668336395e-05, "loss": 2.6092, "step": 7606 }, { "crossentropy": 2.5584707260131836, "epoch": 0.6470738346376318, "grad_norm": 0.03262593597173691, "grad_norm_var": 8.696591323863739e-07, "learning_rate": 6.187212039331314e-05, "loss": 2.5585, "step": 7607 }, { "crossentropy": 2.685941219329834, "epoch": 0.6471588975842123, "grad_norm": 0.032846808433532715, "grad_norm_var": 7.846949835958594e-07, "learning_rate": 6.15582970243117e-05, "loss": 2.6859, "step": 7608 }, { "crossentropy": 2.6478116512298584, "epoch": 0.6472439605307928, "grad_norm": 0.031564489006996155, "grad_norm_var": 7.647519895014142e-07, "learning_rate": 6.124526662674934e-05, "loss": 2.6478, "step": 7609 }, { "crossentropy": 2.617497205734253, "epoch": 0.6473290234773733, "grad_norm": 0.03237050771713257, "grad_norm_var": 7.582568031158036e-07, "learning_rate": 6.093302925088973e-05, "loss": 2.6175, "step": 7610 }, { "crossentropy": 2.611220121383667, "epoch": 0.6474140864239537, "grad_norm": 0.03430720418691635, "grad_norm_var": 8.824148726840752e-07, "learning_rate": 6.062158494686998e-05, "loss": 2.6112, "step": 7611 }, { "crossentropy": 2.7327966690063477, "epoch": 0.6474991493705342, "grad_norm": 0.0313388891518116, "grad_norm_var": 9.491636974695427e-07, "learning_rate": 6.031093376469898e-05, "loss": 2.7328, "step": 7612 }, { "crossentropy": 2.5504000186920166, "epoch": 0.6475842123171147, "grad_norm": 0.033486366271972656, "grad_norm_var": 1.0265489713365822e-06, "learning_rate": 6.000107575425795e-05, "loss": 2.5504, "step": 7613 }, { "crossentropy": 2.5585947036743164, "epoch": 0.6476692752636951, "grad_norm": 0.031835317611694336, "grad_norm_var": 9.516606585211236e-07, "learning_rate": 5.9692010965301526e-05, "loss": 2.5586, "step": 7614 }, { "crossentropy": 2.6610207557678223, "epoch": 0.6477543382102756, "grad_norm": 0.03279034420847893, "grad_norm_var": 9.536190315560424e-07, "learning_rate": 5.938373944745612e-05, "loss": 2.661, "step": 7615 }, { "crossentropy": 2.6027777194976807, "epoch": 0.6478394011568561, "grad_norm": 0.03411215543746948, "grad_norm_var": 1.0863284349348166e-06, "learning_rate": 5.9076261250221585e-05, "loss": 2.6028, "step": 7616 }, { "crossentropy": 2.658329963684082, "epoch": 0.6479244641034365, "grad_norm": 0.03254149854183197, "grad_norm_var": 9.902450992595267e-07, "learning_rate": 5.876957642297009e-05, "loss": 2.6583, "step": 7617 }, { "crossentropy": 2.7045910358428955, "epoch": 0.648009527050017, "grad_norm": 0.03292320668697357, "grad_norm_var": 9.68559812554318e-07, "learning_rate": 5.846368501494615e-05, "loss": 2.7046, "step": 7618 }, { "crossentropy": 2.6759066581726074, "epoch": 0.6480945899965975, "grad_norm": 0.03095404990017414, "grad_norm_var": 1.1043429232896265e-06, "learning_rate": 5.8158587075267666e-05, "loss": 2.6759, "step": 7619 }, { "crossentropy": 2.551748752593994, "epoch": 0.6481796529431779, "grad_norm": 0.032014455646276474, "grad_norm_var": 1.0141652213725561e-06, "learning_rate": 5.7854282652923804e-05, "loss": 2.5517, "step": 7620 }, { "crossentropy": 2.6507813930511475, "epoch": 0.6482647158897584, "grad_norm": 0.03268606960773468, "grad_norm_var": 1.0179554410471298e-06, "learning_rate": 5.7550771796777146e-05, "loss": 2.6508, "step": 7621 }, { "crossentropy": 2.579538345336914, "epoch": 0.6483497788363389, "grad_norm": 0.03114892914891243, "grad_norm_var": 9.59600165251192e-07, "learning_rate": 5.72480545555637e-05, "loss": 2.5795, "step": 7622 }, { "crossentropy": 2.593632459640503, "epoch": 0.6484348417829193, "grad_norm": 0.033631812781095505, "grad_norm_var": 1.0435307281398866e-06, "learning_rate": 5.69461309778907e-05, "loss": 2.5936, "step": 7623 }, { "crossentropy": 2.587095022201538, "epoch": 0.6485199047294998, "grad_norm": 0.03233747184276581, "grad_norm_var": 1.0385358077984273e-06, "learning_rate": 5.664500111223769e-05, "loss": 2.5871, "step": 7624 }, { "crossentropy": 2.5667786598205566, "epoch": 0.6486049676760803, "grad_norm": 0.03241831064224243, "grad_norm_var": 9.772934489982263e-07, "learning_rate": 5.634466500695823e-05, "loss": 2.5668, "step": 7625 }, { "crossentropy": 2.58181095123291, "epoch": 0.6486900306226607, "grad_norm": 0.032044943422079086, "grad_norm_var": 9.919715101463448e-07, "learning_rate": 5.604512271027706e-05, "loss": 2.5818, "step": 7626 }, { "crossentropy": 2.6767823696136475, "epoch": 0.6487750935692412, "grad_norm": 0.033608827739953995, "grad_norm_var": 8.574966894048096e-07, "learning_rate": 5.574637427029239e-05, "loss": 2.6768, "step": 7627 }, { "crossentropy": 2.6576006412506104, "epoch": 0.6488601565158217, "grad_norm": 0.0345756970345974, "grad_norm_var": 1.0146339182997745e-06, "learning_rate": 5.544841973497472e-05, "loss": 2.6576, "step": 7628 }, { "crossentropy": 2.6356842517852783, "epoch": 0.6489452194624021, "grad_norm": 0.034608904272317886, "grad_norm_var": 1.211933431370761e-06, "learning_rate": 5.515125915216745e-05, "loss": 2.6357, "step": 7629 }, { "crossentropy": 2.548654794692993, "epoch": 0.6490302824089826, "grad_norm": 0.031991783529520035, "grad_norm_var": 1.1940788207799447e-06, "learning_rate": 5.485489256958465e-05, "loss": 2.5487, "step": 7630 }, { "crossentropy": 2.6684393882751465, "epoch": 0.6491153453555631, "grad_norm": 0.0335574671626091, "grad_norm_var": 1.2325018938861282e-06, "learning_rate": 5.4559320034814897e-05, "loss": 2.6684, "step": 7631 }, { "crossentropy": 2.602069854736328, "epoch": 0.6492004083021435, "grad_norm": 0.03283877670764923, "grad_norm_var": 1.1148357236100687e-06, "learning_rate": 5.426454159531913e-05, "loss": 2.6021, "step": 7632 }, { "crossentropy": 2.6186270713806152, "epoch": 0.649285471248724, "grad_norm": 0.030545402318239212, "grad_norm_var": 1.417393171577773e-06, "learning_rate": 5.397055729842948e-05, "loss": 2.6186, "step": 7633 }, { "crossentropy": 2.6582000255584717, "epoch": 0.6493705341953045, "grad_norm": 0.031087489798665047, "grad_norm_var": 1.5532771901365558e-06, "learning_rate": 5.367736719135208e-05, "loss": 2.6582, "step": 7634 }, { "crossentropy": 2.6547369956970215, "epoch": 0.6494555971418849, "grad_norm": 0.0322110690176487, "grad_norm_var": 1.39240021957535e-06, "learning_rate": 5.3384971321164264e-05, "loss": 2.6547, "step": 7635 }, { "crossentropy": 2.625725269317627, "epoch": 0.6495406600884654, "grad_norm": 0.032386623322963715, "grad_norm_var": 1.3729083631547662e-06, "learning_rate": 5.309336973481682e-05, "loss": 2.6257, "step": 7636 }, { "crossentropy": 2.63069486618042, "epoch": 0.649625723035046, "grad_norm": 0.03432328253984451, "grad_norm_var": 1.5581403471833967e-06, "learning_rate": 5.280256247913229e-05, "loss": 2.6307, "step": 7637 }, { "crossentropy": 2.65423846244812, "epoch": 0.6497107859816263, "grad_norm": 0.031640779227018356, "grad_norm_var": 1.471062187484888e-06, "learning_rate": 5.2512549600805535e-05, "loss": 2.6542, "step": 7638 }, { "crossentropy": 2.673496723175049, "epoch": 0.6497958489282069, "grad_norm": 0.03274867311120033, "grad_norm_var": 1.4145646840908366e-06, "learning_rate": 5.222333114640543e-05, "loss": 2.6735, "step": 7639 }, { "crossentropy": 2.5044052600860596, "epoch": 0.6498809118747874, "grad_norm": 0.034080494195222855, "grad_norm_var": 1.5241821315633744e-06, "learning_rate": 5.193490716237037e-05, "loss": 2.5044, "step": 7640 }, { "crossentropy": 2.632660150527954, "epoch": 0.6499659748213679, "grad_norm": 0.03296156972646713, "grad_norm_var": 1.515575501378795e-06, "learning_rate": 5.164727769501498e-05, "loss": 2.6327, "step": 7641 }, { "crossentropy": 2.5418899059295654, "epoch": 0.6500510377679483, "grad_norm": 0.03133774548768997, "grad_norm_var": 1.6204569205470793e-06, "learning_rate": 5.136044279052288e-05, "loss": 2.5419, "step": 7642 }, { "crossentropy": 2.6412477493286133, "epoch": 0.6501361007145288, "grad_norm": 0.031856149435043335, "grad_norm_var": 1.6191194610681483e-06, "learning_rate": 5.1074402494951653e-05, "loss": 2.6412, "step": 7643 }, { "crossentropy": 2.540708303451538, "epoch": 0.6502211636611093, "grad_norm": 0.03250683471560478, "grad_norm_var": 1.3614982034597927e-06, "learning_rate": 5.07891568542318e-05, "loss": 2.5407, "step": 7644 }, { "crossentropy": 2.6577975749969482, "epoch": 0.6503062266076897, "grad_norm": 0.031641509383916855, "grad_norm_var": 1.094334686890782e-06, "learning_rate": 5.0504705914165e-05, "loss": 2.6578, "step": 7645 }, { "crossentropy": 2.7218029499053955, "epoch": 0.6503912895542702, "grad_norm": 0.03279650956392288, "grad_norm_var": 1.0955976510956436e-06, "learning_rate": 5.022104972042529e-05, "loss": 2.7218, "step": 7646 }, { "crossentropy": 2.626636266708374, "epoch": 0.6504763525008507, "grad_norm": 0.03335166722536087, "grad_norm_var": 1.0666903070098334e-06, "learning_rate": 4.993818831856123e-05, "loss": 2.6266, "step": 7647 }, { "crossentropy": 2.513533115386963, "epoch": 0.6505614154474311, "grad_norm": 0.03375110402703285, "grad_norm_var": 1.1727354822226583e-06, "learning_rate": 4.965612175399092e-05, "loss": 2.5135, "step": 7648 }, { "crossentropy": 2.6045751571655273, "epoch": 0.6506464783940116, "grad_norm": 0.0312768779695034, "grad_norm_var": 1.020256952879286e-06, "learning_rate": 4.937485007200593e-05, "loss": 2.6046, "step": 7649 }, { "crossentropy": 2.6042206287384033, "epoch": 0.6507315413405921, "grad_norm": 0.03184724226593971, "grad_norm_var": 9.135092084444051e-07, "learning_rate": 4.909437331777178e-05, "loss": 2.6042, "step": 7650 }, { "crossentropy": 2.5825610160827637, "epoch": 0.6508166042871725, "grad_norm": 0.03181586042046547, "grad_norm_var": 9.408612378274927e-07, "learning_rate": 4.8814691536324675e-05, "loss": 2.5826, "step": 7651 }, { "crossentropy": 2.6697916984558105, "epoch": 0.650901667233753, "grad_norm": 0.03480740636587143, "grad_norm_var": 1.2640140550329004e-06, "learning_rate": 4.8535804772572024e-05, "loss": 2.6698, "step": 7652 }, { "crossentropy": 2.6373517513275146, "epoch": 0.6509867301803335, "grad_norm": 0.034181930124759674, "grad_norm_var": 1.234131363070321e-06, "learning_rate": 4.825771307129634e-05, "loss": 2.6374, "step": 7653 }, { "crossentropy": 2.6162338256835938, "epoch": 0.6510717931269139, "grad_norm": 0.03112521395087242, "grad_norm_var": 1.3209896126172826e-06, "learning_rate": 4.798041647715079e-05, "loss": 2.6162, "step": 7654 }, { "crossentropy": 2.6168508529663086, "epoch": 0.6511568560734944, "grad_norm": 0.034065280109643936, "grad_norm_var": 1.4500887909926185e-06, "learning_rate": 4.770391503466087e-05, "loss": 2.6169, "step": 7655 }, { "crossentropy": 2.5980184078216553, "epoch": 0.6512419190200749, "grad_norm": 0.03215081989765167, "grad_norm_var": 1.3308999667218287e-06, "learning_rate": 4.742820878822496e-05, "loss": 2.598, "step": 7656 }, { "crossentropy": 2.541999340057373, "epoch": 0.6513269819666553, "grad_norm": 0.032010603696107864, "grad_norm_var": 1.3405748595443056e-06, "learning_rate": 4.715329778211375e-05, "loss": 2.542, "step": 7657 }, { "crossentropy": 2.6606740951538086, "epoch": 0.6514120449132358, "grad_norm": 0.03330254927277565, "grad_norm_var": 1.2688137013751321e-06, "learning_rate": 4.687918206047026e-05, "loss": 2.6607, "step": 7658 }, { "crossentropy": 2.5308914184570312, "epoch": 0.6514971078598163, "grad_norm": 0.03222635015845299, "grad_norm_var": 1.2379245786267936e-06, "learning_rate": 4.660586166730874e-05, "loss": 2.5309, "step": 7659 }, { "crossentropy": 2.605083703994751, "epoch": 0.6515821708063967, "grad_norm": 0.0318249873816967, "grad_norm_var": 1.2825984130835148e-06, "learning_rate": 4.6333336646517397e-05, "loss": 2.6051, "step": 7660 }, { "crossentropy": 2.599679708480835, "epoch": 0.6516672337529772, "grad_norm": 0.03371979668736458, "grad_norm_var": 1.2769765265050262e-06, "learning_rate": 4.6061607041855136e-05, "loss": 2.5997, "step": 7661 }, { "crossentropy": 2.661062240600586, "epoch": 0.6517522966995577, "grad_norm": 0.03191491961479187, "grad_norm_var": 1.321952091918171e-06, "learning_rate": 4.579067289695427e-05, "loss": 2.6611, "step": 7662 }, { "crossentropy": 2.6040711402893066, "epoch": 0.6518373596461381, "grad_norm": 0.031200913712382317, "grad_norm_var": 1.4272778055780898e-06, "learning_rate": 4.552053425531943e-05, "loss": 2.6041, "step": 7663 }, { "crossentropy": 2.6052262783050537, "epoch": 0.6519224225927186, "grad_norm": 0.0311439111828804, "grad_norm_var": 1.4437495622905086e-06, "learning_rate": 4.52511911603265e-05, "loss": 2.6052, "step": 7664 }, { "crossentropy": 2.5763726234436035, "epoch": 0.6520074855392991, "grad_norm": 0.031569402664899826, "grad_norm_var": 1.4047689887851233e-06, "learning_rate": 4.4982643655224196e-05, "loss": 2.5764, "step": 7665 }, { "crossentropy": 2.6839680671691895, "epoch": 0.6520925484858795, "grad_norm": 0.03285745903849602, "grad_norm_var": 1.3898288469932438e-06, "learning_rate": 4.471489178313415e-05, "loss": 2.684, "step": 7666 }, { "crossentropy": 2.6050214767456055, "epoch": 0.65217761143246, "grad_norm": 0.03431836515665054, "grad_norm_var": 1.5546844322805354e-06, "learning_rate": 4.444793558704918e-05, "loss": 2.605, "step": 7667 }, { "crossentropy": 2.67543625831604, "epoch": 0.6522626743790405, "grad_norm": 0.03251989185810089, "grad_norm_var": 1.2240959994337519e-06, "learning_rate": 4.418177510983445e-05, "loss": 2.6754, "step": 7668 }, { "crossentropy": 2.652660369873047, "epoch": 0.652347737325621, "grad_norm": 0.03206120803952217, "grad_norm_var": 1.031939636056397e-06, "learning_rate": 4.3916410394228e-05, "loss": 2.6527, "step": 7669 }, { "crossentropy": 2.5809175968170166, "epoch": 0.6524328002722014, "grad_norm": 0.03153466060757637, "grad_norm_var": 9.741482901421863e-07, "learning_rate": 4.365184148284018e-05, "loss": 2.5809, "step": 7670 }, { "crossentropy": 2.5028467178344727, "epoch": 0.6525178632187819, "grad_norm": 0.031811222434043884, "grad_norm_var": 7.916084988790339e-07, "learning_rate": 4.3388068418152015e-05, "loss": 2.5028, "step": 7671 }, { "crossentropy": 2.6206274032592773, "epoch": 0.6526029261653624, "grad_norm": 0.03194735199213028, "grad_norm_var": 7.971698741396909e-07, "learning_rate": 4.3125091242519065e-05, "loss": 2.6206, "step": 7672 }, { "crossentropy": 2.518599510192871, "epoch": 0.6526879891119428, "grad_norm": 0.032295361161231995, "grad_norm_var": 7.932348739174972e-07, "learning_rate": 4.2862909998167556e-05, "loss": 2.5186, "step": 7673 }, { "crossentropy": 2.6402013301849365, "epoch": 0.6527730520585233, "grad_norm": 0.0337277390062809, "grad_norm_var": 8.633251352341363e-07, "learning_rate": 4.260152472719547e-05, "loss": 2.6402, "step": 7674 }, { "crossentropy": 2.579754114151001, "epoch": 0.6528581150051038, "grad_norm": 0.03345838189125061, "grad_norm_var": 9.473938455178589e-07, "learning_rate": 4.234093547157425e-05, "loss": 2.5798, "step": 7675 }, { "crossentropy": 2.5851364135742188, "epoch": 0.6529431779516842, "grad_norm": 0.03370105102658272, "grad_norm_var": 1.0312648812770316e-06, "learning_rate": 4.2081142273147635e-05, "loss": 2.5851, "step": 7676 }, { "crossentropy": 2.658400774002075, "epoch": 0.6530282408982647, "grad_norm": 0.03313857689499855, "grad_norm_var": 9.567914336055475e-07, "learning_rate": 4.1822145173630035e-05, "loss": 2.6584, "step": 7677 }, { "crossentropy": 2.6547229290008545, "epoch": 0.6531133038448452, "grad_norm": 0.034182481467723846, "grad_norm_var": 1.1163713154036057e-06, "learning_rate": 4.156394421460929e-05, "loss": 2.6547, "step": 7678 }, { "crossentropy": 2.5221831798553467, "epoch": 0.6531983667914256, "grad_norm": 0.03937883302569389, "grad_norm_var": 3.779717648856827e-06, "learning_rate": 4.1306539437545584e-05, "loss": 2.5222, "step": 7679 }, { "crossentropy": 2.570460081100464, "epoch": 0.6532834297380061, "grad_norm": 0.032766882330179214, "grad_norm_var": 3.5204340110690675e-06, "learning_rate": 4.104993088376974e-05, "loss": 2.5705, "step": 7680 }, { "crossentropy": 2.5780179500579834, "epoch": 0.6533684926845866, "grad_norm": 0.03137540444731712, "grad_norm_var": 3.5650752845991484e-06, "learning_rate": 4.079411859448601e-05, "loss": 2.578, "step": 7681 }, { "crossentropy": 2.4660511016845703, "epoch": 0.653453555631167, "grad_norm": 0.03022245317697525, "grad_norm_var": 4.116627459189541e-06, "learning_rate": 4.053910261077043e-05, "loss": 2.4661, "step": 7682 }, { "crossentropy": 2.652056932449341, "epoch": 0.6535386185777475, "grad_norm": 0.031989771872758865, "grad_norm_var": 4.054734839855206e-06, "learning_rate": 4.028488297357191e-05, "loss": 2.6521, "step": 7683 }, { "crossentropy": 2.66308856010437, "epoch": 0.653623681524328, "grad_norm": 0.03225273638963699, "grad_norm_var": 4.072092526211506e-06, "learning_rate": 4.0031459723709476e-05, "loss": 2.6631, "step": 7684 }, { "crossentropy": 2.6622815132141113, "epoch": 0.6537087444709084, "grad_norm": 0.032819755375385284, "grad_norm_var": 4.026733404965723e-06, "learning_rate": 3.977883290187667e-05, "loss": 2.6623, "step": 7685 }, { "crossentropy": 2.6860368251800537, "epoch": 0.6537938074174889, "grad_norm": 0.03251611068844795, "grad_norm_var": 3.906610312714766e-06, "learning_rate": 3.952700254863828e-05, "loss": 2.686, "step": 7686 }, { "crossentropy": 2.5582289695739746, "epoch": 0.6538788703640694, "grad_norm": 0.032894473522901535, "grad_norm_var": 3.8120048400276546e-06, "learning_rate": 3.927596870442973e-05, "loss": 2.5582, "step": 7687 }, { "crossentropy": 2.517951250076294, "epoch": 0.6539639333106498, "grad_norm": 0.03279934823513031, "grad_norm_var": 3.733054888101159e-06, "learning_rate": 3.9025731409561e-05, "loss": 2.518, "step": 7688 }, { "crossentropy": 2.590229034423828, "epoch": 0.6540489962572303, "grad_norm": 0.0309063121676445, "grad_norm_var": 4.001736891497074e-06, "learning_rate": 3.877629070421273e-05, "loss": 2.5902, "step": 7689 }, { "crossentropy": 2.6446170806884766, "epoch": 0.6541340592038108, "grad_norm": 0.03563130274415016, "grad_norm_var": 4.410848281720334e-06, "learning_rate": 3.852764662843733e-05, "loss": 2.6446, "step": 7690 }, { "crossentropy": 2.57401180267334, "epoch": 0.6542191221503912, "grad_norm": 0.03274271637201309, "grad_norm_var": 4.411249390720515e-06, "learning_rate": 3.8279799222160624e-05, "loss": 2.574, "step": 7691 }, { "crossentropy": 2.5557374954223633, "epoch": 0.6543041850969717, "grad_norm": 0.03162118420004845, "grad_norm_var": 4.510049902737602e-06, "learning_rate": 3.803274852517968e-05, "loss": 2.5557, "step": 7692 }, { "crossentropy": 2.6200244426727295, "epoch": 0.6543892480435523, "grad_norm": 0.030590364709496498, "grad_norm_var": 4.852629510010753e-06, "learning_rate": 3.778649457716277e-05, "loss": 2.62, "step": 7693 }, { "crossentropy": 2.617194652557373, "epoch": 0.6544743109901326, "grad_norm": 0.031274471431970596, "grad_norm_var": 4.842463652421691e-06, "learning_rate": 3.754103741765269e-05, "loss": 2.6172, "step": 7694 }, { "crossentropy": 2.6436362266540527, "epoch": 0.6545593739367132, "grad_norm": 0.03147399425506592, "grad_norm_var": 1.6151210747012419e-06, "learning_rate": 3.7296377086061815e-05, "loss": 2.6436, "step": 7695 }, { "crossentropy": 2.6188697814941406, "epoch": 0.6546444368832937, "grad_norm": 0.031925611197948456, "grad_norm_var": 1.5864947081253247e-06, "learning_rate": 3.705251362167483e-05, "loss": 2.6189, "step": 7696 }, { "crossentropy": 2.625319004058838, "epoch": 0.6547294998298742, "grad_norm": 0.03158506378531456, "grad_norm_var": 1.5699716398575539e-06, "learning_rate": 3.680944706365097e-05, "loss": 2.6253, "step": 7697 }, { "crossentropy": 2.6195554733276367, "epoch": 0.6548145627764546, "grad_norm": 0.03016357123851776, "grad_norm_var": 1.5847549484735891e-06, "learning_rate": 3.6567177451019005e-05, "loss": 2.6196, "step": 7698 }, { "crossentropy": 2.570545196533203, "epoch": 0.6548996257230351, "grad_norm": 0.03210591524839401, "grad_norm_var": 1.5842909915907232e-06, "learning_rate": 3.632570482268005e-05, "loss": 2.5705, "step": 7699 }, { "crossentropy": 2.690812349319458, "epoch": 0.6549846886696156, "grad_norm": 0.031313467770814896, "grad_norm_var": 1.617976795858011e-06, "learning_rate": 3.608502921740753e-05, "loss": 2.6908, "step": 7700 }, { "crossentropy": 2.66182279586792, "epoch": 0.655069751616196, "grad_norm": 0.032138414680957794, "grad_norm_var": 1.5745846606813323e-06, "learning_rate": 3.584515067384775e-05, "loss": 2.6618, "step": 7701 }, { "crossentropy": 2.6292314529418945, "epoch": 0.6551548145627765, "grad_norm": 0.032593902200460434, "grad_norm_var": 1.58052202335449e-06, "learning_rate": 3.560606923051768e-05, "loss": 2.6292, "step": 7702 }, { "crossentropy": 2.607877016067505, "epoch": 0.655239877509357, "grad_norm": 0.03143417835235596, "grad_norm_var": 1.5367223093633627e-06, "learning_rate": 3.536778492580717e-05, "loss": 2.6079, "step": 7703 }, { "crossentropy": 2.6087005138397217, "epoch": 0.6553249404559374, "grad_norm": 0.031178690493106842, "grad_norm_var": 1.5051894403464062e-06, "learning_rate": 3.513029779797783e-05, "loss": 2.6087, "step": 7704 }, { "crossentropy": 2.7784156799316406, "epoch": 0.6554100034025179, "grad_norm": 0.03334125131368637, "grad_norm_var": 1.588056048849743e-06, "learning_rate": 3.489360788516305e-05, "loss": 2.7784, "step": 7705 }, { "crossentropy": 2.6005992889404297, "epoch": 0.6554950663490984, "grad_norm": 0.03159043565392494, "grad_norm_var": 6.222807077875623e-07, "learning_rate": 3.465771522536854e-05, "loss": 2.6006, "step": 7706 }, { "crossentropy": 2.6566343307495117, "epoch": 0.6555801292956788, "grad_norm": 0.03106176108121872, "grad_norm_var": 5.634043470746265e-07, "learning_rate": 3.442261985647177e-05, "loss": 2.6566, "step": 7707 }, { "crossentropy": 2.607597827911377, "epoch": 0.6556651922422593, "grad_norm": 0.0319807343184948, "grad_norm_var": 5.731220743454181e-07, "learning_rate": 3.418832181622311e-05, "loss": 2.6076, "step": 7708 }, { "crossentropy": 2.570606231689453, "epoch": 0.6557502551888398, "grad_norm": 0.032719917595386505, "grad_norm_var": 5.67188683294149e-07, "learning_rate": 3.3954821142242466e-05, "loss": 2.5706, "step": 7709 }, { "crossentropy": 2.6138930320739746, "epoch": 0.6558353181354202, "grad_norm": 0.030282501131296158, "grad_norm_var": 6.906031386438398e-07, "learning_rate": 3.372211787202484e-05, "loss": 2.6139, "step": 7710 }, { "crossentropy": 2.701308250427246, "epoch": 0.6559203810820007, "grad_norm": 0.03148879483342171, "grad_norm_var": 6.902091351734229e-07, "learning_rate": 3.3490212042934785e-05, "loss": 2.7013, "step": 7711 }, { "crossentropy": 2.5972933769226074, "epoch": 0.6560054440285812, "grad_norm": 0.03168170526623726, "grad_norm_var": 6.859890060752216e-07, "learning_rate": 3.325910369220975e-05, "loss": 2.5973, "step": 7712 }, { "crossentropy": 2.5653581619262695, "epoch": 0.6560905069751616, "grad_norm": 0.03202873095870018, "grad_norm_var": 6.934877924837102e-07, "learning_rate": 3.3028792856960034e-05, "loss": 2.5654, "step": 7713 }, { "crossentropy": 2.6680026054382324, "epoch": 0.6561755699217421, "grad_norm": 0.03132085129618645, "grad_norm_var": 5.410428405576313e-07, "learning_rate": 3.279927957416551e-05, "loss": 2.668, "step": 7714 }, { "crossentropy": 2.6497786045074463, "epoch": 0.6562606328683226, "grad_norm": 0.031729843467473984, "grad_norm_var": 5.328543368911254e-07, "learning_rate": 3.2570563880680025e-05, "loss": 2.6498, "step": 7715 }, { "crossentropy": 2.6402015686035156, "epoch": 0.656345695814903, "grad_norm": 0.030131880193948746, "grad_norm_var": 6.877565582187606e-07, "learning_rate": 3.234264581322921e-05, "loss": 2.6402, "step": 7716 }, { "crossentropy": 2.665506601333618, "epoch": 0.6564307587614835, "grad_norm": 0.03117241896688938, "grad_norm_var": 6.856146703881066e-07, "learning_rate": 3.211552540840934e-05, "loss": 2.6655, "step": 7717 }, { "crossentropy": 2.5237019062042236, "epoch": 0.656515821708064, "grad_norm": 0.030699962750077248, "grad_norm_var": 6.609888456847457e-07, "learning_rate": 3.188920270268958e-05, "loss": 2.5237, "step": 7718 }, { "crossentropy": 2.65431547164917, "epoch": 0.6566008846546444, "grad_norm": 0.03211389109492302, "grad_norm_var": 6.847846912104323e-07, "learning_rate": 3.166367773241141e-05, "loss": 2.6543, "step": 7719 }, { "crossentropy": 2.550649642944336, "epoch": 0.6566859476012249, "grad_norm": 0.032311342656612396, "grad_norm_var": 7.11501772570194e-07, "learning_rate": 3.143895053378698e-05, "loss": 2.5506, "step": 7720 }, { "crossentropy": 2.624750852584839, "epoch": 0.6567710105478054, "grad_norm": 0.032136429101228714, "grad_norm_var": 5.230692668283429e-07, "learning_rate": 3.121502114290131e-05, "loss": 2.6248, "step": 7721 }, { "crossentropy": 2.622025489807129, "epoch": 0.6568560734943858, "grad_norm": 0.03186287358403206, "grad_norm_var": 5.299688813053168e-07, "learning_rate": 3.09918895957112e-05, "loss": 2.622, "step": 7722 }, { "crossentropy": 2.5490822792053223, "epoch": 0.6569411364409663, "grad_norm": 0.032698821276426315, "grad_norm_var": 5.919383007331353e-07, "learning_rate": 3.07695559280452e-05, "loss": 2.5491, "step": 7723 }, { "crossentropy": 2.6862716674804688, "epoch": 0.6570261993875468, "grad_norm": 0.03339456021785736, "grad_norm_var": 7.796795921363268e-07, "learning_rate": 3.054802017560254e-05, "loss": 2.6863, "step": 7724 }, { "crossentropy": 2.5964651107788086, "epoch": 0.6571112623341272, "grad_norm": 0.031313199549913406, "grad_norm_var": 7.187948212421131e-07, "learning_rate": 3.0327282373956435e-05, "loss": 2.5965, "step": 7725 }, { "crossentropy": 2.5441226959228516, "epoch": 0.6571963252807077, "grad_norm": 0.030768005177378654, "grad_norm_var": 6.451337108260054e-07, "learning_rate": 3.01073425585513e-05, "loss": 2.5441, "step": 7726 }, { "crossentropy": 2.624569892883301, "epoch": 0.6572813882272882, "grad_norm": 0.03331362083554268, "grad_norm_var": 8.071417930898776e-07, "learning_rate": 2.9888200764702778e-05, "loss": 2.6246, "step": 7727 }, { "crossentropy": 2.559403657913208, "epoch": 0.6573664511738687, "grad_norm": 0.03154020383954048, "grad_norm_var": 8.104813598111867e-07, "learning_rate": 2.966985702759828e-05, "loss": 2.5594, "step": 7728 }, { "crossentropy": 2.6505026817321777, "epoch": 0.6574515141204491, "grad_norm": 0.03090347908437252, "grad_norm_var": 8.528314207933522e-07, "learning_rate": 2.9452311382298647e-05, "loss": 2.6505, "step": 7729 }, { "crossentropy": 2.657280445098877, "epoch": 0.6575365770670296, "grad_norm": 0.032825350761413574, "grad_norm_var": 9.155939183181152e-07, "learning_rate": 2.9235563863734273e-05, "loss": 2.6573, "step": 7730 }, { "crossentropy": 2.520151138305664, "epoch": 0.6576216400136101, "grad_norm": 0.03313502296805382, "grad_norm_var": 1.0245007064891079e-06, "learning_rate": 2.901961450670898e-05, "loss": 2.5202, "step": 7731 }, { "crossentropy": 2.6454784870147705, "epoch": 0.6577067029601905, "grad_norm": 0.03147665411233902, "grad_norm_var": 8.213818562881799e-07, "learning_rate": 2.880446334589837e-05, "loss": 2.6455, "step": 7732 }, { "crossentropy": 2.498042106628418, "epoch": 0.657791765906771, "grad_norm": 0.03311175853013992, "grad_norm_var": 8.478524549898006e-07, "learning_rate": 2.8590110415849247e-05, "loss": 2.498, "step": 7733 }, { "crossentropy": 2.659526824951172, "epoch": 0.6578768288533515, "grad_norm": 0.03411802276968956, "grad_norm_var": 9.398460827217403e-07, "learning_rate": 2.8376555750979637e-05, "loss": 2.6595, "step": 7734 }, { "crossentropy": 2.536825656890869, "epoch": 0.6579618917999319, "grad_norm": 0.030813591554760933, "grad_norm_var": 1.0802050158923894e-06, "learning_rate": 2.816379938558211e-05, "loss": 2.5368, "step": 7735 }, { "crossentropy": 2.565551519393921, "epoch": 0.6580469547465124, "grad_norm": 0.03183838352560997, "grad_norm_var": 1.0892253200124688e-06, "learning_rate": 2.7951841353817674e-05, "loss": 2.5656, "step": 7736 }, { "crossentropy": 2.6219823360443115, "epoch": 0.6581320176930929, "grad_norm": 0.032255880534648895, "grad_norm_var": 1.089054877232787e-06, "learning_rate": 2.7740681689721326e-05, "loss": 2.622, "step": 7737 }, { "crossentropy": 2.596252202987671, "epoch": 0.6582170806396733, "grad_norm": 0.03257225453853607, "grad_norm_var": 1.0876178261915443e-06, "learning_rate": 2.7530320427199272e-05, "loss": 2.5963, "step": 7738 }, { "crossentropy": 2.602555513381958, "epoch": 0.6583021435862538, "grad_norm": 0.03137854114174843, "grad_norm_var": 1.1184218306284673e-06, "learning_rate": 2.7320757600028945e-05, "loss": 2.6026, "step": 7739 }, { "crossentropy": 2.691422939300537, "epoch": 0.6583872065328343, "grad_norm": 0.03302328288555145, "grad_norm_var": 1.0665362740707676e-06, "learning_rate": 2.7111993241860644e-05, "loss": 2.6914, "step": 7740 }, { "crossentropy": 2.607349395751953, "epoch": 0.6584722694794147, "grad_norm": 0.03187418356537819, "grad_norm_var": 1.0236739215472527e-06, "learning_rate": 2.6904027386215334e-05, "loss": 2.6073, "step": 7741 }, { "crossentropy": 2.5738635063171387, "epoch": 0.6585573324259952, "grad_norm": 0.03138524293899536, "grad_norm_var": 9.309294740503585e-07, "learning_rate": 2.6696860066487417e-05, "loss": 2.5739, "step": 7742 }, { "crossentropy": 2.602215528488159, "epoch": 0.6586423953725757, "grad_norm": 0.0364239364862442, "grad_norm_var": 1.9879139295237807e-06, "learning_rate": 2.6490491315940835e-05, "loss": 2.6022, "step": 7743 }, { "crossentropy": 2.6278183460235596, "epoch": 0.6587274583191561, "grad_norm": 0.031171467155218124, "grad_norm_var": 2.0395310823819826e-06, "learning_rate": 2.628492116771297e-05, "loss": 2.6278, "step": 7744 }, { "crossentropy": 2.6405649185180664, "epoch": 0.6588125212657366, "grad_norm": 0.03241473808884621, "grad_norm_var": 1.8818948712169172e-06, "learning_rate": 2.6080149654812977e-05, "loss": 2.6406, "step": 7745 }, { "crossentropy": 2.6133811473846436, "epoch": 0.6588975842123171, "grad_norm": 0.03197423368692398, "grad_norm_var": 1.8889596921413716e-06, "learning_rate": 2.587617681012011e-05, "loss": 2.6134, "step": 7746 }, { "crossentropy": 2.6311333179473877, "epoch": 0.6589826471588975, "grad_norm": 0.031681086868047714, "grad_norm_var": 1.8854623426785606e-06, "learning_rate": 2.567300266638706e-05, "loss": 2.6311, "step": 7747 }, { "crossentropy": 2.6517271995544434, "epoch": 0.659067710105478, "grad_norm": 0.03281485289335251, "grad_norm_var": 1.8425251003579042e-06, "learning_rate": 2.547062725623828e-05, "loss": 2.6517, "step": 7748 }, { "crossentropy": 2.685063123703003, "epoch": 0.6591527730520585, "grad_norm": 0.03359770029783249, "grad_norm_var": 1.901572056963345e-06, "learning_rate": 2.5269050612168887e-05, "loss": 2.6851, "step": 7749 }, { "crossentropy": 2.6365582942962646, "epoch": 0.6592378359986389, "grad_norm": 0.0313541442155838, "grad_norm_var": 1.7674807332064413e-06, "learning_rate": 2.5068272766545775e-05, "loss": 2.6366, "step": 7750 }, { "crossentropy": 2.6679861545562744, "epoch": 0.6593228989452194, "grad_norm": 0.030047303065657616, "grad_norm_var": 1.95460339606616e-06, "learning_rate": 2.4868293751609263e-05, "loss": 2.668, "step": 7751 }, { "crossentropy": 2.7132651805877686, "epoch": 0.6594079618918, "grad_norm": 0.031628791242837906, "grad_norm_var": 1.9685151495060176e-06, "learning_rate": 2.4669113599469774e-05, "loss": 2.7133, "step": 7752 }, { "crossentropy": 2.600417375564575, "epoch": 0.6594930248383803, "grad_norm": 0.031982000917196274, "grad_norm_var": 1.9720702169075904e-06, "learning_rate": 2.4470732342108947e-05, "loss": 2.6004, "step": 7753 }, { "crossentropy": 2.551321029663086, "epoch": 0.6595780877849609, "grad_norm": 0.030700061470270157, "grad_norm_var": 2.100145936584233e-06, "learning_rate": 2.4273150011382415e-05, "loss": 2.5513, "step": 7754 }, { "crossentropy": 2.629969596862793, "epoch": 0.6596631507315414, "grad_norm": 0.030759796500205994, "grad_norm_var": 2.1828282337083325e-06, "learning_rate": 2.4076366639015912e-05, "loss": 2.63, "step": 7755 }, { "crossentropy": 2.6034371852874756, "epoch": 0.6597482136781219, "grad_norm": 0.03400643542408943, "grad_norm_var": 2.3705558685708687e-06, "learning_rate": 2.388038225660638e-05, "loss": 2.6034, "step": 7756 }, { "crossentropy": 2.5913259983062744, "epoch": 0.6598332766247023, "grad_norm": 0.033597469329833984, "grad_norm_var": 2.5011752636802984e-06, "learning_rate": 2.3685196895624206e-05, "loss": 2.5913, "step": 7757 }, { "crossentropy": 2.659160614013672, "epoch": 0.6599183395712828, "grad_norm": 0.03397785499691963, "grad_norm_var": 2.6323013041639935e-06, "learning_rate": 2.349081058740987e-05, "loss": 2.6592, "step": 7758 }, { "crossentropy": 2.6700217723846436, "epoch": 0.6600034025178633, "grad_norm": 0.032559216022491455, "grad_norm_var": 1.4836513853451096e-06, "learning_rate": 2.3297223363176744e-05, "loss": 2.67, "step": 7759 }, { "crossentropy": 2.647006034851074, "epoch": 0.6600884654644437, "grad_norm": 0.030598899349570274, "grad_norm_var": 1.5782106541957035e-06, "learning_rate": 2.3104435254008848e-05, "loss": 2.647, "step": 7760 }, { "crossentropy": 2.562295913696289, "epoch": 0.6601735284110242, "grad_norm": 0.031196920201182365, "grad_norm_var": 1.6207572442057296e-06, "learning_rate": 2.2912446290863088e-05, "loss": 2.5623, "step": 7761 }, { "crossentropy": 2.6762635707855225, "epoch": 0.6602585913576047, "grad_norm": 0.033715613186359406, "grad_norm_var": 1.7973812855413452e-06, "learning_rate": 2.2721256504567022e-05, "loss": 2.6763, "step": 7762 }, { "crossentropy": 2.6621010303497314, "epoch": 0.6603436543041851, "grad_norm": 0.03206297755241394, "grad_norm_var": 1.7831985785141431e-06, "learning_rate": 2.253086592581943e-05, "loss": 2.6621, "step": 7763 }, { "crossentropy": 2.6541855335235596, "epoch": 0.6604287172507656, "grad_norm": 0.031054601073265076, "grad_norm_var": 1.8237471383569417e-06, "learning_rate": 2.2341274585192527e-05, "loss": 2.6542, "step": 7764 }, { "crossentropy": 2.652684211730957, "epoch": 0.6605137801973461, "grad_norm": 0.031657394021749496, "grad_norm_var": 1.6592880258677605e-06, "learning_rate": 2.215248251312918e-05, "loss": 2.6527, "step": 7765 }, { "crossentropy": 2.4887115955352783, "epoch": 0.6605988431439265, "grad_norm": 0.03256770595908165, "grad_norm_var": 1.657958588758288e-06, "learning_rate": 2.1964489739944028e-05, "loss": 2.4887, "step": 7766 }, { "crossentropy": 2.571486711502075, "epoch": 0.660683906090507, "grad_norm": 0.031992316246032715, "grad_norm_var": 1.3861658116468e-06, "learning_rate": 2.177729629582237e-05, "loss": 2.5715, "step": 7767 }, { "crossentropy": 2.619443416595459, "epoch": 0.6607689690370875, "grad_norm": 0.031098898500204086, "grad_norm_var": 1.4390296301729222e-06, "learning_rate": 2.159090221082294e-05, "loss": 2.6194, "step": 7768 }, { "crossentropy": 2.636227607727051, "epoch": 0.6608540319836679, "grad_norm": 0.03403879702091217, "grad_norm_var": 1.672301583669801e-06, "learning_rate": 2.140530751487513e-05, "loss": 2.6362, "step": 7769 }, { "crossentropy": 2.521533966064453, "epoch": 0.6609390949302484, "grad_norm": 0.031868305057287216, "grad_norm_var": 1.5202143138404526e-06, "learning_rate": 2.1220512237779544e-05, "loss": 2.5215, "step": 7770 }, { "crossentropy": 2.5518271923065186, "epoch": 0.6610241578768289, "grad_norm": 0.02999194525182247, "grad_norm_var": 1.7144508665845598e-06, "learning_rate": 2.1036516409210227e-05, "loss": 2.5518, "step": 7771 }, { "crossentropy": 2.604167938232422, "epoch": 0.6611092208234093, "grad_norm": 0.03462501987814903, "grad_norm_var": 1.8833089630743005e-06, "learning_rate": 2.0853320058710212e-05, "loss": 2.6042, "step": 7772 }, { "crossentropy": 2.552304744720459, "epoch": 0.6611942837699898, "grad_norm": 0.03297843784093857, "grad_norm_var": 1.799157618898372e-06, "learning_rate": 2.067092321569597e-05, "loss": 2.5523, "step": 7773 }, { "crossentropy": 2.636570453643799, "epoch": 0.6612793467165703, "grad_norm": 0.03210744634270668, "grad_norm_var": 1.586668084216008e-06, "learning_rate": 2.0489325909455737e-05, "loss": 2.6366, "step": 7774 }, { "crossentropy": 2.700007200241089, "epoch": 0.6613644096631507, "grad_norm": 0.03158284351229668, "grad_norm_var": 1.5906535633368092e-06, "learning_rate": 2.030852816914841e-05, "loss": 2.7, "step": 7775 }, { "crossentropy": 2.5739643573760986, "epoch": 0.6614494726097312, "grad_norm": 0.03184986487030983, "grad_norm_var": 1.4428990025645733e-06, "learning_rate": 2.012853002380466e-05, "loss": 2.574, "step": 7776 }, { "crossentropy": 2.6904964447021484, "epoch": 0.6615345355563117, "grad_norm": 0.03234401345252991, "grad_norm_var": 1.379472738035125e-06, "learning_rate": 1.9949331502327472e-05, "loss": 2.6905, "step": 7777 }, { "crossentropy": 2.503685235977173, "epoch": 0.6616195985028921, "grad_norm": 0.031523533165454865, "grad_norm_var": 1.242960387369873e-06, "learning_rate": 1.9770932633491057e-05, "loss": 2.5037, "step": 7778 }, { "crossentropy": 2.6795356273651123, "epoch": 0.6617046614494726, "grad_norm": 0.03337070345878601, "grad_norm_var": 1.3461779441863252e-06, "learning_rate": 1.9593333445940276e-05, "loss": 2.6795, "step": 7779 }, { "crossentropy": 2.620964527130127, "epoch": 0.6617897243960531, "grad_norm": 0.03161393478512764, "grad_norm_var": 1.2828650706047379e-06, "learning_rate": 1.9416533968193428e-05, "loss": 2.621, "step": 7780 }, { "crossentropy": 2.6203315258026123, "epoch": 0.6618747873426335, "grad_norm": 0.03251408040523529, "grad_norm_var": 1.2666757887053037e-06, "learning_rate": 1.9240534228638363e-05, "loss": 2.6203, "step": 7781 }, { "crossentropy": 2.5781185626983643, "epoch": 0.661959850289214, "grad_norm": 0.03223109617829323, "grad_norm_var": 1.2596886778222603e-06, "learning_rate": 1.9065334255536916e-05, "loss": 2.5781, "step": 7782 }, { "crossentropy": 2.6949894428253174, "epoch": 0.6620449132357945, "grad_norm": 0.03175480291247368, "grad_norm_var": 1.2708429492726125e-06, "learning_rate": 1.8890934077019915e-05, "loss": 2.695, "step": 7783 }, { "crossentropy": 2.638240098953247, "epoch": 0.662129976182375, "grad_norm": 0.032067880034446716, "grad_norm_var": 1.1848944073192808e-06, "learning_rate": 1.8717333721091635e-05, "loss": 2.6382, "step": 7784 }, { "crossentropy": 2.621481418609619, "epoch": 0.6622150391289554, "grad_norm": 0.03192831948399544, "grad_norm_var": 9.680522207535508e-07, "learning_rate": 1.8544533215627546e-05, "loss": 2.6215, "step": 7785 }, { "crossentropy": 2.5219719409942627, "epoch": 0.6623001020755359, "grad_norm": 0.03165871649980545, "grad_norm_var": 9.785862462623745e-07, "learning_rate": 1.8372532588373237e-05, "loss": 2.522, "step": 7786 }, { "crossentropy": 2.6990559101104736, "epoch": 0.6623851650221164, "grad_norm": 0.032347917556762695, "grad_norm_var": 6.526429406637317e-07, "learning_rate": 1.8201331866948833e-05, "loss": 2.6991, "step": 7787 }, { "crossentropy": 2.554176092147827, "epoch": 0.6624702279686968, "grad_norm": 0.030587799847126007, "grad_norm_var": 4.096508275504169e-07, "learning_rate": 1.803093107884235e-05, "loss": 2.5542, "step": 7788 }, { "crossentropy": 2.5901222229003906, "epoch": 0.6625552909152773, "grad_norm": 0.03181048110127449, "grad_norm_var": 3.470294686074941e-07, "learning_rate": 1.7861330251416895e-05, "loss": 2.5901, "step": 7789 }, { "crossentropy": 2.654818058013916, "epoch": 0.6626403538618578, "grad_norm": 0.031926609575748444, "grad_norm_var": 3.454178632085354e-07, "learning_rate": 1.769252941190458e-05, "loss": 2.6548, "step": 7790 }, { "crossentropy": 2.5783443450927734, "epoch": 0.6627254168084382, "grad_norm": 0.03167616203427315, "grad_norm_var": 3.41461770879808e-07, "learning_rate": 1.752452858740983e-05, "loss": 2.5783, "step": 7791 }, { "crossentropy": 2.6048057079315186, "epoch": 0.6628104797550187, "grad_norm": 0.03255792707204819, "grad_norm_var": 3.633077855555907e-07, "learning_rate": 1.735732780490884e-05, "loss": 2.6048, "step": 7792 }, { "crossentropy": 2.584041118621826, "epoch": 0.6628955427015992, "grad_norm": 0.03070679120719433, "grad_norm_var": 4.5456831574730514e-07, "learning_rate": 1.719092709125014e-05, "loss": 2.584, "step": 7793 }, { "crossentropy": 2.546950101852417, "epoch": 0.6629806056481796, "grad_norm": 0.03054039552807808, "grad_norm_var": 5.633177328140711e-07, "learning_rate": 1.7025326473151248e-05, "loss": 2.547, "step": 7794 }, { "crossentropy": 2.6378369331359863, "epoch": 0.6630656685947601, "grad_norm": 0.03197941929101944, "grad_norm_var": 3.9864758330312017e-07, "learning_rate": 1.6860525977204778e-05, "loss": 2.6378, "step": 7795 }, { "crossentropy": 2.5263888835906982, "epoch": 0.6631507315413406, "grad_norm": 0.03418037295341492, "grad_norm_var": 7.658387846728633e-07, "learning_rate": 1.669652562987123e-05, "loss": 2.5264, "step": 7796 }, { "crossentropy": 2.6519036293029785, "epoch": 0.663235794487921, "grad_norm": 0.032077230513095856, "grad_norm_var": 7.422483786129052e-07, "learning_rate": 1.6533325457485095e-05, "loss": 2.6519, "step": 7797 }, { "crossentropy": 2.6288442611694336, "epoch": 0.6633208574345015, "grad_norm": 0.031314916908741, "grad_norm_var": 7.514538994608193e-07, "learning_rate": 1.6370925486252075e-05, "loss": 2.6288, "step": 7798 }, { "crossentropy": 2.5742642879486084, "epoch": 0.663405920381082, "grad_norm": 0.030712377279996872, "grad_norm_var": 8.283943638836041e-07, "learning_rate": 1.620932574224798e-05, "loss": 2.5743, "step": 7799 }, { "crossentropy": 2.644106149673462, "epoch": 0.6634909833276624, "grad_norm": 0.03388688713312149, "grad_norm_var": 1.1111789795231376e-06, "learning_rate": 1.60485262514215e-05, "loss": 2.6441, "step": 7800 }, { "crossentropy": 2.5678930282592773, "epoch": 0.6635760462742429, "grad_norm": 0.032185621559619904, "grad_norm_var": 1.1173768576092408e-06, "learning_rate": 1.588852703959254e-05, "loss": 2.5679, "step": 7801 }, { "crossentropy": 2.613945245742798, "epoch": 0.6636611092208234, "grad_norm": 0.03190985694527626, "grad_norm_var": 1.1137633469096662e-06, "learning_rate": 1.5729328132452204e-05, "loss": 2.6139, "step": 7802 }, { "crossentropy": 2.611565113067627, "epoch": 0.6637461721674038, "grad_norm": 0.030849825590848923, "grad_norm_var": 1.164570828112452e-06, "learning_rate": 1.5570929555563384e-05, "loss": 2.6116, "step": 7803 }, { "crossentropy": 2.5200235843658447, "epoch": 0.6638312351139843, "grad_norm": 0.03139723837375641, "grad_norm_var": 1.074000808464837e-06, "learning_rate": 1.541333133436018e-05, "loss": 2.52, "step": 7804 }, { "crossentropy": 2.6197454929351807, "epoch": 0.6639162980605648, "grad_norm": 0.030361749231815338, "grad_norm_var": 1.2141644635002876e-06, "learning_rate": 1.5256533494148461e-05, "loss": 2.6197, "step": 7805 }, { "crossentropy": 2.5697569847106934, "epoch": 0.6640013610071452, "grad_norm": 0.03060077130794525, "grad_norm_var": 1.2957191507534006e-06, "learning_rate": 1.5100536060105308e-05, "loss": 2.5698, "step": 7806 }, { "crossentropy": 2.5882935523986816, "epoch": 0.6640864239537257, "grad_norm": 0.031668078154325485, "grad_norm_var": 1.295731248244812e-06, "learning_rate": 1.4945339057279016e-05, "loss": 2.5883, "step": 7807 }, { "crossentropy": 2.613172769546509, "epoch": 0.6641714869003063, "grad_norm": 0.03130887821316719, "grad_norm_var": 1.2475439044032683e-06, "learning_rate": 1.4790942510590765e-05, "loss": 2.6132, "step": 7808 }, { "crossentropy": 2.6116037368774414, "epoch": 0.6642565498468866, "grad_norm": 0.03268048167228699, "grad_norm_var": 1.2546315520593301e-06, "learning_rate": 1.4637346444831279e-05, "loss": 2.6116, "step": 7809 }, { "crossentropy": 2.5989043712615967, "epoch": 0.6643416127934672, "grad_norm": 0.0326429083943367, "grad_norm_var": 1.1978825272359582e-06, "learning_rate": 1.448455088466416e-05, "loss": 2.5989, "step": 7810 }, { "crossentropy": 2.6639904975891113, "epoch": 0.6644266757400477, "grad_norm": 0.03175530955195427, "grad_norm_var": 1.1974468726747475e-06, "learning_rate": 1.4332555854623675e-05, "loss": 2.664, "step": 7811 }, { "crossentropy": 2.498948335647583, "epoch": 0.664511738686628, "grad_norm": 0.03445041552186012, "grad_norm_var": 1.286063102939825e-06, "learning_rate": 1.4181361379115853e-05, "loss": 2.4989, "step": 7812 }, { "crossentropy": 2.6152589321136475, "epoch": 0.6645968016332086, "grad_norm": 0.030483299866318703, "grad_norm_var": 1.3992499511006196e-06, "learning_rate": 1.4030967482418499e-05, "loss": 2.6153, "step": 7813 }, { "crossentropy": 2.600602149963379, "epoch": 0.6646818645797891, "grad_norm": 0.03234660252928734, "grad_norm_var": 1.4041306579335207e-06, "learning_rate": 1.3881374188680073e-05, "loss": 2.6006, "step": 7814 }, { "crossentropy": 2.593952178955078, "epoch": 0.6647669275263696, "grad_norm": 0.031595055013895035, "grad_norm_var": 1.3215842409547516e-06, "learning_rate": 1.3732581521921916e-05, "loss": 2.594, "step": 7815 }, { "crossentropy": 2.6655759811401367, "epoch": 0.66485199047295, "grad_norm": 0.03218228742480278, "grad_norm_var": 1.047673289178647e-06, "learning_rate": 1.3584589506034361e-05, "loss": 2.6656, "step": 7816 }, { "crossentropy": 2.605834722518921, "epoch": 0.6649370534195305, "grad_norm": 0.03248409181833267, "grad_norm_var": 1.0695364658065553e-06, "learning_rate": 1.3437398164781733e-05, "loss": 2.6058, "step": 7817 }, { "crossentropy": 2.623103141784668, "epoch": 0.665022116366111, "grad_norm": 0.03144831955432892, "grad_norm_var": 1.0757697933395087e-06, "learning_rate": 1.3291007521799014e-05, "loss": 2.6231, "step": 7818 }, { "crossentropy": 2.5818023681640625, "epoch": 0.6651071793126914, "grad_norm": 0.03230933099985123, "grad_norm_var": 1.0306247101693564e-06, "learning_rate": 1.3145417600590737e-05, "loss": 2.5818, "step": 7819 }, { "crossentropy": 2.5738956928253174, "epoch": 0.6651922422592719, "grad_norm": 0.03191693499684334, "grad_norm_var": 1.0156345852120694e-06, "learning_rate": 1.3000628424535976e-05, "loss": 2.5739, "step": 7820 }, { "crossentropy": 2.603686809539795, "epoch": 0.6652773052058524, "grad_norm": 0.032190952450037, "grad_norm_var": 8.521114025130388e-07, "learning_rate": 1.2856640016883358e-05, "loss": 2.6037, "step": 7821 }, { "crossentropy": 2.68336820602417, "epoch": 0.6653623681524328, "grad_norm": 0.032347533851861954, "grad_norm_var": 7.159999173672691e-07, "learning_rate": 1.2713452400752722e-05, "loss": 2.6834, "step": 7822 }, { "crossentropy": 2.6325910091400146, "epoch": 0.6654474310990133, "grad_norm": 0.03112676553428173, "grad_norm_var": 7.664370554271692e-07, "learning_rate": 1.2571065599136233e-05, "loss": 2.6326, "step": 7823 }, { "crossentropy": 2.594977617263794, "epoch": 0.6655324940455938, "grad_norm": 0.03088507056236267, "grad_norm_var": 8.21198913780972e-07, "learning_rate": 1.2429479634897267e-05, "loss": 2.595, "step": 7824 }, { "crossentropy": 2.6643717288970947, "epoch": 0.6656175569921742, "grad_norm": 0.03169761225581169, "grad_norm_var": 7.993233006490958e-07, "learning_rate": 1.2288694530769862e-05, "loss": 2.6644, "step": 7825 }, { "crossentropy": 2.606196641921997, "epoch": 0.6657026199387547, "grad_norm": 0.03181295096874237, "grad_norm_var": 7.702791944294921e-07, "learning_rate": 1.2148710309360934e-05, "loss": 2.6062, "step": 7826 }, { "crossentropy": 2.6046695709228516, "epoch": 0.6657876828853352, "grad_norm": 0.03150593861937523, "grad_norm_var": 7.802911507282309e-07, "learning_rate": 1.2009526993147501e-05, "loss": 2.6047, "step": 7827 }, { "crossentropy": 2.545795202255249, "epoch": 0.6658727458319156, "grad_norm": 0.03171445056796074, "grad_norm_var": 3.264914812473388e-07, "learning_rate": 1.1871144604478357e-05, "loss": 2.5458, "step": 7828 }, { "crossentropy": 2.51562237739563, "epoch": 0.6659578087784961, "grad_norm": 0.030515456572175026, "grad_norm_var": 3.211124082871363e-07, "learning_rate": 1.1733563165573502e-05, "loss": 2.5156, "step": 7829 }, { "crossentropy": 2.5819122791290283, "epoch": 0.6660428717250766, "grad_norm": 0.0300491601228714, "grad_norm_var": 4.6976710389839044e-07, "learning_rate": 1.159678269852471e-05, "loss": 2.5819, "step": 7830 }, { "crossentropy": 2.6360058784484863, "epoch": 0.666127934671657, "grad_norm": 0.031402587890625, "grad_norm_var": 4.725009943547127e-07, "learning_rate": 1.1460803225295524e-05, "loss": 2.636, "step": 7831 }, { "crossentropy": 2.4986860752105713, "epoch": 0.6662129976182375, "grad_norm": 0.03195013105869293, "grad_norm_var": 4.578248778366415e-07, "learning_rate": 1.1325624767719589e-05, "loss": 2.4987, "step": 7832 }, { "crossentropy": 2.7325525283813477, "epoch": 0.666298060564818, "grad_norm": 0.031720031052827835, "grad_norm_var": 4.0269964121693546e-07, "learning_rate": 1.1191247347502875e-05, "loss": 2.7326, "step": 7833 }, { "crossentropy": 2.520702838897705, "epoch": 0.6663831235113984, "grad_norm": 0.0312434583902359, "grad_norm_var": 4.077470326347213e-07, "learning_rate": 1.1057670986222569e-05, "loss": 2.5207, "step": 7834 }, { "crossentropy": 2.5918896198272705, "epoch": 0.6664681864579789, "grad_norm": 0.0318688228726387, "grad_norm_var": 3.737650641372561e-07, "learning_rate": 1.0924895705327065e-05, "loss": 2.5919, "step": 7835 }, { "crossentropy": 2.603161573410034, "epoch": 0.6665532494045594, "grad_norm": 0.031740181148052216, "grad_norm_var": 3.658148994410693e-07, "learning_rate": 1.0792921526136535e-05, "loss": 2.6032, "step": 7836 }, { "crossentropy": 2.5537571907043457, "epoch": 0.6666383123511398, "grad_norm": 0.0310058556497097, "grad_norm_var": 3.4215336259848145e-07, "learning_rate": 1.0661748469842358e-05, "loss": 2.5538, "step": 7837 }, { "crossentropy": 2.4723331928253174, "epoch": 0.6667233752977203, "grad_norm": 0.033313218504190445, "grad_norm_var": 5.209431918622627e-07, "learning_rate": 1.0531376557506577e-05, "loss": 2.4723, "step": 7838 }, { "crossentropy": 2.585508346557617, "epoch": 0.6668084382443008, "grad_norm": 0.03167120739817619, "grad_norm_var": 5.144093031187683e-07, "learning_rate": 1.0401805810063559e-05, "loss": 2.5855, "step": 7839 }, { "crossentropy": 2.68430495262146, "epoch": 0.6668935011908812, "grad_norm": 0.03244490548968315, "grad_norm_var": 5.373357220215715e-07, "learning_rate": 1.0273036248318323e-05, "loss": 2.6843, "step": 7840 }, { "crossentropy": 2.551060676574707, "epoch": 0.6669785641374617, "grad_norm": 0.03139165788888931, "grad_norm_var": 5.393469391114322e-07, "learning_rate": 1.0145067892947668e-05, "loss": 2.5511, "step": 7841 }, { "crossentropy": 2.6215178966522217, "epoch": 0.6670636270840422, "grad_norm": 0.03183676302433014, "grad_norm_var": 5.401080898256485e-07, "learning_rate": 1.0017900764500154e-05, "loss": 2.6215, "step": 7842 }, { "crossentropy": 2.5625858306884766, "epoch": 0.6671486900306227, "grad_norm": 0.030605848878622055, "grad_norm_var": 6.003352050604754e-07, "learning_rate": 9.891534883394449e-06, "loss": 2.5626, "step": 7843 }, { "crossentropy": 2.5803470611572266, "epoch": 0.6672337529772031, "grad_norm": 0.02972927689552307, "grad_norm_var": 7.977165561009649e-07, "learning_rate": 9.765970269921542e-06, "loss": 2.5803, "step": 7844 }, { "crossentropy": 2.6800198554992676, "epoch": 0.6673188159237836, "grad_norm": 0.03140232339501381, "grad_norm_var": 7.416240290745886e-07, "learning_rate": 9.64120694424364e-06, "loss": 2.68, "step": 7845 }, { "crossentropy": 2.555175304412842, "epoch": 0.6674038788703641, "grad_norm": 0.03195934370160103, "grad_norm_var": 6.101000761739305e-07, "learning_rate": 9.517244926393608e-06, "loss": 2.5552, "step": 7846 }, { "crossentropy": 2.702087879180908, "epoch": 0.6674889418169445, "grad_norm": 0.03242890536785126, "grad_norm_var": 6.516075544154658e-07, "learning_rate": 9.39408423627719e-06, "loss": 2.7021, "step": 7847 }, { "crossentropy": 2.604024887084961, "epoch": 0.667574004763525, "grad_norm": 0.03197946026921272, "grad_norm_var": 6.528565230325943e-07, "learning_rate": 9.27172489366912e-06, "loss": 2.604, "step": 7848 }, { "crossentropy": 2.583162307739258, "epoch": 0.6676590677101055, "grad_norm": 0.029662948101758957, "grad_norm_var": 8.97116013417037e-07, "learning_rate": 9.150166918218128e-06, "loss": 2.5832, "step": 7849 }, { "crossentropy": 2.5891010761260986, "epoch": 0.6677441306566859, "grad_norm": 0.030688541010022163, "grad_norm_var": 9.366572205197667e-07, "learning_rate": 9.029410329441379e-06, "loss": 2.5891, "step": 7850 }, { "crossentropy": 2.7242417335510254, "epoch": 0.6678291936032664, "grad_norm": 0.03161952272057533, "grad_norm_var": 9.277194875851088e-07, "learning_rate": 8.909455146730583e-06, "loss": 2.7242, "step": 7851 }, { "crossentropy": 2.6022050380706787, "epoch": 0.6679142565498469, "grad_norm": 0.031894560903310776, "grad_norm_var": 9.348219689167634e-07, "learning_rate": 8.79030138934589e-06, "loss": 2.6022, "step": 7852 }, { "crossentropy": 2.5323305130004883, "epoch": 0.6679993194964273, "grad_norm": 0.030988622456789017, "grad_norm_var": 9.359234424801636e-07, "learning_rate": 8.671949076420883e-06, "loss": 2.5323, "step": 7853 }, { "crossentropy": 2.7267863750457764, "epoch": 0.6680843824430078, "grad_norm": 0.03356500715017319, "grad_norm_var": 1.0015622348961366e-06, "learning_rate": 8.554398226958138e-06, "loss": 2.7268, "step": 7854 }, { "crossentropy": 2.529902219772339, "epoch": 0.6681694453895883, "grad_norm": 0.031305938959121704, "grad_norm_var": 1.0011637561703102e-06, "learning_rate": 8.437648859834224e-06, "loss": 2.5299, "step": 7855 }, { "crossentropy": 2.6677610874176025, "epoch": 0.6682545083361687, "grad_norm": 0.03060532920062542, "grad_norm_var": 9.732935705610937e-07, "learning_rate": 8.32170099379581e-06, "loss": 2.6678, "step": 7856 }, { "crossentropy": 2.5149788856506348, "epoch": 0.6683395712827492, "grad_norm": 0.03330459073185921, "grad_norm_var": 1.2116047135773387e-06, "learning_rate": 8.206554647459674e-06, "loss": 2.515, "step": 7857 }, { "crossentropy": 2.6004860401153564, "epoch": 0.6684246342293297, "grad_norm": 0.03095921501517296, "grad_norm_var": 1.2172384580497186e-06, "learning_rate": 8.092209839315467e-06, "loss": 2.6005, "step": 7858 }, { "crossentropy": 2.5517752170562744, "epoch": 0.6685096971759101, "grad_norm": 0.03191402181982994, "grad_norm_var": 1.1824131976520547e-06, "learning_rate": 7.978666587724059e-06, "loss": 2.5518, "step": 7859 }, { "crossentropy": 2.572030544281006, "epoch": 0.6685947601224906, "grad_norm": 0.03167475014925003, "grad_norm_var": 9.595247372450044e-07, "learning_rate": 7.865924910916978e-06, "loss": 2.572, "step": 7860 }, { "crossentropy": 2.6232597827911377, "epoch": 0.6686798230690711, "grad_norm": 0.03296169638633728, "grad_norm_var": 1.065814079775386e-06, "learning_rate": 7.753984826997518e-06, "loss": 2.6233, "step": 7861 }, { "crossentropy": 2.6292715072631836, "epoch": 0.6687648860156515, "grad_norm": 0.03303045779466629, "grad_norm_var": 1.1717686946098594e-06, "learning_rate": 7.642846353939637e-06, "loss": 2.6293, "step": 7862 }, { "crossentropy": 2.6410326957702637, "epoch": 0.668849948962232, "grad_norm": 0.03170837089419365, "grad_norm_var": 1.142497526241716e-06, "learning_rate": 7.532509509589058e-06, "loss": 2.641, "step": 7863 }, { "crossentropy": 2.64814829826355, "epoch": 0.6689350119088125, "grad_norm": 0.031591013073921204, "grad_norm_var": 1.1396004327419532e-06, "learning_rate": 7.422974311662722e-06, "loss": 2.6481, "step": 7864 }, { "crossentropy": 2.5648109912872314, "epoch": 0.6690200748553929, "grad_norm": 0.031171534210443497, "grad_norm_var": 8.686455330200275e-07, "learning_rate": 7.314240777749337e-06, "loss": 2.5648, "step": 7865 }, { "crossentropy": 2.63616943359375, "epoch": 0.6691051378019734, "grad_norm": 0.031224433332681656, "grad_norm_var": 8.063600013741377e-07, "learning_rate": 7.206308925307159e-06, "loss": 2.6362, "step": 7866 }, { "crossentropy": 2.550806760787964, "epoch": 0.669190200748554, "grad_norm": 0.03320677950978279, "grad_norm_var": 9.16115166245734e-07, "learning_rate": 7.099178771668435e-06, "loss": 2.5508, "step": 7867 }, { "crossentropy": 2.5754785537719727, "epoch": 0.6692752636951343, "grad_norm": 0.032327037304639816, "grad_norm_var": 9.249457068188077e-07, "learning_rate": 6.992850334033851e-06, "loss": 2.5755, "step": 7868 }, { "crossentropy": 2.6216773986816406, "epoch": 0.6693603266417149, "grad_norm": 0.032174184918403625, "grad_norm_var": 8.574761353420755e-07, "learning_rate": 6.887323629477527e-06, "loss": 2.6217, "step": 7869 }, { "crossentropy": 2.476954936981201, "epoch": 0.6694453895882954, "grad_norm": 0.03163076192140579, "grad_norm_var": 6.993690809476884e-07, "learning_rate": 6.7825986749436855e-06, "loss": 2.477, "step": 7870 }, { "crossentropy": 2.587143659591675, "epoch": 0.6695304525348758, "grad_norm": 0.03157857060432434, "grad_norm_var": 6.815336220064669e-07, "learning_rate": 6.67867548724832e-06, "loss": 2.5871, "step": 7871 }, { "crossentropy": 2.629405975341797, "epoch": 0.6696155154814563, "grad_norm": 0.03153779357671738, "grad_norm_var": 5.697622500383526e-07, "learning_rate": 6.5755540830780835e-06, "loss": 2.6294, "step": 7872 }, { "crossentropy": 2.602827548980713, "epoch": 0.6697005784280368, "grad_norm": 0.030622083693742752, "grad_norm_var": 5.527856090194645e-07, "learning_rate": 6.473234478991952e-06, "loss": 2.6028, "step": 7873 }, { "crossentropy": 2.643470287322998, "epoch": 0.6697856413746173, "grad_norm": 0.030997617170214653, "grad_norm_var": 5.484086440981588e-07, "learning_rate": 6.371716691419005e-06, "loss": 2.6435, "step": 7874 }, { "crossentropy": 2.632204294204712, "epoch": 0.6698707043211977, "grad_norm": 0.030635979026556015, "grad_norm_var": 6.369352410271241e-07, "learning_rate": 6.271000736660648e-06, "loss": 2.6322, "step": 7875 }, { "crossentropy": 2.6204216480255127, "epoch": 0.6699557672677782, "grad_norm": 0.030977359041571617, "grad_norm_var": 6.747541475971934e-07, "learning_rate": 6.171086630887835e-06, "loss": 2.6204, "step": 7876 }, { "crossentropy": 2.5468246936798096, "epoch": 0.6700408302143587, "grad_norm": 0.030500099062919617, "grad_norm_var": 6.429689814730725e-07, "learning_rate": 6.071974390144952e-06, "loss": 2.5468, "step": 7877 }, { "crossentropy": 2.5650618076324463, "epoch": 0.6701258931609391, "grad_norm": 0.03155258297920227, "grad_norm_var": 4.891568390205196e-07, "learning_rate": 5.973664030346493e-06, "loss": 2.5651, "step": 7878 }, { "crossentropy": 2.6564369201660156, "epoch": 0.6702109561075196, "grad_norm": 0.03261779993772507, "grad_norm_var": 5.703874338564355e-07, "learning_rate": 5.876155567277608e-06, "loss": 2.6564, "step": 7879 }, { "crossentropy": 2.582453966140747, "epoch": 0.6702960190541001, "grad_norm": 0.03329925984144211, "grad_norm_var": 7.685786590697422e-07, "learning_rate": 5.7794490165957725e-06, "loss": 2.5825, "step": 7880 }, { "crossentropy": 2.588158130645752, "epoch": 0.6703810820006805, "grad_norm": 0.0317351259291172, "grad_norm_var": 7.541019093142952e-07, "learning_rate": 5.683544393829121e-06, "loss": 2.5882, "step": 7881 }, { "crossentropy": 2.6538546085357666, "epoch": 0.670466144947261, "grad_norm": 0.03360839933156967, "grad_norm_var": 9.697159486912742e-07, "learning_rate": 5.5884417143775565e-06, "loss": 2.6539, "step": 7882 }, { "crossentropy": 2.5776257514953613, "epoch": 0.6705512078938415, "grad_norm": 0.031594324856996536, "grad_norm_var": 8.324741765777799e-07, "learning_rate": 5.494140993511643e-06, "loss": 2.5776, "step": 7883 }, { "crossentropy": 2.5698089599609375, "epoch": 0.6706362708404219, "grad_norm": 0.031154531985521317, "grad_norm_var": 8.222164377340302e-07, "learning_rate": 5.400642246373155e-06, "loss": 2.5698, "step": 7884 }, { "crossentropy": 2.622774124145508, "epoch": 0.6707213337870024, "grad_norm": 0.031157605350017548, "grad_norm_var": 8.142012326115e-07, "learning_rate": 5.307945487975086e-06, "loss": 2.6228, "step": 7885 }, { "crossentropy": 2.6405270099639893, "epoch": 0.6708063967335829, "grad_norm": 0.031737782061100006, "grad_norm_var": 8.157128452465634e-07, "learning_rate": 5.216050733202748e-06, "loss": 2.6405, "step": 7886 }, { "crossentropy": 2.7071995735168457, "epoch": 0.6708914596801633, "grad_norm": 0.031755730509757996, "grad_norm_var": 8.176009483620849e-07, "learning_rate": 5.124957996811008e-06, "loss": 2.7072, "step": 7887 }, { "crossentropy": 2.669809103012085, "epoch": 0.6709765226267438, "grad_norm": 0.031393323093652725, "grad_norm_var": 8.199641312109721e-07, "learning_rate": 5.034667293427053e-06, "loss": 2.6698, "step": 7888 }, { "crossentropy": 2.605922222137451, "epoch": 0.6710615855733243, "grad_norm": 0.03285025432705879, "grad_norm_var": 8.445671267453524e-07, "learning_rate": 4.945178637548731e-06, "loss": 2.6059, "step": 7889 }, { "crossentropy": 2.6596457958221436, "epoch": 0.6711466485199047, "grad_norm": 0.03202785551548004, "grad_norm_var": 8.112637090347859e-07, "learning_rate": 4.856492043545102e-06, "loss": 2.6596, "step": 7890 }, { "crossentropy": 2.501274347305298, "epoch": 0.6712317114664852, "grad_norm": 0.03066674992442131, "grad_norm_var": 8.065989520293492e-07, "learning_rate": 4.768607525657553e-06, "loss": 2.5013, "step": 7891 }, { "crossentropy": 2.6365888118743896, "epoch": 0.6713167744130657, "grad_norm": 0.032164543867111206, "grad_norm_var": 7.661639029319285e-07, "learning_rate": 4.6815250979970195e-06, "loss": 2.6366, "step": 7892 }, { "crossentropy": 2.74845814704895, "epoch": 0.6714018373596461, "grad_norm": 0.0321822352707386, "grad_norm_var": 6.372230880499184e-07, "learning_rate": 4.595244774547313e-06, "loss": 2.7485, "step": 7893 }, { "crossentropy": 2.5112812519073486, "epoch": 0.6714869003062266, "grad_norm": 0.03178185597062111, "grad_norm_var": 6.277899762365442e-07, "learning_rate": 4.509766569161244e-06, "loss": 2.5113, "step": 7894 }, { "crossentropy": 2.597564220428467, "epoch": 0.6715719632528071, "grad_norm": 0.031223025172948837, "grad_norm_var": 6.313163097769354e-07, "learning_rate": 4.425090495565609e-06, "loss": 2.5976, "step": 7895 }, { "crossentropy": 2.6025872230529785, "epoch": 0.6716570261993875, "grad_norm": 0.031227249652147293, "grad_norm_var": 5.119084684932866e-07, "learning_rate": 4.341216567355643e-06, "loss": 2.6026, "step": 7896 }, { "crossentropy": 2.5435726642608643, "epoch": 0.671742089145968, "grad_norm": 0.03174606338143349, "grad_norm_var": 5.118705021299001e-07, "learning_rate": 4.258144798000019e-06, "loss": 2.5436, "step": 7897 }, { "crossentropy": 2.616652727127075, "epoch": 0.6718271520925485, "grad_norm": 0.031838323920965195, "grad_norm_var": 2.7309775122204516e-07, "learning_rate": 4.175875200836954e-06, "loss": 2.6167, "step": 7898 }, { "crossentropy": 2.639899492263794, "epoch": 0.6719122150391289, "grad_norm": 0.03181115537881851, "grad_norm_var": 2.742432881284081e-07, "learning_rate": 4.094407789077548e-06, "loss": 2.6399, "step": 7899 }, { "crossentropy": 2.6748812198638916, "epoch": 0.6719972779857094, "grad_norm": 0.03215387836098671, "grad_norm_var": 2.6799174531721054e-07, "learning_rate": 4.013742575801893e-06, "loss": 2.6749, "step": 7900 }, { "crossentropy": 2.573495626449585, "epoch": 0.6720823409322899, "grad_norm": 0.031109487637877464, "grad_norm_var": 2.7182385153970367e-07, "learning_rate": 3.933879573963517e-06, "loss": 2.5735, "step": 7901 }, { "crossentropy": 2.6300086975097656, "epoch": 0.6721674038788704, "grad_norm": 0.0314929224550724, "grad_norm_var": 2.7529565082769157e-07, "learning_rate": 3.854818796385496e-06, "loss": 2.63, "step": 7902 }, { "crossentropy": 2.6054046154022217, "epoch": 0.6722524668254508, "grad_norm": 0.031120529398322105, "grad_norm_var": 2.969823410888898e-07, "learning_rate": 3.7765602557626735e-06, "loss": 2.6054, "step": 7903 }, { "crossentropy": 2.5391712188720703, "epoch": 0.6723375297720313, "grad_norm": 0.031393617391586304, "grad_norm_var": 2.9697131944343007e-07, "learning_rate": 3.699103964661665e-06, "loss": 2.5392, "step": 7904 }, { "crossentropy": 2.5588061809539795, "epoch": 0.6724225927186118, "grad_norm": 0.031140895560383797, "grad_norm_var": 2.1158698503089857e-07, "learning_rate": 3.6224499355197447e-06, "loss": 2.5588, "step": 7905 }, { "crossentropy": 2.5962352752685547, "epoch": 0.6725076556651922, "grad_norm": 0.030842596665024757, "grad_norm_var": 2.266411748857421e-07, "learning_rate": 3.5465981806442895e-06, "loss": 2.5962, "step": 7906 }, { "crossentropy": 2.591296672821045, "epoch": 0.6725927186117727, "grad_norm": 0.03152317553758621, "grad_norm_var": 1.7808227711982315e-07, "learning_rate": 3.4715487122161105e-06, "loss": 2.5913, "step": 7907 }, { "crossentropy": 2.5003676414489746, "epoch": 0.6726777815583532, "grad_norm": 0.03194679319858551, "grad_norm_var": 1.6311551837080994e-07, "learning_rate": 3.397301542285569e-06, "loss": 2.5004, "step": 7908 }, { "crossentropy": 2.593186378479004, "epoch": 0.6727628445049336, "grad_norm": 0.03186061978340149, "grad_norm_var": 1.4175530993424528e-07, "learning_rate": 3.32385668277424e-06, "loss": 2.5932, "step": 7909 }, { "crossentropy": 2.574021339416504, "epoch": 0.6728479074515141, "grad_norm": 0.03282151371240616, "grad_norm_var": 2.4654361910417254e-07, "learning_rate": 3.2512141454760226e-06, "loss": 2.574, "step": 7910 }, { "crossentropy": 2.736356258392334, "epoch": 0.6729329703980946, "grad_norm": 0.030755700543522835, "grad_norm_var": 2.823265848247384e-07, "learning_rate": 3.1793739420543644e-06, "loss": 2.7364, "step": 7911 }, { "crossentropy": 2.651728868484497, "epoch": 0.673018033344675, "grad_norm": 0.03124191239476204, "grad_norm_var": 2.8171092588424355e-07, "learning_rate": 3.1083360840455932e-06, "loss": 2.6517, "step": 7912 }, { "crossentropy": 2.6857235431671143, "epoch": 0.6731030962912555, "grad_norm": 0.031951986253261566, "grad_norm_var": 2.8974577949007706e-07, "learning_rate": 3.0381005828555854e-06, "loss": 2.6857, "step": 7913 }, { "crossentropy": 2.6853082180023193, "epoch": 0.673188159237836, "grad_norm": 0.03223498910665512, "grad_norm_var": 3.141508147286165e-07, "learning_rate": 2.968667449762541e-06, "loss": 2.6853, "step": 7914 }, { "crossentropy": 2.639704704284668, "epoch": 0.6732732221844164, "grad_norm": 0.030771546065807343, "grad_norm_var": 3.50713503167764e-07, "learning_rate": 2.900036695914765e-06, "loss": 2.6397, "step": 7915 }, { "crossentropy": 2.6248700618743896, "epoch": 0.6733582851309969, "grad_norm": 0.032174982130527496, "grad_norm_var": 3.5251755327613856e-07, "learning_rate": 2.832208332333441e-06, "loss": 2.6249, "step": 7916 }, { "crossentropy": 2.7068238258361816, "epoch": 0.6734433480775774, "grad_norm": 0.032104671001434326, "grad_norm_var": 3.5942088813887414e-07, "learning_rate": 2.7651823699093027e-06, "loss": 2.7068, "step": 7917 }, { "crossentropy": 2.5720551013946533, "epoch": 0.6735284110241578, "grad_norm": 0.03214599937200546, "grad_norm_var": 3.77959483034244e-07, "learning_rate": 2.698958819404851e-06, "loss": 2.5721, "step": 7918 }, { "crossentropy": 2.7111144065856934, "epoch": 0.6736134739707383, "grad_norm": 0.03134356066584587, "grad_norm_var": 3.6600812351888074e-07, "learning_rate": 2.6335376914538025e-06, "loss": 2.7111, "step": 7919 }, { "crossentropy": 2.5746545791625977, "epoch": 0.6736985369173188, "grad_norm": 0.029488196596503258, "grad_norm_var": 6.557485862534439e-07, "learning_rate": 2.568918996560532e-06, "loss": 2.5747, "step": 7920 }, { "crossentropy": 2.685350179672241, "epoch": 0.6737835998638992, "grad_norm": 0.0312592014670372, "grad_norm_var": 6.506145877405948e-07, "learning_rate": 2.5051027451011845e-06, "loss": 2.6854, "step": 7921 }, { "crossentropy": 2.6000165939331055, "epoch": 0.6738686628104797, "grad_norm": 0.030557280406355858, "grad_norm_var": 6.818228838220089e-07, "learning_rate": 2.4420889473231178e-06, "loss": 2.6, "step": 7922 }, { "crossentropy": 2.5460240840911865, "epoch": 0.6739537257570603, "grad_norm": 0.031320855021476746, "grad_norm_var": 6.840631173663971e-07, "learning_rate": 2.379877613343795e-06, "loss": 2.546, "step": 7923 }, { "crossentropy": 2.5222113132476807, "epoch": 0.6740387887036406, "grad_norm": 0.029193194583058357, "grad_norm_var": 9.93455304462478e-07, "learning_rate": 2.3184687531530025e-06, "loss": 2.5222, "step": 7924 }, { "crossentropy": 2.6221437454223633, "epoch": 0.6741238516502212, "grad_norm": 0.031578801572322845, "grad_norm_var": 9.783543789350832e-07, "learning_rate": 2.257862376611741e-06, "loss": 2.6221, "step": 7925 }, { "crossentropy": 2.528515577316284, "epoch": 0.6742089145968017, "grad_norm": 0.03165395185351372, "grad_norm_var": 8.280977827082013e-07, "learning_rate": 2.198058493450561e-06, "loss": 2.5285, "step": 7926 }, { "crossentropy": 2.593019723892212, "epoch": 0.674293977543382, "grad_norm": 0.031374406069517136, "grad_norm_var": 8.123964342546571e-07, "learning_rate": 2.139057113272891e-06, "loss": 2.593, "step": 7927 }, { "crossentropy": 2.6336328983306885, "epoch": 0.6743790404899626, "grad_norm": 0.039869412779808044, "grad_norm_var": 5.426765884741881e-06, "learning_rate": 2.080858245552819e-06, "loss": 2.6336, "step": 7928 }, { "crossentropy": 2.7064809799194336, "epoch": 0.6744641034365431, "grad_norm": 0.03171323984861374, "grad_norm_var": 5.4259339588243746e-06, "learning_rate": 2.0234618996350928e-06, "loss": 2.7065, "step": 7929 }, { "crossentropy": 2.621594190597534, "epoch": 0.6745491663831236, "grad_norm": 0.03211846202611923, "grad_norm_var": 5.420008960679484e-06, "learning_rate": 1.9668680847356735e-06, "loss": 2.6216, "step": 7930 }, { "crossentropy": 2.6389472484588623, "epoch": 0.674634229329704, "grad_norm": 0.030948510393500328, "grad_norm_var": 5.397894625341673e-06, "learning_rate": 1.9110768099422914e-06, "loss": 2.6389, "step": 7931 }, { "crossentropy": 2.5849227905273438, "epoch": 0.6747192922762845, "grad_norm": 0.03170740231871605, "grad_norm_var": 5.388355452792294e-06, "learning_rate": 1.8560880842133366e-06, "loss": 2.5849, "step": 7932 }, { "crossentropy": 2.624626874923706, "epoch": 0.674804355222865, "grad_norm": 0.03130525350570679, "grad_norm_var": 5.393005672164828e-06, "learning_rate": 1.8019019163784122e-06, "loss": 2.6246, "step": 7933 }, { "crossentropy": 2.6341471672058105, "epoch": 0.6748894181694454, "grad_norm": 0.031857769936323166, "grad_norm_var": 5.381965186053902e-06, "learning_rate": 1.7485183151383365e-06, "loss": 2.6341, "step": 7934 }, { "crossentropy": 2.5399012565612793, "epoch": 0.6749744811160259, "grad_norm": 0.03170333802700043, "grad_norm_var": 5.372688332246483e-06, "learning_rate": 1.6959372890645863e-06, "loss": 2.5399, "step": 7935 }, { "crossentropy": 2.6605215072631836, "epoch": 0.6750595440626064, "grad_norm": 0.03128292039036751, "grad_norm_var": 5.038006707556289e-06, "learning_rate": 1.6441588466009628e-06, "loss": 2.6605, "step": 7936 }, { "crossentropy": 2.570080041885376, "epoch": 0.6751446070091868, "grad_norm": 0.03092310205101967, "grad_norm_var": 5.0711055605528875e-06, "learning_rate": 1.5931829960608157e-06, "loss": 2.5701, "step": 7937 }, { "crossentropy": 2.5278499126434326, "epoch": 0.6752296699557673, "grad_norm": 0.02928544394671917, "grad_norm_var": 5.386205033099444e-06, "learning_rate": 1.5430097456292646e-06, "loss": 2.5278, "step": 7938 }, { "crossentropy": 2.662475824356079, "epoch": 0.6753147329023478, "grad_norm": 0.03228887543082237, "grad_norm_var": 5.390704465289122e-06, "learning_rate": 1.493639103363753e-06, "loss": 2.6625, "step": 7939 }, { "crossentropy": 2.5762524604797363, "epoch": 0.6753997958489282, "grad_norm": 0.030610306188464165, "grad_norm_var": 5.023617831068069e-06, "learning_rate": 1.4450710771907183e-06, "loss": 2.5763, "step": 7940 }, { "crossentropy": 2.5287067890167236, "epoch": 0.6754848587955087, "grad_norm": 0.034782957285642624, "grad_norm_var": 5.532832848593308e-06, "learning_rate": 1.397305674909477e-06, "loss": 2.5287, "step": 7941 }, { "crossentropy": 2.6657252311706543, "epoch": 0.6755699217420892, "grad_norm": 0.03193039819598198, "grad_norm_var": 5.5215704825924e-06, "learning_rate": 1.350342904188895e-06, "loss": 2.6657, "step": 7942 }, { "crossentropy": 2.5916292667388916, "epoch": 0.6756549846886696, "grad_norm": 0.03110521286725998, "grad_norm_var": 5.552371232973848e-06, "learning_rate": 1.3041827725712718e-06, "loss": 2.5916, "step": 7943 }, { "crossentropy": 2.59883189201355, "epoch": 0.6757400476352501, "grad_norm": 0.02996261604130268, "grad_norm_var": 1.40992477301131e-06, "learning_rate": 1.2588252874673467e-06, "loss": 2.5988, "step": 7944 }, { "crossentropy": 2.655846118927002, "epoch": 0.6758251105818306, "grad_norm": 0.03198780119419098, "grad_norm_var": 1.4235275470025435e-06, "learning_rate": 1.2142704561601826e-06, "loss": 2.6558, "step": 7945 }, { "crossentropy": 2.5124077796936035, "epoch": 0.675910173528411, "grad_norm": 0.03247535601258278, "grad_norm_var": 1.4615121532330148e-06, "learning_rate": 1.1705182858051667e-06, "loss": 2.5124, "step": 7946 }, { "crossentropy": 2.6747570037841797, "epoch": 0.6759952364749915, "grad_norm": 0.030946239829063416, "grad_norm_var": 1.461682410120813e-06, "learning_rate": 1.12756878342668e-06, "loss": 2.6748, "step": 7947 }, { "crossentropy": 2.615199565887451, "epoch": 0.676080299421572, "grad_norm": 0.03036077879369259, "grad_norm_var": 1.5395198596166355e-06, "learning_rate": 1.0854219559214284e-06, "loss": 2.6152, "step": 7948 }, { "crossentropy": 2.563793182373047, "epoch": 0.6761653623681524, "grad_norm": 0.03377540782094002, "grad_norm_var": 1.8812624929762448e-06, "learning_rate": 1.044077810056776e-06, "loss": 2.5638, "step": 7949 }, { "crossentropy": 2.5831751823425293, "epoch": 0.6762504253147329, "grad_norm": 0.03140034154057503, "grad_norm_var": 1.8773930985463775e-06, "learning_rate": 1.0035363524713016e-06, "loss": 2.5832, "step": 7950 }, { "crossentropy": 2.6163318157196045, "epoch": 0.6763354882613134, "grad_norm": 0.030684366822242737, "grad_norm_var": 1.9216331872970796e-06, "learning_rate": 9.637975896759078e-07, "loss": 2.6163, "step": 7951 }, { "crossentropy": 2.6526575088500977, "epoch": 0.6764205512078938, "grad_norm": 0.03131198510527611, "grad_norm_var": 1.9208926637013952e-06, "learning_rate": 9.248615280499362e-07, "loss": 2.6527, "step": 7952 }, { "crossentropy": 2.582174777984619, "epoch": 0.6765056141544743, "grad_norm": 0.03139864280819893, "grad_norm_var": 1.8991168573631935e-06, "learning_rate": 8.867281738467181e-07, "loss": 2.5822, "step": 7953 }, { "crossentropy": 2.541114568710327, "epoch": 0.6765906771010548, "grad_norm": 0.03212243691086769, "grad_norm_var": 1.5572076781961008e-06, "learning_rate": 8.493975331885784e-07, "loss": 2.5411, "step": 7954 }, { "crossentropy": 2.595641851425171, "epoch": 0.6766757400476352, "grad_norm": 0.03187508508563042, "grad_norm_var": 1.535225561079947e-06, "learning_rate": 8.12869612069611e-07, "loss": 2.5956, "step": 7955 }, { "crossentropy": 2.614267349243164, "epoch": 0.6767608029942157, "grad_norm": 0.031091244891285896, "grad_norm_var": 1.4816890950026246e-06, "learning_rate": 7.771444163556795e-07, "loss": 2.6143, "step": 7956 }, { "crossentropy": 2.5670931339263916, "epoch": 0.6768458659407962, "grad_norm": 0.03190002217888832, "grad_norm_var": 8.163453411568251e-07, "learning_rate": 7.422219517833062e-07, "loss": 2.5671, "step": 7957 }, { "crossentropy": 2.5244946479797363, "epoch": 0.6769309288873766, "grad_norm": 0.03121710754930973, "grad_norm_var": 8.091603930776917e-07, "learning_rate": 7.081022239591173e-07, "loss": 2.5245, "step": 7958 }, { "crossentropy": 2.560701608657837, "epoch": 0.6770159918339571, "grad_norm": 0.030797870829701424, "grad_norm_var": 8.302550823831388e-07, "learning_rate": 6.747852383631736e-07, "loss": 2.5607, "step": 7959 }, { "crossentropy": 2.579286575317383, "epoch": 0.6771010547805376, "grad_norm": 0.03071001172065735, "grad_norm_var": 7.162773710933078e-07, "learning_rate": 6.422710003439747e-07, "loss": 2.5793, "step": 7960 }, { "crossentropy": 2.6236369609832764, "epoch": 0.6771861177271181, "grad_norm": 0.032123517245054245, "grad_norm_var": 7.261936796805942e-07, "learning_rate": 6.105595151228993e-07, "loss": 2.6236, "step": 7961 }, { "crossentropy": 2.588521718978882, "epoch": 0.6772711806736985, "grad_norm": 0.031058121472597122, "grad_norm_var": 6.696693821963325e-07, "learning_rate": 5.796507877919855e-07, "loss": 2.5885, "step": 7962 }, { "crossentropy": 2.653093099594116, "epoch": 0.677356243620279, "grad_norm": 0.03099711798131466, "grad_norm_var": 6.665947488066476e-07, "learning_rate": 5.4954482331393e-07, "loss": 2.6531, "step": 7963 }, { "crossentropy": 2.606996774673462, "epoch": 0.6774413065668595, "grad_norm": 0.03160112723708153, "grad_norm_var": 5.864994058912253e-07, "learning_rate": 5.20241626523199e-07, "loss": 2.607, "step": 7964 }, { "crossentropy": 2.6256356239318848, "epoch": 0.6775263695134399, "grad_norm": 0.03412153571844101, "grad_norm_var": 6.988123624115014e-07, "learning_rate": 4.917412021249179e-07, "loss": 2.6256, "step": 7965 }, { "crossentropy": 2.6261701583862305, "epoch": 0.6776114324600204, "grad_norm": 0.03168410062789917, "grad_norm_var": 6.9910350563625e-07, "learning_rate": 4.640435546954258e-07, "loss": 2.6262, "step": 7966 }, { "crossentropy": 2.6543266773223877, "epoch": 0.6776964954066009, "grad_norm": 0.034110404551029205, "grad_norm_var": 1.040304257175252e-06, "learning_rate": 4.3714868868227617e-07, "loss": 2.6543, "step": 7967 }, { "crossentropy": 2.641119956970215, "epoch": 0.6777815583531813, "grad_norm": 0.031064458191394806, "grad_norm_var": 1.0588378804031e-06, "learning_rate": 4.110566084036815e-07, "loss": 2.6411, "step": 7968 }, { "crossentropy": 2.665416955947876, "epoch": 0.6778666212997618, "grad_norm": 0.03141598030924797, "grad_norm_var": 1.0580628234832986e-06, "learning_rate": 3.857673180496235e-07, "loss": 2.6654, "step": 7969 }, { "crossentropy": 2.502931833267212, "epoch": 0.6779516842463423, "grad_norm": 0.030290542170405388, "grad_norm_var": 1.1751569625209942e-06, "learning_rate": 3.6128082168074283e-07, "loss": 2.5029, "step": 7970 }, { "crossentropy": 2.694362163543701, "epoch": 0.6780367471929227, "grad_norm": 0.03192095831036568, "grad_norm_var": 1.1767958457076782e-06, "learning_rate": 3.3759712322889435e-07, "loss": 2.6944, "step": 7971 }, { "crossentropy": 2.5955851078033447, "epoch": 0.6781218101395032, "grad_norm": 0.03110688179731369, "grad_norm_var": 1.175684722878878e-06, "learning_rate": 3.1471622649714706e-07, "loss": 2.5956, "step": 7972 }, { "crossentropy": 2.5819015502929688, "epoch": 0.6782068730860837, "grad_norm": 0.0324215292930603, "grad_norm_var": 1.2112858420416813e-06, "learning_rate": 2.9263813515922887e-07, "loss": 2.5819, "step": 7973 }, { "crossentropy": 2.459707498550415, "epoch": 0.6782919360326641, "grad_norm": 0.032328829169273376, "grad_norm_var": 1.2221285365027518e-06, "learning_rate": 2.71362852760082e-07, "loss": 2.4597, "step": 7974 }, { "crossentropy": 2.5603578090667725, "epoch": 0.6783769989792446, "grad_norm": 0.030527010560035706, "grad_norm_var": 1.2605421777893884e-06, "learning_rate": 2.508903827164177e-07, "loss": 2.5604, "step": 7975 }, { "crossentropy": 2.5147833824157715, "epoch": 0.6784620619258251, "grad_norm": 0.03044291026890278, "grad_norm_var": 1.3008860707613371e-06, "learning_rate": 2.3122072831505136e-07, "loss": 2.5148, "step": 7976 }, { "crossentropy": 2.599937915802002, "epoch": 0.6785471248724055, "grad_norm": 0.030628031119704247, "grad_norm_var": 1.3564046836509262e-06, "learning_rate": 2.123538927145674e-07, "loss": 2.5999, "step": 7977 }, { "crossentropy": 2.6601085662841797, "epoch": 0.678632187818986, "grad_norm": 0.032269176095724106, "grad_norm_var": 1.3593648488934828e-06, "learning_rate": 1.9428987894476446e-07, "loss": 2.6601, "step": 7978 }, { "crossentropy": 2.6808087825775146, "epoch": 0.6787172507655665, "grad_norm": 0.03196552395820618, "grad_norm_var": 1.3293954206766568e-06, "learning_rate": 1.770286899055451e-07, "loss": 2.6808, "step": 7979 }, { "crossentropy": 2.6949801445007324, "epoch": 0.6788023137121469, "grad_norm": 0.03095400705933571, "grad_norm_var": 1.367868683124643e-06, "learning_rate": 1.605703283691362e-07, "loss": 2.695, "step": 7980 }, { "crossentropy": 2.643425941467285, "epoch": 0.6788873766587274, "grad_norm": 0.03221592679619789, "grad_norm_var": 9.80384937887763e-07, "learning_rate": 1.4491479697786857e-07, "loss": 2.6434, "step": 7981 }, { "crossentropy": 2.5750954151153564, "epoch": 0.678972439605308, "grad_norm": 0.03361961245536804, "grad_norm_var": 1.2403189903436532e-06, "learning_rate": 1.3006209824584226e-07, "loss": 2.5751, "step": 7982 }, { "crossentropy": 2.471358060836792, "epoch": 0.6790575025518883, "grad_norm": 0.03035132586956024, "grad_norm_var": 9.179278535668128e-07, "learning_rate": 1.1601223455781628e-07, "loss": 2.4714, "step": 7983 }, { "crossentropy": 2.626140594482422, "epoch": 0.6791425654984689, "grad_norm": 0.03096604347229004, "grad_norm_var": 9.238569161885835e-07, "learning_rate": 1.0276520816976387e-07, "loss": 2.6261, "step": 7984 }, { "crossentropy": 2.5622358322143555, "epoch": 0.6792276284450494, "grad_norm": 0.029492124915122986, "grad_norm_var": 1.1675054938150292e-06, "learning_rate": 9.032102120887232e-08, "loss": 2.5622, "step": 7985 }, { "crossentropy": 2.5739858150482178, "epoch": 0.6793126913916298, "grad_norm": 0.030851561576128006, "grad_norm_var": 1.1083922206116143e-06, "learning_rate": 7.867967567354306e-08, "loss": 2.574, "step": 7986 }, { "crossentropy": 2.5499939918518066, "epoch": 0.6793977543382103, "grad_norm": 0.03244418650865555, "grad_norm_var": 1.1633228609544965e-06, "learning_rate": 6.784117343228146e-08, "loss": 2.55, "step": 7987 }, { "crossentropy": 2.5534276962280273, "epoch": 0.6794828172847908, "grad_norm": 0.031559910625219345, "grad_norm_var": 1.1577473749597883e-06, "learning_rate": 5.7805516226472343e-08, "loss": 2.5534, "step": 7988 }, { "crossentropy": 2.6463961601257324, "epoch": 0.6795678802313713, "grad_norm": 0.031201712787151337, "grad_norm_var": 1.091083029074585e-06, "learning_rate": 4.85727056670493e-08, "loss": 2.6464, "step": 7989 }, { "crossentropy": 2.703519344329834, "epoch": 0.6796529431779517, "grad_norm": 0.030868005007505417, "grad_norm_var": 1.036458038496807e-06, "learning_rate": 4.014274323616007e-08, "loss": 2.7035, "step": 7990 }, { "crossentropy": 2.7079782485961914, "epoch": 0.6797380061245322, "grad_norm": 0.03244664892554283, "grad_norm_var": 1.0760087896863681e-06, "learning_rate": 3.251563028827676e-08, "loss": 2.708, "step": 7991 }, { "crossentropy": 2.674110174179077, "epoch": 0.6798230690711127, "grad_norm": 0.03100908175110817, "grad_norm_var": 1.024374625094868e-06, "learning_rate": 2.5691368046865115e-08, "loss": 2.6741, "step": 7992 }, { "crossentropy": 2.550798177719116, "epoch": 0.6799081320176931, "grad_norm": 0.03088502585887909, "grad_norm_var": 1.0011017801764278e-06, "learning_rate": 1.9669957608825507e-08, "loss": 2.5508, "step": 7993 }, { "crossentropy": 2.7032172679901123, "epoch": 0.6799931949642736, "grad_norm": 0.030543172731995583, "grad_norm_var": 9.97334509705835e-07, "learning_rate": 1.445139994116218e-08, "loss": 2.7032, "step": 7994 }, { "crossentropy": 2.635073184967041, "epoch": 0.6800782579108541, "grad_norm": 0.03156520798802376, "grad_norm_var": 9.737420772969966e-07, "learning_rate": 1.0035695880428186e-08, "loss": 2.6351, "step": 7995 }, { "crossentropy": 2.5267536640167236, "epoch": 0.6801633208574345, "grad_norm": 0.031038319692015648, "grad_norm_var": 9.701748824340768e-07, "learning_rate": 6.42284613716626e-09, "loss": 2.5268, "step": 7996 }, { "crossentropy": 2.541825771331787, "epoch": 0.680248383804015, "grad_norm": 0.030055927112698555, "grad_norm_var": 1.0026295229397283e-06, "learning_rate": 3.6128512903577103e-09, "loss": 2.5418, "step": 7997 }, { "crossentropy": 2.564561128616333, "epoch": 0.6803334467505955, "grad_norm": 0.03315962105989456, "grad_norm_var": 8.66295753097869e-07, "learning_rate": 1.6057117918633068e-09, "loss": 2.5646, "step": 7998 }, { "crossentropy": 2.5806703567504883, "epoch": 0.6804185096971759, "grad_norm": 0.03140547499060631, "grad_norm_var": 8.231587099423078e-07, "learning_rate": 4.0142796420283846e-10, "loss": 2.5807, "step": 7999 }, { "crossentropy": 2.57578444480896, "epoch": 0.6805035726437564, "grad_norm": 0.03149164095520973, "grad_norm_var": 8.227498462399223e-07, "learning_rate": 0.0, "loss": 2.5758, "step": 8000 } ], "logging_steps": 1, "max_steps": 8000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": true, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.769290988355584e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }