diff --git "a/checkpoint-1572/trainer_state.json" "b/checkpoint-1572/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1572/trainer_state.json" @@ -0,0 +1,10534 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8650519031141869, + "eval_steps": 500, + "global_step": 1500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005767012687427913, + "grad_norm": 0.5134636163711548, + "learning_rate": 0.0, + "loss": 1.6129628419876099, + "step": 1 + }, + { + "epoch": 0.0011534025374855825, + "grad_norm": 0.45678019523620605, + "learning_rate": 4e-05, + "loss": 1.713558554649353, + "step": 2 + }, + { + "epoch": 0.0017301038062283738, + "grad_norm": 0.6324027180671692, + "learning_rate": 8e-05, + "loss": 1.9871511459350586, + "step": 3 + }, + { + "epoch": 0.002306805074971165, + "grad_norm": 0.5307025909423828, + "learning_rate": 0.00012, + "loss": 1.6862211227416992, + "step": 4 + }, + { + "epoch": 0.0028835063437139563, + "grad_norm": 0.616538941860199, + "learning_rate": 0.00016, + "loss": 2.1033642292022705, + "step": 5 + }, + { + "epoch": 0.0034602076124567475, + "grad_norm": 0.7627953290939331, + "learning_rate": 0.0002, + "loss": 2.150984764099121, + "step": 6 + }, + { + "epoch": 0.004036908881199538, + "grad_norm": 0.8402333855628967, + "learning_rate": 0.00019996151625938042, + "loss": 2.0197458267211914, + "step": 7 + }, + { + "epoch": 0.00461361014994233, + "grad_norm": 3.813333034515381, + "learning_rate": 0.00019992303251876084, + "loss": 2.396656036376953, + "step": 8 + }, + { + "epoch": 0.005190311418685121, + "grad_norm": 0.9861733913421631, + "learning_rate": 0.00019988454877814126, + "loss": 2.1392970085144043, + "step": 9 + }, + { + "epoch": 0.0057670126874279125, + "grad_norm": 0.7931668758392334, + "learning_rate": 0.00019984606503752164, + "loss": 1.8062304258346558, + "step": 10 + }, + { + "epoch": 0.006343713956170703, + "grad_norm": 0.8828097581863403, + "learning_rate": 0.00019980758129690206, + "loss": 1.76358962059021, + "step": 11 + }, + { + "epoch": 0.006920415224913495, + "grad_norm": 0.7205682396888733, + "learning_rate": 0.00019976909755628247, + "loss": 1.3197358846664429, + "step": 12 + }, + { + "epoch": 0.007497116493656286, + "grad_norm": 1.2321408987045288, + "learning_rate": 0.0001997306138156629, + "loss": 1.7697328329086304, + "step": 13 + }, + { + "epoch": 0.008073817762399077, + "grad_norm": 0.9804911613464355, + "learning_rate": 0.0001996921300750433, + "loss": 1.7214155197143555, + "step": 14 + }, + { + "epoch": 0.00865051903114187, + "grad_norm": 0.9436901807785034, + "learning_rate": 0.00019965364633442372, + "loss": 1.6395944356918335, + "step": 15 + }, + { + "epoch": 0.00922722029988466, + "grad_norm": 1.6564269065856934, + "learning_rate": 0.00019961516259380414, + "loss": 1.8607707023620605, + "step": 16 + }, + { + "epoch": 0.00980392156862745, + "grad_norm": 1.0676305294036865, + "learning_rate": 0.00019957667885318455, + "loss": 1.4897263050079346, + "step": 17 + }, + { + "epoch": 0.010380622837370242, + "grad_norm": 0.9889469146728516, + "learning_rate": 0.00019953819511256494, + "loss": 1.7445942163467407, + "step": 18 + }, + { + "epoch": 0.010957324106113034, + "grad_norm": 0.8717456459999084, + "learning_rate": 0.00019949971137194535, + "loss": 1.4854474067687988, + "step": 19 + }, + { + "epoch": 0.011534025374855825, + "grad_norm": 1.110196590423584, + "learning_rate": 0.00019946122763132577, + "loss": 1.32136869430542, + "step": 20 + }, + { + "epoch": 0.012110726643598616, + "grad_norm": 0.7795314192771912, + "learning_rate": 0.00019942274389070618, + "loss": 1.7199318408966064, + "step": 21 + }, + { + "epoch": 0.012687427912341407, + "grad_norm": 0.7504187822341919, + "learning_rate": 0.0001993842601500866, + "loss": 1.2975201606750488, + "step": 22 + }, + { + "epoch": 0.0132641291810842, + "grad_norm": 0.8012252449989319, + "learning_rate": 0.00019934577640946702, + "loss": 1.2630457878112793, + "step": 23 + }, + { + "epoch": 0.01384083044982699, + "grad_norm": 0.9531145691871643, + "learning_rate": 0.00019930729266884743, + "loss": 1.6974424123764038, + "step": 24 + }, + { + "epoch": 0.01441753171856978, + "grad_norm": 1.020970106124878, + "learning_rate": 0.00019926880892822785, + "loss": 1.294957160949707, + "step": 25 + }, + { + "epoch": 0.014994232987312572, + "grad_norm": 1.7608129978179932, + "learning_rate": 0.00019923032518760823, + "loss": 1.801735520362854, + "step": 26 + }, + { + "epoch": 0.015570934256055362, + "grad_norm": 0.9601960182189941, + "learning_rate": 0.00019919184144698865, + "loss": 1.4538304805755615, + "step": 27 + }, + { + "epoch": 0.016147635524798153, + "grad_norm": 0.7025886178016663, + "learning_rate": 0.00019915335770636906, + "loss": 1.1746238470077515, + "step": 28 + }, + { + "epoch": 0.016724336793540944, + "grad_norm": 0.8506267666816711, + "learning_rate": 0.00019911487396574948, + "loss": 1.1891943216323853, + "step": 29 + }, + { + "epoch": 0.01730103806228374, + "grad_norm": 0.9117224216461182, + "learning_rate": 0.0001990763902251299, + "loss": 1.4325735569000244, + "step": 30 + }, + { + "epoch": 0.01787773933102653, + "grad_norm": 0.8756442070007324, + "learning_rate": 0.0001990379064845103, + "loss": 1.3962581157684326, + "step": 31 + }, + { + "epoch": 0.01845444059976932, + "grad_norm": 1.0293549299240112, + "learning_rate": 0.00019899942274389073, + "loss": 1.4936443567276, + "step": 32 + }, + { + "epoch": 0.01903114186851211, + "grad_norm": 0.8239012360572815, + "learning_rate": 0.00019896093900327114, + "loss": 1.1294159889221191, + "step": 33 + }, + { + "epoch": 0.0196078431372549, + "grad_norm": 0.6293753385543823, + "learning_rate": 0.00019892245526265153, + "loss": 1.219704031944275, + "step": 34 + }, + { + "epoch": 0.020184544405997693, + "grad_norm": 0.9778785109519958, + "learning_rate": 0.00019888397152203194, + "loss": 1.3405961990356445, + "step": 35 + }, + { + "epoch": 0.020761245674740483, + "grad_norm": 0.9916248917579651, + "learning_rate": 0.00019884548778141236, + "loss": 1.7191007137298584, + "step": 36 + }, + { + "epoch": 0.021337946943483274, + "grad_norm": 0.9758312106132507, + "learning_rate": 0.00019880700404079277, + "loss": 1.2949879169464111, + "step": 37 + }, + { + "epoch": 0.02191464821222607, + "grad_norm": 0.7310605645179749, + "learning_rate": 0.0001987685203001732, + "loss": 1.282931923866272, + "step": 38 + }, + { + "epoch": 0.02249134948096886, + "grad_norm": 0.6537899374961853, + "learning_rate": 0.0001987300365595536, + "loss": 1.4050456285476685, + "step": 39 + }, + { + "epoch": 0.02306805074971165, + "grad_norm": 0.6727839708328247, + "learning_rate": 0.00019869155281893402, + "loss": 1.3566672801971436, + "step": 40 + }, + { + "epoch": 0.02364475201845444, + "grad_norm": 0.6026540994644165, + "learning_rate": 0.00019865306907831444, + "loss": 1.6914572715759277, + "step": 41 + }, + { + "epoch": 0.02422145328719723, + "grad_norm": 0.7345203161239624, + "learning_rate": 0.00019861458533769482, + "loss": 1.3210856914520264, + "step": 42 + }, + { + "epoch": 0.024798154555940023, + "grad_norm": 1.7062476873397827, + "learning_rate": 0.00019857610159707524, + "loss": 1.6727783679962158, + "step": 43 + }, + { + "epoch": 0.025374855824682813, + "grad_norm": 0.7726621627807617, + "learning_rate": 0.00019853761785645565, + "loss": 1.7425484657287598, + "step": 44 + }, + { + "epoch": 0.025951557093425604, + "grad_norm": 0.6947644948959351, + "learning_rate": 0.00019849913411583607, + "loss": 1.0628504753112793, + "step": 45 + }, + { + "epoch": 0.0265282583621684, + "grad_norm": 0.7833652496337891, + "learning_rate": 0.00019846065037521649, + "loss": 1.4800021648406982, + "step": 46 + }, + { + "epoch": 0.02710495963091119, + "grad_norm": 0.8065851926803589, + "learning_rate": 0.0001984221666345969, + "loss": 1.2809616327285767, + "step": 47 + }, + { + "epoch": 0.02768166089965398, + "grad_norm": 1.044630527496338, + "learning_rate": 0.00019838368289397732, + "loss": 1.602962851524353, + "step": 48 + }, + { + "epoch": 0.02825836216839677, + "grad_norm": 0.5969672203063965, + "learning_rate": 0.00019834519915335773, + "loss": 1.5166534185409546, + "step": 49 + }, + { + "epoch": 0.02883506343713956, + "grad_norm": 0.848512589931488, + "learning_rate": 0.00019830671541273812, + "loss": 1.442568063735962, + "step": 50 + }, + { + "epoch": 0.029411764705882353, + "grad_norm": 0.5782500505447388, + "learning_rate": 0.00019826823167211853, + "loss": 1.3492627143859863, + "step": 51 + }, + { + "epoch": 0.029988465974625143, + "grad_norm": 0.850151777267456, + "learning_rate": 0.00019822974793149895, + "loss": 1.5313668251037598, + "step": 52 + }, + { + "epoch": 0.030565167243367934, + "grad_norm": 0.613896906375885, + "learning_rate": 0.00019819126419087937, + "loss": 1.0709185600280762, + "step": 53 + }, + { + "epoch": 0.031141868512110725, + "grad_norm": 0.9450347423553467, + "learning_rate": 0.00019815278045025978, + "loss": 1.5562160015106201, + "step": 54 + }, + { + "epoch": 0.031718569780853516, + "grad_norm": 0.9424428939819336, + "learning_rate": 0.0001981142967096402, + "loss": 1.764065146446228, + "step": 55 + }, + { + "epoch": 0.03229527104959631, + "grad_norm": 0.9744471311569214, + "learning_rate": 0.0001980758129690206, + "loss": 0.9400297403335571, + "step": 56 + }, + { + "epoch": 0.0328719723183391, + "grad_norm": 0.7247487902641296, + "learning_rate": 0.00019803732922840103, + "loss": 1.572107195854187, + "step": 57 + }, + { + "epoch": 0.03344867358708189, + "grad_norm": 0.6125597357749939, + "learning_rate": 0.00019799884548778141, + "loss": 1.2189209461212158, + "step": 58 + }, + { + "epoch": 0.034025374855824686, + "grad_norm": 1.0781699419021606, + "learning_rate": 0.00019796036174716183, + "loss": 1.3933414220809937, + "step": 59 + }, + { + "epoch": 0.03460207612456748, + "grad_norm": 0.8329439759254456, + "learning_rate": 0.00019792187800654224, + "loss": 1.4748475551605225, + "step": 60 + }, + { + "epoch": 0.03517877739331027, + "grad_norm": 0.7766849398612976, + "learning_rate": 0.00019788339426592266, + "loss": 1.4775745868682861, + "step": 61 + }, + { + "epoch": 0.03575547866205306, + "grad_norm": 0.7776947021484375, + "learning_rate": 0.00019784491052530308, + "loss": 1.4959548711776733, + "step": 62 + }, + { + "epoch": 0.03633217993079585, + "grad_norm": 0.7114179134368896, + "learning_rate": 0.0001978064267846835, + "loss": 1.4756664037704468, + "step": 63 + }, + { + "epoch": 0.03690888119953864, + "grad_norm": 0.675800621509552, + "learning_rate": 0.0001977679430440639, + "loss": 1.4753670692443848, + "step": 64 + }, + { + "epoch": 0.03748558246828143, + "grad_norm": 1.5709729194641113, + "learning_rate": 0.00019772945930344432, + "loss": 1.5947999954223633, + "step": 65 + }, + { + "epoch": 0.03806228373702422, + "grad_norm": 0.7363697290420532, + "learning_rate": 0.0001976909755628247, + "loss": 1.2786856889724731, + "step": 66 + }, + { + "epoch": 0.03863898500576701, + "grad_norm": 0.8212243318557739, + "learning_rate": 0.00019765249182220512, + "loss": 1.3553478717803955, + "step": 67 + }, + { + "epoch": 0.0392156862745098, + "grad_norm": 0.6724039912223816, + "learning_rate": 0.00019761400808158554, + "loss": 1.3045082092285156, + "step": 68 + }, + { + "epoch": 0.039792387543252594, + "grad_norm": 1.0372695922851562, + "learning_rate": 0.00019757552434096596, + "loss": 1.5149048566818237, + "step": 69 + }, + { + "epoch": 0.040369088811995385, + "grad_norm": 0.7058703303337097, + "learning_rate": 0.00019753704060034637, + "loss": 1.2227076292037964, + "step": 70 + }, + { + "epoch": 0.040945790080738176, + "grad_norm": 0.8637105226516724, + "learning_rate": 0.00019749855685972679, + "loss": 1.0762852430343628, + "step": 71 + }, + { + "epoch": 0.04152249134948097, + "grad_norm": 0.8108904957771301, + "learning_rate": 0.0001974600731191072, + "loss": 1.4130628108978271, + "step": 72 + }, + { + "epoch": 0.04209919261822376, + "grad_norm": 1.2491207122802734, + "learning_rate": 0.00019742158937848762, + "loss": 1.7983347177505493, + "step": 73 + }, + { + "epoch": 0.04267589388696655, + "grad_norm": 1.1523128747940063, + "learning_rate": 0.000197383105637868, + "loss": 1.5859603881835938, + "step": 74 + }, + { + "epoch": 0.04325259515570934, + "grad_norm": 0.7240892648696899, + "learning_rate": 0.00019734462189724842, + "loss": 1.4029178619384766, + "step": 75 + }, + { + "epoch": 0.04382929642445214, + "grad_norm": 0.7445366978645325, + "learning_rate": 0.00019730613815662884, + "loss": 1.351811170578003, + "step": 76 + }, + { + "epoch": 0.04440599769319493, + "grad_norm": 0.9881113767623901, + "learning_rate": 0.00019726765441600925, + "loss": 1.437370777130127, + "step": 77 + }, + { + "epoch": 0.04498269896193772, + "grad_norm": 1.0404249429702759, + "learning_rate": 0.00019722917067538967, + "loss": 1.0401325225830078, + "step": 78 + }, + { + "epoch": 0.04555940023068051, + "grad_norm": 0.998892605304718, + "learning_rate": 0.00019719068693477008, + "loss": 1.2733221054077148, + "step": 79 + }, + { + "epoch": 0.0461361014994233, + "grad_norm": 1.0299255847930908, + "learning_rate": 0.0001971522031941505, + "loss": 1.8878190517425537, + "step": 80 + }, + { + "epoch": 0.04671280276816609, + "grad_norm": 0.6168495416641235, + "learning_rate": 0.0001971137194535309, + "loss": 1.3375468254089355, + "step": 81 + }, + { + "epoch": 0.04728950403690888, + "grad_norm": 0.645830512046814, + "learning_rate": 0.0001970752357129113, + "loss": 0.986657440662384, + "step": 82 + }, + { + "epoch": 0.04786620530565167, + "grad_norm": 0.7971145510673523, + "learning_rate": 0.00019703675197229172, + "loss": 1.3205912113189697, + "step": 83 + }, + { + "epoch": 0.04844290657439446, + "grad_norm": 0.6297418475151062, + "learning_rate": 0.00019699826823167213, + "loss": 1.3360888957977295, + "step": 84 + }, + { + "epoch": 0.049019607843137254, + "grad_norm": 0.9845420718193054, + "learning_rate": 0.00019695978449105255, + "loss": 1.4006659984588623, + "step": 85 + }, + { + "epoch": 0.049596309111880045, + "grad_norm": 0.73700350522995, + "learning_rate": 0.00019692130075043296, + "loss": 1.1298922300338745, + "step": 86 + }, + { + "epoch": 0.050173010380622836, + "grad_norm": 0.7659608721733093, + "learning_rate": 0.00019688281700981338, + "loss": 1.2487225532531738, + "step": 87 + }, + { + "epoch": 0.05074971164936563, + "grad_norm": 0.7576966285705566, + "learning_rate": 0.0001968443332691938, + "loss": 1.346827507019043, + "step": 88 + }, + { + "epoch": 0.05132641291810842, + "grad_norm": 0.6777650117874146, + "learning_rate": 0.0001968058495285742, + "loss": 1.9484481811523438, + "step": 89 + }, + { + "epoch": 0.05190311418685121, + "grad_norm": 0.9935969114303589, + "learning_rate": 0.0001967673657879546, + "loss": 1.1737089157104492, + "step": 90 + }, + { + "epoch": 0.052479815455594, + "grad_norm": 1.0581051111221313, + "learning_rate": 0.000196728882047335, + "loss": 1.2755905389785767, + "step": 91 + }, + { + "epoch": 0.0530565167243368, + "grad_norm": 0.8372200131416321, + "learning_rate": 0.00019669039830671543, + "loss": 1.7988427877426147, + "step": 92 + }, + { + "epoch": 0.05363321799307959, + "grad_norm": 0.8300452828407288, + "learning_rate": 0.00019665191456609584, + "loss": 0.9904743432998657, + "step": 93 + }, + { + "epoch": 0.05420991926182238, + "grad_norm": 0.6703553199768066, + "learning_rate": 0.00019661343082547626, + "loss": 1.2092053890228271, + "step": 94 + }, + { + "epoch": 0.05478662053056517, + "grad_norm": 0.703804075717926, + "learning_rate": 0.00019657494708485667, + "loss": 1.1028215885162354, + "step": 95 + }, + { + "epoch": 0.05536332179930796, + "grad_norm": 0.8232657313346863, + "learning_rate": 0.0001965364633442371, + "loss": 1.3875727653503418, + "step": 96 + }, + { + "epoch": 0.05594002306805075, + "grad_norm": 0.6119164824485779, + "learning_rate": 0.00019649797960361747, + "loss": 1.161183476448059, + "step": 97 + }, + { + "epoch": 0.05651672433679354, + "grad_norm": 0.7460926175117493, + "learning_rate": 0.0001964594958629979, + "loss": 1.3667285442352295, + "step": 98 + }, + { + "epoch": 0.05709342560553633, + "grad_norm": 0.6345133185386658, + "learning_rate": 0.0001964210121223783, + "loss": 1.1740115880966187, + "step": 99 + }, + { + "epoch": 0.05767012687427912, + "grad_norm": 0.800463080406189, + "learning_rate": 0.00019638252838175872, + "loss": 1.1274670362472534, + "step": 100 + }, + { + "epoch": 0.058246828143021914, + "grad_norm": 0.6817663311958313, + "learning_rate": 0.00019634404464113914, + "loss": 1.2432150840759277, + "step": 101 + }, + { + "epoch": 0.058823529411764705, + "grad_norm": 0.7663673162460327, + "learning_rate": 0.00019630556090051955, + "loss": 1.2066948413848877, + "step": 102 + }, + { + "epoch": 0.059400230680507496, + "grad_norm": 1.0259535312652588, + "learning_rate": 0.00019626707715989997, + "loss": 1.3713116645812988, + "step": 103 + }, + { + "epoch": 0.05997693194925029, + "grad_norm": 0.6617158055305481, + "learning_rate": 0.00019622859341928038, + "loss": 1.0320123434066772, + "step": 104 + }, + { + "epoch": 0.06055363321799308, + "grad_norm": 1.0050235986709595, + "learning_rate": 0.00019619010967866077, + "loss": 1.5375267267227173, + "step": 105 + }, + { + "epoch": 0.06113033448673587, + "grad_norm": 0.5563177466392517, + "learning_rate": 0.00019615162593804119, + "loss": 0.9102802276611328, + "step": 106 + }, + { + "epoch": 0.06170703575547866, + "grad_norm": 0.9994164109230042, + "learning_rate": 0.0001961131421974216, + "loss": 1.6505589485168457, + "step": 107 + }, + { + "epoch": 0.06228373702422145, + "grad_norm": 0.907625675201416, + "learning_rate": 0.00019607465845680202, + "loss": 1.6013598442077637, + "step": 108 + }, + { + "epoch": 0.06286043829296424, + "grad_norm": 1.0009554624557495, + "learning_rate": 0.00019603617471618243, + "loss": 1.0403454303741455, + "step": 109 + }, + { + "epoch": 0.06343713956170703, + "grad_norm": 0.8243467807769775, + "learning_rate": 0.00019599769097556285, + "loss": 1.5382654666900635, + "step": 110 + }, + { + "epoch": 0.06401384083044982, + "grad_norm": 1.0160003900527954, + "learning_rate": 0.00019595920723494326, + "loss": 1.2732863426208496, + "step": 111 + }, + { + "epoch": 0.06459054209919261, + "grad_norm": 0.608269453048706, + "learning_rate": 0.00019592072349432368, + "loss": 1.070478916168213, + "step": 112 + }, + { + "epoch": 0.0651672433679354, + "grad_norm": 0.7176778316497803, + "learning_rate": 0.00019588223975370406, + "loss": 1.302718162536621, + "step": 113 + }, + { + "epoch": 0.0657439446366782, + "grad_norm": 0.551771879196167, + "learning_rate": 0.00019584375601308448, + "loss": 0.9242706894874573, + "step": 114 + }, + { + "epoch": 0.06632064590542099, + "grad_norm": 0.9680222868919373, + "learning_rate": 0.0001958052722724649, + "loss": 1.9658548831939697, + "step": 115 + }, + { + "epoch": 0.06689734717416378, + "grad_norm": 0.8025707602500916, + "learning_rate": 0.0001957667885318453, + "loss": 1.5753577947616577, + "step": 116 + }, + { + "epoch": 0.06747404844290658, + "grad_norm": 0.7211287021636963, + "learning_rate": 0.00019572830479122573, + "loss": 1.3677327632904053, + "step": 117 + }, + { + "epoch": 0.06805074971164937, + "grad_norm": 0.7547542452812195, + "learning_rate": 0.00019568982105060614, + "loss": 1.507096767425537, + "step": 118 + }, + { + "epoch": 0.06862745098039216, + "grad_norm": 0.6146650314331055, + "learning_rate": 0.00019565133730998656, + "loss": 1.1320711374282837, + "step": 119 + }, + { + "epoch": 0.06920415224913495, + "grad_norm": 0.7611070275306702, + "learning_rate": 0.00019561285356936697, + "loss": 1.207049012184143, + "step": 120 + }, + { + "epoch": 0.06978085351787774, + "grad_norm": 0.714883029460907, + "learning_rate": 0.00019557436982874736, + "loss": 1.3823729753494263, + "step": 121 + }, + { + "epoch": 0.07035755478662054, + "grad_norm": 0.6768732666969299, + "learning_rate": 0.00019553588608812778, + "loss": 1.3038188219070435, + "step": 122 + }, + { + "epoch": 0.07093425605536333, + "grad_norm": 0.6013675332069397, + "learning_rate": 0.0001954974023475082, + "loss": 1.056199073791504, + "step": 123 + }, + { + "epoch": 0.07151095732410612, + "grad_norm": 0.8240784406661987, + "learning_rate": 0.0001954589186068886, + "loss": 1.4242757558822632, + "step": 124 + }, + { + "epoch": 0.07208765859284891, + "grad_norm": 0.6539785265922546, + "learning_rate": 0.00019542043486626902, + "loss": 1.161075472831726, + "step": 125 + }, + { + "epoch": 0.0726643598615917, + "grad_norm": 0.6347744464874268, + "learning_rate": 0.00019538195112564944, + "loss": 1.179503321647644, + "step": 126 + }, + { + "epoch": 0.07324106113033449, + "grad_norm": 0.7294688820838928, + "learning_rate": 0.00019534346738502985, + "loss": 1.2521535158157349, + "step": 127 + }, + { + "epoch": 0.07381776239907728, + "grad_norm": 0.6087843179702759, + "learning_rate": 0.00019530498364441027, + "loss": 1.0938013792037964, + "step": 128 + }, + { + "epoch": 0.07439446366782007, + "grad_norm": 1.116716980934143, + "learning_rate": 0.00019526649990379066, + "loss": 1.74098539352417, + "step": 129 + }, + { + "epoch": 0.07497116493656286, + "grad_norm": 0.7590331435203552, + "learning_rate": 0.00019522801616317107, + "loss": 1.2943538427352905, + "step": 130 + }, + { + "epoch": 0.07554786620530565, + "grad_norm": 0.9142744541168213, + "learning_rate": 0.00019518953242255149, + "loss": 1.0948201417922974, + "step": 131 + }, + { + "epoch": 0.07612456747404844, + "grad_norm": 0.8165064454078674, + "learning_rate": 0.0001951510486819319, + "loss": 1.5152888298034668, + "step": 132 + }, + { + "epoch": 0.07670126874279123, + "grad_norm": 0.8904751539230347, + "learning_rate": 0.00019511256494131232, + "loss": 1.3492425680160522, + "step": 133 + }, + { + "epoch": 0.07727797001153403, + "grad_norm": 0.632338285446167, + "learning_rate": 0.00019507408120069273, + "loss": 1.1460604667663574, + "step": 134 + }, + { + "epoch": 0.07785467128027682, + "grad_norm": 0.6621445417404175, + "learning_rate": 0.00019503559746007315, + "loss": 1.153398871421814, + "step": 135 + }, + { + "epoch": 0.0784313725490196, + "grad_norm": 0.928593635559082, + "learning_rate": 0.00019499711371945356, + "loss": 1.4575080871582031, + "step": 136 + }, + { + "epoch": 0.0790080738177624, + "grad_norm": 0.9125704765319824, + "learning_rate": 0.00019495862997883395, + "loss": 1.176555871963501, + "step": 137 + }, + { + "epoch": 0.07958477508650519, + "grad_norm": 0.7735126614570618, + "learning_rate": 0.00019492014623821437, + "loss": 1.3028615713119507, + "step": 138 + }, + { + "epoch": 0.08016147635524798, + "grad_norm": 1.4182281494140625, + "learning_rate": 0.00019488166249759478, + "loss": 1.7123095989227295, + "step": 139 + }, + { + "epoch": 0.08073817762399077, + "grad_norm": 0.957777738571167, + "learning_rate": 0.0001948431787569752, + "loss": 1.2952847480773926, + "step": 140 + }, + { + "epoch": 0.08131487889273356, + "grad_norm": 0.6284865140914917, + "learning_rate": 0.0001948046950163556, + "loss": 1.063300609588623, + "step": 141 + }, + { + "epoch": 0.08189158016147635, + "grad_norm": 1.020240068435669, + "learning_rate": 0.00019476621127573603, + "loss": 1.0956578254699707, + "step": 142 + }, + { + "epoch": 0.08246828143021914, + "grad_norm": 0.9629870057106018, + "learning_rate": 0.00019472772753511644, + "loss": 1.6626744270324707, + "step": 143 + }, + { + "epoch": 0.08304498269896193, + "grad_norm": 0.723129391670227, + "learning_rate": 0.00019468924379449686, + "loss": 1.5930454730987549, + "step": 144 + }, + { + "epoch": 0.08362168396770472, + "grad_norm": 0.6031758785247803, + "learning_rate": 0.00019465076005387725, + "loss": 1.3550267219543457, + "step": 145 + }, + { + "epoch": 0.08419838523644751, + "grad_norm": 0.6608120799064636, + "learning_rate": 0.00019461227631325766, + "loss": 1.091226577758789, + "step": 146 + }, + { + "epoch": 0.0847750865051903, + "grad_norm": 0.8583825826644897, + "learning_rate": 0.00019457379257263808, + "loss": 1.2840064764022827, + "step": 147 + }, + { + "epoch": 0.0853517877739331, + "grad_norm": 0.6371753215789795, + "learning_rate": 0.0001945353088320185, + "loss": 1.0223405361175537, + "step": 148 + }, + { + "epoch": 0.08592848904267589, + "grad_norm": 0.6101475954055786, + "learning_rate": 0.0001944968250913989, + "loss": 1.2935165166854858, + "step": 149 + }, + { + "epoch": 0.08650519031141868, + "grad_norm": 0.8921840190887451, + "learning_rate": 0.00019445834135077932, + "loss": 1.3194819688796997, + "step": 150 + }, + { + "epoch": 0.08708189158016148, + "grad_norm": 1.0423651933670044, + "learning_rate": 0.0001944198576101597, + "loss": 1.162503957748413, + "step": 151 + }, + { + "epoch": 0.08765859284890427, + "grad_norm": 0.9011998772621155, + "learning_rate": 0.00019438137386954013, + "loss": 1.4854192733764648, + "step": 152 + }, + { + "epoch": 0.08823529411764706, + "grad_norm": 0.6850185990333557, + "learning_rate": 0.00019434289012892054, + "loss": 1.2653287649154663, + "step": 153 + }, + { + "epoch": 0.08881199538638986, + "grad_norm": 0.5742697715759277, + "learning_rate": 0.00019430440638830093, + "loss": 1.1639142036437988, + "step": 154 + }, + { + "epoch": 0.08938869665513265, + "grad_norm": 0.5625914931297302, + "learning_rate": 0.00019426592264768134, + "loss": 1.0387107133865356, + "step": 155 + }, + { + "epoch": 0.08996539792387544, + "grad_norm": 0.7183355689048767, + "learning_rate": 0.00019422743890706176, + "loss": 1.211965799331665, + "step": 156 + }, + { + "epoch": 0.09054209919261823, + "grad_norm": 0.8835011124610901, + "learning_rate": 0.00019418895516644217, + "loss": 1.0958670377731323, + "step": 157 + }, + { + "epoch": 0.09111880046136102, + "grad_norm": 0.6885069608688354, + "learning_rate": 0.0001941504714258226, + "loss": 1.297393798828125, + "step": 158 + }, + { + "epoch": 0.09169550173010381, + "grad_norm": 0.7518923878669739, + "learning_rate": 0.000194111987685203, + "loss": 1.1739790439605713, + "step": 159 + }, + { + "epoch": 0.0922722029988466, + "grad_norm": 0.8452180027961731, + "learning_rate": 0.00019407350394458342, + "loss": 1.2312185764312744, + "step": 160 + }, + { + "epoch": 0.09284890426758939, + "grad_norm": 0.8018324971199036, + "learning_rate": 0.00019403502020396384, + "loss": 1.392999291419983, + "step": 161 + }, + { + "epoch": 0.09342560553633218, + "grad_norm": 0.743302583694458, + "learning_rate": 0.00019399653646334422, + "loss": 1.1602349281311035, + "step": 162 + }, + { + "epoch": 0.09400230680507497, + "grad_norm": 0.551163911819458, + "learning_rate": 0.00019395805272272464, + "loss": 1.0061742067337036, + "step": 163 + }, + { + "epoch": 0.09457900807381776, + "grad_norm": 0.6732088327407837, + "learning_rate": 0.00019391956898210505, + "loss": 1.2422168254852295, + "step": 164 + }, + { + "epoch": 0.09515570934256055, + "grad_norm": 0.6432737708091736, + "learning_rate": 0.00019388108524148547, + "loss": 0.8992981910705566, + "step": 165 + }, + { + "epoch": 0.09573241061130335, + "grad_norm": 0.893099308013916, + "learning_rate": 0.00019384260150086589, + "loss": 1.4426004886627197, + "step": 166 + }, + { + "epoch": 0.09630911188004614, + "grad_norm": 0.7915064692497253, + "learning_rate": 0.0001938041177602463, + "loss": 1.1332988739013672, + "step": 167 + }, + { + "epoch": 0.09688581314878893, + "grad_norm": 0.7785482406616211, + "learning_rate": 0.00019376563401962672, + "loss": 1.1662797927856445, + "step": 168 + }, + { + "epoch": 0.09746251441753172, + "grad_norm": 0.7676025032997131, + "learning_rate": 0.00019372715027900713, + "loss": 1.276615858078003, + "step": 169 + }, + { + "epoch": 0.09803921568627451, + "grad_norm": 0.7058248519897461, + "learning_rate": 0.00019368866653838752, + "loss": 1.2280982732772827, + "step": 170 + }, + { + "epoch": 0.0986159169550173, + "grad_norm": 0.7814574241638184, + "learning_rate": 0.00019365018279776793, + "loss": 1.6545538902282715, + "step": 171 + }, + { + "epoch": 0.09919261822376009, + "grad_norm": 0.5429863333702087, + "learning_rate": 0.00019361169905714835, + "loss": 1.047904133796692, + "step": 172 + }, + { + "epoch": 0.09976931949250288, + "grad_norm": 0.7021914124488831, + "learning_rate": 0.00019357321531652876, + "loss": 1.3578035831451416, + "step": 173 + }, + { + "epoch": 0.10034602076124567, + "grad_norm": 0.7608473896980286, + "learning_rate": 0.00019353473157590918, + "loss": 1.3332273960113525, + "step": 174 + }, + { + "epoch": 0.10092272202998846, + "grad_norm": 0.8988219499588013, + "learning_rate": 0.0001934962478352896, + "loss": 1.5955560207366943, + "step": 175 + }, + { + "epoch": 0.10149942329873125, + "grad_norm": 0.8784334659576416, + "learning_rate": 0.00019345776409467, + "loss": 1.4267313480377197, + "step": 176 + }, + { + "epoch": 0.10207612456747404, + "grad_norm": 0.9006462097167969, + "learning_rate": 0.00019341928035405043, + "loss": 1.2960124015808105, + "step": 177 + }, + { + "epoch": 0.10265282583621683, + "grad_norm": 0.7736122608184814, + "learning_rate": 0.00019338079661343081, + "loss": 1.3841434717178345, + "step": 178 + }, + { + "epoch": 0.10322952710495963, + "grad_norm": 0.8202458620071411, + "learning_rate": 0.00019334231287281123, + "loss": 1.2962226867675781, + "step": 179 + }, + { + "epoch": 0.10380622837370242, + "grad_norm": 0.743390679359436, + "learning_rate": 0.00019330382913219164, + "loss": 1.010484218597412, + "step": 180 + }, + { + "epoch": 0.10438292964244521, + "grad_norm": 0.7926476001739502, + "learning_rate": 0.00019326534539157206, + "loss": 1.45333731174469, + "step": 181 + }, + { + "epoch": 0.104959630911188, + "grad_norm": 0.527367889881134, + "learning_rate": 0.00019322686165095248, + "loss": 0.7763160467147827, + "step": 182 + }, + { + "epoch": 0.10553633217993079, + "grad_norm": 1.0006170272827148, + "learning_rate": 0.0001931883779103329, + "loss": 1.089290738105774, + "step": 183 + }, + { + "epoch": 0.1061130334486736, + "grad_norm": 0.7497840523719788, + "learning_rate": 0.0001931498941697133, + "loss": 1.1641783714294434, + "step": 184 + }, + { + "epoch": 0.10668973471741638, + "grad_norm": 0.6732814908027649, + "learning_rate": 0.00019311141042909372, + "loss": 1.0954653024673462, + "step": 185 + }, + { + "epoch": 0.10726643598615918, + "grad_norm": 0.7817464470863342, + "learning_rate": 0.0001930729266884741, + "loss": 1.5050190687179565, + "step": 186 + }, + { + "epoch": 0.10784313725490197, + "grad_norm": 0.813869297504425, + "learning_rate": 0.00019303444294785452, + "loss": 1.5048751831054688, + "step": 187 + }, + { + "epoch": 0.10841983852364476, + "grad_norm": 0.6368386745452881, + "learning_rate": 0.00019299595920723494, + "loss": 1.0601242780685425, + "step": 188 + }, + { + "epoch": 0.10899653979238755, + "grad_norm": 0.817610502243042, + "learning_rate": 0.00019295747546661536, + "loss": 1.2267041206359863, + "step": 189 + }, + { + "epoch": 0.10957324106113034, + "grad_norm": 0.768892228603363, + "learning_rate": 0.00019291899172599577, + "loss": 1.0935152769088745, + "step": 190 + }, + { + "epoch": 0.11014994232987313, + "grad_norm": 0.8072124123573303, + "learning_rate": 0.00019288050798537619, + "loss": 1.5566798448562622, + "step": 191 + }, + { + "epoch": 0.11072664359861592, + "grad_norm": 0.7275574803352356, + "learning_rate": 0.0001928420242447566, + "loss": 1.5278323888778687, + "step": 192 + }, + { + "epoch": 0.11130334486735871, + "grad_norm": 0.6448370814323425, + "learning_rate": 0.00019280354050413702, + "loss": 1.2096084356307983, + "step": 193 + }, + { + "epoch": 0.1118800461361015, + "grad_norm": 0.9334590435028076, + "learning_rate": 0.0001927650567635174, + "loss": 1.2487378120422363, + "step": 194 + }, + { + "epoch": 0.11245674740484429, + "grad_norm": 0.6830427646636963, + "learning_rate": 0.00019272657302289782, + "loss": 1.3567012548446655, + "step": 195 + }, + { + "epoch": 0.11303344867358708, + "grad_norm": 0.9035089612007141, + "learning_rate": 0.00019268808928227823, + "loss": 1.1751577854156494, + "step": 196 + }, + { + "epoch": 0.11361014994232987, + "grad_norm": 0.5569579005241394, + "learning_rate": 0.00019264960554165865, + "loss": 1.0159823894500732, + "step": 197 + }, + { + "epoch": 0.11418685121107267, + "grad_norm": 0.6232113838195801, + "learning_rate": 0.00019261112180103907, + "loss": 1.0779603719711304, + "step": 198 + }, + { + "epoch": 0.11476355247981546, + "grad_norm": 0.7666590213775635, + "learning_rate": 0.00019257263806041948, + "loss": 1.2052793502807617, + "step": 199 + }, + { + "epoch": 0.11534025374855825, + "grad_norm": 0.6218665242195129, + "learning_rate": 0.0001925341543197999, + "loss": 1.2699958086013794, + "step": 200 + }, + { + "epoch": 0.11591695501730104, + "grad_norm": 0.6059345006942749, + "learning_rate": 0.0001924956705791803, + "loss": 1.0522977113723755, + "step": 201 + }, + { + "epoch": 0.11649365628604383, + "grad_norm": 0.6952403783798218, + "learning_rate": 0.0001924571868385607, + "loss": 1.3461261987686157, + "step": 202 + }, + { + "epoch": 0.11707035755478662, + "grad_norm": 0.7097076177597046, + "learning_rate": 0.00019241870309794111, + "loss": 1.0901520252227783, + "step": 203 + }, + { + "epoch": 0.11764705882352941, + "grad_norm": 1.3426554203033447, + "learning_rate": 0.00019238021935732153, + "loss": 1.8886399269104004, + "step": 204 + }, + { + "epoch": 0.1182237600922722, + "grad_norm": 1.00478196144104, + "learning_rate": 0.00019234173561670195, + "loss": 1.2172045707702637, + "step": 205 + }, + { + "epoch": 0.11880046136101499, + "grad_norm": 0.8586134314537048, + "learning_rate": 0.00019230325187608236, + "loss": 1.0469045639038086, + "step": 206 + }, + { + "epoch": 0.11937716262975778, + "grad_norm": 0.7872591018676758, + "learning_rate": 0.00019226476813546278, + "loss": 1.1137733459472656, + "step": 207 + }, + { + "epoch": 0.11995386389850057, + "grad_norm": 0.8721824884414673, + "learning_rate": 0.0001922262843948432, + "loss": 1.3743940591812134, + "step": 208 + }, + { + "epoch": 0.12053056516724336, + "grad_norm": 0.6212759613990784, + "learning_rate": 0.0001921878006542236, + "loss": 0.900457501411438, + "step": 209 + }, + { + "epoch": 0.12110726643598616, + "grad_norm": 1.0083750486373901, + "learning_rate": 0.000192149316913604, + "loss": 1.339089035987854, + "step": 210 + }, + { + "epoch": 0.12168396770472895, + "grad_norm": 0.794417142868042, + "learning_rate": 0.0001921108331729844, + "loss": 1.194704532623291, + "step": 211 + }, + { + "epoch": 0.12226066897347174, + "grad_norm": 1.1438184976577759, + "learning_rate": 0.00019207234943236483, + "loss": 1.3168675899505615, + "step": 212 + }, + { + "epoch": 0.12283737024221453, + "grad_norm": 0.5655554533004761, + "learning_rate": 0.00019203386569174524, + "loss": 1.008853793144226, + "step": 213 + }, + { + "epoch": 0.12341407151095732, + "grad_norm": 0.7868179082870483, + "learning_rate": 0.00019199538195112566, + "loss": 1.3174118995666504, + "step": 214 + }, + { + "epoch": 0.12399077277970011, + "grad_norm": 0.6736404299736023, + "learning_rate": 0.00019195689821050607, + "loss": 1.054055094718933, + "step": 215 + }, + { + "epoch": 0.1245674740484429, + "grad_norm": 0.7425172328948975, + "learning_rate": 0.00019191841446988649, + "loss": 1.2892072200775146, + "step": 216 + }, + { + "epoch": 0.1251441753171857, + "grad_norm": 0.7724793553352356, + "learning_rate": 0.00019187993072926687, + "loss": 1.3278907537460327, + "step": 217 + }, + { + "epoch": 0.12572087658592848, + "grad_norm": 0.7415600419044495, + "learning_rate": 0.0001918414469886473, + "loss": 1.1893579959869385, + "step": 218 + }, + { + "epoch": 0.12629757785467127, + "grad_norm": 0.8178536295890808, + "learning_rate": 0.0001918029632480277, + "loss": 1.3486452102661133, + "step": 219 + }, + { + "epoch": 0.12687427912341406, + "grad_norm": 0.803683340549469, + "learning_rate": 0.00019176447950740812, + "loss": 1.297539234161377, + "step": 220 + }, + { + "epoch": 0.12745098039215685, + "grad_norm": 0.6226982474327087, + "learning_rate": 0.00019172599576678854, + "loss": 1.0952314138412476, + "step": 221 + }, + { + "epoch": 0.12802768166089964, + "grad_norm": 0.652317225933075, + "learning_rate": 0.00019168751202616895, + "loss": 0.9360387325286865, + "step": 222 + }, + { + "epoch": 0.12860438292964244, + "grad_norm": 0.8147749900817871, + "learning_rate": 0.00019164902828554937, + "loss": 1.0632787942886353, + "step": 223 + }, + { + "epoch": 0.12918108419838523, + "grad_norm": 0.9202223420143127, + "learning_rate": 0.00019161054454492978, + "loss": 1.3678290843963623, + "step": 224 + }, + { + "epoch": 0.12975778546712802, + "grad_norm": 1.1951165199279785, + "learning_rate": 0.00019157206080431017, + "loss": 1.2670767307281494, + "step": 225 + }, + { + "epoch": 0.1303344867358708, + "grad_norm": 0.7266793847084045, + "learning_rate": 0.00019153357706369058, + "loss": 1.1158084869384766, + "step": 226 + }, + { + "epoch": 0.1309111880046136, + "grad_norm": 0.6181395649909973, + "learning_rate": 0.000191495093323071, + "loss": 1.1156044006347656, + "step": 227 + }, + { + "epoch": 0.1314878892733564, + "grad_norm": 0.7921776175498962, + "learning_rate": 0.00019145660958245142, + "loss": 1.001752257347107, + "step": 228 + }, + { + "epoch": 0.13206459054209918, + "grad_norm": 0.5998401045799255, + "learning_rate": 0.00019141812584183183, + "loss": 0.7688826322555542, + "step": 229 + }, + { + "epoch": 0.13264129181084197, + "grad_norm": 0.7660285234451294, + "learning_rate": 0.00019137964210121225, + "loss": 1.2462745904922485, + "step": 230 + }, + { + "epoch": 0.13321799307958476, + "grad_norm": 0.7925796508789062, + "learning_rate": 0.00019134115836059266, + "loss": 1.1053651571273804, + "step": 231 + }, + { + "epoch": 0.13379469434832755, + "grad_norm": 0.6407649517059326, + "learning_rate": 0.00019130267461997308, + "loss": 0.8710946440696716, + "step": 232 + }, + { + "epoch": 0.13437139561707034, + "grad_norm": 0.7516645789146423, + "learning_rate": 0.00019126419087935346, + "loss": 1.009436011314392, + "step": 233 + }, + { + "epoch": 0.13494809688581316, + "grad_norm": 0.5998948216438293, + "learning_rate": 0.00019122570713873388, + "loss": 1.0309457778930664, + "step": 234 + }, + { + "epoch": 0.13552479815455595, + "grad_norm": 1.1897567510604858, + "learning_rate": 0.0001911872233981143, + "loss": 0.9930981397628784, + "step": 235 + }, + { + "epoch": 0.13610149942329874, + "grad_norm": 0.7404462695121765, + "learning_rate": 0.0001911487396574947, + "loss": 1.1489670276641846, + "step": 236 + }, + { + "epoch": 0.13667820069204153, + "grad_norm": 0.7168471813201904, + "learning_rate": 0.00019111025591687513, + "loss": 1.202157735824585, + "step": 237 + }, + { + "epoch": 0.13725490196078433, + "grad_norm": 0.7502639293670654, + "learning_rate": 0.00019107177217625554, + "loss": 1.022951364517212, + "step": 238 + }, + { + "epoch": 0.13783160322952712, + "grad_norm": 0.6795151233673096, + "learning_rate": 0.00019103328843563596, + "loss": 1.1194236278533936, + "step": 239 + }, + { + "epoch": 0.1384083044982699, + "grad_norm": 0.7620200514793396, + "learning_rate": 0.00019099480469501637, + "loss": 0.8411365747451782, + "step": 240 + }, + { + "epoch": 0.1389850057670127, + "grad_norm": 0.6618032455444336, + "learning_rate": 0.00019095632095439676, + "loss": 0.7801553606987, + "step": 241 + }, + { + "epoch": 0.1395617070357555, + "grad_norm": 0.9366044402122498, + "learning_rate": 0.00019091783721377718, + "loss": 1.0621672868728638, + "step": 242 + }, + { + "epoch": 0.14013840830449828, + "grad_norm": 1.0874788761138916, + "learning_rate": 0.0001908793534731576, + "loss": 1.6787068843841553, + "step": 243 + }, + { + "epoch": 0.14071510957324107, + "grad_norm": 0.8962084054946899, + "learning_rate": 0.000190840869732538, + "loss": 1.1922732591629028, + "step": 244 + }, + { + "epoch": 0.14129181084198386, + "grad_norm": 0.7039315700531006, + "learning_rate": 0.00019080238599191842, + "loss": 1.177897334098816, + "step": 245 + }, + { + "epoch": 0.14186851211072665, + "grad_norm": 0.9172819256782532, + "learning_rate": 0.00019076390225129884, + "loss": 1.3276829719543457, + "step": 246 + }, + { + "epoch": 0.14244521337946944, + "grad_norm": 1.002533197402954, + "learning_rate": 0.00019072541851067925, + "loss": 1.11848783493042, + "step": 247 + }, + { + "epoch": 0.14302191464821223, + "grad_norm": 0.9164738059043884, + "learning_rate": 0.00019068693477005967, + "loss": 0.7153259515762329, + "step": 248 + }, + { + "epoch": 0.14359861591695502, + "grad_norm": 0.7163867354393005, + "learning_rate": 0.00019064845102944006, + "loss": 1.206921100616455, + "step": 249 + }, + { + "epoch": 0.14417531718569782, + "grad_norm": 0.8200199604034424, + "learning_rate": 0.00019060996728882047, + "loss": 0.9798004031181335, + "step": 250 + }, + { + "epoch": 0.1447520184544406, + "grad_norm": 0.9806034564971924, + "learning_rate": 0.00019057148354820089, + "loss": 1.0969898700714111, + "step": 251 + }, + { + "epoch": 0.1453287197231834, + "grad_norm": 1.0849624872207642, + "learning_rate": 0.0001905329998075813, + "loss": 1.2618253231048584, + "step": 252 + }, + { + "epoch": 0.1459054209919262, + "grad_norm": 0.8736698031425476, + "learning_rate": 0.00019049451606696172, + "loss": 1.1534979343414307, + "step": 253 + }, + { + "epoch": 0.14648212226066898, + "grad_norm": 0.6748337745666504, + "learning_rate": 0.00019045603232634213, + "loss": 0.9178370237350464, + "step": 254 + }, + { + "epoch": 0.14705882352941177, + "grad_norm": 0.8655548691749573, + "learning_rate": 0.00019041754858572255, + "loss": 1.157179355621338, + "step": 255 + }, + { + "epoch": 0.14763552479815456, + "grad_norm": 0.7558174133300781, + "learning_rate": 0.00019037906484510296, + "loss": 0.7844438552856445, + "step": 256 + }, + { + "epoch": 0.14821222606689735, + "grad_norm": 0.8278117179870605, + "learning_rate": 0.00019034058110448335, + "loss": 1.4085724353790283, + "step": 257 + }, + { + "epoch": 0.14878892733564014, + "grad_norm": 0.9563509225845337, + "learning_rate": 0.00019030209736386377, + "loss": 1.244802713394165, + "step": 258 + }, + { + "epoch": 0.14936562860438293, + "grad_norm": 0.8018333315849304, + "learning_rate": 0.00019026361362324418, + "loss": 0.801522970199585, + "step": 259 + }, + { + "epoch": 0.14994232987312572, + "grad_norm": 0.555248498916626, + "learning_rate": 0.0001902251298826246, + "loss": 0.8989696502685547, + "step": 260 + }, + { + "epoch": 0.15051903114186851, + "grad_norm": 0.5092940926551819, + "learning_rate": 0.000190186646142005, + "loss": 0.8229849338531494, + "step": 261 + }, + { + "epoch": 0.1510957324106113, + "grad_norm": 0.614162266254425, + "learning_rate": 0.00019014816240138543, + "loss": 1.14143705368042, + "step": 262 + }, + { + "epoch": 0.1516724336793541, + "grad_norm": 0.7050411701202393, + "learning_rate": 0.00019010967866076584, + "loss": 1.2602849006652832, + "step": 263 + }, + { + "epoch": 0.1522491349480969, + "grad_norm": 0.8917875289916992, + "learning_rate": 0.00019007119492014626, + "loss": 1.2684617042541504, + "step": 264 + }, + { + "epoch": 0.15282583621683968, + "grad_norm": 0.7177139520645142, + "learning_rate": 0.00019003271117952665, + "loss": 0.664681077003479, + "step": 265 + }, + { + "epoch": 0.15340253748558247, + "grad_norm": 0.7513463497161865, + "learning_rate": 0.00018999422743890706, + "loss": 0.9689874649047852, + "step": 266 + }, + { + "epoch": 0.15397923875432526, + "grad_norm": 0.8350100517272949, + "learning_rate": 0.00018995574369828748, + "loss": 1.222740888595581, + "step": 267 + }, + { + "epoch": 0.15455594002306805, + "grad_norm": 1.152787685394287, + "learning_rate": 0.0001899172599576679, + "loss": 1.0707926750183105, + "step": 268 + }, + { + "epoch": 0.15513264129181084, + "grad_norm": 0.7810789346694946, + "learning_rate": 0.0001898787762170483, + "loss": 1.1552890539169312, + "step": 269 + }, + { + "epoch": 0.15570934256055363, + "grad_norm": 0.864863395690918, + "learning_rate": 0.00018984029247642872, + "loss": 1.2455859184265137, + "step": 270 + }, + { + "epoch": 0.15628604382929642, + "grad_norm": 0.578794002532959, + "learning_rate": 0.00018980180873580914, + "loss": 0.9284070730209351, + "step": 271 + }, + { + "epoch": 0.1568627450980392, + "grad_norm": 0.9245108962059021, + "learning_rate": 0.00018976332499518955, + "loss": 0.8936307430267334, + "step": 272 + }, + { + "epoch": 0.157439446366782, + "grad_norm": 1.022964358329773, + "learning_rate": 0.00018972484125456994, + "loss": 1.2052812576293945, + "step": 273 + }, + { + "epoch": 0.1580161476355248, + "grad_norm": 0.6136555075645447, + "learning_rate": 0.00018968635751395036, + "loss": 0.9395220875740051, + "step": 274 + }, + { + "epoch": 0.15859284890426759, + "grad_norm": 0.49354949593544006, + "learning_rate": 0.00018964787377333077, + "loss": 0.7979940176010132, + "step": 275 + }, + { + "epoch": 0.15916955017301038, + "grad_norm": 0.8118260502815247, + "learning_rate": 0.00018960939003271119, + "loss": 1.3310189247131348, + "step": 276 + }, + { + "epoch": 0.15974625144175317, + "grad_norm": 0.7864040732383728, + "learning_rate": 0.0001895709062920916, + "loss": 0.995107889175415, + "step": 277 + }, + { + "epoch": 0.16032295271049596, + "grad_norm": 0.7795019149780273, + "learning_rate": 0.00018953242255147202, + "loss": 1.031097412109375, + "step": 278 + }, + { + "epoch": 0.16089965397923875, + "grad_norm": 0.7358199954032898, + "learning_rate": 0.00018949393881085243, + "loss": 1.2151832580566406, + "step": 279 + }, + { + "epoch": 0.16147635524798154, + "grad_norm": 0.592187225818634, + "learning_rate": 0.00018945545507023285, + "loss": 1.18082857131958, + "step": 280 + }, + { + "epoch": 0.16205305651672433, + "grad_norm": 0.6349275708198547, + "learning_rate": 0.00018941697132961324, + "loss": 1.0011241436004639, + "step": 281 + }, + { + "epoch": 0.16262975778546712, + "grad_norm": 0.827673614025116, + "learning_rate": 0.00018937848758899365, + "loss": 1.1634137630462646, + "step": 282 + }, + { + "epoch": 0.1632064590542099, + "grad_norm": 0.7459465861320496, + "learning_rate": 0.00018934000384837407, + "loss": 1.2054771184921265, + "step": 283 + }, + { + "epoch": 0.1637831603229527, + "grad_norm": 0.8688679337501526, + "learning_rate": 0.00018930152010775448, + "loss": 1.5523681640625, + "step": 284 + }, + { + "epoch": 0.1643598615916955, + "grad_norm": 0.5501953959465027, + "learning_rate": 0.0001892630363671349, + "loss": 0.8807846903800964, + "step": 285 + }, + { + "epoch": 0.16493656286043828, + "grad_norm": 0.9370623230934143, + "learning_rate": 0.0001892245526265153, + "loss": 1.480832815170288, + "step": 286 + }, + { + "epoch": 0.16551326412918108, + "grad_norm": 0.824664831161499, + "learning_rate": 0.00018918606888589573, + "loss": 1.1490377187728882, + "step": 287 + }, + { + "epoch": 0.16608996539792387, + "grad_norm": 0.6960827708244324, + "learning_rate": 0.00018914758514527614, + "loss": 0.9883493185043335, + "step": 288 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 0.5384089946746826, + "learning_rate": 0.00018910910140465653, + "loss": 0.9772455096244812, + "step": 289 + }, + { + "epoch": 0.16724336793540945, + "grad_norm": 0.5826528072357178, + "learning_rate": 0.00018907061766403695, + "loss": 0.80659019947052, + "step": 290 + }, + { + "epoch": 0.16782006920415224, + "grad_norm": 0.8662609457969666, + "learning_rate": 0.00018903213392341736, + "loss": 1.438920497894287, + "step": 291 + }, + { + "epoch": 0.16839677047289503, + "grad_norm": 0.8694437742233276, + "learning_rate": 0.00018899365018279778, + "loss": 1.594082236289978, + "step": 292 + }, + { + "epoch": 0.16897347174163782, + "grad_norm": 0.9895355701446533, + "learning_rate": 0.0001889551664421782, + "loss": 1.1623947620391846, + "step": 293 + }, + { + "epoch": 0.1695501730103806, + "grad_norm": 0.7757118940353394, + "learning_rate": 0.0001889166827015586, + "loss": 1.2969348430633545, + "step": 294 + }, + { + "epoch": 0.1701268742791234, + "grad_norm": 1.1235777139663696, + "learning_rate": 0.00018887819896093902, + "loss": 1.5447598695755005, + "step": 295 + }, + { + "epoch": 0.1707035755478662, + "grad_norm": 0.5995392799377441, + "learning_rate": 0.00018883971522031944, + "loss": 1.1860620975494385, + "step": 296 + }, + { + "epoch": 0.17128027681660898, + "grad_norm": 0.7350177764892578, + "learning_rate": 0.00018880123147969983, + "loss": 1.1964070796966553, + "step": 297 + }, + { + "epoch": 0.17185697808535177, + "grad_norm": 0.7769676446914673, + "learning_rate": 0.00018876274773908024, + "loss": 0.9732775688171387, + "step": 298 + }, + { + "epoch": 0.17243367935409457, + "grad_norm": 1.0317054986953735, + "learning_rate": 0.00018872426399846066, + "loss": 1.1931625604629517, + "step": 299 + }, + { + "epoch": 0.17301038062283736, + "grad_norm": 0.855571985244751, + "learning_rate": 0.00018868578025784107, + "loss": 1.2726032733917236, + "step": 300 + }, + { + "epoch": 0.17358708189158017, + "grad_norm": 1.0038337707519531, + "learning_rate": 0.0001886472965172215, + "loss": 1.3021737337112427, + "step": 301 + }, + { + "epoch": 0.17416378316032297, + "grad_norm": 1.05097496509552, + "learning_rate": 0.0001886088127766019, + "loss": 1.6369917392730713, + "step": 302 + }, + { + "epoch": 0.17474048442906576, + "grad_norm": 0.6620575189590454, + "learning_rate": 0.00018857032903598232, + "loss": 1.0873693227767944, + "step": 303 + }, + { + "epoch": 0.17531718569780855, + "grad_norm": 0.8430469036102295, + "learning_rate": 0.0001885318452953627, + "loss": 1.1750123500823975, + "step": 304 + }, + { + "epoch": 0.17589388696655134, + "grad_norm": 0.8181238174438477, + "learning_rate": 0.00018849336155474312, + "loss": 1.3522461652755737, + "step": 305 + }, + { + "epoch": 0.17647058823529413, + "grad_norm": 0.6994307041168213, + "learning_rate": 0.00018845487781412354, + "loss": 1.327797293663025, + "step": 306 + }, + { + "epoch": 0.17704728950403692, + "grad_norm": 0.7090145349502563, + "learning_rate": 0.00018841639407350395, + "loss": 1.3075491189956665, + "step": 307 + }, + { + "epoch": 0.1776239907727797, + "grad_norm": 0.7612029314041138, + "learning_rate": 0.00018837791033288437, + "loss": 1.0585792064666748, + "step": 308 + }, + { + "epoch": 0.1782006920415225, + "grad_norm": 0.8592241406440735, + "learning_rate": 0.00018833942659226478, + "loss": 0.6441008448600769, + "step": 309 + }, + { + "epoch": 0.1787773933102653, + "grad_norm": 1.0303255319595337, + "learning_rate": 0.0001883009428516452, + "loss": 1.520599365234375, + "step": 310 + }, + { + "epoch": 0.17935409457900808, + "grad_norm": 0.80874103307724, + "learning_rate": 0.0001882624591110256, + "loss": 0.902335524559021, + "step": 311 + }, + { + "epoch": 0.17993079584775087, + "grad_norm": 0.7039778232574463, + "learning_rate": 0.000188223975370406, + "loss": 1.0226070880889893, + "step": 312 + }, + { + "epoch": 0.18050749711649366, + "grad_norm": 0.7102690935134888, + "learning_rate": 0.00018818549162978642, + "loss": 1.0590555667877197, + "step": 313 + }, + { + "epoch": 0.18108419838523646, + "grad_norm": 1.0405141115188599, + "learning_rate": 0.00018814700788916683, + "loss": 1.4237335920333862, + "step": 314 + }, + { + "epoch": 0.18166089965397925, + "grad_norm": 0.6633170247077942, + "learning_rate": 0.00018810852414854725, + "loss": 0.9277420043945312, + "step": 315 + }, + { + "epoch": 0.18223760092272204, + "grad_norm": 0.6740328073501587, + "learning_rate": 0.00018807004040792766, + "loss": 1.053580403327942, + "step": 316 + }, + { + "epoch": 0.18281430219146483, + "grad_norm": 0.6842854619026184, + "learning_rate": 0.00018803155666730808, + "loss": 1.0379540920257568, + "step": 317 + }, + { + "epoch": 0.18339100346020762, + "grad_norm": 0.6766674518585205, + "learning_rate": 0.0001879930729266885, + "loss": 0.9214432835578918, + "step": 318 + }, + { + "epoch": 0.1839677047289504, + "grad_norm": 0.8358355164527893, + "learning_rate": 0.0001879545891860689, + "loss": 1.069684624671936, + "step": 319 + }, + { + "epoch": 0.1845444059976932, + "grad_norm": 0.9044516086578369, + "learning_rate": 0.0001879161054454493, + "loss": 1.4757916927337646, + "step": 320 + }, + { + "epoch": 0.185121107266436, + "grad_norm": 0.9662521481513977, + "learning_rate": 0.0001878776217048297, + "loss": 1.3449480533599854, + "step": 321 + }, + { + "epoch": 0.18569780853517878, + "grad_norm": 0.8681714534759521, + "learning_rate": 0.00018783913796421013, + "loss": 1.2057011127471924, + "step": 322 + }, + { + "epoch": 0.18627450980392157, + "grad_norm": 0.7318335175514221, + "learning_rate": 0.00018780065422359054, + "loss": 1.276970386505127, + "step": 323 + }, + { + "epoch": 0.18685121107266436, + "grad_norm": 0.798865556716919, + "learning_rate": 0.00018776217048297096, + "loss": 1.1334099769592285, + "step": 324 + }, + { + "epoch": 0.18742791234140715, + "grad_norm": 0.6787270903587341, + "learning_rate": 0.00018772368674235137, + "loss": 1.0829839706420898, + "step": 325 + }, + { + "epoch": 0.18800461361014995, + "grad_norm": 0.705894947052002, + "learning_rate": 0.0001876852030017318, + "loss": 1.3146710395812988, + "step": 326 + }, + { + "epoch": 0.18858131487889274, + "grad_norm": 0.7403978705406189, + "learning_rate": 0.0001876467192611122, + "loss": 0.7811852693557739, + "step": 327 + }, + { + "epoch": 0.18915801614763553, + "grad_norm": 0.8138331770896912, + "learning_rate": 0.0001876082355204926, + "loss": 1.3800559043884277, + "step": 328 + }, + { + "epoch": 0.18973471741637832, + "grad_norm": 1.0053505897521973, + "learning_rate": 0.000187569751779873, + "loss": 1.502892017364502, + "step": 329 + }, + { + "epoch": 0.1903114186851211, + "grad_norm": 1.2905986309051514, + "learning_rate": 0.00018753126803925342, + "loss": 1.6044906377792358, + "step": 330 + }, + { + "epoch": 0.1908881199538639, + "grad_norm": 0.7266846299171448, + "learning_rate": 0.00018749278429863384, + "loss": 0.8269582390785217, + "step": 331 + }, + { + "epoch": 0.1914648212226067, + "grad_norm": 0.9892683029174805, + "learning_rate": 0.00018745430055801425, + "loss": 1.2374012470245361, + "step": 332 + }, + { + "epoch": 0.19204152249134948, + "grad_norm": 0.8026344180107117, + "learning_rate": 0.00018741581681739467, + "loss": 0.9166598916053772, + "step": 333 + }, + { + "epoch": 0.19261822376009227, + "grad_norm": 0.7790790796279907, + "learning_rate": 0.00018737733307677508, + "loss": 0.8837241530418396, + "step": 334 + }, + { + "epoch": 0.19319492502883506, + "grad_norm": 0.8625907897949219, + "learning_rate": 0.0001873388493361555, + "loss": 1.0963804721832275, + "step": 335 + }, + { + "epoch": 0.19377162629757785, + "grad_norm": 0.8408490419387817, + "learning_rate": 0.00018730036559553589, + "loss": 1.2887423038482666, + "step": 336 + }, + { + "epoch": 0.19434832756632064, + "grad_norm": 0.8141940236091614, + "learning_rate": 0.0001872618818549163, + "loss": 1.234419584274292, + "step": 337 + }, + { + "epoch": 0.19492502883506344, + "grad_norm": 0.7913158535957336, + "learning_rate": 0.00018722339811429672, + "loss": 0.8931217193603516, + "step": 338 + }, + { + "epoch": 0.19550173010380623, + "grad_norm": 0.9377291202545166, + "learning_rate": 0.00018718491437367713, + "loss": 1.1958264112472534, + "step": 339 + }, + { + "epoch": 0.19607843137254902, + "grad_norm": 1.1096664667129517, + "learning_rate": 0.00018714643063305755, + "loss": 0.871677041053772, + "step": 340 + }, + { + "epoch": 0.1966551326412918, + "grad_norm": 0.7379001379013062, + "learning_rate": 0.00018710794689243796, + "loss": 0.9309886693954468, + "step": 341 + }, + { + "epoch": 0.1972318339100346, + "grad_norm": 0.738572895526886, + "learning_rate": 0.00018706946315181838, + "loss": 1.065298080444336, + "step": 342 + }, + { + "epoch": 0.1978085351787774, + "grad_norm": 0.8279491066932678, + "learning_rate": 0.0001870309794111988, + "loss": 1.0682514905929565, + "step": 343 + }, + { + "epoch": 0.19838523644752018, + "grad_norm": 0.9108213782310486, + "learning_rate": 0.00018699249567057918, + "loss": 1.2043181657791138, + "step": 344 + }, + { + "epoch": 0.19896193771626297, + "grad_norm": 0.9347065687179565, + "learning_rate": 0.0001869540119299596, + "loss": 1.5744340419769287, + "step": 345 + }, + { + "epoch": 0.19953863898500576, + "grad_norm": 0.5783383250236511, + "learning_rate": 0.00018691552818934, + "loss": 0.7808327674865723, + "step": 346 + }, + { + "epoch": 0.20011534025374855, + "grad_norm": 0.661321759223938, + "learning_rate": 0.00018687704444872043, + "loss": 0.9458237290382385, + "step": 347 + }, + { + "epoch": 0.20069204152249134, + "grad_norm": 0.5592895746231079, + "learning_rate": 0.00018683856070810084, + "loss": 0.8761368989944458, + "step": 348 + }, + { + "epoch": 0.20126874279123413, + "grad_norm": 0.6626494526863098, + "learning_rate": 0.00018680007696748126, + "loss": 0.9110841751098633, + "step": 349 + }, + { + "epoch": 0.20184544405997693, + "grad_norm": 0.8392354249954224, + "learning_rate": 0.00018676159322686167, + "loss": 1.234721302986145, + "step": 350 + }, + { + "epoch": 0.20242214532871972, + "grad_norm": 0.5596436262130737, + "learning_rate": 0.0001867231094862421, + "loss": 0.837221622467041, + "step": 351 + }, + { + "epoch": 0.2029988465974625, + "grad_norm": 0.5023308992385864, + "learning_rate": 0.00018668462574562248, + "loss": 0.7079763412475586, + "step": 352 + }, + { + "epoch": 0.2035755478662053, + "grad_norm": 0.7946610450744629, + "learning_rate": 0.0001866461420050029, + "loss": 1.3043620586395264, + "step": 353 + }, + { + "epoch": 0.2041522491349481, + "grad_norm": 0.8124772310256958, + "learning_rate": 0.0001866076582643833, + "loss": 1.1276662349700928, + "step": 354 + }, + { + "epoch": 0.20472895040369088, + "grad_norm": 0.5195242166519165, + "learning_rate": 0.00018656917452376372, + "loss": 0.737315833568573, + "step": 355 + }, + { + "epoch": 0.20530565167243367, + "grad_norm": 0.7146646976470947, + "learning_rate": 0.00018653069078314414, + "loss": 1.0838680267333984, + "step": 356 + }, + { + "epoch": 0.20588235294117646, + "grad_norm": 0.7928506135940552, + "learning_rate": 0.00018649220704252455, + "loss": 1.2697861194610596, + "step": 357 + }, + { + "epoch": 0.20645905420991925, + "grad_norm": 0.6152468919754028, + "learning_rate": 0.00018645372330190497, + "loss": 0.9355758428573608, + "step": 358 + }, + { + "epoch": 0.20703575547866204, + "grad_norm": 1.0809266567230225, + "learning_rate": 0.00018641523956128538, + "loss": 1.9420266151428223, + "step": 359 + }, + { + "epoch": 0.20761245674740483, + "grad_norm": 0.59016352891922, + "learning_rate": 0.00018637675582066577, + "loss": 0.9944459199905396, + "step": 360 + }, + { + "epoch": 0.20818915801614762, + "grad_norm": 0.7870339751243591, + "learning_rate": 0.0001863382720800462, + "loss": 1.0614302158355713, + "step": 361 + }, + { + "epoch": 0.20876585928489041, + "grad_norm": 0.7203708291053772, + "learning_rate": 0.0001862997883394266, + "loss": 0.9602723717689514, + "step": 362 + }, + { + "epoch": 0.2093425605536332, + "grad_norm": 0.532341480255127, + "learning_rate": 0.00018626130459880702, + "loss": 0.8718068599700928, + "step": 363 + }, + { + "epoch": 0.209919261822376, + "grad_norm": 0.9565883278846741, + "learning_rate": 0.00018622282085818743, + "loss": 1.278198480606079, + "step": 364 + }, + { + "epoch": 0.2104959630911188, + "grad_norm": 0.7197461724281311, + "learning_rate": 0.00018618433711756785, + "loss": 1.3148860931396484, + "step": 365 + }, + { + "epoch": 0.21107266435986158, + "grad_norm": 0.6119058728218079, + "learning_rate": 0.00018614585337694826, + "loss": 0.9266935586929321, + "step": 366 + }, + { + "epoch": 0.2116493656286044, + "grad_norm": 0.9047015309333801, + "learning_rate": 0.00018610736963632868, + "loss": 1.1473264694213867, + "step": 367 + }, + { + "epoch": 0.2122260668973472, + "grad_norm": 0.6796925663948059, + "learning_rate": 0.00018606888589570907, + "loss": 1.0393201112747192, + "step": 368 + }, + { + "epoch": 0.21280276816608998, + "grad_norm": 0.6059300303459167, + "learning_rate": 0.00018603040215508948, + "loss": 1.001380443572998, + "step": 369 + }, + { + "epoch": 0.21337946943483277, + "grad_norm": 0.6669148206710815, + "learning_rate": 0.0001859919184144699, + "loss": 0.8133573532104492, + "step": 370 + }, + { + "epoch": 0.21395617070357556, + "grad_norm": 0.6025424003601074, + "learning_rate": 0.0001859534346738503, + "loss": 0.9277598261833191, + "step": 371 + }, + { + "epoch": 0.21453287197231835, + "grad_norm": 0.8728757500648499, + "learning_rate": 0.00018591495093323073, + "loss": 1.1496421098709106, + "step": 372 + }, + { + "epoch": 0.21510957324106114, + "grad_norm": 0.587089478969574, + "learning_rate": 0.00018587646719261114, + "loss": 0.8672431707382202, + "step": 373 + }, + { + "epoch": 0.21568627450980393, + "grad_norm": 0.7482187747955322, + "learning_rate": 0.00018583798345199156, + "loss": 1.0713750123977661, + "step": 374 + }, + { + "epoch": 0.21626297577854672, + "grad_norm": 0.8591217398643494, + "learning_rate": 0.00018579949971137197, + "loss": 1.4045636653900146, + "step": 375 + }, + { + "epoch": 0.21683967704728951, + "grad_norm": 0.7630711793899536, + "learning_rate": 0.00018576101597075236, + "loss": 0.9842856526374817, + "step": 376 + }, + { + "epoch": 0.2174163783160323, + "grad_norm": 1.2762526273727417, + "learning_rate": 0.00018572253223013278, + "loss": 1.5381450653076172, + "step": 377 + }, + { + "epoch": 0.2179930795847751, + "grad_norm": 0.7234092950820923, + "learning_rate": 0.0001856840484895132, + "loss": 1.0782972574234009, + "step": 378 + }, + { + "epoch": 0.2185697808535179, + "grad_norm": 0.8868815898895264, + "learning_rate": 0.0001856455647488936, + "loss": 0.9910011291503906, + "step": 379 + }, + { + "epoch": 0.21914648212226068, + "grad_norm": 0.5880477428436279, + "learning_rate": 0.00018560708100827402, + "loss": 0.9178383946418762, + "step": 380 + }, + { + "epoch": 0.21972318339100347, + "grad_norm": 0.7115210294723511, + "learning_rate": 0.00018556859726765444, + "loss": 1.3695993423461914, + "step": 381 + }, + { + "epoch": 0.22029988465974626, + "grad_norm": 0.9036445617675781, + "learning_rate": 0.00018553011352703485, + "loss": 1.049261212348938, + "step": 382 + }, + { + "epoch": 0.22087658592848905, + "grad_norm": 1.044411540031433, + "learning_rate": 0.00018549162978641527, + "loss": 1.272240400314331, + "step": 383 + }, + { + "epoch": 0.22145328719723184, + "grad_norm": 0.6363574862480164, + "learning_rate": 0.00018545314604579566, + "loss": 1.0237360000610352, + "step": 384 + }, + { + "epoch": 0.22202998846597463, + "grad_norm": 0.7671105861663818, + "learning_rate": 0.00018541466230517607, + "loss": 0.9970401525497437, + "step": 385 + }, + { + "epoch": 0.22260668973471742, + "grad_norm": 1.170229434967041, + "learning_rate": 0.0001853761785645565, + "loss": 1.5654575824737549, + "step": 386 + }, + { + "epoch": 0.2231833910034602, + "grad_norm": 0.9486715793609619, + "learning_rate": 0.0001853376948239369, + "loss": 1.8445625305175781, + "step": 387 + }, + { + "epoch": 0.223760092272203, + "grad_norm": 0.7049561142921448, + "learning_rate": 0.00018529921108331732, + "loss": 1.147915005683899, + "step": 388 + }, + { + "epoch": 0.2243367935409458, + "grad_norm": 0.7626886963844299, + "learning_rate": 0.00018526072734269773, + "loss": 0.9354770183563232, + "step": 389 + }, + { + "epoch": 0.22491349480968859, + "grad_norm": 0.8018368482589722, + "learning_rate": 0.00018522224360207815, + "loss": 1.0617220401763916, + "step": 390 + }, + { + "epoch": 0.22549019607843138, + "grad_norm": 0.7590807676315308, + "learning_rate": 0.00018518375986145854, + "loss": 0.9120303988456726, + "step": 391 + }, + { + "epoch": 0.22606689734717417, + "grad_norm": 0.6623148918151855, + "learning_rate": 0.00018514527612083895, + "loss": 0.7569756507873535, + "step": 392 + }, + { + "epoch": 0.22664359861591696, + "grad_norm": 0.5547282099723816, + "learning_rate": 0.00018510679238021937, + "loss": 0.7989190816879272, + "step": 393 + }, + { + "epoch": 0.22722029988465975, + "grad_norm": 0.5765286087989807, + "learning_rate": 0.00018506830863959978, + "loss": 0.6133571863174438, + "step": 394 + }, + { + "epoch": 0.22779700115340254, + "grad_norm": 0.8331816792488098, + "learning_rate": 0.0001850298248989802, + "loss": 1.1577847003936768, + "step": 395 + }, + { + "epoch": 0.22837370242214533, + "grad_norm": 0.7655069231987, + "learning_rate": 0.0001849913411583606, + "loss": 1.0809553861618042, + "step": 396 + }, + { + "epoch": 0.22895040369088812, + "grad_norm": 0.7397854924201965, + "learning_rate": 0.00018495285741774103, + "loss": 0.9830250144004822, + "step": 397 + }, + { + "epoch": 0.2295271049596309, + "grad_norm": 0.6970857381820679, + "learning_rate": 0.00018491437367712144, + "loss": 0.8101853132247925, + "step": 398 + }, + { + "epoch": 0.2301038062283737, + "grad_norm": 0.5724602937698364, + "learning_rate": 0.00018487588993650183, + "loss": 0.70196932554245, + "step": 399 + }, + { + "epoch": 0.2306805074971165, + "grad_norm": 0.9593637585639954, + "learning_rate": 0.00018483740619588225, + "loss": 0.9378552436828613, + "step": 400 + }, + { + "epoch": 0.23125720876585928, + "grad_norm": 0.7079650163650513, + "learning_rate": 0.00018479892245526266, + "loss": 0.8764985799789429, + "step": 401 + }, + { + "epoch": 0.23183391003460208, + "grad_norm": 0.7374391555786133, + "learning_rate": 0.00018476043871464308, + "loss": 0.8556146025657654, + "step": 402 + }, + { + "epoch": 0.23241061130334487, + "grad_norm": 0.6992713809013367, + "learning_rate": 0.0001847219549740235, + "loss": 0.9657334089279175, + "step": 403 + }, + { + "epoch": 0.23298731257208766, + "grad_norm": 0.8299751281738281, + "learning_rate": 0.0001846834712334039, + "loss": 1.2171483039855957, + "step": 404 + }, + { + "epoch": 0.23356401384083045, + "grad_norm": 0.5866743922233582, + "learning_rate": 0.00018464498749278432, + "loss": 0.9809523820877075, + "step": 405 + }, + { + "epoch": 0.23414071510957324, + "grad_norm": 0.8412980437278748, + "learning_rate": 0.00018460650375216474, + "loss": 1.1848514080047607, + "step": 406 + }, + { + "epoch": 0.23471741637831603, + "grad_norm": 0.7566470503807068, + "learning_rate": 0.00018456802001154513, + "loss": 1.0939483642578125, + "step": 407 + }, + { + "epoch": 0.23529411764705882, + "grad_norm": 0.787800669670105, + "learning_rate": 0.00018452953627092554, + "loss": 1.2347867488861084, + "step": 408 + }, + { + "epoch": 0.2358708189158016, + "grad_norm": 0.8511201739311218, + "learning_rate": 0.00018449105253030596, + "loss": 0.9385696053504944, + "step": 409 + }, + { + "epoch": 0.2364475201845444, + "grad_norm": 0.9360937476158142, + "learning_rate": 0.00018445256878968637, + "loss": 1.3519483804702759, + "step": 410 + }, + { + "epoch": 0.2370242214532872, + "grad_norm": 0.556093692779541, + "learning_rate": 0.0001844140850490668, + "loss": 0.8482391238212585, + "step": 411 + }, + { + "epoch": 0.23760092272202998, + "grad_norm": 0.6390929818153381, + "learning_rate": 0.0001843756013084472, + "loss": 1.0374037027359009, + "step": 412 + }, + { + "epoch": 0.23817762399077277, + "grad_norm": 0.5385326743125916, + "learning_rate": 0.00018433711756782762, + "loss": 0.8951395750045776, + "step": 413 + }, + { + "epoch": 0.23875432525951557, + "grad_norm": 0.7417898774147034, + "learning_rate": 0.00018429863382720803, + "loss": 1.1854356527328491, + "step": 414 + }, + { + "epoch": 0.23933102652825836, + "grad_norm": 0.7092972993850708, + "learning_rate": 0.00018426015008658842, + "loss": 1.2556312084197998, + "step": 415 + }, + { + "epoch": 0.23990772779700115, + "grad_norm": 0.6026037931442261, + "learning_rate": 0.00018422166634596884, + "loss": 0.8205006718635559, + "step": 416 + }, + { + "epoch": 0.24048442906574394, + "grad_norm": 0.7460249662399292, + "learning_rate": 0.00018418318260534925, + "loss": 0.9955434203147888, + "step": 417 + }, + { + "epoch": 0.24106113033448673, + "grad_norm": 0.6313579082489014, + "learning_rate": 0.00018414469886472967, + "loss": 1.15024995803833, + "step": 418 + }, + { + "epoch": 0.24163783160322952, + "grad_norm": 0.7596423029899597, + "learning_rate": 0.00018410621512411008, + "loss": 1.196816325187683, + "step": 419 + }, + { + "epoch": 0.2422145328719723, + "grad_norm": 0.7336683869361877, + "learning_rate": 0.0001840677313834905, + "loss": 1.0791605710983276, + "step": 420 + }, + { + "epoch": 0.2427912341407151, + "grad_norm": 0.6802041530609131, + "learning_rate": 0.00018402924764287091, + "loss": 0.8439788222312927, + "step": 421 + }, + { + "epoch": 0.2433679354094579, + "grad_norm": 0.9311268329620361, + "learning_rate": 0.00018399076390225133, + "loss": 1.4188232421875, + "step": 422 + }, + { + "epoch": 0.24394463667820068, + "grad_norm": 0.9715989232063293, + "learning_rate": 0.00018395228016163172, + "loss": 1.149898648262024, + "step": 423 + }, + { + "epoch": 0.24452133794694347, + "grad_norm": 0.6722977161407471, + "learning_rate": 0.00018391379642101213, + "loss": 1.0626373291015625, + "step": 424 + }, + { + "epoch": 0.24509803921568626, + "grad_norm": 0.9417729377746582, + "learning_rate": 0.00018387531268039255, + "loss": 1.277899980545044, + "step": 425 + }, + { + "epoch": 0.24567474048442905, + "grad_norm": 0.8700136542320251, + "learning_rate": 0.00018383682893977296, + "loss": 1.106884479522705, + "step": 426 + }, + { + "epoch": 0.24625144175317185, + "grad_norm": 0.71380615234375, + "learning_rate": 0.00018379834519915338, + "loss": 1.1928266286849976, + "step": 427 + }, + { + "epoch": 0.24682814302191464, + "grad_norm": 0.7276275157928467, + "learning_rate": 0.0001837598614585338, + "loss": 1.2448585033416748, + "step": 428 + }, + { + "epoch": 0.24740484429065743, + "grad_norm": 0.8795212507247925, + "learning_rate": 0.0001837213777179142, + "loss": 1.317166805267334, + "step": 429 + }, + { + "epoch": 0.24798154555940022, + "grad_norm": 0.9904524087905884, + "learning_rate": 0.00018368289397729462, + "loss": 1.166348934173584, + "step": 430 + }, + { + "epoch": 0.248558246828143, + "grad_norm": 0.7632173299789429, + "learning_rate": 0.000183644410236675, + "loss": 1.5664170980453491, + "step": 431 + }, + { + "epoch": 0.2491349480968858, + "grad_norm": 0.8291054964065552, + "learning_rate": 0.00018360592649605543, + "loss": 1.4953291416168213, + "step": 432 + }, + { + "epoch": 0.2497116493656286, + "grad_norm": 0.6445023417472839, + "learning_rate": 0.00018356744275543584, + "loss": 0.8673335313796997, + "step": 433 + }, + { + "epoch": 0.2502883506343714, + "grad_norm": 1.2072186470031738, + "learning_rate": 0.00018352895901481626, + "loss": 1.59421968460083, + "step": 434 + }, + { + "epoch": 0.2508650519031142, + "grad_norm": 0.7409680485725403, + "learning_rate": 0.00018349047527419667, + "loss": 1.0224432945251465, + "step": 435 + }, + { + "epoch": 0.25144175317185696, + "grad_norm": 0.8207524418830872, + "learning_rate": 0.0001834519915335771, + "loss": 1.276658058166504, + "step": 436 + }, + { + "epoch": 0.2520184544405998, + "grad_norm": 0.8591949343681335, + "learning_rate": 0.0001834135077929575, + "loss": 1.1319093704223633, + "step": 437 + }, + { + "epoch": 0.25259515570934254, + "grad_norm": 0.6689372658729553, + "learning_rate": 0.00018337502405233792, + "loss": 0.9691576361656189, + "step": 438 + }, + { + "epoch": 0.25317185697808536, + "grad_norm": 0.9033296704292297, + "learning_rate": 0.0001833365403117183, + "loss": 1.4272680282592773, + "step": 439 + }, + { + "epoch": 0.2537485582468281, + "grad_norm": 0.6959604620933533, + "learning_rate": 0.0001832980565710987, + "loss": 1.1449182033538818, + "step": 440 + }, + { + "epoch": 0.25432525951557095, + "grad_norm": 0.6695550680160522, + "learning_rate": 0.0001832595728304791, + "loss": 1.0492792129516602, + "step": 441 + }, + { + "epoch": 0.2549019607843137, + "grad_norm": 0.710794985294342, + "learning_rate": 0.00018322108908985953, + "loss": 0.9534090757369995, + "step": 442 + }, + { + "epoch": 0.2554786620530565, + "grad_norm": 0.6955594420433044, + "learning_rate": 0.00018318260534923994, + "loss": 0.8743690252304077, + "step": 443 + }, + { + "epoch": 0.2560553633217993, + "grad_norm": 0.6831961274147034, + "learning_rate": 0.00018314412160862036, + "loss": 1.3500818014144897, + "step": 444 + }, + { + "epoch": 0.2566320645905421, + "grad_norm": 0.7839577198028564, + "learning_rate": 0.00018310563786800077, + "loss": 1.0105950832366943, + "step": 445 + }, + { + "epoch": 0.25720876585928487, + "grad_norm": 0.8791704773902893, + "learning_rate": 0.0001830671541273812, + "loss": 1.2243623733520508, + "step": 446 + }, + { + "epoch": 0.2577854671280277, + "grad_norm": 0.7005860209465027, + "learning_rate": 0.0001830286703867616, + "loss": 1.077842354774475, + "step": 447 + }, + { + "epoch": 0.25836216839677045, + "grad_norm": 0.822964072227478, + "learning_rate": 0.000182990186646142, + "loss": 1.2265344858169556, + "step": 448 + }, + { + "epoch": 0.25893886966551327, + "grad_norm": 0.773158609867096, + "learning_rate": 0.0001829517029055224, + "loss": 0.8715431690216064, + "step": 449 + }, + { + "epoch": 0.25951557093425603, + "grad_norm": 0.8603456616401672, + "learning_rate": 0.00018291321916490282, + "loss": 0.9889146089553833, + "step": 450 + }, + { + "epoch": 0.26009227220299885, + "grad_norm": 0.8188443779945374, + "learning_rate": 0.00018287473542428324, + "loss": 0.8885264992713928, + "step": 451 + }, + { + "epoch": 0.2606689734717416, + "grad_norm": 1.0877407789230347, + "learning_rate": 0.00018283625168366365, + "loss": 1.0748121738433838, + "step": 452 + }, + { + "epoch": 0.26124567474048443, + "grad_norm": 0.5481402277946472, + "learning_rate": 0.00018279776794304407, + "loss": 0.807957649230957, + "step": 453 + }, + { + "epoch": 0.2618223760092272, + "grad_norm": 0.8591419458389282, + "learning_rate": 0.00018275928420242448, + "loss": 1.3057336807250977, + "step": 454 + }, + { + "epoch": 0.26239907727797, + "grad_norm": 0.7936019897460938, + "learning_rate": 0.0001827208004618049, + "loss": 1.185962200164795, + "step": 455 + }, + { + "epoch": 0.2629757785467128, + "grad_norm": 0.6581904888153076, + "learning_rate": 0.00018268231672118529, + "loss": 0.8275895118713379, + "step": 456 + }, + { + "epoch": 0.2635524798154556, + "grad_norm": 0.831302285194397, + "learning_rate": 0.0001826438329805657, + "loss": 1.299217939376831, + "step": 457 + }, + { + "epoch": 0.26412918108419836, + "grad_norm": 0.6771467924118042, + "learning_rate": 0.00018260534923994612, + "loss": 0.8427085876464844, + "step": 458 + }, + { + "epoch": 0.2647058823529412, + "grad_norm": 0.7914313077926636, + "learning_rate": 0.00018256686549932653, + "loss": 1.369484305381775, + "step": 459 + }, + { + "epoch": 0.26528258362168394, + "grad_norm": 0.5916578769683838, + "learning_rate": 0.00018252838175870695, + "loss": 0.6241229772567749, + "step": 460 + }, + { + "epoch": 0.26585928489042676, + "grad_norm": 0.6836418509483337, + "learning_rate": 0.00018248989801808736, + "loss": 0.8050651550292969, + "step": 461 + }, + { + "epoch": 0.2664359861591695, + "grad_norm": 0.7545502185821533, + "learning_rate": 0.00018245141427746778, + "loss": 0.7911585569381714, + "step": 462 + }, + { + "epoch": 0.26701268742791234, + "grad_norm": 0.6010773181915283, + "learning_rate": 0.0001824129305368482, + "loss": 1.1161192655563354, + "step": 463 + }, + { + "epoch": 0.2675893886966551, + "grad_norm": 0.813204824924469, + "learning_rate": 0.00018237444679622858, + "loss": 1.096695065498352, + "step": 464 + }, + { + "epoch": 0.2681660899653979, + "grad_norm": 0.91140216588974, + "learning_rate": 0.000182335963055609, + "loss": 1.4385195970535278, + "step": 465 + }, + { + "epoch": 0.2687427912341407, + "grad_norm": 0.9745720624923706, + "learning_rate": 0.0001822974793149894, + "loss": 1.3157883882522583, + "step": 466 + }, + { + "epoch": 0.2693194925028835, + "grad_norm": 0.4999851584434509, + "learning_rate": 0.00018225899557436983, + "loss": 0.6729867458343506, + "step": 467 + }, + { + "epoch": 0.2698961937716263, + "grad_norm": 0.9021291732788086, + "learning_rate": 0.00018222051183375024, + "loss": 1.0553233623504639, + "step": 468 + }, + { + "epoch": 0.2704728950403691, + "grad_norm": 0.8061716556549072, + "learning_rate": 0.00018218202809313066, + "loss": 1.3081198930740356, + "step": 469 + }, + { + "epoch": 0.2710495963091119, + "grad_norm": 0.6820981502532959, + "learning_rate": 0.00018214354435251107, + "loss": 0.9388906359672546, + "step": 470 + }, + { + "epoch": 0.27162629757785467, + "grad_norm": 1.0991320610046387, + "learning_rate": 0.0001821050606118915, + "loss": 1.528028964996338, + "step": 471 + }, + { + "epoch": 0.2722029988465975, + "grad_norm": 0.7934592962265015, + "learning_rate": 0.00018206657687127188, + "loss": 1.2054097652435303, + "step": 472 + }, + { + "epoch": 0.27277970011534025, + "grad_norm": 0.7113450765609741, + "learning_rate": 0.0001820280931306523, + "loss": 1.0254576206207275, + "step": 473 + }, + { + "epoch": 0.27335640138408307, + "grad_norm": 0.7593767046928406, + "learning_rate": 0.0001819896093900327, + "loss": 1.284333348274231, + "step": 474 + }, + { + "epoch": 0.27393310265282583, + "grad_norm": 1.006116509437561, + "learning_rate": 0.00018195112564941312, + "loss": 1.3650097846984863, + "step": 475 + }, + { + "epoch": 0.27450980392156865, + "grad_norm": 0.8706763982772827, + "learning_rate": 0.00018191264190879354, + "loss": 1.6067880392074585, + "step": 476 + }, + { + "epoch": 0.2750865051903114, + "grad_norm": 0.7428901195526123, + "learning_rate": 0.00018187415816817395, + "loss": 1.373342514038086, + "step": 477 + }, + { + "epoch": 0.27566320645905423, + "grad_norm": 0.8846433162689209, + "learning_rate": 0.00018183567442755437, + "loss": 1.5520777702331543, + "step": 478 + }, + { + "epoch": 0.276239907727797, + "grad_norm": 0.8808581829071045, + "learning_rate": 0.00018179719068693478, + "loss": 1.1342291831970215, + "step": 479 + }, + { + "epoch": 0.2768166089965398, + "grad_norm": 0.7310512065887451, + "learning_rate": 0.00018175870694631517, + "loss": 0.7762906551361084, + "step": 480 + }, + { + "epoch": 0.2773933102652826, + "grad_norm": 0.8467727303504944, + "learning_rate": 0.0001817202232056956, + "loss": 0.990180253982544, + "step": 481 + }, + { + "epoch": 0.2779700115340254, + "grad_norm": 0.642230212688446, + "learning_rate": 0.000181681739465076, + "loss": 0.845292329788208, + "step": 482 + }, + { + "epoch": 0.27854671280276816, + "grad_norm": 0.7775582075119019, + "learning_rate": 0.00018164325572445642, + "loss": 1.279380202293396, + "step": 483 + }, + { + "epoch": 0.279123414071511, + "grad_norm": 0.6477130651473999, + "learning_rate": 0.00018160477198383683, + "loss": 0.8197907209396362, + "step": 484 + }, + { + "epoch": 0.27970011534025374, + "grad_norm": 0.6508778929710388, + "learning_rate": 0.00018156628824321725, + "loss": 0.9538026452064514, + "step": 485 + }, + { + "epoch": 0.28027681660899656, + "grad_norm": 0.9379159212112427, + "learning_rate": 0.00018152780450259766, + "loss": 1.2874410152435303, + "step": 486 + }, + { + "epoch": 0.2808535178777393, + "grad_norm": 0.8014243245124817, + "learning_rate": 0.00018148932076197808, + "loss": 1.364856481552124, + "step": 487 + }, + { + "epoch": 0.28143021914648214, + "grad_norm": 1.0049822330474854, + "learning_rate": 0.00018145083702135847, + "loss": 1.3461369276046753, + "step": 488 + }, + { + "epoch": 0.2820069204152249, + "grad_norm": 0.8764071464538574, + "learning_rate": 0.00018141235328073888, + "loss": 1.549091100692749, + "step": 489 + }, + { + "epoch": 0.2825836216839677, + "grad_norm": 0.6743770241737366, + "learning_rate": 0.0001813738695401193, + "loss": 0.8718385696411133, + "step": 490 + }, + { + "epoch": 0.2831603229527105, + "grad_norm": 0.8501721024513245, + "learning_rate": 0.0001813353857994997, + "loss": 0.9592713117599487, + "step": 491 + }, + { + "epoch": 0.2837370242214533, + "grad_norm": 0.6727166771888733, + "learning_rate": 0.00018129690205888013, + "loss": 1.0024611949920654, + "step": 492 + }, + { + "epoch": 0.28431372549019607, + "grad_norm": 0.7949026226997375, + "learning_rate": 0.00018125841831826054, + "loss": 0.889624297618866, + "step": 493 + }, + { + "epoch": 0.2848904267589389, + "grad_norm": 0.8814200758934021, + "learning_rate": 0.00018121993457764096, + "loss": 1.7483818531036377, + "step": 494 + }, + { + "epoch": 0.28546712802768165, + "grad_norm": 0.6116936206817627, + "learning_rate": 0.00018118145083702137, + "loss": 1.097643256187439, + "step": 495 + }, + { + "epoch": 0.28604382929642447, + "grad_norm": 0.6951889395713806, + "learning_rate": 0.00018114296709640176, + "loss": 0.9292160272598267, + "step": 496 + }, + { + "epoch": 0.28662053056516723, + "grad_norm": 0.9138390421867371, + "learning_rate": 0.00018110448335578218, + "loss": 1.174808144569397, + "step": 497 + }, + { + "epoch": 0.28719723183391005, + "grad_norm": 0.6442549824714661, + "learning_rate": 0.0001810659996151626, + "loss": 0.9390018582344055, + "step": 498 + }, + { + "epoch": 0.2877739331026528, + "grad_norm": 0.9683842658996582, + "learning_rate": 0.000181027515874543, + "loss": 1.4045450687408447, + "step": 499 + }, + { + "epoch": 0.28835063437139563, + "grad_norm": 0.7444068193435669, + "learning_rate": 0.00018098903213392342, + "loss": 0.9792321920394897, + "step": 500 + }, + { + "epoch": 0.2889273356401384, + "grad_norm": 0.7402380108833313, + "learning_rate": 0.00018095054839330384, + "loss": 1.231440782546997, + "step": 501 + }, + { + "epoch": 0.2895040369088812, + "grad_norm": 0.7022894024848938, + "learning_rate": 0.00018091206465268425, + "loss": 0.856300950050354, + "step": 502 + }, + { + "epoch": 0.290080738177624, + "grad_norm": 0.7641032338142395, + "learning_rate": 0.00018087358091206467, + "loss": 0.9729149341583252, + "step": 503 + }, + { + "epoch": 0.2906574394463668, + "grad_norm": 0.9500510096549988, + "learning_rate": 0.00018083509717144506, + "loss": 1.2449204921722412, + "step": 504 + }, + { + "epoch": 0.29123414071510956, + "grad_norm": 0.6954758763313293, + "learning_rate": 0.00018079661343082547, + "loss": 0.8000816106796265, + "step": 505 + }, + { + "epoch": 0.2918108419838524, + "grad_norm": 0.7313628196716309, + "learning_rate": 0.0001807581296902059, + "loss": 1.233512282371521, + "step": 506 + }, + { + "epoch": 0.29238754325259514, + "grad_norm": 0.8792680501937866, + "learning_rate": 0.0001807196459495863, + "loss": 1.092308521270752, + "step": 507 + }, + { + "epoch": 0.29296424452133796, + "grad_norm": 0.6230028867721558, + "learning_rate": 0.00018068116220896672, + "loss": 0.7719423174858093, + "step": 508 + }, + { + "epoch": 0.2935409457900807, + "grad_norm": 0.8965409398078918, + "learning_rate": 0.00018064267846834713, + "loss": 1.576930284500122, + "step": 509 + }, + { + "epoch": 0.29411764705882354, + "grad_norm": 0.756908118724823, + "learning_rate": 0.00018060419472772755, + "loss": 0.9762069582939148, + "step": 510 + }, + { + "epoch": 0.2946943483275663, + "grad_norm": 0.7524373531341553, + "learning_rate": 0.00018056571098710794, + "loss": 0.9206646680831909, + "step": 511 + }, + { + "epoch": 0.2952710495963091, + "grad_norm": 0.9292136430740356, + "learning_rate": 0.00018052722724648835, + "loss": 1.534470558166504, + "step": 512 + }, + { + "epoch": 0.2958477508650519, + "grad_norm": 1.0442750453948975, + "learning_rate": 0.00018048874350586877, + "loss": 1.2520341873168945, + "step": 513 + }, + { + "epoch": 0.2964244521337947, + "grad_norm": 0.8131316900253296, + "learning_rate": 0.00018045025976524918, + "loss": 1.5056309700012207, + "step": 514 + }, + { + "epoch": 0.29700115340253747, + "grad_norm": 0.7711693048477173, + "learning_rate": 0.0001804117760246296, + "loss": 1.2189143896102905, + "step": 515 + }, + { + "epoch": 0.2975778546712803, + "grad_norm": 0.6610523462295532, + "learning_rate": 0.00018037329228401, + "loss": 1.1120340824127197, + "step": 516 + }, + { + "epoch": 0.29815455594002305, + "grad_norm": 0.7343090772628784, + "learning_rate": 0.00018033480854339043, + "loss": 1.0496878623962402, + "step": 517 + }, + { + "epoch": 0.29873125720876587, + "grad_norm": 0.6952423453330994, + "learning_rate": 0.00018029632480277084, + "loss": 1.0725046396255493, + "step": 518 + }, + { + "epoch": 0.29930795847750863, + "grad_norm": 1.0385462045669556, + "learning_rate": 0.00018025784106215123, + "loss": 1.3104898929595947, + "step": 519 + }, + { + "epoch": 0.29988465974625145, + "grad_norm": 0.6035030484199524, + "learning_rate": 0.00018021935732153165, + "loss": 0.7342404127120972, + "step": 520 + }, + { + "epoch": 0.3004613610149942, + "grad_norm": 0.5726889371871948, + "learning_rate": 0.00018018087358091206, + "loss": 0.9352455139160156, + "step": 521 + }, + { + "epoch": 0.30103806228373703, + "grad_norm": 0.5148364305496216, + "learning_rate": 0.00018014238984029248, + "loss": 0.8527913093566895, + "step": 522 + }, + { + "epoch": 0.3016147635524798, + "grad_norm": 0.8307221531867981, + "learning_rate": 0.0001801039060996729, + "loss": 1.180746078491211, + "step": 523 + }, + { + "epoch": 0.3021914648212226, + "grad_norm": 0.8560492396354675, + "learning_rate": 0.0001800654223590533, + "loss": 1.4329997301101685, + "step": 524 + }, + { + "epoch": 0.3027681660899654, + "grad_norm": 0.5972908139228821, + "learning_rate": 0.00018002693861843372, + "loss": 0.7385514974594116, + "step": 525 + }, + { + "epoch": 0.3033448673587082, + "grad_norm": 0.5159963965415955, + "learning_rate": 0.00017998845487781414, + "loss": 0.646453320980072, + "step": 526 + }, + { + "epoch": 0.30392156862745096, + "grad_norm": 0.9237578511238098, + "learning_rate": 0.00017994997113719453, + "loss": 1.442482590675354, + "step": 527 + }, + { + "epoch": 0.3044982698961938, + "grad_norm": 0.9341033697128296, + "learning_rate": 0.00017991148739657494, + "loss": 1.3850878477096558, + "step": 528 + }, + { + "epoch": 0.30507497116493654, + "grad_norm": 0.5422039031982422, + "learning_rate": 0.00017987300365595536, + "loss": 0.6736562252044678, + "step": 529 + }, + { + "epoch": 0.30565167243367936, + "grad_norm": 0.6220455765724182, + "learning_rate": 0.00017983451991533577, + "loss": 0.7528645992279053, + "step": 530 + }, + { + "epoch": 0.3062283737024221, + "grad_norm": 0.8073663115501404, + "learning_rate": 0.0001797960361747162, + "loss": 1.2123267650604248, + "step": 531 + }, + { + "epoch": 0.30680507497116494, + "grad_norm": 0.5491252541542053, + "learning_rate": 0.0001797575524340966, + "loss": 0.5903505086898804, + "step": 532 + }, + { + "epoch": 0.3073817762399077, + "grad_norm": 1.9019479751586914, + "learning_rate": 0.00017971906869347702, + "loss": 1.4316587448120117, + "step": 533 + }, + { + "epoch": 0.3079584775086505, + "grad_norm": 0.45649734139442444, + "learning_rate": 0.00017968058495285743, + "loss": 0.659195065498352, + "step": 534 + }, + { + "epoch": 0.30853517877739334, + "grad_norm": 0.7406135201454163, + "learning_rate": 0.00017964210121223782, + "loss": 1.0346477031707764, + "step": 535 + }, + { + "epoch": 0.3091118800461361, + "grad_norm": 0.9768670201301575, + "learning_rate": 0.00017960361747161824, + "loss": 1.584676742553711, + "step": 536 + }, + { + "epoch": 0.3096885813148789, + "grad_norm": 0.7869756817817688, + "learning_rate": 0.00017956513373099865, + "loss": 1.0404967069625854, + "step": 537 + }, + { + "epoch": 0.3102652825836217, + "grad_norm": 0.6868966221809387, + "learning_rate": 0.00017952664999037907, + "loss": 0.8878238201141357, + "step": 538 + }, + { + "epoch": 0.3108419838523645, + "grad_norm": 0.7594157457351685, + "learning_rate": 0.00017948816624975948, + "loss": 1.0191287994384766, + "step": 539 + }, + { + "epoch": 0.31141868512110726, + "grad_norm": 0.8346229195594788, + "learning_rate": 0.0001794496825091399, + "loss": 1.021256923675537, + "step": 540 + }, + { + "epoch": 0.3119953863898501, + "grad_norm": 1.0493948459625244, + "learning_rate": 0.00017941119876852031, + "loss": 1.0015616416931152, + "step": 541 + }, + { + "epoch": 0.31257208765859285, + "grad_norm": 0.62034010887146, + "learning_rate": 0.00017937271502790073, + "loss": 0.9237149357795715, + "step": 542 + }, + { + "epoch": 0.31314878892733566, + "grad_norm": 0.7169587016105652, + "learning_rate": 0.00017933423128728112, + "loss": 0.8658795356750488, + "step": 543 + }, + { + "epoch": 0.3137254901960784, + "grad_norm": 0.7205992341041565, + "learning_rate": 0.00017929574754666153, + "loss": 1.1227588653564453, + "step": 544 + }, + { + "epoch": 0.31430219146482125, + "grad_norm": 0.7573957443237305, + "learning_rate": 0.00017925726380604195, + "loss": 0.9638352394104004, + "step": 545 + }, + { + "epoch": 0.314878892733564, + "grad_norm": 0.981253981590271, + "learning_rate": 0.00017921878006542236, + "loss": 1.0400216579437256, + "step": 546 + }, + { + "epoch": 0.3154555940023068, + "grad_norm": 0.6763452291488647, + "learning_rate": 0.00017918029632480278, + "loss": 1.0069935321807861, + "step": 547 + }, + { + "epoch": 0.3160322952710496, + "grad_norm": 0.5641304850578308, + "learning_rate": 0.0001791418125841832, + "loss": 0.7099517583847046, + "step": 548 + }, + { + "epoch": 0.3166089965397924, + "grad_norm": 0.542838454246521, + "learning_rate": 0.0001791033288435636, + "loss": 0.7347281575202942, + "step": 549 + }, + { + "epoch": 0.31718569780853517, + "grad_norm": 0.6865650415420532, + "learning_rate": 0.00017906484510294402, + "loss": 0.9269914031028748, + "step": 550 + }, + { + "epoch": 0.317762399077278, + "grad_norm": 0.6794233322143555, + "learning_rate": 0.0001790263613623244, + "loss": 0.8624827861785889, + "step": 551 + }, + { + "epoch": 0.31833910034602075, + "grad_norm": 0.9417468905448914, + "learning_rate": 0.00017898787762170483, + "loss": 1.2194072008132935, + "step": 552 + }, + { + "epoch": 0.31891580161476357, + "grad_norm": 0.8551915287971497, + "learning_rate": 0.00017894939388108524, + "loss": 1.1121107339859009, + "step": 553 + }, + { + "epoch": 0.31949250288350634, + "grad_norm": 1.0210304260253906, + "learning_rate": 0.00017891091014046566, + "loss": 1.3061752319335938, + "step": 554 + }, + { + "epoch": 0.32006920415224915, + "grad_norm": 0.9833082556724548, + "learning_rate": 0.00017887242639984607, + "loss": 1.3157097101211548, + "step": 555 + }, + { + "epoch": 0.3206459054209919, + "grad_norm": 0.8534771203994751, + "learning_rate": 0.0001788339426592265, + "loss": 1.1443736553192139, + "step": 556 + }, + { + "epoch": 0.32122260668973474, + "grad_norm": 0.5206373929977417, + "learning_rate": 0.0001787954589186069, + "loss": 0.9210702776908875, + "step": 557 + }, + { + "epoch": 0.3217993079584775, + "grad_norm": 0.9890329837799072, + "learning_rate": 0.00017875697517798732, + "loss": 1.1474642753601074, + "step": 558 + }, + { + "epoch": 0.3223760092272203, + "grad_norm": 1.033987045288086, + "learning_rate": 0.0001787184914373677, + "loss": 1.3469852209091187, + "step": 559 + }, + { + "epoch": 0.3229527104959631, + "grad_norm": 0.5397274494171143, + "learning_rate": 0.00017868000769674812, + "loss": 0.8606307506561279, + "step": 560 + }, + { + "epoch": 0.3235294117647059, + "grad_norm": 0.7607125639915466, + "learning_rate": 0.00017864152395612854, + "loss": 1.5313308238983154, + "step": 561 + }, + { + "epoch": 0.32410611303344866, + "grad_norm": 0.8187709450721741, + "learning_rate": 0.00017860304021550895, + "loss": 1.2671842575073242, + "step": 562 + }, + { + "epoch": 0.3246828143021915, + "grad_norm": 0.8652257919311523, + "learning_rate": 0.00017856455647488937, + "loss": 1.0011459589004517, + "step": 563 + }, + { + "epoch": 0.32525951557093424, + "grad_norm": 0.8205957412719727, + "learning_rate": 0.00017852607273426978, + "loss": 0.9995499849319458, + "step": 564 + }, + { + "epoch": 0.32583621683967706, + "grad_norm": 0.8630533814430237, + "learning_rate": 0.0001784875889936502, + "loss": 1.119580864906311, + "step": 565 + }, + { + "epoch": 0.3264129181084198, + "grad_norm": 0.6678904294967651, + "learning_rate": 0.00017844910525303061, + "loss": 0.9301247596740723, + "step": 566 + }, + { + "epoch": 0.32698961937716264, + "grad_norm": 0.7211806774139404, + "learning_rate": 0.000178410621512411, + "loss": 1.3346351385116577, + "step": 567 + }, + { + "epoch": 0.3275663206459054, + "grad_norm": 0.6392566561698914, + "learning_rate": 0.00017837213777179142, + "loss": 0.6997557878494263, + "step": 568 + }, + { + "epoch": 0.3281430219146482, + "grad_norm": 0.8357546329498291, + "learning_rate": 0.00017833365403117183, + "loss": 1.3044462203979492, + "step": 569 + }, + { + "epoch": 0.328719723183391, + "grad_norm": 0.7778827548027039, + "learning_rate": 0.00017829517029055225, + "loss": 0.9234685897827148, + "step": 570 + }, + { + "epoch": 0.3292964244521338, + "grad_norm": 0.7168182730674744, + "learning_rate": 0.00017825668654993266, + "loss": 1.532446265220642, + "step": 571 + }, + { + "epoch": 0.32987312572087657, + "grad_norm": 1.016398549079895, + "learning_rate": 0.00017821820280931308, + "loss": 1.4056748151779175, + "step": 572 + }, + { + "epoch": 0.3304498269896194, + "grad_norm": 0.8056113719940186, + "learning_rate": 0.0001781797190686935, + "loss": 1.0595710277557373, + "step": 573 + }, + { + "epoch": 0.33102652825836215, + "grad_norm": 0.6588327884674072, + "learning_rate": 0.0001781412353280739, + "loss": 0.849087655544281, + "step": 574 + }, + { + "epoch": 0.33160322952710497, + "grad_norm": 0.7659177184104919, + "learning_rate": 0.0001781027515874543, + "loss": 1.1442945003509521, + "step": 575 + }, + { + "epoch": 0.33217993079584773, + "grad_norm": 0.8960584402084351, + "learning_rate": 0.0001780642678468347, + "loss": 1.2777467966079712, + "step": 576 + }, + { + "epoch": 0.33275663206459055, + "grad_norm": 0.8990175724029541, + "learning_rate": 0.00017802578410621513, + "loss": 1.0199333429336548, + "step": 577 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.7010089159011841, + "learning_rate": 0.00017798730036559554, + "loss": 1.2177313566207886, + "step": 578 + }, + { + "epoch": 0.33391003460207613, + "grad_norm": 0.8779993057250977, + "learning_rate": 0.00017794881662497596, + "loss": 0.8511064648628235, + "step": 579 + }, + { + "epoch": 0.3344867358708189, + "grad_norm": 0.8380318880081177, + "learning_rate": 0.00017791033288435637, + "loss": 1.0792275667190552, + "step": 580 + }, + { + "epoch": 0.3350634371395617, + "grad_norm": 0.7335569858551025, + "learning_rate": 0.0001778718491437368, + "loss": 1.0502758026123047, + "step": 581 + }, + { + "epoch": 0.3356401384083045, + "grad_norm": 0.7759366631507874, + "learning_rate": 0.0001778333654031172, + "loss": 1.000847578048706, + "step": 582 + }, + { + "epoch": 0.3362168396770473, + "grad_norm": 0.565648078918457, + "learning_rate": 0.0001777948816624976, + "loss": 0.7337483167648315, + "step": 583 + }, + { + "epoch": 0.33679354094579006, + "grad_norm": 0.8646697998046875, + "learning_rate": 0.000177756397921878, + "loss": 1.2806568145751953, + "step": 584 + }, + { + "epoch": 0.3373702422145329, + "grad_norm": 0.9556112289428711, + "learning_rate": 0.00017771791418125842, + "loss": 1.1648443937301636, + "step": 585 + }, + { + "epoch": 0.33794694348327564, + "grad_norm": 0.6629974842071533, + "learning_rate": 0.00017767943044063884, + "loss": 1.0415198802947998, + "step": 586 + }, + { + "epoch": 0.33852364475201846, + "grad_norm": 0.5972018837928772, + "learning_rate": 0.00017764094670001925, + "loss": 0.6916914582252502, + "step": 587 + }, + { + "epoch": 0.3391003460207612, + "grad_norm": 0.7391757965087891, + "learning_rate": 0.00017760246295939967, + "loss": 1.194846510887146, + "step": 588 + }, + { + "epoch": 0.33967704728950404, + "grad_norm": 0.7234671711921692, + "learning_rate": 0.00017756397921878008, + "loss": 0.9572672247886658, + "step": 589 + }, + { + "epoch": 0.3402537485582468, + "grad_norm": 0.6949688792228699, + "learning_rate": 0.0001775254954781605, + "loss": 0.9968490600585938, + "step": 590 + }, + { + "epoch": 0.3408304498269896, + "grad_norm": 0.9384737610816956, + "learning_rate": 0.0001774870117375409, + "loss": 1.106278896331787, + "step": 591 + }, + { + "epoch": 0.3414071510957324, + "grad_norm": 0.8691385388374329, + "learning_rate": 0.0001774485279969213, + "loss": 0.8517290353775024, + "step": 592 + }, + { + "epoch": 0.3419838523644752, + "grad_norm": 0.6864728331565857, + "learning_rate": 0.00017741004425630172, + "loss": 0.9280612468719482, + "step": 593 + }, + { + "epoch": 0.34256055363321797, + "grad_norm": 0.7656051516532898, + "learning_rate": 0.00017737156051568213, + "loss": 1.0975104570388794, + "step": 594 + }, + { + "epoch": 0.3431372549019608, + "grad_norm": 0.6587508916854858, + "learning_rate": 0.00017733307677506255, + "loss": 0.9575508236885071, + "step": 595 + }, + { + "epoch": 0.34371395617070355, + "grad_norm": 0.8466372489929199, + "learning_rate": 0.00017729459303444296, + "loss": 1.2343617677688599, + "step": 596 + }, + { + "epoch": 0.34429065743944637, + "grad_norm": 1.0839906930923462, + "learning_rate": 0.00017725610929382338, + "loss": 1.3552396297454834, + "step": 597 + }, + { + "epoch": 0.34486735870818913, + "grad_norm": 0.7300306558609009, + "learning_rate": 0.00017721762555320377, + "loss": 1.0701713562011719, + "step": 598 + }, + { + "epoch": 0.34544405997693195, + "grad_norm": 0.737766683101654, + "learning_rate": 0.00017717914181258418, + "loss": 1.0968977212905884, + "step": 599 + }, + { + "epoch": 0.3460207612456747, + "grad_norm": 0.749933660030365, + "learning_rate": 0.0001771406580719646, + "loss": 1.3320926427841187, + "step": 600 + }, + { + "epoch": 0.34659746251441753, + "grad_norm": 1.0226854085922241, + "learning_rate": 0.000177102174331345, + "loss": 1.5281516313552856, + "step": 601 + }, + { + "epoch": 0.34717416378316035, + "grad_norm": 0.5458315014839172, + "learning_rate": 0.00017706369059072543, + "loss": 0.6243756413459778, + "step": 602 + }, + { + "epoch": 0.3477508650519031, + "grad_norm": 0.6592231392860413, + "learning_rate": 0.00017702520685010584, + "loss": 1.007111668586731, + "step": 603 + }, + { + "epoch": 0.34832756632064593, + "grad_norm": 0.7599675059318542, + "learning_rate": 0.00017698672310948626, + "loss": 1.059772253036499, + "step": 604 + }, + { + "epoch": 0.3489042675893887, + "grad_norm": 0.7249642610549927, + "learning_rate": 0.00017694823936886667, + "loss": 1.0405762195587158, + "step": 605 + }, + { + "epoch": 0.3494809688581315, + "grad_norm": 0.6669758558273315, + "learning_rate": 0.00017690975562824706, + "loss": 0.8157357573509216, + "step": 606 + }, + { + "epoch": 0.3500576701268743, + "grad_norm": 1.0521658658981323, + "learning_rate": 0.00017687127188762748, + "loss": 1.3226133584976196, + "step": 607 + }, + { + "epoch": 0.3506343713956171, + "grad_norm": 1.190586805343628, + "learning_rate": 0.0001768327881470079, + "loss": 0.9668002724647522, + "step": 608 + }, + { + "epoch": 0.35121107266435986, + "grad_norm": 0.7342950105667114, + "learning_rate": 0.0001767943044063883, + "loss": 1.0137907266616821, + "step": 609 + }, + { + "epoch": 0.3517877739331027, + "grad_norm": 0.8390425443649292, + "learning_rate": 0.00017675582066576872, + "loss": 1.2452900409698486, + "step": 610 + }, + { + "epoch": 0.35236447520184544, + "grad_norm": 0.7040269374847412, + "learning_rate": 0.00017671733692514914, + "loss": 1.1274709701538086, + "step": 611 + }, + { + "epoch": 0.35294117647058826, + "grad_norm": 0.6184991002082825, + "learning_rate": 0.00017667885318452955, + "loss": 0.8320228457450867, + "step": 612 + }, + { + "epoch": 0.353517877739331, + "grad_norm": 0.9174041748046875, + "learning_rate": 0.00017664036944390997, + "loss": 1.0515730381011963, + "step": 613 + }, + { + "epoch": 0.35409457900807384, + "grad_norm": 0.8032795786857605, + "learning_rate": 0.00017660188570329036, + "loss": 0.9692851901054382, + "step": 614 + }, + { + "epoch": 0.3546712802768166, + "grad_norm": 0.854794979095459, + "learning_rate": 0.00017656340196267077, + "loss": 0.9672110676765442, + "step": 615 + }, + { + "epoch": 0.3552479815455594, + "grad_norm": 0.8945924043655396, + "learning_rate": 0.0001765249182220512, + "loss": 1.1629329919815063, + "step": 616 + }, + { + "epoch": 0.3558246828143022, + "grad_norm": 0.8737151622772217, + "learning_rate": 0.0001764864344814316, + "loss": 1.022585153579712, + "step": 617 + }, + { + "epoch": 0.356401384083045, + "grad_norm": 0.7043283581733704, + "learning_rate": 0.00017644795074081202, + "loss": 1.825275182723999, + "step": 618 + }, + { + "epoch": 0.35697808535178777, + "grad_norm": 0.81025230884552, + "learning_rate": 0.00017640946700019243, + "loss": 1.1937224864959717, + "step": 619 + }, + { + "epoch": 0.3575547866205306, + "grad_norm": 0.6064541339874268, + "learning_rate": 0.00017637098325957285, + "loss": 1.144992709159851, + "step": 620 + }, + { + "epoch": 0.35813148788927335, + "grad_norm": 0.7281432747840881, + "learning_rate": 0.00017633249951895326, + "loss": 0.8976823091506958, + "step": 621 + }, + { + "epoch": 0.35870818915801617, + "grad_norm": 0.7124044895172119, + "learning_rate": 0.00017629401577833365, + "loss": 0.9814664721488953, + "step": 622 + }, + { + "epoch": 0.35928489042675893, + "grad_norm": 0.7080062031745911, + "learning_rate": 0.00017625553203771407, + "loss": 0.8040327429771423, + "step": 623 + }, + { + "epoch": 0.35986159169550175, + "grad_norm": 0.9307262897491455, + "learning_rate": 0.00017621704829709448, + "loss": 1.1769636869430542, + "step": 624 + }, + { + "epoch": 0.3604382929642445, + "grad_norm": 0.6040496230125427, + "learning_rate": 0.0001761785645564749, + "loss": 0.8058497905731201, + "step": 625 + }, + { + "epoch": 0.36101499423298733, + "grad_norm": 0.6352747678756714, + "learning_rate": 0.00017614008081585531, + "loss": 1.0901957750320435, + "step": 626 + }, + { + "epoch": 0.3615916955017301, + "grad_norm": 1.0686722993850708, + "learning_rate": 0.00017610159707523573, + "loss": 1.0280206203460693, + "step": 627 + }, + { + "epoch": 0.3621683967704729, + "grad_norm": 0.823551595211029, + "learning_rate": 0.00017606311333461614, + "loss": 1.1255362033843994, + "step": 628 + }, + { + "epoch": 0.3627450980392157, + "grad_norm": 0.8719285726547241, + "learning_rate": 0.00017602462959399656, + "loss": 1.1470766067504883, + "step": 629 + }, + { + "epoch": 0.3633217993079585, + "grad_norm": 0.8169400691986084, + "learning_rate": 0.00017598614585337695, + "loss": 1.0567045211791992, + "step": 630 + }, + { + "epoch": 0.36389850057670126, + "grad_norm": 1.0707166194915771, + "learning_rate": 0.00017594766211275736, + "loss": 1.3314507007598877, + "step": 631 + }, + { + "epoch": 0.3644752018454441, + "grad_norm": 0.6268380284309387, + "learning_rate": 0.00017590917837213778, + "loss": 1.100555419921875, + "step": 632 + }, + { + "epoch": 0.36505190311418684, + "grad_norm": 0.7382054328918457, + "learning_rate": 0.0001758706946315182, + "loss": 0.9670585989952087, + "step": 633 + }, + { + "epoch": 0.36562860438292966, + "grad_norm": 1.193224310874939, + "learning_rate": 0.0001758322108908986, + "loss": 1.3042614459991455, + "step": 634 + }, + { + "epoch": 0.3662053056516724, + "grad_norm": 1.0091503858566284, + "learning_rate": 0.00017579372715027902, + "loss": 1.3520644903182983, + "step": 635 + }, + { + "epoch": 0.36678200692041524, + "grad_norm": 0.6810548901557922, + "learning_rate": 0.00017575524340965944, + "loss": 0.8741036653518677, + "step": 636 + }, + { + "epoch": 0.367358708189158, + "grad_norm": 0.7155483365058899, + "learning_rate": 0.00017571675966903986, + "loss": 0.8751124143600464, + "step": 637 + }, + { + "epoch": 0.3679354094579008, + "grad_norm": 1.0436261892318726, + "learning_rate": 0.00017567827592842024, + "loss": 1.5696821212768555, + "step": 638 + }, + { + "epoch": 0.3685121107266436, + "grad_norm": 0.9394407868385315, + "learning_rate": 0.00017563979218780066, + "loss": 0.8675939440727234, + "step": 639 + }, + { + "epoch": 0.3690888119953864, + "grad_norm": 1.4341135025024414, + "learning_rate": 0.00017560130844718107, + "loss": 1.498160481452942, + "step": 640 + }, + { + "epoch": 0.36966551326412916, + "grad_norm": 1.006375789642334, + "learning_rate": 0.0001755628247065615, + "loss": 1.2490055561065674, + "step": 641 + }, + { + "epoch": 0.370242214532872, + "grad_norm": 0.6104082465171814, + "learning_rate": 0.0001755243409659419, + "loss": 0.8300263285636902, + "step": 642 + }, + { + "epoch": 0.37081891580161475, + "grad_norm": 0.8571838736534119, + "learning_rate": 0.00017548585722532232, + "loss": 1.146481990814209, + "step": 643 + }, + { + "epoch": 0.37139561707035756, + "grad_norm": 0.6824607253074646, + "learning_rate": 0.00017544737348470273, + "loss": 1.2418452501296997, + "step": 644 + }, + { + "epoch": 0.3719723183391003, + "grad_norm": 1.0891611576080322, + "learning_rate": 0.00017540888974408315, + "loss": 1.2160457372665405, + "step": 645 + }, + { + "epoch": 0.37254901960784315, + "grad_norm": 0.6260281801223755, + "learning_rate": 0.00017537040600346354, + "loss": 0.8934881091117859, + "step": 646 + }, + { + "epoch": 0.3731257208765859, + "grad_norm": 0.8351913690567017, + "learning_rate": 0.00017533192226284395, + "loss": 1.5422282218933105, + "step": 647 + }, + { + "epoch": 0.3737024221453287, + "grad_norm": 0.7572267055511475, + "learning_rate": 0.00017529343852222437, + "loss": 1.2659950256347656, + "step": 648 + }, + { + "epoch": 0.3742791234140715, + "grad_norm": 0.7712565064430237, + "learning_rate": 0.00017525495478160478, + "loss": 1.2143782377243042, + "step": 649 + }, + { + "epoch": 0.3748558246828143, + "grad_norm": 0.6880773305892944, + "learning_rate": 0.0001752164710409852, + "loss": 1.0878217220306396, + "step": 650 + }, + { + "epoch": 0.3754325259515571, + "grad_norm": 0.8996551632881165, + "learning_rate": 0.00017517798730036561, + "loss": 0.9668335914611816, + "step": 651 + }, + { + "epoch": 0.3760092272202999, + "grad_norm": 0.921444296836853, + "learning_rate": 0.00017513950355974603, + "loss": 1.1585900783538818, + "step": 652 + }, + { + "epoch": 0.37658592848904265, + "grad_norm": 0.8658480048179626, + "learning_rate": 0.00017510101981912645, + "loss": 1.1533393859863281, + "step": 653 + }, + { + "epoch": 0.3771626297577855, + "grad_norm": 0.6665229797363281, + "learning_rate": 0.00017506253607850683, + "loss": 0.8233336210250854, + "step": 654 + }, + { + "epoch": 0.37773933102652824, + "grad_norm": 0.746337890625, + "learning_rate": 0.00017502405233788725, + "loss": 1.099341630935669, + "step": 655 + }, + { + "epoch": 0.37831603229527105, + "grad_norm": 0.8498716354370117, + "learning_rate": 0.00017498556859726766, + "loss": 1.333115577697754, + "step": 656 + }, + { + "epoch": 0.3788927335640138, + "grad_norm": 0.7371817827224731, + "learning_rate": 0.00017494708485664808, + "loss": 1.05489182472229, + "step": 657 + }, + { + "epoch": 0.37946943483275664, + "grad_norm": 0.7369913458824158, + "learning_rate": 0.0001749086011160285, + "loss": 0.7275075912475586, + "step": 658 + }, + { + "epoch": 0.3800461361014994, + "grad_norm": 1.4918899536132812, + "learning_rate": 0.0001748701173754089, + "loss": 1.2430638074874878, + "step": 659 + }, + { + "epoch": 0.3806228373702422, + "grad_norm": 0.686100423336029, + "learning_rate": 0.00017483163363478933, + "loss": 0.7841339707374573, + "step": 660 + }, + { + "epoch": 0.381199538638985, + "grad_norm": 0.7799985408782959, + "learning_rate": 0.00017479314989416974, + "loss": 1.1784673929214478, + "step": 661 + }, + { + "epoch": 0.3817762399077278, + "grad_norm": 0.7435747385025024, + "learning_rate": 0.00017475466615355013, + "loss": 1.180450439453125, + "step": 662 + }, + { + "epoch": 0.38235294117647056, + "grad_norm": 0.7358818650245667, + "learning_rate": 0.00017471618241293054, + "loss": 0.9987742900848389, + "step": 663 + }, + { + "epoch": 0.3829296424452134, + "grad_norm": 0.7353511452674866, + "learning_rate": 0.00017467769867231096, + "loss": 1.1325185298919678, + "step": 664 + }, + { + "epoch": 0.38350634371395614, + "grad_norm": 0.7735626697540283, + "learning_rate": 0.00017463921493169137, + "loss": 1.0828659534454346, + "step": 665 + }, + { + "epoch": 0.38408304498269896, + "grad_norm": 0.6293249130249023, + "learning_rate": 0.0001746007311910718, + "loss": 0.9253727793693542, + "step": 666 + }, + { + "epoch": 0.3846597462514418, + "grad_norm": 0.6271319389343262, + "learning_rate": 0.0001745622474504522, + "loss": 0.7645162343978882, + "step": 667 + }, + { + "epoch": 0.38523644752018454, + "grad_norm": 0.6632966995239258, + "learning_rate": 0.00017452376370983262, + "loss": 0.9796670079231262, + "step": 668 + }, + { + "epoch": 0.38581314878892736, + "grad_norm": 0.8829965591430664, + "learning_rate": 0.00017448527996921304, + "loss": 0.9777094721794128, + "step": 669 + }, + { + "epoch": 0.3863898500576701, + "grad_norm": 0.7675085663795471, + "learning_rate": 0.00017444679622859342, + "loss": 1.0497252941131592, + "step": 670 + }, + { + "epoch": 0.38696655132641294, + "grad_norm": 0.9194138050079346, + "learning_rate": 0.00017440831248797384, + "loss": 1.0992257595062256, + "step": 671 + }, + { + "epoch": 0.3875432525951557, + "grad_norm": 1.0398883819580078, + "learning_rate": 0.00017436982874735425, + "loss": 1.25284743309021, + "step": 672 + }, + { + "epoch": 0.3881199538638985, + "grad_norm": 0.5921796560287476, + "learning_rate": 0.00017433134500673467, + "loss": 0.6763097047805786, + "step": 673 + }, + { + "epoch": 0.3886966551326413, + "grad_norm": 1.0226387977600098, + "learning_rate": 0.00017429286126611508, + "loss": 1.3273173570632935, + "step": 674 + }, + { + "epoch": 0.3892733564013841, + "grad_norm": 0.5252590179443359, + "learning_rate": 0.0001742543775254955, + "loss": 0.6646312475204468, + "step": 675 + }, + { + "epoch": 0.38985005767012687, + "grad_norm": 0.600639820098877, + "learning_rate": 0.00017421589378487592, + "loss": 0.7095688581466675, + "step": 676 + }, + { + "epoch": 0.3904267589388697, + "grad_norm": 0.7131365537643433, + "learning_rate": 0.00017417741004425633, + "loss": 1.2200595140457153, + "step": 677 + }, + { + "epoch": 0.39100346020761245, + "grad_norm": 0.9018159508705139, + "learning_rate": 0.00017413892630363672, + "loss": 0.9669409394264221, + "step": 678 + }, + { + "epoch": 0.39158016147635527, + "grad_norm": 0.9841684103012085, + "learning_rate": 0.00017410044256301713, + "loss": 1.028241515159607, + "step": 679 + }, + { + "epoch": 0.39215686274509803, + "grad_norm": 0.9678821563720703, + "learning_rate": 0.00017406195882239755, + "loss": 1.3122403621673584, + "step": 680 + }, + { + "epoch": 0.39273356401384085, + "grad_norm": 0.6439565420150757, + "learning_rate": 0.00017402347508177796, + "loss": 0.8441326022148132, + "step": 681 + }, + { + "epoch": 0.3933102652825836, + "grad_norm": 0.8460219502449036, + "learning_rate": 0.00017398499134115838, + "loss": 1.193575382232666, + "step": 682 + }, + { + "epoch": 0.39388696655132643, + "grad_norm": 0.8068860769271851, + "learning_rate": 0.0001739465076005388, + "loss": 1.209285020828247, + "step": 683 + }, + { + "epoch": 0.3944636678200692, + "grad_norm": 0.6420811414718628, + "learning_rate": 0.0001739080238599192, + "loss": 0.9203285574913025, + "step": 684 + }, + { + "epoch": 0.395040369088812, + "grad_norm": 1.1171250343322754, + "learning_rate": 0.0001738695401192996, + "loss": 1.5638062953948975, + "step": 685 + }, + { + "epoch": 0.3956170703575548, + "grad_norm": 0.7218726873397827, + "learning_rate": 0.00017383105637868001, + "loss": 1.1434835195541382, + "step": 686 + }, + { + "epoch": 0.3961937716262976, + "grad_norm": 0.9958249926567078, + "learning_rate": 0.00017379257263806043, + "loss": 0.7441573143005371, + "step": 687 + }, + { + "epoch": 0.39677047289504036, + "grad_norm": 0.8222061395645142, + "learning_rate": 0.00017375408889744084, + "loss": 1.2088245153427124, + "step": 688 + }, + { + "epoch": 0.3973471741637832, + "grad_norm": 0.5759637355804443, + "learning_rate": 0.00017371560515682126, + "loss": 0.9504674077033997, + "step": 689 + }, + { + "epoch": 0.39792387543252594, + "grad_norm": 0.8157130479812622, + "learning_rate": 0.00017367712141620168, + "loss": 1.319948673248291, + "step": 690 + }, + { + "epoch": 0.39850057670126876, + "grad_norm": 0.7266381978988647, + "learning_rate": 0.0001736386376755821, + "loss": 0.8739478588104248, + "step": 691 + }, + { + "epoch": 0.3990772779700115, + "grad_norm": 0.644598126411438, + "learning_rate": 0.0001736001539349625, + "loss": 0.9521651864051819, + "step": 692 + }, + { + "epoch": 0.39965397923875434, + "grad_norm": 0.5922922492027283, + "learning_rate": 0.0001735616701943429, + "loss": 0.7051569223403931, + "step": 693 + }, + { + "epoch": 0.4002306805074971, + "grad_norm": 0.6880702972412109, + "learning_rate": 0.0001735231864537233, + "loss": 1.1202598810195923, + "step": 694 + }, + { + "epoch": 0.4008073817762399, + "grad_norm": 1.1836776733398438, + "learning_rate": 0.00017348470271310372, + "loss": 1.2588169574737549, + "step": 695 + }, + { + "epoch": 0.4013840830449827, + "grad_norm": 0.965606689453125, + "learning_rate": 0.00017344621897248414, + "loss": 0.7970831990242004, + "step": 696 + }, + { + "epoch": 0.4019607843137255, + "grad_norm": 0.8883787989616394, + "learning_rate": 0.00017340773523186456, + "loss": 1.6653708219528198, + "step": 697 + }, + { + "epoch": 0.40253748558246827, + "grad_norm": 0.7349938750267029, + "learning_rate": 0.00017336925149124497, + "loss": 0.7324041724205017, + "step": 698 + }, + { + "epoch": 0.4031141868512111, + "grad_norm": 1.0731885433197021, + "learning_rate": 0.00017333076775062539, + "loss": 0.9731301069259644, + "step": 699 + }, + { + "epoch": 0.40369088811995385, + "grad_norm": 0.8691738843917847, + "learning_rate": 0.0001732922840100058, + "loss": 1.0968525409698486, + "step": 700 + }, + { + "epoch": 0.40426758938869667, + "grad_norm": 0.921116292476654, + "learning_rate": 0.0001732538002693862, + "loss": 1.3427119255065918, + "step": 701 + }, + { + "epoch": 0.40484429065743943, + "grad_norm": 0.8539203405380249, + "learning_rate": 0.0001732153165287666, + "loss": 1.2618871927261353, + "step": 702 + }, + { + "epoch": 0.40542099192618225, + "grad_norm": 0.6238696575164795, + "learning_rate": 0.00017317683278814702, + "loss": 0.7679486274719238, + "step": 703 + }, + { + "epoch": 0.405997693194925, + "grad_norm": 0.6827321648597717, + "learning_rate": 0.00017313834904752743, + "loss": 0.9498722553253174, + "step": 704 + }, + { + "epoch": 0.40657439446366783, + "grad_norm": 0.9637985229492188, + "learning_rate": 0.00017309986530690785, + "loss": 1.2945339679718018, + "step": 705 + }, + { + "epoch": 0.4071510957324106, + "grad_norm": 0.6361503601074219, + "learning_rate": 0.00017306138156628827, + "loss": 1.2040516138076782, + "step": 706 + }, + { + "epoch": 0.4077277970011534, + "grad_norm": 0.713758647441864, + "learning_rate": 0.00017302289782566868, + "loss": 1.1285666227340698, + "step": 707 + }, + { + "epoch": 0.4083044982698962, + "grad_norm": 1.0620390176773071, + "learning_rate": 0.0001729844140850491, + "loss": 1.2117018699645996, + "step": 708 + }, + { + "epoch": 0.408881199538639, + "grad_norm": 0.6957300305366516, + "learning_rate": 0.00017294593034442948, + "loss": 1.2091706991195679, + "step": 709 + }, + { + "epoch": 0.40945790080738176, + "grad_norm": 0.4594845771789551, + "learning_rate": 0.0001729074466038099, + "loss": 3.3324732780456543, + "step": 710 + }, + { + "epoch": 0.4100346020761246, + "grad_norm": 0.8902932405471802, + "learning_rate": 0.00017286896286319031, + "loss": 1.1579055786132812, + "step": 711 + }, + { + "epoch": 0.41061130334486734, + "grad_norm": 0.7140578031539917, + "learning_rate": 0.00017283047912257073, + "loss": 0.877116858959198, + "step": 712 + }, + { + "epoch": 0.41118800461361016, + "grad_norm": 0.8449535369873047, + "learning_rate": 0.00017279199538195115, + "loss": 1.2400063276290894, + "step": 713 + }, + { + "epoch": 0.4117647058823529, + "grad_norm": 1.0700358152389526, + "learning_rate": 0.00017275351164133156, + "loss": 1.1401453018188477, + "step": 714 + }, + { + "epoch": 0.41234140715109574, + "grad_norm": 0.6705982685089111, + "learning_rate": 0.00017271502790071198, + "loss": 0.8326209783554077, + "step": 715 + }, + { + "epoch": 0.4129181084198385, + "grad_norm": 0.7149010896682739, + "learning_rate": 0.0001726765441600924, + "loss": 1.0872998237609863, + "step": 716 + }, + { + "epoch": 0.4134948096885813, + "grad_norm": 0.46808966994285583, + "learning_rate": 0.00017263806041947278, + "loss": 0.6795035004615784, + "step": 717 + }, + { + "epoch": 0.4140715109573241, + "grad_norm": 0.8606752157211304, + "learning_rate": 0.0001725995766788532, + "loss": 1.0544252395629883, + "step": 718 + }, + { + "epoch": 0.4146482122260669, + "grad_norm": 0.5839232802391052, + "learning_rate": 0.0001725610929382336, + "loss": 0.7785719633102417, + "step": 719 + }, + { + "epoch": 0.41522491349480967, + "grad_norm": 0.8700772523880005, + "learning_rate": 0.00017252260919761403, + "loss": 0.988602340221405, + "step": 720 + }, + { + "epoch": 0.4158016147635525, + "grad_norm": 0.9886090159416199, + "learning_rate": 0.00017248412545699444, + "loss": 1.3493539094924927, + "step": 721 + }, + { + "epoch": 0.41637831603229525, + "grad_norm": 0.9088316559791565, + "learning_rate": 0.00017244564171637486, + "loss": 1.0131090879440308, + "step": 722 + }, + { + "epoch": 0.41695501730103807, + "grad_norm": 0.9066189527511597, + "learning_rate": 0.00017240715797575527, + "loss": 1.2530944347381592, + "step": 723 + }, + { + "epoch": 0.41753171856978083, + "grad_norm": 0.7733665704727173, + "learning_rate": 0.00017236867423513569, + "loss": 1.1255629062652588, + "step": 724 + }, + { + "epoch": 0.41810841983852365, + "grad_norm": 0.609832763671875, + "learning_rate": 0.00017233019049451607, + "loss": 0.7514859437942505, + "step": 725 + }, + { + "epoch": 0.4186851211072664, + "grad_norm": 0.6903802752494812, + "learning_rate": 0.0001722917067538965, + "loss": 0.8925538063049316, + "step": 726 + }, + { + "epoch": 0.41926182237600923, + "grad_norm": 0.7692581415176392, + "learning_rate": 0.0001722532230132769, + "loss": 1.103420376777649, + "step": 727 + }, + { + "epoch": 0.419838523644752, + "grad_norm": 0.7881311774253845, + "learning_rate": 0.0001722147392726573, + "loss": 1.3109550476074219, + "step": 728 + }, + { + "epoch": 0.4204152249134948, + "grad_norm": 0.6949164271354675, + "learning_rate": 0.0001721762555320377, + "loss": 1.0904300212860107, + "step": 729 + }, + { + "epoch": 0.4209919261822376, + "grad_norm": 0.6746834516525269, + "learning_rate": 0.00017213777179141812, + "loss": 1.240382194519043, + "step": 730 + }, + { + "epoch": 0.4215686274509804, + "grad_norm": 0.8831079602241516, + "learning_rate": 0.00017209928805079854, + "loss": 1.546260118484497, + "step": 731 + }, + { + "epoch": 0.42214532871972316, + "grad_norm": 0.917523205280304, + "learning_rate": 0.00017206080431017895, + "loss": 1.3464173078536987, + "step": 732 + }, + { + "epoch": 0.422722029988466, + "grad_norm": 0.729640007019043, + "learning_rate": 0.00017202232056955937, + "loss": 0.9092597961425781, + "step": 733 + }, + { + "epoch": 0.4232987312572088, + "grad_norm": 0.9597057104110718, + "learning_rate": 0.00017198383682893976, + "loss": 1.449595332145691, + "step": 734 + }, + { + "epoch": 0.42387543252595156, + "grad_norm": 0.570996880531311, + "learning_rate": 0.00017194535308832017, + "loss": 0.660990297794342, + "step": 735 + }, + { + "epoch": 0.4244521337946944, + "grad_norm": 0.8485130071640015, + "learning_rate": 0.0001719068693477006, + "loss": 1.009351372718811, + "step": 736 + }, + { + "epoch": 0.42502883506343714, + "grad_norm": 1.1340487003326416, + "learning_rate": 0.000171868385607081, + "loss": 1.186898946762085, + "step": 737 + }, + { + "epoch": 0.42560553633217996, + "grad_norm": 0.9666796326637268, + "learning_rate": 0.00017182990186646142, + "loss": 1.3713027238845825, + "step": 738 + }, + { + "epoch": 0.4261822376009227, + "grad_norm": 0.8104447722434998, + "learning_rate": 0.00017179141812584183, + "loss": 0.7822756767272949, + "step": 739 + }, + { + "epoch": 0.42675893886966554, + "grad_norm": 0.7587509155273438, + "learning_rate": 0.00017175293438522225, + "loss": 1.1129992008209229, + "step": 740 + }, + { + "epoch": 0.4273356401384083, + "grad_norm": 0.854256272315979, + "learning_rate": 0.00017171445064460266, + "loss": 1.1753698587417603, + "step": 741 + }, + { + "epoch": 0.4279123414071511, + "grad_norm": 0.7335513234138489, + "learning_rate": 0.00017167596690398305, + "loss": 1.1233677864074707, + "step": 742 + }, + { + "epoch": 0.4284890426758939, + "grad_norm": 1.1383814811706543, + "learning_rate": 0.00017163748316336347, + "loss": 1.6328407526016235, + "step": 743 + }, + { + "epoch": 0.4290657439446367, + "grad_norm": 0.5805800557136536, + "learning_rate": 0.00017159899942274388, + "loss": 0.8374234437942505, + "step": 744 + }, + { + "epoch": 0.42964244521337946, + "grad_norm": 0.5744853615760803, + "learning_rate": 0.0001715605156821243, + "loss": 0.7072418332099915, + "step": 745 + }, + { + "epoch": 0.4302191464821223, + "grad_norm": 1.0968151092529297, + "learning_rate": 0.00017152203194150471, + "loss": 0.9308477640151978, + "step": 746 + }, + { + "epoch": 0.43079584775086505, + "grad_norm": 0.7771037220954895, + "learning_rate": 0.00017148354820088513, + "loss": 1.0803910493850708, + "step": 747 + }, + { + "epoch": 0.43137254901960786, + "grad_norm": 0.760296106338501, + "learning_rate": 0.00017144506446026554, + "loss": 0.9416469931602478, + "step": 748 + }, + { + "epoch": 0.43194925028835063, + "grad_norm": 0.8478863835334778, + "learning_rate": 0.00017140658071964596, + "loss": 1.0037909746170044, + "step": 749 + }, + { + "epoch": 0.43252595155709345, + "grad_norm": 0.802010715007782, + "learning_rate": 0.00017136809697902635, + "loss": 1.2789827585220337, + "step": 750 + }, + { + "epoch": 0.4331026528258362, + "grad_norm": 0.7146703004837036, + "learning_rate": 0.00017132961323840676, + "loss": 0.925313413143158, + "step": 751 + }, + { + "epoch": 0.43367935409457903, + "grad_norm": 1.1419707536697388, + "learning_rate": 0.00017129112949778718, + "loss": 1.3266316652297974, + "step": 752 + }, + { + "epoch": 0.4342560553633218, + "grad_norm": 0.5337522029876709, + "learning_rate": 0.0001712526457571676, + "loss": 0.8182927966117859, + "step": 753 + }, + { + "epoch": 0.4348327566320646, + "grad_norm": 0.7067147493362427, + "learning_rate": 0.000171214162016548, + "loss": 1.01529061794281, + "step": 754 + }, + { + "epoch": 0.4354094579008074, + "grad_norm": 0.8742361664772034, + "learning_rate": 0.00017117567827592842, + "loss": 0.9216449856758118, + "step": 755 + }, + { + "epoch": 0.4359861591695502, + "grad_norm": 1.0121413469314575, + "learning_rate": 0.00017113719453530884, + "loss": 1.5315768718719482, + "step": 756 + }, + { + "epoch": 0.43656286043829295, + "grad_norm": 0.970582127571106, + "learning_rate": 0.00017109871079468925, + "loss": 1.1701881885528564, + "step": 757 + }, + { + "epoch": 0.4371395617070358, + "grad_norm": 0.8317894339561462, + "learning_rate": 0.00017106022705406964, + "loss": 1.1619702577590942, + "step": 758 + }, + { + "epoch": 0.43771626297577854, + "grad_norm": 0.6935670375823975, + "learning_rate": 0.00017102174331345006, + "loss": 1.0018664598464966, + "step": 759 + }, + { + "epoch": 0.43829296424452135, + "grad_norm": 1.0123279094696045, + "learning_rate": 0.00017098325957283047, + "loss": 1.1231794357299805, + "step": 760 + }, + { + "epoch": 0.4388696655132641, + "grad_norm": 0.7619280219078064, + "learning_rate": 0.0001709447758322109, + "loss": 1.0395662784576416, + "step": 761 + }, + { + "epoch": 0.43944636678200694, + "grad_norm": 0.8570308089256287, + "learning_rate": 0.0001709062920915913, + "loss": 1.4022446870803833, + "step": 762 + }, + { + "epoch": 0.4400230680507497, + "grad_norm": 1.178285837173462, + "learning_rate": 0.00017086780835097172, + "loss": 1.5245153903961182, + "step": 763 + }, + { + "epoch": 0.4405997693194925, + "grad_norm": 0.876589298248291, + "learning_rate": 0.00017082932461035213, + "loss": 1.482165813446045, + "step": 764 + }, + { + "epoch": 0.4411764705882353, + "grad_norm": 0.8614532947540283, + "learning_rate": 0.00017079084086973255, + "loss": 1.312232255935669, + "step": 765 + }, + { + "epoch": 0.4417531718569781, + "grad_norm": 0.6772201061248779, + "learning_rate": 0.00017075235712911294, + "loss": 1.1610076427459717, + "step": 766 + }, + { + "epoch": 0.44232987312572086, + "grad_norm": 0.805927038192749, + "learning_rate": 0.00017071387338849335, + "loss": 1.3874244689941406, + "step": 767 + }, + { + "epoch": 0.4429065743944637, + "grad_norm": 0.5419954061508179, + "learning_rate": 0.00017067538964787377, + "loss": 0.7610808610916138, + "step": 768 + }, + { + "epoch": 0.44348327566320644, + "grad_norm": 0.773598313331604, + "learning_rate": 0.00017063690590725418, + "loss": 0.8612810373306274, + "step": 769 + }, + { + "epoch": 0.44405997693194926, + "grad_norm": 0.6376165151596069, + "learning_rate": 0.0001705984221666346, + "loss": 0.8417828679084778, + "step": 770 + }, + { + "epoch": 0.444636678200692, + "grad_norm": 0.6870789527893066, + "learning_rate": 0.00017055993842601501, + "loss": 1.1764918565750122, + "step": 771 + }, + { + "epoch": 0.44521337946943484, + "grad_norm": 0.5562968254089355, + "learning_rate": 0.00017052145468539543, + "loss": 0.8358933925628662, + "step": 772 + }, + { + "epoch": 0.4457900807381776, + "grad_norm": 0.602963924407959, + "learning_rate": 0.00017048297094477585, + "loss": 1.197677731513977, + "step": 773 + }, + { + "epoch": 0.4463667820069204, + "grad_norm": 1.0190907716751099, + "learning_rate": 0.00017044448720415623, + "loss": 1.4355199337005615, + "step": 774 + }, + { + "epoch": 0.4469434832756632, + "grad_norm": 0.633346676826477, + "learning_rate": 0.00017040600346353665, + "loss": 0.7924656867980957, + "step": 775 + }, + { + "epoch": 0.447520184544406, + "grad_norm": 0.797099232673645, + "learning_rate": 0.00017036751972291706, + "loss": 1.2302619218826294, + "step": 776 + }, + { + "epoch": 0.44809688581314877, + "grad_norm": 0.7166492938995361, + "learning_rate": 0.00017032903598229748, + "loss": 1.063340187072754, + "step": 777 + }, + { + "epoch": 0.4486735870818916, + "grad_norm": 0.9511370062828064, + "learning_rate": 0.0001702905522416779, + "loss": 0.8998168706893921, + "step": 778 + }, + { + "epoch": 0.44925028835063435, + "grad_norm": 0.8487029075622559, + "learning_rate": 0.0001702520685010583, + "loss": 1.1850653886795044, + "step": 779 + }, + { + "epoch": 0.44982698961937717, + "grad_norm": 1.0267854928970337, + "learning_rate": 0.00017021358476043873, + "loss": 1.246724009513855, + "step": 780 + }, + { + "epoch": 0.45040369088811993, + "grad_norm": 1.155428409576416, + "learning_rate": 0.00017017510101981914, + "loss": 1.539854884147644, + "step": 781 + }, + { + "epoch": 0.45098039215686275, + "grad_norm": 0.6774823069572449, + "learning_rate": 0.00017013661727919953, + "loss": 0.7472063302993774, + "step": 782 + }, + { + "epoch": 0.4515570934256055, + "grad_norm": 0.7500667572021484, + "learning_rate": 0.00017009813353857994, + "loss": 0.9946876168251038, + "step": 783 + }, + { + "epoch": 0.45213379469434833, + "grad_norm": 0.7643426656723022, + "learning_rate": 0.00017005964979796036, + "loss": 0.8451071977615356, + "step": 784 + }, + { + "epoch": 0.4527104959630911, + "grad_norm": 0.721379816532135, + "learning_rate": 0.00017002116605734077, + "loss": 0.9988998174667358, + "step": 785 + }, + { + "epoch": 0.4532871972318339, + "grad_norm": 0.8850287199020386, + "learning_rate": 0.0001699826823167212, + "loss": 0.9789897203445435, + "step": 786 + }, + { + "epoch": 0.4538638985005767, + "grad_norm": 1.0076375007629395, + "learning_rate": 0.0001699441985761016, + "loss": 1.3830417394638062, + "step": 787 + }, + { + "epoch": 0.4544405997693195, + "grad_norm": 0.6105207204818726, + "learning_rate": 0.00016990571483548202, + "loss": 0.8870081901550293, + "step": 788 + }, + { + "epoch": 0.45501730103806226, + "grad_norm": 0.7732753157615662, + "learning_rate": 0.00016986723109486244, + "loss": 0.9958963990211487, + "step": 789 + }, + { + "epoch": 0.4555940023068051, + "grad_norm": 0.9871165156364441, + "learning_rate": 0.00016982874735424282, + "loss": 1.1141139268875122, + "step": 790 + }, + { + "epoch": 0.45617070357554784, + "grad_norm": 0.7117231488227844, + "learning_rate": 0.00016979026361362324, + "loss": 1.0168585777282715, + "step": 791 + }, + { + "epoch": 0.45674740484429066, + "grad_norm": 0.6954454183578491, + "learning_rate": 0.00016975177987300365, + "loss": 0.9319931268692017, + "step": 792 + }, + { + "epoch": 0.4573241061130334, + "grad_norm": 0.6463753581047058, + "learning_rate": 0.00016971329613238407, + "loss": 0.9734832644462585, + "step": 793 + }, + { + "epoch": 0.45790080738177624, + "grad_norm": 0.7156365513801575, + "learning_rate": 0.00016967481239176448, + "loss": 1.0014495849609375, + "step": 794 + }, + { + "epoch": 0.458477508650519, + "grad_norm": 0.8648508787155151, + "learning_rate": 0.0001696363286511449, + "loss": 1.3907616138458252, + "step": 795 + }, + { + "epoch": 0.4590542099192618, + "grad_norm": 0.8066338300704956, + "learning_rate": 0.00016959784491052532, + "loss": 1.0530327558517456, + "step": 796 + }, + { + "epoch": 0.4596309111880046, + "grad_norm": 0.8617266416549683, + "learning_rate": 0.00016955936116990573, + "loss": 1.7989249229431152, + "step": 797 + }, + { + "epoch": 0.4602076124567474, + "grad_norm": 0.7956259250640869, + "learning_rate": 0.00016952087742928612, + "loss": 0.928198516368866, + "step": 798 + }, + { + "epoch": 0.46078431372549017, + "grad_norm": 0.8778709173202515, + "learning_rate": 0.00016948239368866653, + "loss": 0.9466978907585144, + "step": 799 + }, + { + "epoch": 0.461361014994233, + "grad_norm": 0.8518659472465515, + "learning_rate": 0.00016944390994804695, + "loss": 1.0593540668487549, + "step": 800 + }, + { + "epoch": 0.4619377162629758, + "grad_norm": 0.79550701379776, + "learning_rate": 0.00016940542620742736, + "loss": 1.1164321899414062, + "step": 801 + }, + { + "epoch": 0.46251441753171857, + "grad_norm": 1.0006239414215088, + "learning_rate": 0.00016936694246680778, + "loss": 1.160499930381775, + "step": 802 + }, + { + "epoch": 0.4630911188004614, + "grad_norm": 0.8525403738021851, + "learning_rate": 0.0001693284587261882, + "loss": 1.0770652294158936, + "step": 803 + }, + { + "epoch": 0.46366782006920415, + "grad_norm": 0.6851354837417603, + "learning_rate": 0.0001692899749855686, + "loss": 1.0310590267181396, + "step": 804 + }, + { + "epoch": 0.46424452133794697, + "grad_norm": 0.6831552386283875, + "learning_rate": 0.000169251491244949, + "loss": 1.0782524347305298, + "step": 805 + }, + { + "epoch": 0.46482122260668973, + "grad_norm": 0.8892863988876343, + "learning_rate": 0.00016921300750432941, + "loss": 1.3154478073120117, + "step": 806 + }, + { + "epoch": 0.46539792387543255, + "grad_norm": 0.6863577961921692, + "learning_rate": 0.00016917452376370983, + "loss": 0.5912436842918396, + "step": 807 + }, + { + "epoch": 0.4659746251441753, + "grad_norm": 0.8612192869186401, + "learning_rate": 0.00016913604002309024, + "loss": 1.0140503644943237, + "step": 808 + }, + { + "epoch": 0.46655132641291813, + "grad_norm": 0.6565495729446411, + "learning_rate": 0.00016909755628247066, + "loss": 0.8388250470161438, + "step": 809 + }, + { + "epoch": 0.4671280276816609, + "grad_norm": 0.5729434490203857, + "learning_rate": 0.00016905907254185107, + "loss": 0.8662521839141846, + "step": 810 + }, + { + "epoch": 0.4677047289504037, + "grad_norm": 0.8261442184448242, + "learning_rate": 0.0001690205888012315, + "loss": 1.1527458429336548, + "step": 811 + }, + { + "epoch": 0.4682814302191465, + "grad_norm": 0.6182582974433899, + "learning_rate": 0.0001689821050606119, + "loss": 0.7817882895469666, + "step": 812 + }, + { + "epoch": 0.4688581314878893, + "grad_norm": 0.5987662672996521, + "learning_rate": 0.0001689436213199923, + "loss": 0.864625871181488, + "step": 813 + }, + { + "epoch": 0.46943483275663206, + "grad_norm": 0.8617327809333801, + "learning_rate": 0.0001689051375793727, + "loss": 1.1531751155853271, + "step": 814 + }, + { + "epoch": 0.4700115340253749, + "grad_norm": 0.8277755379676819, + "learning_rate": 0.00016886665383875312, + "loss": 0.928108811378479, + "step": 815 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 0.7510029673576355, + "learning_rate": 0.00016882817009813354, + "loss": 1.0068414211273193, + "step": 816 + }, + { + "epoch": 0.47116493656286046, + "grad_norm": 0.8691316246986389, + "learning_rate": 0.00016878968635751395, + "loss": 1.0941516160964966, + "step": 817 + }, + { + "epoch": 0.4717416378316032, + "grad_norm": 0.581984281539917, + "learning_rate": 0.00016875120261689437, + "loss": 0.6039727926254272, + "step": 818 + }, + { + "epoch": 0.47231833910034604, + "grad_norm": 0.7486310005187988, + "learning_rate": 0.00016871271887627479, + "loss": 1.140452265739441, + "step": 819 + }, + { + "epoch": 0.4728950403690888, + "grad_norm": 0.8794305324554443, + "learning_rate": 0.0001686742351356552, + "loss": 1.2717854976654053, + "step": 820 + }, + { + "epoch": 0.4734717416378316, + "grad_norm": 0.8812481164932251, + "learning_rate": 0.0001686357513950356, + "loss": 0.9813717007637024, + "step": 821 + }, + { + "epoch": 0.4740484429065744, + "grad_norm": 0.9091891646385193, + "learning_rate": 0.000168597267654416, + "loss": 1.2938401699066162, + "step": 822 + }, + { + "epoch": 0.4746251441753172, + "grad_norm": 0.9045780301094055, + "learning_rate": 0.00016855878391379642, + "loss": 1.312792181968689, + "step": 823 + }, + { + "epoch": 0.47520184544405997, + "grad_norm": 0.8430265784263611, + "learning_rate": 0.00016852030017317683, + "loss": 1.2679914236068726, + "step": 824 + }, + { + "epoch": 0.4757785467128028, + "grad_norm": 0.6870001554489136, + "learning_rate": 0.00016848181643255725, + "loss": 0.970576822757721, + "step": 825 + }, + { + "epoch": 0.47635524798154555, + "grad_norm": 0.8256406188011169, + "learning_rate": 0.00016844333269193767, + "loss": 1.302760362625122, + "step": 826 + }, + { + "epoch": 0.47693194925028837, + "grad_norm": 0.7057660222053528, + "learning_rate": 0.00016840484895131808, + "loss": 0.9811574220657349, + "step": 827 + }, + { + "epoch": 0.47750865051903113, + "grad_norm": 0.8487821817398071, + "learning_rate": 0.0001683663652106985, + "loss": 1.0537941455841064, + "step": 828 + }, + { + "epoch": 0.47808535178777395, + "grad_norm": 0.7474492788314819, + "learning_rate": 0.00016832788147007888, + "loss": 0.856541633605957, + "step": 829 + }, + { + "epoch": 0.4786620530565167, + "grad_norm": 0.9228368401527405, + "learning_rate": 0.0001682893977294593, + "loss": 1.0505741834640503, + "step": 830 + }, + { + "epoch": 0.47923875432525953, + "grad_norm": 0.9288182854652405, + "learning_rate": 0.00016825091398883971, + "loss": 1.3584654331207275, + "step": 831 + }, + { + "epoch": 0.4798154555940023, + "grad_norm": 1.4403129816055298, + "learning_rate": 0.00016821243024822013, + "loss": 1.911801815032959, + "step": 832 + }, + { + "epoch": 0.4803921568627451, + "grad_norm": 0.6283893585205078, + "learning_rate": 0.00016817394650760055, + "loss": 0.8583131432533264, + "step": 833 + }, + { + "epoch": 0.4809688581314879, + "grad_norm": 0.6910902261734009, + "learning_rate": 0.00016813546276698096, + "loss": 1.3508315086364746, + "step": 834 + }, + { + "epoch": 0.4815455594002307, + "grad_norm": 0.6606875658035278, + "learning_rate": 0.00016809697902636138, + "loss": 1.0815465450286865, + "step": 835 + }, + { + "epoch": 0.48212226066897346, + "grad_norm": 0.8546112775802612, + "learning_rate": 0.0001680584952857418, + "loss": 1.2201032638549805, + "step": 836 + }, + { + "epoch": 0.4826989619377163, + "grad_norm": 0.9130816459655762, + "learning_rate": 0.00016802001154512218, + "loss": 1.208343744277954, + "step": 837 + }, + { + "epoch": 0.48327566320645904, + "grad_norm": 0.7690496444702148, + "learning_rate": 0.0001679815278045026, + "loss": 1.0452954769134521, + "step": 838 + }, + { + "epoch": 0.48385236447520186, + "grad_norm": 0.7210266590118408, + "learning_rate": 0.000167943044063883, + "loss": 0.7897384166717529, + "step": 839 + }, + { + "epoch": 0.4844290657439446, + "grad_norm": 0.5705054402351379, + "learning_rate": 0.00016790456032326342, + "loss": 0.8288441896438599, + "step": 840 + }, + { + "epoch": 0.48500576701268744, + "grad_norm": 0.6143510341644287, + "learning_rate": 0.00016786607658264384, + "loss": 0.8081311583518982, + "step": 841 + }, + { + "epoch": 0.4855824682814302, + "grad_norm": 0.7222305536270142, + "learning_rate": 0.00016782759284202426, + "loss": 1.1107532978057861, + "step": 842 + }, + { + "epoch": 0.486159169550173, + "grad_norm": 0.6712546944618225, + "learning_rate": 0.00016778910910140467, + "loss": 0.8375999927520752, + "step": 843 + }, + { + "epoch": 0.4867358708189158, + "grad_norm": 0.9085020422935486, + "learning_rate": 0.00016775062536078509, + "loss": 0.9624453186988831, + "step": 844 + }, + { + "epoch": 0.4873125720876586, + "grad_norm": 0.773102879524231, + "learning_rate": 0.00016771214162016547, + "loss": 1.0454928874969482, + "step": 845 + }, + { + "epoch": 0.48788927335640137, + "grad_norm": 0.5635338425636292, + "learning_rate": 0.0001676736578795459, + "loss": 0.7329631447792053, + "step": 846 + }, + { + "epoch": 0.4884659746251442, + "grad_norm": 0.8183399438858032, + "learning_rate": 0.0001676351741389263, + "loss": 0.859244704246521, + "step": 847 + }, + { + "epoch": 0.48904267589388695, + "grad_norm": 0.7920128107070923, + "learning_rate": 0.00016759669039830672, + "loss": 0.9889219403266907, + "step": 848 + }, + { + "epoch": 0.48961937716262977, + "grad_norm": 1.1391570568084717, + "learning_rate": 0.00016755820665768714, + "loss": 1.146942138671875, + "step": 849 + }, + { + "epoch": 0.49019607843137253, + "grad_norm": 0.6648845076560974, + "learning_rate": 0.00016751972291706755, + "loss": 0.7090552449226379, + "step": 850 + }, + { + "epoch": 0.49077277970011535, + "grad_norm": 0.7156478762626648, + "learning_rate": 0.00016748123917644797, + "loss": 0.7772218585014343, + "step": 851 + }, + { + "epoch": 0.4913494809688581, + "grad_norm": 0.7279021739959717, + "learning_rate": 0.00016744275543582838, + "loss": 1.0468722581863403, + "step": 852 + }, + { + "epoch": 0.49192618223760093, + "grad_norm": 1.0862352848052979, + "learning_rate": 0.00016740427169520877, + "loss": 1.3199949264526367, + "step": 853 + }, + { + "epoch": 0.4925028835063437, + "grad_norm": 0.5989871025085449, + "learning_rate": 0.00016736578795458918, + "loss": 0.7066143751144409, + "step": 854 + }, + { + "epoch": 0.4930795847750865, + "grad_norm": 0.88418048620224, + "learning_rate": 0.0001673273042139696, + "loss": 0.9679941534996033, + "step": 855 + }, + { + "epoch": 0.4936562860438293, + "grad_norm": 0.7538619637489319, + "learning_rate": 0.00016728882047335002, + "loss": 0.906350314617157, + "step": 856 + }, + { + "epoch": 0.4942329873125721, + "grad_norm": 1.0406384468078613, + "learning_rate": 0.00016725033673273043, + "loss": 1.0761326551437378, + "step": 857 + }, + { + "epoch": 0.49480968858131485, + "grad_norm": 0.9118819236755371, + "learning_rate": 0.00016721185299211085, + "loss": 1.449715495109558, + "step": 858 + }, + { + "epoch": 0.4953863898500577, + "grad_norm": 0.7859880328178406, + "learning_rate": 0.00016717336925149126, + "loss": 1.0066848993301392, + "step": 859 + }, + { + "epoch": 0.49596309111880044, + "grad_norm": 0.7971929907798767, + "learning_rate": 0.00016713488551087168, + "loss": 1.0836429595947266, + "step": 860 + }, + { + "epoch": 0.49653979238754326, + "grad_norm": 0.7688129544258118, + "learning_rate": 0.00016709640177025206, + "loss": 0.8990678191184998, + "step": 861 + }, + { + "epoch": 0.497116493656286, + "grad_norm": 0.6911450028419495, + "learning_rate": 0.00016705791802963248, + "loss": 0.9118435382843018, + "step": 862 + }, + { + "epoch": 0.49769319492502884, + "grad_norm": 0.9296817183494568, + "learning_rate": 0.0001670194342890129, + "loss": 1.0580615997314453, + "step": 863 + }, + { + "epoch": 0.4982698961937716, + "grad_norm": 0.5820940732955933, + "learning_rate": 0.0001669809505483933, + "loss": 0.6944743394851685, + "step": 864 + }, + { + "epoch": 0.4988465974625144, + "grad_norm": 0.9766574501991272, + "learning_rate": 0.00016694246680777373, + "loss": 1.4097439050674438, + "step": 865 + }, + { + "epoch": 0.4994232987312572, + "grad_norm": 0.658211350440979, + "learning_rate": 0.00016690398306715414, + "loss": 0.7773644924163818, + "step": 866 + }, + { + "epoch": 0.5, + "grad_norm": 0.7480500340461731, + "learning_rate": 0.00016686549932653456, + "loss": 1.1536113023757935, + "step": 867 + }, + { + "epoch": 0.5005767012687428, + "grad_norm": 0.5885343551635742, + "learning_rate": 0.00016682701558591497, + "loss": 0.5359970927238464, + "step": 868 + }, + { + "epoch": 0.5011534025374856, + "grad_norm": 0.7808444499969482, + "learning_rate": 0.00016678853184529536, + "loss": 0.6940274834632874, + "step": 869 + }, + { + "epoch": 0.5017301038062284, + "grad_norm": 0.8007370233535767, + "learning_rate": 0.00016675004810467577, + "loss": 1.3268241882324219, + "step": 870 + }, + { + "epoch": 0.5023068050749712, + "grad_norm": 0.6729685068130493, + "learning_rate": 0.0001667115643640562, + "loss": 0.9482746124267578, + "step": 871 + }, + { + "epoch": 0.5028835063437139, + "grad_norm": 0.648239016532898, + "learning_rate": 0.0001666730806234366, + "loss": 0.9904931783676147, + "step": 872 + }, + { + "epoch": 0.5034602076124568, + "grad_norm": 0.7997180223464966, + "learning_rate": 0.00016663459688281702, + "loss": 1.0594019889831543, + "step": 873 + }, + { + "epoch": 0.5040369088811996, + "grad_norm": 0.8298223614692688, + "learning_rate": 0.00016659611314219744, + "loss": 0.9604882597923279, + "step": 874 + }, + { + "epoch": 0.5046136101499423, + "grad_norm": 0.8724483251571655, + "learning_rate": 0.00016655762940157785, + "loss": 1.0515791177749634, + "step": 875 + }, + { + "epoch": 0.5051903114186851, + "grad_norm": 0.7477858662605286, + "learning_rate": 0.00016651914566095827, + "loss": 1.0346887111663818, + "step": 876 + }, + { + "epoch": 0.505767012687428, + "grad_norm": 0.6524494886398315, + "learning_rate": 0.00016648066192033865, + "loss": 0.8699806928634644, + "step": 877 + }, + { + "epoch": 0.5063437139561707, + "grad_norm": 0.7959410548210144, + "learning_rate": 0.00016644217817971907, + "loss": 1.0138338804244995, + "step": 878 + }, + { + "epoch": 0.5069204152249135, + "grad_norm": 0.7872818112373352, + "learning_rate": 0.00016640369443909949, + "loss": 1.0084038972854614, + "step": 879 + }, + { + "epoch": 0.5074971164936563, + "grad_norm": 0.9153385758399963, + "learning_rate": 0.0001663652106984799, + "loss": 0.9120053052902222, + "step": 880 + }, + { + "epoch": 0.5080738177623991, + "grad_norm": 0.8691549301147461, + "learning_rate": 0.00016632672695786032, + "loss": 0.9792031645774841, + "step": 881 + }, + { + "epoch": 0.5086505190311419, + "grad_norm": 0.7193480730056763, + "learning_rate": 0.00016628824321724073, + "loss": 0.9441159963607788, + "step": 882 + }, + { + "epoch": 0.5092272202998847, + "grad_norm": 0.5675065517425537, + "learning_rate": 0.00016624975947662115, + "loss": 0.7550349235534668, + "step": 883 + }, + { + "epoch": 0.5098039215686274, + "grad_norm": 0.45122864842414856, + "learning_rate": 0.00016621127573600156, + "loss": 0.494687020778656, + "step": 884 + }, + { + "epoch": 0.5103806228373703, + "grad_norm": 0.5535047650337219, + "learning_rate": 0.00016617279199538195, + "loss": 1.0048768520355225, + "step": 885 + }, + { + "epoch": 0.510957324106113, + "grad_norm": 1.1627446413040161, + "learning_rate": 0.00016613430825476237, + "loss": 1.3231415748596191, + "step": 886 + }, + { + "epoch": 0.5115340253748558, + "grad_norm": 0.5924594402313232, + "learning_rate": 0.00016609582451414278, + "loss": 0.8373284339904785, + "step": 887 + }, + { + "epoch": 0.5121107266435986, + "grad_norm": 1.071594476699829, + "learning_rate": 0.0001660573407735232, + "loss": 1.1695808172225952, + "step": 888 + }, + { + "epoch": 0.5126874279123415, + "grad_norm": 0.7243885397911072, + "learning_rate": 0.0001660188570329036, + "loss": 0.9688019156455994, + "step": 889 + }, + { + "epoch": 0.5132641291810842, + "grad_norm": 0.7857576012611389, + "learning_rate": 0.00016598037329228403, + "loss": 0.9062821269035339, + "step": 890 + }, + { + "epoch": 0.513840830449827, + "grad_norm": 0.6501168012619019, + "learning_rate": 0.00016594188955166444, + "loss": 0.7230191230773926, + "step": 891 + }, + { + "epoch": 0.5144175317185697, + "grad_norm": 0.7679166197776794, + "learning_rate": 0.00016590340581104483, + "loss": 0.9849987030029297, + "step": 892 + }, + { + "epoch": 0.5149942329873126, + "grad_norm": 0.5687773823738098, + "learning_rate": 0.00016586492207042524, + "loss": 0.5315793752670288, + "step": 893 + }, + { + "epoch": 0.5155709342560554, + "grad_norm": 0.5201639533042908, + "learning_rate": 0.00016582643832980566, + "loss": 0.833229660987854, + "step": 894 + }, + { + "epoch": 0.5161476355247981, + "grad_norm": 0.9703792333602905, + "learning_rate": 0.00016578795458918608, + "loss": 1.2787346839904785, + "step": 895 + }, + { + "epoch": 0.5167243367935409, + "grad_norm": 0.5964572429656982, + "learning_rate": 0.0001657494708485665, + "loss": 0.8054360151290894, + "step": 896 + }, + { + "epoch": 0.5173010380622838, + "grad_norm": 0.8156993389129639, + "learning_rate": 0.0001657109871079469, + "loss": 1.1183547973632812, + "step": 897 + }, + { + "epoch": 0.5178777393310265, + "grad_norm": 0.9944779276847839, + "learning_rate": 0.00016567250336732732, + "loss": 1.4230319261550903, + "step": 898 + }, + { + "epoch": 0.5184544405997693, + "grad_norm": 0.6466273069381714, + "learning_rate": 0.00016563401962670774, + "loss": 0.9248323440551758, + "step": 899 + }, + { + "epoch": 0.5190311418685121, + "grad_norm": 0.6486216187477112, + "learning_rate": 0.00016559553588608812, + "loss": 0.8279266357421875, + "step": 900 + }, + { + "epoch": 0.5196078431372549, + "grad_norm": 0.8492687940597534, + "learning_rate": 0.00016555705214546854, + "loss": 1.1167151927947998, + "step": 901 + }, + { + "epoch": 0.5201845444059977, + "grad_norm": 0.7403521537780762, + "learning_rate": 0.00016551856840484896, + "loss": 0.9129210710525513, + "step": 902 + }, + { + "epoch": 0.5207612456747405, + "grad_norm": 0.9525539875030518, + "learning_rate": 0.00016548008466422937, + "loss": 1.0805696249008179, + "step": 903 + }, + { + "epoch": 0.5213379469434832, + "grad_norm": 0.6410759091377258, + "learning_rate": 0.00016544160092360979, + "loss": 0.7183154821395874, + "step": 904 + }, + { + "epoch": 0.5219146482122261, + "grad_norm": 0.9240155816078186, + "learning_rate": 0.0001654031171829902, + "loss": 1.2977594137191772, + "step": 905 + }, + { + "epoch": 0.5224913494809689, + "grad_norm": 0.5909906625747681, + "learning_rate": 0.00016536463344237062, + "loss": 0.8771336078643799, + "step": 906 + }, + { + "epoch": 0.5230680507497116, + "grad_norm": 0.6739245653152466, + "learning_rate": 0.00016532614970175103, + "loss": 0.9435271620750427, + "step": 907 + }, + { + "epoch": 0.5236447520184544, + "grad_norm": 0.7840787172317505, + "learning_rate": 0.00016528766596113142, + "loss": 0.9116816520690918, + "step": 908 + }, + { + "epoch": 0.5242214532871973, + "grad_norm": 0.7001404762268066, + "learning_rate": 0.00016524918222051184, + "loss": 0.7686711549758911, + "step": 909 + }, + { + "epoch": 0.52479815455594, + "grad_norm": 0.7492363452911377, + "learning_rate": 0.00016521069847989225, + "loss": 0.894406795501709, + "step": 910 + }, + { + "epoch": 0.5253748558246828, + "grad_norm": 0.6643780469894409, + "learning_rate": 0.00016517221473927267, + "loss": 0.9077553153038025, + "step": 911 + }, + { + "epoch": 0.5259515570934256, + "grad_norm": 0.6426498889923096, + "learning_rate": 0.00016513373099865308, + "loss": 0.7784804701805115, + "step": 912 + }, + { + "epoch": 0.5265282583621684, + "grad_norm": 0.6445097923278809, + "learning_rate": 0.0001650952472580335, + "loss": 0.8351481556892395, + "step": 913 + }, + { + "epoch": 0.5271049596309112, + "grad_norm": 0.9749622344970703, + "learning_rate": 0.0001650567635174139, + "loss": 1.3779326677322388, + "step": 914 + }, + { + "epoch": 0.527681660899654, + "grad_norm": 1.0297281742095947, + "learning_rate": 0.00016501827977679433, + "loss": 1.4258373975753784, + "step": 915 + }, + { + "epoch": 0.5282583621683967, + "grad_norm": 0.8116568326950073, + "learning_rate": 0.00016497979603617472, + "loss": 1.120481252670288, + "step": 916 + }, + { + "epoch": 0.5288350634371396, + "grad_norm": 0.8832195401191711, + "learning_rate": 0.00016494131229555513, + "loss": 1.0475956201553345, + "step": 917 + }, + { + "epoch": 0.5294117647058824, + "grad_norm": 0.7668746709823608, + "learning_rate": 0.00016490282855493555, + "loss": 0.9356057643890381, + "step": 918 + }, + { + "epoch": 0.5299884659746251, + "grad_norm": 0.7938312292098999, + "learning_rate": 0.00016486434481431596, + "loss": 1.0766160488128662, + "step": 919 + }, + { + "epoch": 0.5305651672433679, + "grad_norm": 0.6379091739654541, + "learning_rate": 0.00016482586107369638, + "loss": 0.8664296865463257, + "step": 920 + }, + { + "epoch": 0.5311418685121108, + "grad_norm": 0.5966930389404297, + "learning_rate": 0.0001647873773330768, + "loss": 0.7848939299583435, + "step": 921 + }, + { + "epoch": 0.5317185697808535, + "grad_norm": 0.7270369529724121, + "learning_rate": 0.0001647488935924572, + "loss": 0.8690502643585205, + "step": 922 + }, + { + "epoch": 0.5322952710495963, + "grad_norm": 0.7373891472816467, + "learning_rate": 0.00016471040985183762, + "loss": 0.9187401533126831, + "step": 923 + }, + { + "epoch": 0.532871972318339, + "grad_norm": 0.6114344596862793, + "learning_rate": 0.000164671926111218, + "loss": 0.7336284518241882, + "step": 924 + }, + { + "epoch": 0.5334486735870819, + "grad_norm": 0.7629640102386475, + "learning_rate": 0.00016463344237059843, + "loss": 1.0568023920059204, + "step": 925 + }, + { + "epoch": 0.5340253748558247, + "grad_norm": 0.5172185897827148, + "learning_rate": 0.00016459495862997884, + "loss": 0.6043404936790466, + "step": 926 + }, + { + "epoch": 0.5346020761245674, + "grad_norm": 0.6732125282287598, + "learning_rate": 0.00016455647488935926, + "loss": 0.7869133353233337, + "step": 927 + }, + { + "epoch": 0.5351787773933102, + "grad_norm": 0.993881344795227, + "learning_rate": 0.00016451799114873967, + "loss": 1.3750996589660645, + "step": 928 + }, + { + "epoch": 0.5357554786620531, + "grad_norm": 0.6748846173286438, + "learning_rate": 0.0001644795074081201, + "loss": 0.7957302331924438, + "step": 929 + }, + { + "epoch": 0.5363321799307958, + "grad_norm": 0.5961597561836243, + "learning_rate": 0.0001644410236675005, + "loss": 0.817986786365509, + "step": 930 + }, + { + "epoch": 0.5369088811995386, + "grad_norm": 0.8336942195892334, + "learning_rate": 0.00016440253992688092, + "loss": 1.071876883506775, + "step": 931 + }, + { + "epoch": 0.5374855824682814, + "grad_norm": 0.8322470784187317, + "learning_rate": 0.0001643640561862613, + "loss": 0.9675548672676086, + "step": 932 + }, + { + "epoch": 0.5380622837370242, + "grad_norm": 0.8054575324058533, + "learning_rate": 0.00016432557244564172, + "loss": 1.0018256902694702, + "step": 933 + }, + { + "epoch": 0.538638985005767, + "grad_norm": 0.7546166181564331, + "learning_rate": 0.00016428708870502214, + "loss": 0.9199832677841187, + "step": 934 + }, + { + "epoch": 0.5392156862745098, + "grad_norm": 0.6384134292602539, + "learning_rate": 0.00016424860496440255, + "loss": 0.5693946480751038, + "step": 935 + }, + { + "epoch": 0.5397923875432526, + "grad_norm": 0.8509575128555298, + "learning_rate": 0.00016421012122378297, + "loss": 1.3604402542114258, + "step": 936 + }, + { + "epoch": 0.5403690888119954, + "grad_norm": 1.0863171815872192, + "learning_rate": 0.00016417163748316338, + "loss": 1.441767692565918, + "step": 937 + }, + { + "epoch": 0.5409457900807382, + "grad_norm": 0.7680332064628601, + "learning_rate": 0.0001641331537425438, + "loss": 0.8990482091903687, + "step": 938 + }, + { + "epoch": 0.5415224913494809, + "grad_norm": 0.9804447889328003, + "learning_rate": 0.0001640946700019242, + "loss": 1.0421537160873413, + "step": 939 + }, + { + "epoch": 0.5420991926182238, + "grad_norm": 1.0693145990371704, + "learning_rate": 0.0001640561862613046, + "loss": 1.1600146293640137, + "step": 940 + }, + { + "epoch": 0.5426758938869666, + "grad_norm": 0.8488958477973938, + "learning_rate": 0.00016401770252068502, + "loss": 1.2710307836532593, + "step": 941 + }, + { + "epoch": 0.5432525951557093, + "grad_norm": 1.048317313194275, + "learning_rate": 0.00016397921878006543, + "loss": 0.8453274369239807, + "step": 942 + }, + { + "epoch": 0.5438292964244521, + "grad_norm": 0.7326422929763794, + "learning_rate": 0.00016394073503944585, + "loss": 1.0167326927185059, + "step": 943 + }, + { + "epoch": 0.544405997693195, + "grad_norm": 0.877862274646759, + "learning_rate": 0.00016390225129882626, + "loss": 0.9589974880218506, + "step": 944 + }, + { + "epoch": 0.5449826989619377, + "grad_norm": 0.8096463680267334, + "learning_rate": 0.00016386376755820668, + "loss": 0.8364965915679932, + "step": 945 + }, + { + "epoch": 0.5455594002306805, + "grad_norm": 0.9232637882232666, + "learning_rate": 0.0001638252838175871, + "loss": 0.9332213997840881, + "step": 946 + }, + { + "epoch": 0.5461361014994233, + "grad_norm": 0.7885507941246033, + "learning_rate": 0.0001637868000769675, + "loss": 1.0532820224761963, + "step": 947 + }, + { + "epoch": 0.5467128027681661, + "grad_norm": 0.914097249507904, + "learning_rate": 0.0001637483163363479, + "loss": 0.8059665560722351, + "step": 948 + }, + { + "epoch": 0.5472895040369089, + "grad_norm": 0.8124399781227112, + "learning_rate": 0.0001637098325957283, + "loss": 0.7342300415039062, + "step": 949 + }, + { + "epoch": 0.5478662053056517, + "grad_norm": 0.8677952289581299, + "learning_rate": 0.00016367134885510873, + "loss": 1.2200864553451538, + "step": 950 + }, + { + "epoch": 0.5484429065743944, + "grad_norm": 0.8235622048377991, + "learning_rate": 0.00016363286511448914, + "loss": 1.2276276350021362, + "step": 951 + }, + { + "epoch": 0.5490196078431373, + "grad_norm": 0.8734779953956604, + "learning_rate": 0.00016359438137386956, + "loss": 1.481785535812378, + "step": 952 + }, + { + "epoch": 0.5495963091118801, + "grad_norm": 0.7058696746826172, + "learning_rate": 0.00016355589763324997, + "loss": 0.8971320390701294, + "step": 953 + }, + { + "epoch": 0.5501730103806228, + "grad_norm": 0.7818495035171509, + "learning_rate": 0.0001635174138926304, + "loss": 0.9900298118591309, + "step": 954 + }, + { + "epoch": 0.5507497116493656, + "grad_norm": 0.9933992028236389, + "learning_rate": 0.0001634789301520108, + "loss": 1.377812147140503, + "step": 955 + }, + { + "epoch": 0.5513264129181085, + "grad_norm": 0.6487358808517456, + "learning_rate": 0.0001634404464113912, + "loss": 0.8082116842269897, + "step": 956 + }, + { + "epoch": 0.5519031141868512, + "grad_norm": 0.7896233201026917, + "learning_rate": 0.0001634019626707716, + "loss": 0.8894538879394531, + "step": 957 + }, + { + "epoch": 0.552479815455594, + "grad_norm": 0.5499460697174072, + "learning_rate": 0.00016336347893015202, + "loss": 0.7779909372329712, + "step": 958 + }, + { + "epoch": 0.5530565167243368, + "grad_norm": 0.7304683327674866, + "learning_rate": 0.00016332499518953244, + "loss": 0.9466789960861206, + "step": 959 + }, + { + "epoch": 0.5536332179930796, + "grad_norm": 0.8766285181045532, + "learning_rate": 0.00016328651144891285, + "loss": 0.654015064239502, + "step": 960 + }, + { + "epoch": 0.5542099192618224, + "grad_norm": 0.5168980956077576, + "learning_rate": 0.00016324802770829327, + "loss": 0.7942756414413452, + "step": 961 + }, + { + "epoch": 0.5547866205305652, + "grad_norm": 0.8975361585617065, + "learning_rate": 0.00016320954396767368, + "loss": 1.1166660785675049, + "step": 962 + }, + { + "epoch": 0.5553633217993079, + "grad_norm": 0.559033215045929, + "learning_rate": 0.0001631710602270541, + "loss": 0.7238450050354004, + "step": 963 + }, + { + "epoch": 0.5559400230680508, + "grad_norm": 0.5114202499389648, + "learning_rate": 0.00016313257648643449, + "loss": 0.8229402303695679, + "step": 964 + }, + { + "epoch": 0.5565167243367936, + "grad_norm": 0.8146692514419556, + "learning_rate": 0.0001630940927458149, + "loss": 0.9510258436203003, + "step": 965 + }, + { + "epoch": 0.5570934256055363, + "grad_norm": 0.7686490416526794, + "learning_rate": 0.00016305560900519532, + "loss": 1.3754280805587769, + "step": 966 + }, + { + "epoch": 0.5576701268742791, + "grad_norm": 0.6895797252655029, + "learning_rate": 0.00016301712526457573, + "loss": 0.9850455522537231, + "step": 967 + }, + { + "epoch": 0.558246828143022, + "grad_norm": 0.6049807667732239, + "learning_rate": 0.00016297864152395615, + "loss": 0.6829259395599365, + "step": 968 + }, + { + "epoch": 0.5588235294117647, + "grad_norm": 0.7376249432563782, + "learning_rate": 0.00016294015778333656, + "loss": 0.7787905931472778, + "step": 969 + }, + { + "epoch": 0.5594002306805075, + "grad_norm": 0.5940505862236023, + "learning_rate": 0.00016290167404271698, + "loss": 0.7658302783966064, + "step": 970 + }, + { + "epoch": 0.5599769319492502, + "grad_norm": 0.8353221416473389, + "learning_rate": 0.0001628631903020974, + "loss": 1.0191570520401, + "step": 971 + }, + { + "epoch": 0.5605536332179931, + "grad_norm": 0.6136527061462402, + "learning_rate": 0.00016282470656147778, + "loss": 0.9413414001464844, + "step": 972 + }, + { + "epoch": 0.5611303344867359, + "grad_norm": 0.64887535572052, + "learning_rate": 0.0001627862228208582, + "loss": 0.763261616230011, + "step": 973 + }, + { + "epoch": 0.5617070357554786, + "grad_norm": 0.8027318716049194, + "learning_rate": 0.0001627477390802386, + "loss": 1.1142311096191406, + "step": 974 + }, + { + "epoch": 0.5622837370242214, + "grad_norm": 0.6630944609642029, + "learning_rate": 0.00016270925533961903, + "loss": 0.8240130543708801, + "step": 975 + }, + { + "epoch": 0.5628604382929643, + "grad_norm": 0.7404500246047974, + "learning_rate": 0.00016267077159899944, + "loss": 0.9690840244293213, + "step": 976 + }, + { + "epoch": 0.563437139561707, + "grad_norm": 1.0134172439575195, + "learning_rate": 0.00016263228785837986, + "loss": 1.4774882793426514, + "step": 977 + }, + { + "epoch": 0.5640138408304498, + "grad_norm": 0.8651242256164551, + "learning_rate": 0.00016259380411776027, + "loss": 0.898904025554657, + "step": 978 + }, + { + "epoch": 0.5645905420991926, + "grad_norm": 0.6225872039794922, + "learning_rate": 0.00016255532037714066, + "loss": 1.149839162826538, + "step": 979 + }, + { + "epoch": 0.5651672433679354, + "grad_norm": 0.5773558020591736, + "learning_rate": 0.00016251683663652108, + "loss": 0.516633152961731, + "step": 980 + }, + { + "epoch": 0.5657439446366782, + "grad_norm": 0.6350861191749573, + "learning_rate": 0.0001624783528959015, + "loss": 1.0271410942077637, + "step": 981 + }, + { + "epoch": 0.566320645905421, + "grad_norm": 0.8134899139404297, + "learning_rate": 0.0001624398691552819, + "loss": 0.8847084045410156, + "step": 982 + }, + { + "epoch": 0.5668973471741637, + "grad_norm": 0.793136477470398, + "learning_rate": 0.00016240138541466232, + "loss": 1.0517855882644653, + "step": 983 + }, + { + "epoch": 0.5674740484429066, + "grad_norm": 0.6838855743408203, + "learning_rate": 0.00016236290167404274, + "loss": 0.9592060446739197, + "step": 984 + }, + { + "epoch": 0.5680507497116494, + "grad_norm": 0.77060467004776, + "learning_rate": 0.00016232441793342315, + "loss": 1.1476876735687256, + "step": 985 + }, + { + "epoch": 0.5686274509803921, + "grad_norm": 0.6759986281394958, + "learning_rate": 0.00016228593419280357, + "loss": 0.9518548846244812, + "step": 986 + }, + { + "epoch": 0.5692041522491349, + "grad_norm": 0.6088658571243286, + "learning_rate": 0.00016224745045218396, + "loss": 0.6659010648727417, + "step": 987 + }, + { + "epoch": 0.5697808535178778, + "grad_norm": 0.9436719417572021, + "learning_rate": 0.00016220896671156437, + "loss": 1.1346865892410278, + "step": 988 + }, + { + "epoch": 0.5703575547866205, + "grad_norm": 1.0091006755828857, + "learning_rate": 0.0001621704829709448, + "loss": 1.1687716245651245, + "step": 989 + }, + { + "epoch": 0.5709342560553633, + "grad_norm": 0.9080367684364319, + "learning_rate": 0.0001621319992303252, + "loss": 1.0989638566970825, + "step": 990 + }, + { + "epoch": 0.5715109573241061, + "grad_norm": 0.7519204020500183, + "learning_rate": 0.00016209351548970562, + "loss": 1.3017445802688599, + "step": 991 + }, + { + "epoch": 0.5720876585928489, + "grad_norm": 0.545911431312561, + "learning_rate": 0.00016205503174908603, + "loss": 0.7622886300086975, + "step": 992 + }, + { + "epoch": 0.5726643598615917, + "grad_norm": 0.9163870215415955, + "learning_rate": 0.00016201654800846645, + "loss": 1.2744814157485962, + "step": 993 + }, + { + "epoch": 0.5732410611303345, + "grad_norm": 0.7644914388656616, + "learning_rate": 0.00016197806426784686, + "loss": 0.9071030616760254, + "step": 994 + }, + { + "epoch": 0.5738177623990772, + "grad_norm": 0.761933445930481, + "learning_rate": 0.00016193958052722725, + "loss": 1.0261884927749634, + "step": 995 + }, + { + "epoch": 0.5743944636678201, + "grad_norm": 0.5850253701210022, + "learning_rate": 0.00016190109678660767, + "loss": 0.8700547814369202, + "step": 996 + }, + { + "epoch": 0.5749711649365629, + "grad_norm": 0.8303119540214539, + "learning_rate": 0.00016186261304598808, + "loss": 0.7401360273361206, + "step": 997 + }, + { + "epoch": 0.5755478662053056, + "grad_norm": 0.8335464000701904, + "learning_rate": 0.0001618241293053685, + "loss": 1.058925986289978, + "step": 998 + }, + { + "epoch": 0.5761245674740484, + "grad_norm": 0.6967325806617737, + "learning_rate": 0.0001617856455647489, + "loss": 1.3550879955291748, + "step": 999 + }, + { + "epoch": 0.5767012687427913, + "grad_norm": 1.0509662628173828, + "learning_rate": 0.00016174716182412933, + "loss": 1.3809900283813477, + "step": 1000 + }, + { + "epoch": 0.577277970011534, + "grad_norm": 0.7688459157943726, + "learning_rate": 0.00016170867808350974, + "loss": 0.7888709306716919, + "step": 1001 + }, + { + "epoch": 0.5778546712802768, + "grad_norm": 1.4081027507781982, + "learning_rate": 0.00016167019434289016, + "loss": 0.8922286033630371, + "step": 1002 + }, + { + "epoch": 0.5784313725490197, + "grad_norm": 0.8513575196266174, + "learning_rate": 0.00016163171060227055, + "loss": 0.9064381718635559, + "step": 1003 + }, + { + "epoch": 0.5790080738177624, + "grad_norm": 0.8020631670951843, + "learning_rate": 0.00016159322686165096, + "loss": 1.0038318634033203, + "step": 1004 + }, + { + "epoch": 0.5795847750865052, + "grad_norm": 0.6308439373970032, + "learning_rate": 0.00016155474312103138, + "loss": 1.0535993576049805, + "step": 1005 + }, + { + "epoch": 0.580161476355248, + "grad_norm": 0.9487643837928772, + "learning_rate": 0.0001615162593804118, + "loss": 1.0733325481414795, + "step": 1006 + }, + { + "epoch": 0.5807381776239908, + "grad_norm": 0.5813226699829102, + "learning_rate": 0.0001614777756397922, + "loss": 0.6475256085395813, + "step": 1007 + }, + { + "epoch": 0.5813148788927336, + "grad_norm": 0.8787825703620911, + "learning_rate": 0.00016143929189917262, + "loss": 1.2669293880462646, + "step": 1008 + }, + { + "epoch": 0.5818915801614764, + "grad_norm": 0.5114219784736633, + "learning_rate": 0.00016140080815855304, + "loss": 0.5243850946426392, + "step": 1009 + }, + { + "epoch": 0.5824682814302191, + "grad_norm": 0.9315117597579956, + "learning_rate": 0.00016136232441793345, + "loss": 1.0958704948425293, + "step": 1010 + }, + { + "epoch": 0.583044982698962, + "grad_norm": 0.7866684794425964, + "learning_rate": 0.00016132384067731384, + "loss": 1.0202006101608276, + "step": 1011 + }, + { + "epoch": 0.5836216839677048, + "grad_norm": 0.9690834283828735, + "learning_rate": 0.00016128535693669426, + "loss": 0.7898403406143188, + "step": 1012 + }, + { + "epoch": 0.5841983852364475, + "grad_norm": 1.17559015750885, + "learning_rate": 0.00016124687319607467, + "loss": 1.0564637184143066, + "step": 1013 + }, + { + "epoch": 0.5847750865051903, + "grad_norm": 0.9403568506240845, + "learning_rate": 0.0001612083894554551, + "loss": 1.1451847553253174, + "step": 1014 + }, + { + "epoch": 0.5853517877739332, + "grad_norm": 0.7303722500801086, + "learning_rate": 0.0001611699057148355, + "loss": 1.143730878829956, + "step": 1015 + }, + { + "epoch": 0.5859284890426759, + "grad_norm": 0.9661723375320435, + "learning_rate": 0.00016113142197421592, + "loss": 1.1612937450408936, + "step": 1016 + }, + { + "epoch": 0.5865051903114187, + "grad_norm": 0.9506820440292358, + "learning_rate": 0.0001610929382335963, + "loss": 1.3300495147705078, + "step": 1017 + }, + { + "epoch": 0.5870818915801614, + "grad_norm": 0.9524713754653931, + "learning_rate": 0.00016105445449297672, + "loss": 1.4797887802124023, + "step": 1018 + }, + { + "epoch": 0.5876585928489043, + "grad_norm": 0.8756133317947388, + "learning_rate": 0.00016101597075235714, + "loss": 1.0017035007476807, + "step": 1019 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 0.8561094403266907, + "learning_rate": 0.00016097748701173752, + "loss": 1.4500423669815063, + "step": 1020 + }, + { + "epoch": 0.5888119953863898, + "grad_norm": 0.7503087520599365, + "learning_rate": 0.00016093900327111794, + "loss": 1.0606659650802612, + "step": 1021 + }, + { + "epoch": 0.5893886966551326, + "grad_norm": 0.5415161848068237, + "learning_rate": 0.00016090051953049836, + "loss": 0.6421483159065247, + "step": 1022 + }, + { + "epoch": 0.5899653979238755, + "grad_norm": 0.6148718595504761, + "learning_rate": 0.00016086203578987877, + "loss": 0.94537353515625, + "step": 1023 + }, + { + "epoch": 0.5905420991926182, + "grad_norm": 0.7274061441421509, + "learning_rate": 0.00016082355204925919, + "loss": 1.1045122146606445, + "step": 1024 + }, + { + "epoch": 0.591118800461361, + "grad_norm": 1.0995570421218872, + "learning_rate": 0.0001607850683086396, + "loss": 1.0006502866744995, + "step": 1025 + }, + { + "epoch": 0.5916955017301038, + "grad_norm": 0.6411669850349426, + "learning_rate": 0.00016074658456802002, + "loss": 0.8185054063796997, + "step": 1026 + }, + { + "epoch": 0.5922722029988466, + "grad_norm": 0.8972517848014832, + "learning_rate": 0.00016070810082740043, + "loss": 1.0834156274795532, + "step": 1027 + }, + { + "epoch": 0.5928489042675894, + "grad_norm": 1.3362998962402344, + "learning_rate": 0.00016066961708678082, + "loss": 1.3157958984375, + "step": 1028 + }, + { + "epoch": 0.5934256055363322, + "grad_norm": 0.9085165858268738, + "learning_rate": 0.00016063113334616124, + "loss": 1.0817850828170776, + "step": 1029 + }, + { + "epoch": 0.5940023068050749, + "grad_norm": 1.028162956237793, + "learning_rate": 0.00016059264960554165, + "loss": 1.324896216392517, + "step": 1030 + }, + { + "epoch": 0.5945790080738178, + "grad_norm": 0.6264161467552185, + "learning_rate": 0.00016055416586492207, + "loss": 0.7769796848297119, + "step": 1031 + }, + { + "epoch": 0.5951557093425606, + "grad_norm": 0.6027923822402954, + "learning_rate": 0.00016051568212430248, + "loss": 0.7691771388053894, + "step": 1032 + }, + { + "epoch": 0.5957324106113033, + "grad_norm": 1.1957632303237915, + "learning_rate": 0.0001604771983836829, + "loss": 1.5915735960006714, + "step": 1033 + }, + { + "epoch": 0.5963091118800461, + "grad_norm": 0.8243029713630676, + "learning_rate": 0.0001604387146430633, + "loss": 1.4467861652374268, + "step": 1034 + }, + { + "epoch": 0.596885813148789, + "grad_norm": 0.9241074919700623, + "learning_rate": 0.00016040023090244373, + "loss": 1.2037115097045898, + "step": 1035 + }, + { + "epoch": 0.5974625144175317, + "grad_norm": 0.7573208212852478, + "learning_rate": 0.00016036174716182411, + "loss": 1.111187219619751, + "step": 1036 + }, + { + "epoch": 0.5980392156862745, + "grad_norm": 0.9766779541969299, + "learning_rate": 0.00016032326342120453, + "loss": 1.3394712209701538, + "step": 1037 + }, + { + "epoch": 0.5986159169550173, + "grad_norm": 0.7223910093307495, + "learning_rate": 0.00016028477968058495, + "loss": 0.9714270830154419, + "step": 1038 + }, + { + "epoch": 0.5991926182237601, + "grad_norm": 0.8372020721435547, + "learning_rate": 0.00016024629593996536, + "loss": 0.9755414724349976, + "step": 1039 + }, + { + "epoch": 0.5997693194925029, + "grad_norm": 1.060224175453186, + "learning_rate": 0.00016020781219934578, + "loss": 1.0653870105743408, + "step": 1040 + }, + { + "epoch": 0.6003460207612457, + "grad_norm": 1.0068564414978027, + "learning_rate": 0.0001601693284587262, + "loss": 1.1695475578308105, + "step": 1041 + }, + { + "epoch": 0.6009227220299884, + "grad_norm": 0.8202903866767883, + "learning_rate": 0.0001601308447181066, + "loss": 1.430415391921997, + "step": 1042 + }, + { + "epoch": 0.6014994232987313, + "grad_norm": 0.6556461453437805, + "learning_rate": 0.00016009236097748702, + "loss": 0.6565566658973694, + "step": 1043 + }, + { + "epoch": 0.6020761245674741, + "grad_norm": 1.0711745023727417, + "learning_rate": 0.0001600538772368674, + "loss": 1.4629727602005005, + "step": 1044 + }, + { + "epoch": 0.6026528258362168, + "grad_norm": 0.857792317867279, + "learning_rate": 0.00016001539349624783, + "loss": 1.375361442565918, + "step": 1045 + }, + { + "epoch": 0.6032295271049596, + "grad_norm": 0.8610656261444092, + "learning_rate": 0.00015997690975562824, + "loss": 1.319663166999817, + "step": 1046 + }, + { + "epoch": 0.6038062283737025, + "grad_norm": 0.5466272830963135, + "learning_rate": 0.00015993842601500866, + "loss": 0.9326815009117126, + "step": 1047 + }, + { + "epoch": 0.6043829296424452, + "grad_norm": 0.5424578189849854, + "learning_rate": 0.00015989994227438907, + "loss": 0.8943756818771362, + "step": 1048 + }, + { + "epoch": 0.604959630911188, + "grad_norm": 1.0392166376113892, + "learning_rate": 0.00015986145853376949, + "loss": 1.1610779762268066, + "step": 1049 + }, + { + "epoch": 0.6055363321799307, + "grad_norm": 0.7397944331169128, + "learning_rate": 0.0001598229747931499, + "loss": 0.9297494888305664, + "step": 1050 + }, + { + "epoch": 0.6061130334486736, + "grad_norm": 0.7921435832977295, + "learning_rate": 0.00015978449105253032, + "loss": 0.9271104335784912, + "step": 1051 + }, + { + "epoch": 0.6066897347174164, + "grad_norm": 1.0713645219802856, + "learning_rate": 0.0001597460073119107, + "loss": 1.429350733757019, + "step": 1052 + }, + { + "epoch": 0.6072664359861591, + "grad_norm": 0.7312497496604919, + "learning_rate": 0.00015970752357129112, + "loss": 0.9167627096176147, + "step": 1053 + }, + { + "epoch": 0.6078431372549019, + "grad_norm": 0.7499086260795593, + "learning_rate": 0.00015966903983067154, + "loss": 0.7258137464523315, + "step": 1054 + }, + { + "epoch": 0.6084198385236448, + "grad_norm": 0.7300564646720886, + "learning_rate": 0.00015963055609005195, + "loss": 1.058071494102478, + "step": 1055 + }, + { + "epoch": 0.6089965397923875, + "grad_norm": 0.652527928352356, + "learning_rate": 0.00015959207234943237, + "loss": 0.6544615030288696, + "step": 1056 + }, + { + "epoch": 0.6095732410611303, + "grad_norm": 0.7193166613578796, + "learning_rate": 0.00015955358860881278, + "loss": 0.7395502328872681, + "step": 1057 + }, + { + "epoch": 0.6101499423298731, + "grad_norm": 0.7402684092521667, + "learning_rate": 0.0001595151048681932, + "loss": 0.8958665728569031, + "step": 1058 + }, + { + "epoch": 0.610726643598616, + "grad_norm": 1.0471738576889038, + "learning_rate": 0.0001594766211275736, + "loss": 1.383862018585205, + "step": 1059 + }, + { + "epoch": 0.6113033448673587, + "grad_norm": 0.926358699798584, + "learning_rate": 0.000159438137386954, + "loss": 1.3329360485076904, + "step": 1060 + }, + { + "epoch": 0.6118800461361015, + "grad_norm": 1.3576291799545288, + "learning_rate": 0.00015939965364633442, + "loss": 1.4153847694396973, + "step": 1061 + }, + { + "epoch": 0.6124567474048442, + "grad_norm": 1.043614387512207, + "learning_rate": 0.00015936116990571483, + "loss": 1.1355584859848022, + "step": 1062 + }, + { + "epoch": 0.6130334486735871, + "grad_norm": 0.6180047988891602, + "learning_rate": 0.00015932268616509525, + "loss": 0.7877006530761719, + "step": 1063 + }, + { + "epoch": 0.6136101499423299, + "grad_norm": 1.188005805015564, + "learning_rate": 0.00015928420242447566, + "loss": 1.185757040977478, + "step": 1064 + }, + { + "epoch": 0.6141868512110726, + "grad_norm": 0.6937184929847717, + "learning_rate": 0.00015924571868385608, + "loss": 0.8133529424667358, + "step": 1065 + }, + { + "epoch": 0.6147635524798154, + "grad_norm": 0.5152422785758972, + "learning_rate": 0.0001592072349432365, + "loss": 0.6955524682998657, + "step": 1066 + }, + { + "epoch": 0.6153402537485583, + "grad_norm": 0.8295215964317322, + "learning_rate": 0.0001591687512026169, + "loss": 0.9180642366409302, + "step": 1067 + }, + { + "epoch": 0.615916955017301, + "grad_norm": 1.131622314453125, + "learning_rate": 0.0001591302674619973, + "loss": 1.2194663286209106, + "step": 1068 + }, + { + "epoch": 0.6164936562860438, + "grad_norm": 0.744301438331604, + "learning_rate": 0.0001590917837213777, + "loss": 0.9852138161659241, + "step": 1069 + }, + { + "epoch": 0.6170703575547867, + "grad_norm": 0.7841970920562744, + "learning_rate": 0.00015905329998075813, + "loss": 1.302487850189209, + "step": 1070 + }, + { + "epoch": 0.6176470588235294, + "grad_norm": 0.6610711216926575, + "learning_rate": 0.00015901481624013854, + "loss": 0.8427870273590088, + "step": 1071 + }, + { + "epoch": 0.6182237600922722, + "grad_norm": 0.9735661745071411, + "learning_rate": 0.00015897633249951896, + "loss": 1.1720025539398193, + "step": 1072 + }, + { + "epoch": 0.618800461361015, + "grad_norm": 0.6673301458358765, + "learning_rate": 0.00015893784875889937, + "loss": 1.0172441005706787, + "step": 1073 + }, + { + "epoch": 0.6193771626297578, + "grad_norm": 1.0327497720718384, + "learning_rate": 0.0001588993650182798, + "loss": 1.168729305267334, + "step": 1074 + }, + { + "epoch": 0.6199538638985006, + "grad_norm": 0.6887943744659424, + "learning_rate": 0.0001588608812776602, + "loss": 0.9284838438034058, + "step": 1075 + }, + { + "epoch": 0.6205305651672434, + "grad_norm": 0.6660910844802856, + "learning_rate": 0.0001588223975370406, + "loss": 1.1769919395446777, + "step": 1076 + }, + { + "epoch": 0.6211072664359861, + "grad_norm": 0.7416674494743347, + "learning_rate": 0.000158783913796421, + "loss": 0.750725269317627, + "step": 1077 + }, + { + "epoch": 0.621683967704729, + "grad_norm": 0.6302111148834229, + "learning_rate": 0.00015874543005580142, + "loss": 0.8207563161849976, + "step": 1078 + }, + { + "epoch": 0.6222606689734718, + "grad_norm": 0.720021665096283, + "learning_rate": 0.00015870694631518184, + "loss": 1.133636474609375, + "step": 1079 + }, + { + "epoch": 0.6228373702422145, + "grad_norm": 0.9188029170036316, + "learning_rate": 0.00015866846257456225, + "loss": 1.5215458869934082, + "step": 1080 + }, + { + "epoch": 0.6234140715109573, + "grad_norm": 0.7337254881858826, + "learning_rate": 0.00015862997883394267, + "loss": 0.9544572830200195, + "step": 1081 + }, + { + "epoch": 0.6239907727797002, + "grad_norm": 1.0431314706802368, + "learning_rate": 0.00015859149509332308, + "loss": 1.0790281295776367, + "step": 1082 + }, + { + "epoch": 0.6245674740484429, + "grad_norm": 0.6344501376152039, + "learning_rate": 0.0001585530113527035, + "loss": 0.9151628017425537, + "step": 1083 + }, + { + "epoch": 0.6251441753171857, + "grad_norm": 1.332190752029419, + "learning_rate": 0.00015851452761208389, + "loss": 1.5466241836547852, + "step": 1084 + }, + { + "epoch": 0.6257208765859285, + "grad_norm": 0.7802074551582336, + "learning_rate": 0.0001584760438714643, + "loss": 1.1575053930282593, + "step": 1085 + }, + { + "epoch": 0.6262975778546713, + "grad_norm": 0.5755362510681152, + "learning_rate": 0.00015843756013084472, + "loss": 0.6923443078994751, + "step": 1086 + }, + { + "epoch": 0.6268742791234141, + "grad_norm": 0.8710469007492065, + "learning_rate": 0.00015839907639022513, + "loss": 1.0893003940582275, + "step": 1087 + }, + { + "epoch": 0.6274509803921569, + "grad_norm": 0.6689137816429138, + "learning_rate": 0.00015836059264960555, + "loss": 0.9777762293815613, + "step": 1088 + }, + { + "epoch": 0.6280276816608996, + "grad_norm": 0.9923802614212036, + "learning_rate": 0.00015832210890898596, + "loss": 1.2578145265579224, + "step": 1089 + }, + { + "epoch": 0.6286043829296425, + "grad_norm": 0.7596067190170288, + "learning_rate": 0.00015828362516836638, + "loss": 1.0804511308670044, + "step": 1090 + }, + { + "epoch": 0.6291810841983853, + "grad_norm": 0.9255754947662354, + "learning_rate": 0.0001582451414277468, + "loss": 1.2536742687225342, + "step": 1091 + }, + { + "epoch": 0.629757785467128, + "grad_norm": 0.6089752912521362, + "learning_rate": 0.00015820665768712718, + "loss": 0.8234043121337891, + "step": 1092 + }, + { + "epoch": 0.6303344867358708, + "grad_norm": 0.8412203192710876, + "learning_rate": 0.0001581681739465076, + "loss": 0.8689320683479309, + "step": 1093 + }, + { + "epoch": 0.6309111880046137, + "grad_norm": 0.6300414204597473, + "learning_rate": 0.000158129690205888, + "loss": 0.8836315274238586, + "step": 1094 + }, + { + "epoch": 0.6314878892733564, + "grad_norm": 0.8622999787330627, + "learning_rate": 0.00015809120646526843, + "loss": 0.8355990648269653, + "step": 1095 + }, + { + "epoch": 0.6320645905420992, + "grad_norm": 1.0277838706970215, + "learning_rate": 0.00015805272272464884, + "loss": 1.0228278636932373, + "step": 1096 + }, + { + "epoch": 0.6326412918108419, + "grad_norm": 0.7297544479370117, + "learning_rate": 0.00015801423898402926, + "loss": 0.9207032918930054, + "step": 1097 + }, + { + "epoch": 0.6332179930795848, + "grad_norm": 0.6923787593841553, + "learning_rate": 0.00015797575524340967, + "loss": 0.8914310932159424, + "step": 1098 + }, + { + "epoch": 0.6337946943483276, + "grad_norm": 0.984605073928833, + "learning_rate": 0.00015793727150279006, + "loss": 1.030419945716858, + "step": 1099 + }, + { + "epoch": 0.6343713956170703, + "grad_norm": 0.7933477759361267, + "learning_rate": 0.00015789878776217048, + "loss": 0.8263508081436157, + "step": 1100 + }, + { + "epoch": 0.6349480968858131, + "grad_norm": 0.6690862774848938, + "learning_rate": 0.0001578603040215509, + "loss": 0.8062323927879333, + "step": 1101 + }, + { + "epoch": 0.635524798154556, + "grad_norm": 1.1080838441848755, + "learning_rate": 0.0001578218202809313, + "loss": 1.0695234537124634, + "step": 1102 + }, + { + "epoch": 0.6361014994232987, + "grad_norm": 0.7373805046081543, + "learning_rate": 0.00015778333654031172, + "loss": 0.7782353162765503, + "step": 1103 + }, + { + "epoch": 0.6366782006920415, + "grad_norm": 0.9623069167137146, + "learning_rate": 0.00015774485279969214, + "loss": 1.299721121788025, + "step": 1104 + }, + { + "epoch": 0.6372549019607843, + "grad_norm": 0.8447510004043579, + "learning_rate": 0.00015770636905907255, + "loss": 0.751670241355896, + "step": 1105 + }, + { + "epoch": 0.6378316032295271, + "grad_norm": 0.7200034260749817, + "learning_rate": 0.00015766788531845297, + "loss": 0.8565016388893127, + "step": 1106 + }, + { + "epoch": 0.6384083044982699, + "grad_norm": 0.791018545627594, + "learning_rate": 0.00015762940157783336, + "loss": 1.014164924621582, + "step": 1107 + }, + { + "epoch": 0.6389850057670127, + "grad_norm": 0.7488639950752258, + "learning_rate": 0.00015759091783721377, + "loss": 0.7353352904319763, + "step": 1108 + }, + { + "epoch": 0.6395617070357554, + "grad_norm": 0.6376444697380066, + "learning_rate": 0.00015755243409659419, + "loss": 0.8452020287513733, + "step": 1109 + }, + { + "epoch": 0.6401384083044983, + "grad_norm": 0.7400408387184143, + "learning_rate": 0.0001575139503559746, + "loss": 0.8612061738967896, + "step": 1110 + }, + { + "epoch": 0.6407151095732411, + "grad_norm": 0.630378007888794, + "learning_rate": 0.00015747546661535502, + "loss": 0.8225241899490356, + "step": 1111 + }, + { + "epoch": 0.6412918108419838, + "grad_norm": 0.7687711715698242, + "learning_rate": 0.00015743698287473543, + "loss": 1.0129132270812988, + "step": 1112 + }, + { + "epoch": 0.6418685121107266, + "grad_norm": 0.8225964903831482, + "learning_rate": 0.00015739849913411585, + "loss": 1.0317823886871338, + "step": 1113 + }, + { + "epoch": 0.6424452133794695, + "grad_norm": 0.8062997460365295, + "learning_rate": 0.00015736001539349626, + "loss": 1.2668901681900024, + "step": 1114 + }, + { + "epoch": 0.6430219146482122, + "grad_norm": 0.7937533855438232, + "learning_rate": 0.00015732153165287665, + "loss": 0.5984291434288025, + "step": 1115 + }, + { + "epoch": 0.643598615916955, + "grad_norm": 0.6556064486503601, + "learning_rate": 0.00015728304791225707, + "loss": 0.6811074018478394, + "step": 1116 + }, + { + "epoch": 0.6441753171856978, + "grad_norm": 0.6815225481987, + "learning_rate": 0.00015724456417163748, + "loss": 0.8315191268920898, + "step": 1117 + }, + { + "epoch": 0.6447520184544406, + "grad_norm": 0.8624749779701233, + "learning_rate": 0.0001572060804310179, + "loss": 1.024225115776062, + "step": 1118 + }, + { + "epoch": 0.6453287197231834, + "grad_norm": 0.9867150187492371, + "learning_rate": 0.0001571675966903983, + "loss": 1.1838812828063965, + "step": 1119 + }, + { + "epoch": 0.6459054209919262, + "grad_norm": 0.9800993204116821, + "learning_rate": 0.00015712911294977873, + "loss": 1.0964932441711426, + "step": 1120 + }, + { + "epoch": 0.6464821222606689, + "grad_norm": 0.6755380034446716, + "learning_rate": 0.00015709062920915914, + "loss": 0.6732958555221558, + "step": 1121 + }, + { + "epoch": 0.6470588235294118, + "grad_norm": 0.6237842440605164, + "learning_rate": 0.00015705214546853956, + "loss": 0.769539475440979, + "step": 1122 + }, + { + "epoch": 0.6476355247981546, + "grad_norm": 0.9327729344367981, + "learning_rate": 0.00015701366172791995, + "loss": 1.2593892812728882, + "step": 1123 + }, + { + "epoch": 0.6482122260668973, + "grad_norm": 0.7165786623954773, + "learning_rate": 0.00015697517798730036, + "loss": 0.8721244931221008, + "step": 1124 + }, + { + "epoch": 0.6487889273356401, + "grad_norm": 0.7718213200569153, + "learning_rate": 0.00015693669424668078, + "loss": 0.9298558235168457, + "step": 1125 + }, + { + "epoch": 0.649365628604383, + "grad_norm": 0.7327983975410461, + "learning_rate": 0.0001568982105060612, + "loss": 0.9947003722190857, + "step": 1126 + }, + { + "epoch": 0.6499423298731257, + "grad_norm": 0.8242558240890503, + "learning_rate": 0.0001568597267654416, + "loss": 1.3076270818710327, + "step": 1127 + }, + { + "epoch": 0.6505190311418685, + "grad_norm": 0.5866062641143799, + "learning_rate": 0.00015682124302482202, + "loss": 0.7161552309989929, + "step": 1128 + }, + { + "epoch": 0.6510957324106112, + "grad_norm": 0.690351665019989, + "learning_rate": 0.00015678275928420244, + "loss": 0.7334930896759033, + "step": 1129 + }, + { + "epoch": 0.6516724336793541, + "grad_norm": 0.7475882172584534, + "learning_rate": 0.00015674427554358285, + "loss": 0.8960260152816772, + "step": 1130 + }, + { + "epoch": 0.6522491349480969, + "grad_norm": 0.7973214983940125, + "learning_rate": 0.00015670579180296324, + "loss": 0.9681750535964966, + "step": 1131 + }, + { + "epoch": 0.6528258362168397, + "grad_norm": 0.7747503519058228, + "learning_rate": 0.00015666730806234366, + "loss": 1.051071047782898, + "step": 1132 + }, + { + "epoch": 0.6534025374855824, + "grad_norm": 0.6149755120277405, + "learning_rate": 0.00015662882432172407, + "loss": 1.0745124816894531, + "step": 1133 + }, + { + "epoch": 0.6539792387543253, + "grad_norm": 0.8245506286621094, + "learning_rate": 0.0001565903405811045, + "loss": 1.3383489847183228, + "step": 1134 + }, + { + "epoch": 0.654555940023068, + "grad_norm": 0.754502534866333, + "learning_rate": 0.0001565518568404849, + "loss": 0.709721028804779, + "step": 1135 + }, + { + "epoch": 0.6551326412918108, + "grad_norm": 0.5991480946540833, + "learning_rate": 0.00015651337309986532, + "loss": 0.6601396203041077, + "step": 1136 + }, + { + "epoch": 0.6557093425605537, + "grad_norm": 0.7160611152648926, + "learning_rate": 0.00015647488935924573, + "loss": 1.244566559791565, + "step": 1137 + }, + { + "epoch": 0.6562860438292965, + "grad_norm": 0.6996898055076599, + "learning_rate": 0.00015643640561862615, + "loss": 0.7976762056350708, + "step": 1138 + }, + { + "epoch": 0.6568627450980392, + "grad_norm": 1.1391624212265015, + "learning_rate": 0.00015639792187800654, + "loss": 1.1150181293487549, + "step": 1139 + }, + { + "epoch": 0.657439446366782, + "grad_norm": 0.6305305361747742, + "learning_rate": 0.00015635943813738695, + "loss": 0.9086626768112183, + "step": 1140 + }, + { + "epoch": 0.6580161476355249, + "grad_norm": 1.1590427160263062, + "learning_rate": 0.00015632095439676737, + "loss": 1.2399204969406128, + "step": 1141 + }, + { + "epoch": 0.6585928489042676, + "grad_norm": 0.6845443844795227, + "learning_rate": 0.00015628247065614778, + "loss": 0.9434126019477844, + "step": 1142 + }, + { + "epoch": 0.6591695501730104, + "grad_norm": 0.8011909127235413, + "learning_rate": 0.0001562439869155282, + "loss": 0.9793667197227478, + "step": 1143 + }, + { + "epoch": 0.6597462514417531, + "grad_norm": 0.7350550293922424, + "learning_rate": 0.0001562055031749086, + "loss": 1.27531099319458, + "step": 1144 + }, + { + "epoch": 0.660322952710496, + "grad_norm": 0.9062415361404419, + "learning_rate": 0.00015616701943428903, + "loss": 0.9977236986160278, + "step": 1145 + }, + { + "epoch": 0.6608996539792388, + "grad_norm": 0.8427753448486328, + "learning_rate": 0.00015612853569366944, + "loss": 1.3097494840621948, + "step": 1146 + }, + { + "epoch": 0.6614763552479815, + "grad_norm": 0.7309291958808899, + "learning_rate": 0.00015609005195304983, + "loss": 1.1841623783111572, + "step": 1147 + }, + { + "epoch": 0.6620530565167243, + "grad_norm": 0.8518312573432922, + "learning_rate": 0.00015605156821243025, + "loss": 1.0959196090698242, + "step": 1148 + }, + { + "epoch": 0.6626297577854672, + "grad_norm": 0.7902095317840576, + "learning_rate": 0.00015601308447181066, + "loss": 1.186163067817688, + "step": 1149 + }, + { + "epoch": 0.6632064590542099, + "grad_norm": 0.8482567071914673, + "learning_rate": 0.00015597460073119108, + "loss": 0.9569811820983887, + "step": 1150 + }, + { + "epoch": 0.6637831603229527, + "grad_norm": 0.5328805446624756, + "learning_rate": 0.0001559361169905715, + "loss": 0.6388610005378723, + "step": 1151 + }, + { + "epoch": 0.6643598615916955, + "grad_norm": 0.6060228943824768, + "learning_rate": 0.0001558976332499519, + "loss": 0.7743721008300781, + "step": 1152 + }, + { + "epoch": 0.6649365628604383, + "grad_norm": 0.615100085735321, + "learning_rate": 0.00015585914950933232, + "loss": 0.8808379769325256, + "step": 1153 + }, + { + "epoch": 0.6655132641291811, + "grad_norm": 1.1238489151000977, + "learning_rate": 0.00015582066576871274, + "loss": 1.2252037525177002, + "step": 1154 + }, + { + "epoch": 0.6660899653979239, + "grad_norm": 0.8212980628013611, + "learning_rate": 0.00015578218202809313, + "loss": 1.0264016389846802, + "step": 1155 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.8575494885444641, + "learning_rate": 0.00015574369828747354, + "loss": 0.9453893899917603, + "step": 1156 + }, + { + "epoch": 0.6672433679354095, + "grad_norm": 0.8559103608131409, + "learning_rate": 0.00015570521454685396, + "loss": 1.01399564743042, + "step": 1157 + }, + { + "epoch": 0.6678200692041523, + "grad_norm": 0.8769490122795105, + "learning_rate": 0.00015566673080623437, + "loss": 1.1861730813980103, + "step": 1158 + }, + { + "epoch": 0.668396770472895, + "grad_norm": 0.5112201571464539, + "learning_rate": 0.0001556282470656148, + "loss": 0.6198689341545105, + "step": 1159 + }, + { + "epoch": 0.6689734717416378, + "grad_norm": 0.6346172094345093, + "learning_rate": 0.0001555897633249952, + "loss": 0.757227897644043, + "step": 1160 + }, + { + "epoch": 0.6695501730103807, + "grad_norm": 0.7918882966041565, + "learning_rate": 0.00015555127958437562, + "loss": 0.7224777936935425, + "step": 1161 + }, + { + "epoch": 0.6701268742791234, + "grad_norm": 0.5124825835227966, + "learning_rate": 0.00015551279584375603, + "loss": 0.7446980476379395, + "step": 1162 + }, + { + "epoch": 0.6707035755478662, + "grad_norm": 0.6950685977935791, + "learning_rate": 0.00015547431210313642, + "loss": 0.8628665804862976, + "step": 1163 + }, + { + "epoch": 0.671280276816609, + "grad_norm": 0.8380517363548279, + "learning_rate": 0.00015543582836251684, + "loss": 1.0211181640625, + "step": 1164 + }, + { + "epoch": 0.6718569780853518, + "grad_norm": 0.732266902923584, + "learning_rate": 0.00015539734462189725, + "loss": 0.7137742042541504, + "step": 1165 + }, + { + "epoch": 0.6724336793540946, + "grad_norm": 0.7325503826141357, + "learning_rate": 0.00015535886088127767, + "loss": 1.0089268684387207, + "step": 1166 + }, + { + "epoch": 0.6730103806228374, + "grad_norm": 0.8091567158699036, + "learning_rate": 0.00015532037714065808, + "loss": 1.0261311531066895, + "step": 1167 + }, + { + "epoch": 0.6735870818915801, + "grad_norm": 0.8078528642654419, + "learning_rate": 0.0001552818934000385, + "loss": 1.0196332931518555, + "step": 1168 + }, + { + "epoch": 0.674163783160323, + "grad_norm": 0.5558749437332153, + "learning_rate": 0.0001552434096594189, + "loss": 0.8882730007171631, + "step": 1169 + }, + { + "epoch": 0.6747404844290658, + "grad_norm": 0.7303665280342102, + "learning_rate": 0.00015520492591879933, + "loss": 0.9657995700836182, + "step": 1170 + }, + { + "epoch": 0.6753171856978085, + "grad_norm": 0.7512165904045105, + "learning_rate": 0.00015516644217817972, + "loss": 1.0741921663284302, + "step": 1171 + }, + { + "epoch": 0.6758938869665513, + "grad_norm": 0.7227686047554016, + "learning_rate": 0.0001549788542868128, + "loss": 1.0935313701629639, + "step": 1172 + }, + { + "epoch": 0.6764705882352942, + "grad_norm": 0.9613728523254395, + "learning_rate": 0.00015494040753556324, + "loss": 1.0458366870880127, + "step": 1173 + }, + { + "epoch": 0.6770472895040369, + "grad_norm": 0.7592456936836243, + "learning_rate": 0.00015490196078431375, + "loss": 1.048318862915039, + "step": 1174 + }, + { + "epoch": 0.6776239907727797, + "grad_norm": 0.6358122229576111, + "learning_rate": 0.0001548635140330642, + "loss": 0.9271713495254517, + "step": 1175 + }, + { + "epoch": 0.6782006920415224, + "grad_norm": 0.6779629588127136, + "learning_rate": 0.0001548250672818147, + "loss": 0.8732894062995911, + "step": 1176 + }, + { + "epoch": 0.6787773933102653, + "grad_norm": 0.7252342700958252, + "learning_rate": 0.00015478662053056518, + "loss": 1.016528606414795, + "step": 1177 + }, + { + "epoch": 0.6793540945790081, + "grad_norm": 0.5252419710159302, + "learning_rate": 0.00015474817377931566, + "loss": 0.6656200885772705, + "step": 1178 + }, + { + "epoch": 0.6799307958477508, + "grad_norm": 0.7480099201202393, + "learning_rate": 0.00015470972702806614, + "loss": 0.9825901389122009, + "step": 1179 + }, + { + "epoch": 0.6805074971164936, + "grad_norm": 0.5403528809547424, + "learning_rate": 0.00015467128027681662, + "loss": 0.8263649344444275, + "step": 1180 + }, + { + "epoch": 0.6810841983852365, + "grad_norm": 0.909685492515564, + "learning_rate": 0.0001546328335255671, + "loss": 1.1039624214172363, + "step": 1181 + }, + { + "epoch": 0.6816608996539792, + "grad_norm": 0.6782054305076599, + "learning_rate": 0.00015459438677431757, + "loss": 0.8667647242546082, + "step": 1182 + }, + { + "epoch": 0.682237600922722, + "grad_norm": 0.9437413811683655, + "learning_rate": 0.00015455594002306805, + "loss": 1.0089085102081299, + "step": 1183 + }, + { + "epoch": 0.6828143021914648, + "grad_norm": 0.631352424621582, + "learning_rate": 0.00015451749327181856, + "loss": 0.8900731801986694, + "step": 1184 + }, + { + "epoch": 0.6833910034602076, + "grad_norm": 0.9895037412643433, + "learning_rate": 0.000154479046520569, + "loss": 1.4409505128860474, + "step": 1185 + }, + { + "epoch": 0.6839677047289504, + "grad_norm": 0.655288815498352, + "learning_rate": 0.00015444059976931951, + "loss": 0.8149420022964478, + "step": 1186 + }, + { + "epoch": 0.6845444059976932, + "grad_norm": 0.906093418598175, + "learning_rate": 0.00015440215301806997, + "loss": 1.440996527671814, + "step": 1187 + }, + { + "epoch": 0.6851211072664359, + "grad_norm": 0.7067789435386658, + "learning_rate": 0.00015436370626682047, + "loss": 0.6415053009986877, + "step": 1188 + }, + { + "epoch": 0.6856978085351788, + "grad_norm": 0.7950546741485596, + "learning_rate": 0.00015432525951557095, + "loss": 1.0555880069732666, + "step": 1189 + }, + { + "epoch": 0.6862745098039216, + "grad_norm": 0.7521815299987793, + "learning_rate": 0.00015428681276432143, + "loss": 1.0289030075073242, + "step": 1190 + }, + { + "epoch": 0.6868512110726643, + "grad_norm": 0.8053890466690063, + "learning_rate": 0.0001542483660130719, + "loss": 1.0104256868362427, + "step": 1191 + }, + { + "epoch": 0.6874279123414071, + "grad_norm": 0.8960652351379395, + "learning_rate": 0.00015420991926182238, + "loss": 1.3124630451202393, + "step": 1192 + }, + { + "epoch": 0.68800461361015, + "grad_norm": 0.6445242762565613, + "learning_rate": 0.00015417147251057286, + "loss": 0.7147958278656006, + "step": 1193 + }, + { + "epoch": 0.6885813148788927, + "grad_norm": 0.8771377801895142, + "learning_rate": 0.00015413302575932334, + "loss": 1.1068731546401978, + "step": 1194 + }, + { + "epoch": 0.6891580161476355, + "grad_norm": 0.746562659740448, + "learning_rate": 0.00015409457900807382, + "loss": 0.8577734231948853, + "step": 1195 + }, + { + "epoch": 0.6897347174163783, + "grad_norm": 0.8225957155227661, + "learning_rate": 0.00015405613225682432, + "loss": 1.137495994567871, + "step": 1196 + }, + { + "epoch": 0.6903114186851211, + "grad_norm": 1.2180874347686768, + "learning_rate": 0.00015401768550557478, + "loss": 1.3055964708328247, + "step": 1197 + }, + { + "epoch": 0.6908881199538639, + "grad_norm": 0.8417837619781494, + "learning_rate": 0.00015397923875432528, + "loss": 0.719217836856842, + "step": 1198 + }, + { + "epoch": 0.6914648212226067, + "grad_norm": 0.5893595218658447, + "learning_rate": 0.00015394079200307573, + "loss": 0.7719886302947998, + "step": 1199 + }, + { + "epoch": 0.6920415224913494, + "grad_norm": 0.6734403371810913, + "learning_rate": 0.00015390234525182624, + "loss": 0.960877537727356, + "step": 1200 + }, + { + "epoch": 0.6926182237600923, + "grad_norm": 0.7350678443908691, + "learning_rate": 0.00015386389850057672, + "loss": 1.039952278137207, + "step": 1201 + }, + { + "epoch": 0.6931949250288351, + "grad_norm": 0.8072929978370667, + "learning_rate": 0.0001538254517493272, + "loss": 0.9792311787605286, + "step": 1202 + }, + { + "epoch": 0.6937716262975778, + "grad_norm": 0.6742820739746094, + "learning_rate": 0.00015378700499807767, + "loss": 0.8704882860183716, + "step": 1203 + }, + { + "epoch": 0.6943483275663207, + "grad_norm": 0.6590847969055176, + "learning_rate": 0.00015374855824682815, + "loss": 0.7836930155754089, + "step": 1204 + }, + { + "epoch": 0.6949250288350635, + "grad_norm": 0.6364882588386536, + "learning_rate": 0.00015371011149557863, + "loss": 0.6790116429328918, + "step": 1205 + }, + { + "epoch": 0.6955017301038062, + "grad_norm": 0.8620322346687317, + "learning_rate": 0.0001536716647443291, + "loss": 1.1667858362197876, + "step": 1206 + }, + { + "epoch": 0.696078431372549, + "grad_norm": 0.9262224435806274, + "learning_rate": 0.00015363321799307959, + "loss": 1.2684681415557861, + "step": 1207 + }, + { + "epoch": 0.6966551326412919, + "grad_norm": 0.7098090052604675, + "learning_rate": 0.0001535947712418301, + "loss": 1.108170986175537, + "step": 1208 + }, + { + "epoch": 0.6972318339100346, + "grad_norm": 0.8219681978225708, + "learning_rate": 0.00015355632449058054, + "loss": 1.1987258195877075, + "step": 1209 + }, + { + "epoch": 0.6978085351787774, + "grad_norm": 0.7267138957977295, + "learning_rate": 0.00015351787773933105, + "loss": 0.8790909051895142, + "step": 1210 + }, + { + "epoch": 0.6983852364475202, + "grad_norm": 0.9880861043930054, + "learning_rate": 0.0001534794309880815, + "loss": 0.7550561428070068, + "step": 1211 + }, + { + "epoch": 0.698961937716263, + "grad_norm": 1.0179109573364258, + "learning_rate": 0.000153440984236832, + "loss": 1.2887327671051025, + "step": 1212 + }, + { + "epoch": 0.6995386389850058, + "grad_norm": 1.0065605640411377, + "learning_rate": 0.00015340253748558246, + "loss": 1.3018262386322021, + "step": 1213 + }, + { + "epoch": 0.7001153402537486, + "grad_norm": 0.7868698835372925, + "learning_rate": 0.00015336409073433296, + "loss": 1.0050418376922607, + "step": 1214 + }, + { + "epoch": 0.7006920415224913, + "grad_norm": 1.2052333354949951, + "learning_rate": 0.00015332564398308344, + "loss": 1.4229861497879028, + "step": 1215 + }, + { + "epoch": 0.7012687427912342, + "grad_norm": 0.7077322006225586, + "learning_rate": 0.00015328719723183392, + "loss": 0.6043359041213989, + "step": 1216 + }, + { + "epoch": 0.701845444059977, + "grad_norm": 0.587632417678833, + "learning_rate": 0.0001532487504805844, + "loss": 0.6483091115951538, + "step": 1217 + }, + { + "epoch": 0.7024221453287197, + "grad_norm": 0.5759986042976379, + "learning_rate": 0.00015321030372933487, + "loss": 0.8392894864082336, + "step": 1218 + }, + { + "epoch": 0.7029988465974625, + "grad_norm": 0.6800678372383118, + "learning_rate": 0.00015317185697808535, + "loss": 0.8921798467636108, + "step": 1219 + }, + { + "epoch": 0.7035755478662054, + "grad_norm": 0.7683438658714294, + "learning_rate": 0.00015313341022683586, + "loss": 0.9112846851348877, + "step": 1220 + }, + { + "epoch": 0.7041522491349481, + "grad_norm": 1.0117342472076416, + "learning_rate": 0.0001530949634755863, + "loss": 1.4151829481124878, + "step": 1221 + }, + { + "epoch": 0.7047289504036909, + "grad_norm": 0.889950156211853, + "learning_rate": 0.00015305651672433681, + "loss": 1.190742015838623, + "step": 1222 + }, + { + "epoch": 0.7053056516724336, + "grad_norm": 0.7858697772026062, + "learning_rate": 0.00015301806997308727, + "loss": 1.0679411888122559, + "step": 1223 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 0.894363522529602, + "learning_rate": 0.00015297962322183777, + "loss": 1.1472891569137573, + "step": 1224 + }, + { + "epoch": 0.7064590542099193, + "grad_norm": 0.7669128775596619, + "learning_rate": 0.00015294117647058822, + "loss": 1.1536177396774292, + "step": 1225 + }, + { + "epoch": 0.707035755478662, + "grad_norm": 0.6551662683486938, + "learning_rate": 0.00015290272971933873, + "loss": 1.1004867553710938, + "step": 1226 + }, + { + "epoch": 0.7076124567474048, + "grad_norm": 1.0020555257797241, + "learning_rate": 0.0001528642829680892, + "loss": 1.2485133409500122, + "step": 1227 + }, + { + "epoch": 0.7081891580161477, + "grad_norm": 0.725662887096405, + "learning_rate": 0.00015282583621683968, + "loss": 0.8090496063232422, + "step": 1228 + }, + { + "epoch": 0.7087658592848904, + "grad_norm": 0.8500173091888428, + "learning_rate": 0.00015278738946559016, + "loss": 1.1222527027130127, + "step": 1229 + }, + { + "epoch": 0.7093425605536332, + "grad_norm": 0.7580368518829346, + "learning_rate": 0.00015274894271434064, + "loss": 0.8194168210029602, + "step": 1230 + }, + { + "epoch": 0.709919261822376, + "grad_norm": 0.936622679233551, + "learning_rate": 0.00015271049596309112, + "loss": 0.9981272220611572, + "step": 1231 + }, + { + "epoch": 0.7104959630911188, + "grad_norm": 0.8283603191375732, + "learning_rate": 0.00015267204921184162, + "loss": 0.9328891634941101, + "step": 1232 + }, + { + "epoch": 0.7110726643598616, + "grad_norm": 1.0028311014175415, + "learning_rate": 0.00015263360246059208, + "loss": 0.9482144117355347, + "step": 1233 + }, + { + "epoch": 0.7116493656286044, + "grad_norm": 1.1841291189193726, + "learning_rate": 0.00015259515570934258, + "loss": 1.4021642208099365, + "step": 1234 + }, + { + "epoch": 0.7122260668973471, + "grad_norm": 1.0274176597595215, + "learning_rate": 0.00015255670895809303, + "loss": 1.1408722400665283, + "step": 1235 + }, + { + "epoch": 0.71280276816609, + "grad_norm": 0.8339233994483948, + "learning_rate": 0.00015251826220684354, + "loss": 1.2026294469833374, + "step": 1236 + }, + { + "epoch": 0.7133794694348328, + "grad_norm": 0.8232172727584839, + "learning_rate": 0.000152479815455594, + "loss": 1.0658057928085327, + "step": 1237 + }, + { + "epoch": 0.7139561707035755, + "grad_norm": 0.6768394708633423, + "learning_rate": 0.0001524413687043445, + "loss": 0.7539021968841553, + "step": 1238 + }, + { + "epoch": 0.7145328719723183, + "grad_norm": 1.0153294801712036, + "learning_rate": 0.00015240292195309497, + "loss": 1.1792476177215576, + "step": 1239 + }, + { + "epoch": 0.7151095732410612, + "grad_norm": 1.2099579572677612, + "learning_rate": 0.00015236447520184545, + "loss": 1.482499599456787, + "step": 1240 + }, + { + "epoch": 0.7156862745098039, + "grad_norm": 0.5826729536056519, + "learning_rate": 0.00015232602845059593, + "loss": 0.7845430374145508, + "step": 1241 + }, + { + "epoch": 0.7162629757785467, + "grad_norm": 0.7632762789726257, + "learning_rate": 0.0001522875816993464, + "loss": 0.8908877968788147, + "step": 1242 + }, + { + "epoch": 0.7168396770472895, + "grad_norm": 0.835464358329773, + "learning_rate": 0.00015224913494809689, + "loss": 1.0795903205871582, + "step": 1243 + }, + { + "epoch": 0.7174163783160323, + "grad_norm": 0.998972475528717, + "learning_rate": 0.0001522106881968474, + "loss": 0.9715967178344727, + "step": 1244 + }, + { + "epoch": 0.7179930795847751, + "grad_norm": 0.5176213383674622, + "learning_rate": 0.00015217224144559784, + "loss": 0.7307795286178589, + "step": 1245 + }, + { + "epoch": 0.7185697808535179, + "grad_norm": 1.0009640455245972, + "learning_rate": 0.00015213379469434835, + "loss": 1.253312587738037, + "step": 1246 + }, + { + "epoch": 0.7191464821222606, + "grad_norm": 1.1499648094177246, + "learning_rate": 0.0001520953479430988, + "loss": 1.2523915767669678, + "step": 1247 + }, + { + "epoch": 0.7197231833910035, + "grad_norm": 0.9233465790748596, + "learning_rate": 0.0001520569011918493, + "loss": 1.025418996810913, + "step": 1248 + }, + { + "epoch": 0.7202998846597463, + "grad_norm": 0.5469316840171814, + "learning_rate": 0.00015201845444059975, + "loss": 0.6671372652053833, + "step": 1249 + }, + { + "epoch": 0.720876585928489, + "grad_norm": 0.7743379473686218, + "learning_rate": 0.00015198000768935026, + "loss": 1.2212378978729248, + "step": 1250 + }, + { + "epoch": 0.7214532871972318, + "grad_norm": 0.971682608127594, + "learning_rate": 0.00015194156093810074, + "loss": 1.2435131072998047, + "step": 1251 + }, + { + "epoch": 0.7220299884659747, + "grad_norm": 0.9899376630783081, + "learning_rate": 0.00015190311418685122, + "loss": 1.2595231533050537, + "step": 1252 + }, + { + "epoch": 0.7226066897347174, + "grad_norm": 0.8441123962402344, + "learning_rate": 0.0001518646674356017, + "loss": 0.9278808832168579, + "step": 1253 + }, + { + "epoch": 0.7231833910034602, + "grad_norm": 0.5254001021385193, + "learning_rate": 0.00015182622068435217, + "loss": 0.786496102809906, + "step": 1254 + }, + { + "epoch": 0.723760092272203, + "grad_norm": 0.9715943932533264, + "learning_rate": 0.00015178777393310265, + "loss": 0.9957152605056763, + "step": 1255 + }, + { + "epoch": 0.7243367935409458, + "grad_norm": 0.9919838905334473, + "learning_rate": 0.00015174932718185316, + "loss": 1.3595893383026123, + "step": 1256 + }, + { + "epoch": 0.7249134948096886, + "grad_norm": 0.7739357352256775, + "learning_rate": 0.0001517108804306036, + "loss": 0.7901654839515686, + "step": 1257 + }, + { + "epoch": 0.7254901960784313, + "grad_norm": 0.996926486492157, + "learning_rate": 0.00015167243367935411, + "loss": 1.0908658504486084, + "step": 1258 + }, + { + "epoch": 0.7260668973471741, + "grad_norm": 0.6757825016975403, + "learning_rate": 0.00015163398692810456, + "loss": 0.7795881032943726, + "step": 1259 + }, + { + "epoch": 0.726643598615917, + "grad_norm": 0.9458150863647461, + "learning_rate": 0.00015159554017685507, + "loss": 1.0505211353302002, + "step": 1260 + }, + { + "epoch": 0.7272202998846597, + "grad_norm": 0.8086127638816833, + "learning_rate": 0.00015155709342560552, + "loss": 0.9041070938110352, + "step": 1261 + }, + { + "epoch": 0.7277970011534025, + "grad_norm": 0.6491602659225464, + "learning_rate": 0.00015151864667435603, + "loss": 0.9067816734313965, + "step": 1262 + }, + { + "epoch": 0.7283737024221453, + "grad_norm": 0.5835777521133423, + "learning_rate": 0.0001514801999231065, + "loss": 0.7853602170944214, + "step": 1263 + }, + { + "epoch": 0.7289504036908881, + "grad_norm": 0.8881536722183228, + "learning_rate": 0.00015144175317185698, + "loss": 1.2767361402511597, + "step": 1264 + }, + { + "epoch": 0.7295271049596309, + "grad_norm": 0.6160046458244324, + "learning_rate": 0.00015140330642060746, + "loss": 0.7595696449279785, + "step": 1265 + }, + { + "epoch": 0.7301038062283737, + "grad_norm": 0.7877328991889954, + "learning_rate": 0.00015136485966935794, + "loss": 0.9727606773376465, + "step": 1266 + }, + { + "epoch": 0.7306805074971164, + "grad_norm": 0.6233464479446411, + "learning_rate": 0.00015132641291810842, + "loss": 0.6097822785377502, + "step": 1267 + }, + { + "epoch": 0.7312572087658593, + "grad_norm": 0.8846599459648132, + "learning_rate": 0.00015128796616685892, + "loss": 1.314606785774231, + "step": 1268 + }, + { + "epoch": 0.7318339100346021, + "grad_norm": 0.6752328872680664, + "learning_rate": 0.00015124951941560937, + "loss": 0.9257625341415405, + "step": 1269 + }, + { + "epoch": 0.7324106113033448, + "grad_norm": 0.6147440075874329, + "learning_rate": 0.00015121107266435988, + "loss": 0.7304266691207886, + "step": 1270 + }, + { + "epoch": 0.7329873125720877, + "grad_norm": 0.8625065088272095, + "learning_rate": 0.00015117262591311033, + "loss": 1.2385823726654053, + "step": 1271 + }, + { + "epoch": 0.7335640138408305, + "grad_norm": 0.6224170923233032, + "learning_rate": 0.00015113417916186084, + "loss": 0.7687395215034485, + "step": 1272 + }, + { + "epoch": 0.7341407151095732, + "grad_norm": 0.839799165725708, + "learning_rate": 0.0001510957324106113, + "loss": 1.0231621265411377, + "step": 1273 + }, + { + "epoch": 0.734717416378316, + "grad_norm": 0.8609519600868225, + "learning_rate": 0.0001510572856593618, + "loss": 1.1030302047729492, + "step": 1274 + }, + { + "epoch": 0.7352941176470589, + "grad_norm": 0.8059080243110657, + "learning_rate": 0.00015101883890811227, + "loss": 1.307667851448059, + "step": 1275 + }, + { + "epoch": 0.7358708189158016, + "grad_norm": 0.7881230115890503, + "learning_rate": 0.00015098039215686275, + "loss": 0.8685023784637451, + "step": 1276 + }, + { + "epoch": 0.7364475201845444, + "grad_norm": 0.6535466909408569, + "learning_rate": 0.00015094194540561323, + "loss": 0.8849316835403442, + "step": 1277 + }, + { + "epoch": 0.7370242214532872, + "grad_norm": 0.664448082447052, + "learning_rate": 0.0001509034986543637, + "loss": 0.809040904045105, + "step": 1278 + }, + { + "epoch": 0.73760092272203, + "grad_norm": 0.9526609182357788, + "learning_rate": 0.00015086505190311418, + "loss": 1.2887682914733887, + "step": 1279 + }, + { + "epoch": 0.7381776239907728, + "grad_norm": 0.8947210907936096, + "learning_rate": 0.00015082660515186466, + "loss": 1.0613007545471191, + "step": 1280 + }, + { + "epoch": 0.7387543252595156, + "grad_norm": 0.9127343893051147, + "learning_rate": 0.00015078815840061514, + "loss": 0.9401702284812927, + "step": 1281 + }, + { + "epoch": 0.7393310265282583, + "grad_norm": 1.0288292169570923, + "learning_rate": 0.00015074971164936565, + "loss": 1.2102299928665161, + "step": 1282 + }, + { + "epoch": 0.7399077277970012, + "grad_norm": 0.6608892679214478, + "learning_rate": 0.0001507112648981161, + "loss": 0.7817317247390747, + "step": 1283 + }, + { + "epoch": 0.740484429065744, + "grad_norm": 0.5857222080230713, + "learning_rate": 0.0001506728181468666, + "loss": 0.7468012571334839, + "step": 1284 + }, + { + "epoch": 0.7410611303344867, + "grad_norm": 0.6499783992767334, + "learning_rate": 0.00015063437139561708, + "loss": 0.7113574147224426, + "step": 1285 + }, + { + "epoch": 0.7416378316032295, + "grad_norm": 0.718450129032135, + "learning_rate": 0.00015059592464436756, + "loss": 0.9823046326637268, + "step": 1286 + }, + { + "epoch": 0.7422145328719724, + "grad_norm": 0.7987701296806335, + "learning_rate": 0.00015055747789311804, + "loss": 0.9410796761512756, + "step": 1287 + }, + { + "epoch": 0.7427912341407151, + "grad_norm": 0.7227610349655151, + "learning_rate": 0.00015051903114186852, + "loss": 0.7366760969161987, + "step": 1288 + }, + { + "epoch": 0.7433679354094579, + "grad_norm": 0.9411056637763977, + "learning_rate": 0.000150480584390619, + "loss": 0.9475510120391846, + "step": 1289 + }, + { + "epoch": 0.7439446366782007, + "grad_norm": 0.5987991690635681, + "learning_rate": 0.00015044213763936947, + "loss": 0.8084846138954163, + "step": 1290 + }, + { + "epoch": 0.7445213379469435, + "grad_norm": 0.6214851140975952, + "learning_rate": 0.00015040369088811995, + "loss": 0.6952444911003113, + "step": 1291 + }, + { + "epoch": 0.7450980392156863, + "grad_norm": 0.7398913502693176, + "learning_rate": 0.00015036524413687043, + "loss": 0.8432753086090088, + "step": 1292 + }, + { + "epoch": 0.745674740484429, + "grad_norm": 0.8513553142547607, + "learning_rate": 0.0001503267973856209, + "loss": 0.8751744627952576, + "step": 1293 + }, + { + "epoch": 0.7462514417531718, + "grad_norm": 0.7704481482505798, + "learning_rate": 0.0001502883506343714, + "loss": 0.9727562665939331, + "step": 1294 + }, + { + "epoch": 0.7468281430219147, + "grad_norm": 0.6925477385520935, + "learning_rate": 0.0001502499038831219, + "loss": 1.044316291809082, + "step": 1295 + }, + { + "epoch": 0.7474048442906575, + "grad_norm": 0.8089653253555298, + "learning_rate": 0.00015021145713187237, + "loss": 0.9385859966278076, + "step": 1296 + }, + { + "epoch": 0.7479815455594002, + "grad_norm": 0.8045443296432495, + "learning_rate": 0.00015017301038062285, + "loss": 1.093725562095642, + "step": 1297 + }, + { + "epoch": 0.748558246828143, + "grad_norm": 0.8403393626213074, + "learning_rate": 0.00015013456362937333, + "loss": 0.7081382870674133, + "step": 1298 + }, + { + "epoch": 0.7491349480968859, + "grad_norm": 0.8455471992492676, + "learning_rate": 0.0001500961168781238, + "loss": 1.2357611656188965, + "step": 1299 + }, + { + "epoch": 0.7497116493656286, + "grad_norm": 0.8819023966789246, + "learning_rate": 0.00015005767012687428, + "loss": 1.2907012701034546, + "step": 1300 + }, + { + "epoch": 0.7502883506343714, + "grad_norm": 0.6467103362083435, + "learning_rate": 0.00015001922337562476, + "loss": 0.7991781830787659, + "step": 1301 + }, + { + "epoch": 0.7508650519031141, + "grad_norm": 1.0841728448867798, + "learning_rate": 0.00014998077662437524, + "loss": 1.156419038772583, + "step": 1302 + }, + { + "epoch": 0.751441753171857, + "grad_norm": 0.4863538146018982, + "learning_rate": 0.00014994232987312572, + "loss": 0.5481974482536316, + "step": 1303 + }, + { + "epoch": 0.7520184544405998, + "grad_norm": 0.631119966506958, + "learning_rate": 0.0001499038831218762, + "loss": 0.7421573996543884, + "step": 1304 + }, + { + "epoch": 0.7525951557093425, + "grad_norm": 0.6919093728065491, + "learning_rate": 0.0001498654363706267, + "loss": 0.6554936170578003, + "step": 1305 + }, + { + "epoch": 0.7531718569780853, + "grad_norm": 0.7746281027793884, + "learning_rate": 0.00014982698961937718, + "loss": 0.9226951599121094, + "step": 1306 + }, + { + "epoch": 0.7537485582468282, + "grad_norm": 0.821020245552063, + "learning_rate": 0.00014978854286812766, + "loss": 1.2231357097625732, + "step": 1307 + }, + { + "epoch": 0.754325259515571, + "grad_norm": 0.6167652606964111, + "learning_rate": 0.00014975009611687814, + "loss": 0.9597879648208618, + "step": 1308 + }, + { + "epoch": 0.7549019607843137, + "grad_norm": 0.6786548495292664, + "learning_rate": 0.00014971164936562861, + "loss": 0.8253003358840942, + "step": 1309 + }, + { + "epoch": 0.7554786620530565, + "grad_norm": 0.9683876037597656, + "learning_rate": 0.0001496732026143791, + "loss": 1.1294584274291992, + "step": 1310 + }, + { + "epoch": 0.7560553633217993, + "grad_norm": 0.8556981086730957, + "learning_rate": 0.00014963475586312957, + "loss": 1.009643316268921, + "step": 1311 + }, + { + "epoch": 0.7566320645905421, + "grad_norm": 0.7639108896255493, + "learning_rate": 0.00014959630911188005, + "loss": 0.8871880769729614, + "step": 1312 + }, + { + "epoch": 0.7572087658592849, + "grad_norm": 0.9662507176399231, + "learning_rate": 0.00014955786236063053, + "loss": 1.2890512943267822, + "step": 1313 + }, + { + "epoch": 0.7577854671280276, + "grad_norm": 0.7260032892227173, + "learning_rate": 0.000149519415609381, + "loss": 1.2696185111999512, + "step": 1314 + }, + { + "epoch": 0.7583621683967705, + "grad_norm": 1.0413408279418945, + "learning_rate": 0.0001494809688581315, + "loss": 1.2239567041397095, + "step": 1315 + }, + { + "epoch": 0.7589388696655133, + "grad_norm": 0.9003005623817444, + "learning_rate": 0.00014944252210688196, + "loss": 1.248561143875122, + "step": 1316 + }, + { + "epoch": 0.759515570934256, + "grad_norm": 0.9604087471961975, + "learning_rate": 0.00014940407535563247, + "loss": 1.2369884252548218, + "step": 1317 + }, + { + "epoch": 0.7600922722029988, + "grad_norm": 0.7198401093482971, + "learning_rate": 0.00014936562860438295, + "loss": 0.743487536907196, + "step": 1318 + }, + { + "epoch": 0.7606689734717417, + "grad_norm": 0.7526591420173645, + "learning_rate": 0.00014932718185313342, + "loss": 0.7714953422546387, + "step": 1319 + }, + { + "epoch": 0.7612456747404844, + "grad_norm": 1.1336771249771118, + "learning_rate": 0.0001492887351018839, + "loss": 1.1577683687210083, + "step": 1320 + }, + { + "epoch": 0.7618223760092272, + "grad_norm": 0.7607272267341614, + "learning_rate": 0.00014925028835063438, + "loss": 0.903020977973938, + "step": 1321 + }, + { + "epoch": 0.76239907727797, + "grad_norm": 0.7855517268180847, + "learning_rate": 0.00014921184159938486, + "loss": 0.9421197772026062, + "step": 1322 + }, + { + "epoch": 0.7629757785467128, + "grad_norm": 0.9380967020988464, + "learning_rate": 0.00014917339484813534, + "loss": 1.0594120025634766, + "step": 1323 + }, + { + "epoch": 0.7635524798154556, + "grad_norm": 0.9255303740501404, + "learning_rate": 0.00014913494809688582, + "loss": 1.1912791728973389, + "step": 1324 + }, + { + "epoch": 0.7641291810841984, + "grad_norm": 0.7085497379302979, + "learning_rate": 0.00014909650134563632, + "loss": 0.7702199816703796, + "step": 1325 + }, + { + "epoch": 0.7647058823529411, + "grad_norm": 0.8080468773841858, + "learning_rate": 0.00014905805459438677, + "loss": 0.9640858769416809, + "step": 1326 + }, + { + "epoch": 0.765282583621684, + "grad_norm": 0.8854598999023438, + "learning_rate": 0.00014901960784313728, + "loss": 1.0912519693374634, + "step": 1327 + }, + { + "epoch": 0.7658592848904268, + "grad_norm": 1.158070683479309, + "learning_rate": 0.00014898116109188773, + "loss": 1.259207010269165, + "step": 1328 + }, + { + "epoch": 0.7664359861591695, + "grad_norm": 0.7163742780685425, + "learning_rate": 0.00014894271434063823, + "loss": 0.9091912508010864, + "step": 1329 + }, + { + "epoch": 0.7670126874279123, + "grad_norm": 0.6578546762466431, + "learning_rate": 0.0001489042675893887, + "loss": 1.13603937625885, + "step": 1330 + }, + { + "epoch": 0.7675893886966552, + "grad_norm": 0.641118586063385, + "learning_rate": 0.0001488658208381392, + "loss": 0.6926564574241638, + "step": 1331 + }, + { + "epoch": 0.7681660899653979, + "grad_norm": 1.3342225551605225, + "learning_rate": 0.00014882737408688967, + "loss": 1.1259536743164062, + "step": 1332 + }, + { + "epoch": 0.7687427912341407, + "grad_norm": 0.6777533292770386, + "learning_rate": 0.00014878892733564015, + "loss": 0.8380722403526306, + "step": 1333 + }, + { + "epoch": 0.7693194925028836, + "grad_norm": 0.5475529432296753, + "learning_rate": 0.00014875048058439063, + "loss": 0.7194100618362427, + "step": 1334 + }, + { + "epoch": 0.7698961937716263, + "grad_norm": 0.7109413743019104, + "learning_rate": 0.0001487120338331411, + "loss": 0.7877069711685181, + "step": 1335 + }, + { + "epoch": 0.7704728950403691, + "grad_norm": 0.5451337099075317, + "learning_rate": 0.00014867358708189158, + "loss": 0.7354110479354858, + "step": 1336 + }, + { + "epoch": 0.7710495963091119, + "grad_norm": 0.7789444327354431, + "learning_rate": 0.0001486351403306421, + "loss": 0.9675291776657104, + "step": 1337 + }, + { + "epoch": 0.7716262975778547, + "grad_norm": 0.7246870398521423, + "learning_rate": 0.00014859669357939254, + "loss": 0.9592723846435547, + "step": 1338 + }, + { + "epoch": 0.7722029988465975, + "grad_norm": 0.7461789846420288, + "learning_rate": 0.00014855824682814304, + "loss": 1.062403678894043, + "step": 1339 + }, + { + "epoch": 0.7727797001153403, + "grad_norm": 0.6598569750785828, + "learning_rate": 0.0001485198000768935, + "loss": 0.959195077419281, + "step": 1340 + }, + { + "epoch": 0.773356401384083, + "grad_norm": 0.8688694834709167, + "learning_rate": 0.000148481353325644, + "loss": 1.3393487930297852, + "step": 1341 + }, + { + "epoch": 0.7739331026528259, + "grad_norm": 0.7083797454833984, + "learning_rate": 0.00014844290657439448, + "loss": 0.9515122175216675, + "step": 1342 + }, + { + "epoch": 0.7745098039215687, + "grad_norm": 0.7261124849319458, + "learning_rate": 0.00014840445982314496, + "loss": 1.048977017402649, + "step": 1343 + }, + { + "epoch": 0.7750865051903114, + "grad_norm": 0.9450129270553589, + "learning_rate": 0.00014836601307189544, + "loss": 1.1335430145263672, + "step": 1344 + }, + { + "epoch": 0.7756632064590542, + "grad_norm": 0.47535234689712524, + "learning_rate": 0.00014832756632064591, + "loss": 0.6887091398239136, + "step": 1345 + }, + { + "epoch": 0.776239907727797, + "grad_norm": 0.714235782623291, + "learning_rate": 0.0001482891195693964, + "loss": 0.9414650201797485, + "step": 1346 + }, + { + "epoch": 0.7768166089965398, + "grad_norm": 0.6094812750816345, + "learning_rate": 0.00014825067281814687, + "loss": 0.8214763402938843, + "step": 1347 + }, + { + "epoch": 0.7773933102652826, + "grad_norm": 0.7122801542282104, + "learning_rate": 0.00014821222606689735, + "loss": 0.9144871830940247, + "step": 1348 + }, + { + "epoch": 0.7779700115340253, + "grad_norm": 0.8147172927856445, + "learning_rate": 0.00014817377931564785, + "loss": 1.1212399005889893, + "step": 1349 + }, + { + "epoch": 0.7785467128027682, + "grad_norm": 0.5866456627845764, + "learning_rate": 0.0001481353325643983, + "loss": 0.6841553449630737, + "step": 1350 + }, + { + "epoch": 0.779123414071511, + "grad_norm": 1.2120155096054077, + "learning_rate": 0.0001480968858131488, + "loss": 1.1782194375991821, + "step": 1351 + }, + { + "epoch": 0.7797001153402537, + "grad_norm": 0.8661918640136719, + "learning_rate": 0.00014805843906189926, + "loss": 1.1883846521377563, + "step": 1352 + }, + { + "epoch": 0.7802768166089965, + "grad_norm": 1.2335827350616455, + "learning_rate": 0.00014801999231064977, + "loss": 1.199598789215088, + "step": 1353 + }, + { + "epoch": 0.7808535178777394, + "grad_norm": 0.8413060307502747, + "learning_rate": 0.00014798154555940025, + "loss": 1.0878143310546875, + "step": 1354 + }, + { + "epoch": 0.7814302191464821, + "grad_norm": 1.042397379875183, + "learning_rate": 0.00014794309880815072, + "loss": 1.5179508924484253, + "step": 1355 + }, + { + "epoch": 0.7820069204152249, + "grad_norm": 1.2029002904891968, + "learning_rate": 0.0001479046520569012, + "loss": 1.361120343208313, + "step": 1356 + }, + { + "epoch": 0.7825836216839677, + "grad_norm": 0.9056934714317322, + "learning_rate": 0.00014786620530565168, + "loss": 1.0812435150146484, + "step": 1357 + }, + { + "epoch": 0.7831603229527105, + "grad_norm": 0.7730829119682312, + "learning_rate": 0.00014782775855440216, + "loss": 1.0833256244659424, + "step": 1358 + }, + { + "epoch": 0.7837370242214533, + "grad_norm": 0.8789440393447876, + "learning_rate": 0.00014778931180315264, + "loss": 1.0179883241653442, + "step": 1359 + }, + { + "epoch": 0.7843137254901961, + "grad_norm": 0.775190532207489, + "learning_rate": 0.00014775086505190312, + "loss": 1.0584783554077148, + "step": 1360 + }, + { + "epoch": 0.7848904267589388, + "grad_norm": 0.7954389452934265, + "learning_rate": 0.00014771241830065362, + "loss": 1.1697866916656494, + "step": 1361 + }, + { + "epoch": 0.7854671280276817, + "grad_norm": 0.8194144368171692, + "learning_rate": 0.00014767397154940407, + "loss": 0.9788481593132019, + "step": 1362 + }, + { + "epoch": 0.7860438292964245, + "grad_norm": 0.7247309684753418, + "learning_rate": 0.00014763552479815458, + "loss": 0.9953986406326294, + "step": 1363 + }, + { + "epoch": 0.7866205305651672, + "grad_norm": 0.8735687136650085, + "learning_rate": 0.00014759707804690503, + "loss": 1.108184576034546, + "step": 1364 + }, + { + "epoch": 0.78719723183391, + "grad_norm": 0.8578454256057739, + "learning_rate": 0.00014755863129565553, + "loss": 1.0608623027801514, + "step": 1365 + }, + { + "epoch": 0.7877739331026529, + "grad_norm": 1.038670301437378, + "learning_rate": 0.000147520184544406, + "loss": 1.2398217916488647, + "step": 1366 + }, + { + "epoch": 0.7883506343713956, + "grad_norm": 0.832326352596283, + "learning_rate": 0.0001474817377931565, + "loss": 1.5559954643249512, + "step": 1367 + }, + { + "epoch": 0.7889273356401384, + "grad_norm": 0.5325842499732971, + "learning_rate": 0.00014744329104190697, + "loss": 0.6711868047714233, + "step": 1368 + }, + { + "epoch": 0.7895040369088812, + "grad_norm": 0.6845494508743286, + "learning_rate": 0.00014740484429065745, + "loss": 0.9054516553878784, + "step": 1369 + }, + { + "epoch": 0.790080738177624, + "grad_norm": 0.8053160309791565, + "learning_rate": 0.00014736639753940793, + "loss": 1.1551737785339355, + "step": 1370 + }, + { + "epoch": 0.7906574394463668, + "grad_norm": 0.9268645644187927, + "learning_rate": 0.0001473279507881584, + "loss": 0.9230217933654785, + "step": 1371 + }, + { + "epoch": 0.7912341407151096, + "grad_norm": 1.0553678274154663, + "learning_rate": 0.00014728950403690888, + "loss": 1.2223023176193237, + "step": 1372 + }, + { + "epoch": 0.7918108419838523, + "grad_norm": 0.6177469491958618, + "learning_rate": 0.0001472510572856594, + "loss": 0.8992686867713928, + "step": 1373 + }, + { + "epoch": 0.7923875432525952, + "grad_norm": 1.138965368270874, + "learning_rate": 0.00014721261053440984, + "loss": 0.8630029559135437, + "step": 1374 + }, + { + "epoch": 0.792964244521338, + "grad_norm": 0.5512900948524475, + "learning_rate": 0.00014717416378316034, + "loss": 0.8302984237670898, + "step": 1375 + }, + { + "epoch": 0.7935409457900807, + "grad_norm": 0.6091440916061401, + "learning_rate": 0.0001471357170319108, + "loss": 0.7380212545394897, + "step": 1376 + }, + { + "epoch": 0.7941176470588235, + "grad_norm": 0.909902811050415, + "learning_rate": 0.0001470972702806613, + "loss": 1.0644478797912598, + "step": 1377 + }, + { + "epoch": 0.7946943483275664, + "grad_norm": 0.9841009378433228, + "learning_rate": 0.00014705882352941178, + "loss": 1.5122861862182617, + "step": 1378 + }, + { + "epoch": 0.7952710495963091, + "grad_norm": 0.7682785391807556, + "learning_rate": 0.00014702037677816226, + "loss": 0.8122522830963135, + "step": 1379 + }, + { + "epoch": 0.7958477508650519, + "grad_norm": 0.8022129535675049, + "learning_rate": 0.00014698193002691274, + "loss": 0.7516300678253174, + "step": 1380 + }, + { + "epoch": 0.7964244521337946, + "grad_norm": 0.8423136472702026, + "learning_rate": 0.00014694348327566321, + "loss": 0.9571545124053955, + "step": 1381 + }, + { + "epoch": 0.7970011534025375, + "grad_norm": 0.61954665184021, + "learning_rate": 0.0001469050365244137, + "loss": 0.8543866872787476, + "step": 1382 + }, + { + "epoch": 0.7975778546712803, + "grad_norm": 0.5888648629188538, + "learning_rate": 0.00014686658977316417, + "loss": 0.6958523988723755, + "step": 1383 + }, + { + "epoch": 0.798154555940023, + "grad_norm": 0.9419842958450317, + "learning_rate": 0.00014682814302191465, + "loss": 1.3051813840866089, + "step": 1384 + }, + { + "epoch": 0.7987312572087658, + "grad_norm": 1.1472746133804321, + "learning_rate": 0.00014678969627066515, + "loss": 1.284635305404663, + "step": 1385 + }, + { + "epoch": 0.7993079584775087, + "grad_norm": 0.5858578681945801, + "learning_rate": 0.0001467512495194156, + "loss": 0.7809937596321106, + "step": 1386 + }, + { + "epoch": 0.7998846597462514, + "grad_norm": 0.7086213231086731, + "learning_rate": 0.0001467128027681661, + "loss": 0.6571354269981384, + "step": 1387 + }, + { + "epoch": 0.8004613610149942, + "grad_norm": 0.8438594341278076, + "learning_rate": 0.00014667435601691656, + "loss": 0.9461796283721924, + "step": 1388 + }, + { + "epoch": 0.801038062283737, + "grad_norm": 0.6701700687408447, + "learning_rate": 0.00014663590926566707, + "loss": 0.7518469095230103, + "step": 1389 + }, + { + "epoch": 0.8016147635524798, + "grad_norm": 0.7239779233932495, + "learning_rate": 0.00014659746251441755, + "loss": 0.98681640625, + "step": 1390 + }, + { + "epoch": 0.8021914648212226, + "grad_norm": 0.9055145382881165, + "learning_rate": 0.00014655901576316802, + "loss": 1.038681983947754, + "step": 1391 + }, + { + "epoch": 0.8027681660899654, + "grad_norm": 0.674439013004303, + "learning_rate": 0.0001465205690119185, + "loss": 0.7289140820503235, + "step": 1392 + }, + { + "epoch": 0.8033448673587081, + "grad_norm": 0.6101412773132324, + "learning_rate": 0.00014648212226066898, + "loss": 0.8470169901847839, + "step": 1393 + }, + { + "epoch": 0.803921568627451, + "grad_norm": 1.0043631792068481, + "learning_rate": 0.00014644367550941946, + "loss": 0.9277285933494568, + "step": 1394 + }, + { + "epoch": 0.8044982698961938, + "grad_norm": 0.8795577883720398, + "learning_rate": 0.00014640522875816994, + "loss": 1.2433722019195557, + "step": 1395 + }, + { + "epoch": 0.8050749711649365, + "grad_norm": 0.469595730304718, + "learning_rate": 0.00014636678200692042, + "loss": 0.5572987794876099, + "step": 1396 + }, + { + "epoch": 0.8056516724336793, + "grad_norm": 0.8809022903442383, + "learning_rate": 0.00014632833525567092, + "loss": 1.1597031354904175, + "step": 1397 + }, + { + "epoch": 0.8062283737024222, + "grad_norm": 0.9675459861755371, + "learning_rate": 0.00014628988850442137, + "loss": 1.0070991516113281, + "step": 1398 + }, + { + "epoch": 0.8068050749711649, + "grad_norm": 0.8547102212905884, + "learning_rate": 0.00014625144175317188, + "loss": 0.9210143089294434, + "step": 1399 + }, + { + "epoch": 0.8073817762399077, + "grad_norm": 0.5635284185409546, + "learning_rate": 0.00014621299500192233, + "loss": 0.5849195122718811, + "step": 1400 + }, + { + "epoch": 0.8079584775086506, + "grad_norm": 0.8755897283554077, + "learning_rate": 0.00014617454825067283, + "loss": 1.014789342880249, + "step": 1401 + }, + { + "epoch": 0.8085351787773933, + "grad_norm": 0.6002927422523499, + "learning_rate": 0.00014613610149942328, + "loss": 0.8705483675003052, + "step": 1402 + }, + { + "epoch": 0.8091118800461361, + "grad_norm": 0.9547945857048035, + "learning_rate": 0.0001460976547481738, + "loss": 1.0433237552642822, + "step": 1403 + }, + { + "epoch": 0.8096885813148789, + "grad_norm": 0.8594508767127991, + "learning_rate": 0.00014605920799692427, + "loss": 0.857754111289978, + "step": 1404 + }, + { + "epoch": 0.8102652825836217, + "grad_norm": 0.632087230682373, + "learning_rate": 0.00014602076124567475, + "loss": 1.0932989120483398, + "step": 1405 + }, + { + "epoch": 0.8108419838523645, + "grad_norm": 0.6727497577667236, + "learning_rate": 0.00014598231449442523, + "loss": 1.1335169076919556, + "step": 1406 + }, + { + "epoch": 0.8114186851211073, + "grad_norm": 1.050377368927002, + "learning_rate": 0.0001459438677431757, + "loss": 1.1787501573562622, + "step": 1407 + }, + { + "epoch": 0.81199538638985, + "grad_norm": 0.624580442905426, + "learning_rate": 0.00014590542099192618, + "loss": 0.8040243983268738, + "step": 1408 + }, + { + "epoch": 0.8125720876585929, + "grad_norm": 0.644497275352478, + "learning_rate": 0.0001458669742406767, + "loss": 0.9769735336303711, + "step": 1409 + }, + { + "epoch": 0.8131487889273357, + "grad_norm": 0.8106479048728943, + "learning_rate": 0.00014582852748942714, + "loss": 1.2847563028335571, + "step": 1410 + }, + { + "epoch": 0.8137254901960784, + "grad_norm": 0.6234838962554932, + "learning_rate": 0.00014579008073817764, + "loss": 0.7418760061264038, + "step": 1411 + }, + { + "epoch": 0.8143021914648212, + "grad_norm": 0.7591360807418823, + "learning_rate": 0.0001457516339869281, + "loss": 1.0062642097473145, + "step": 1412 + }, + { + "epoch": 0.8148788927335641, + "grad_norm": 0.7684062123298645, + "learning_rate": 0.0001457131872356786, + "loss": 0.9963294267654419, + "step": 1413 + }, + { + "epoch": 0.8154555940023068, + "grad_norm": 0.8234810829162598, + "learning_rate": 0.00014567474048442905, + "loss": 0.9132286310195923, + "step": 1414 + }, + { + "epoch": 0.8160322952710496, + "grad_norm": 1.3752492666244507, + "learning_rate": 0.00014563629373317956, + "loss": 1.3458770513534546, + "step": 1415 + }, + { + "epoch": 0.8166089965397924, + "grad_norm": 0.8771060109138489, + "learning_rate": 0.00014559784698193004, + "loss": 0.9146612882614136, + "step": 1416 + }, + { + "epoch": 0.8171856978085352, + "grad_norm": 0.5799472332000732, + "learning_rate": 0.0001455594002306805, + "loss": 0.8132292032241821, + "step": 1417 + }, + { + "epoch": 0.817762399077278, + "grad_norm": 1.0692527294158936, + "learning_rate": 0.000145520953479431, + "loss": 1.0524235963821411, + "step": 1418 + }, + { + "epoch": 0.8183391003460208, + "grad_norm": 0.6880149245262146, + "learning_rate": 0.00014548250672818147, + "loss": 0.8549849987030029, + "step": 1419 + }, + { + "epoch": 0.8189158016147635, + "grad_norm": 0.9311429858207703, + "learning_rate": 0.00014544405997693195, + "loss": 1.2363505363464355, + "step": 1420 + }, + { + "epoch": 0.8194925028835064, + "grad_norm": 0.6105409860610962, + "learning_rate": 0.00014540561322568245, + "loss": 0.8256676197052002, + "step": 1421 + }, + { + "epoch": 0.8200692041522492, + "grad_norm": 0.9718572497367859, + "learning_rate": 0.0001453671664744329, + "loss": 1.349236249923706, + "step": 1422 + }, + { + "epoch": 0.8206459054209919, + "grad_norm": 0.9589305520057678, + "learning_rate": 0.0001453287197231834, + "loss": 0.8896529674530029, + "step": 1423 + }, + { + "epoch": 0.8212226066897347, + "grad_norm": 1.1475483179092407, + "learning_rate": 0.00014529027297193386, + "loss": 1.392863154411316, + "step": 1424 + }, + { + "epoch": 0.8217993079584776, + "grad_norm": 0.9420047402381897, + "learning_rate": 0.00014525182622068437, + "loss": 1.1920685768127441, + "step": 1425 + }, + { + "epoch": 0.8223760092272203, + "grad_norm": 0.584073007106781, + "learning_rate": 0.00014521337946943482, + "loss": 0.5488528609275818, + "step": 1426 + }, + { + "epoch": 0.8229527104959631, + "grad_norm": 0.6110360622406006, + "learning_rate": 0.00014517493271818532, + "loss": 0.7226777672767639, + "step": 1427 + }, + { + "epoch": 0.8235294117647058, + "grad_norm": 0.5320557355880737, + "learning_rate": 0.0001451364859669358, + "loss": 0.5602037906646729, + "step": 1428 + }, + { + "epoch": 0.8241061130334487, + "grad_norm": 0.5847785472869873, + "learning_rate": 0.00014509803921568628, + "loss": 0.632820725440979, + "step": 1429 + }, + { + "epoch": 0.8246828143021915, + "grad_norm": 1.1915888786315918, + "learning_rate": 0.00014505959246443676, + "loss": 1.2395484447479248, + "step": 1430 + }, + { + "epoch": 0.8252595155709342, + "grad_norm": 0.7745262980461121, + "learning_rate": 0.00014502114571318724, + "loss": 0.9293632507324219, + "step": 1431 + }, + { + "epoch": 0.825836216839677, + "grad_norm": 0.9716136455535889, + "learning_rate": 0.00014498269896193771, + "loss": 1.2587440013885498, + "step": 1432 + }, + { + "epoch": 0.8264129181084199, + "grad_norm": 0.6674740314483643, + "learning_rate": 0.00014494425221068822, + "loss": 0.9000645875930786, + "step": 1433 + }, + { + "epoch": 0.8269896193771626, + "grad_norm": 0.9345766305923462, + "learning_rate": 0.00014490580545943867, + "loss": 0.9881076812744141, + "step": 1434 + }, + { + "epoch": 0.8275663206459054, + "grad_norm": 0.8641346096992493, + "learning_rate": 0.00014486735870818918, + "loss": 1.0706219673156738, + "step": 1435 + }, + { + "epoch": 0.8281430219146482, + "grad_norm": 0.8997068405151367, + "learning_rate": 0.00014482891195693963, + "loss": 0.932431697845459, + "step": 1436 + }, + { + "epoch": 0.828719723183391, + "grad_norm": 0.7539141774177551, + "learning_rate": 0.00014479046520569013, + "loss": 0.8891205191612244, + "step": 1437 + }, + { + "epoch": 0.8292964244521338, + "grad_norm": 0.8675488233566284, + "learning_rate": 0.00014475201845444058, + "loss": 0.9973325729370117, + "step": 1438 + }, + { + "epoch": 0.8298731257208766, + "grad_norm": 0.7566542029380798, + "learning_rate": 0.0001447135717031911, + "loss": 1.1265358924865723, + "step": 1439 + }, + { + "epoch": 0.8304498269896193, + "grad_norm": 0.902654230594635, + "learning_rate": 0.00014467512495194157, + "loss": 1.0915746688842773, + "step": 1440 + }, + { + "epoch": 0.8310265282583622, + "grad_norm": 0.618813693523407, + "learning_rate": 0.00014463667820069205, + "loss": 0.6798044443130493, + "step": 1441 + }, + { + "epoch": 0.831603229527105, + "grad_norm": 0.6372320055961609, + "learning_rate": 0.00014459823144944252, + "loss": 0.8383584022521973, + "step": 1442 + }, + { + "epoch": 0.8321799307958477, + "grad_norm": 0.742468535900116, + "learning_rate": 0.000144559784698193, + "loss": 1.0003979206085205, + "step": 1443 + }, + { + "epoch": 0.8327566320645905, + "grad_norm": 0.9815142750740051, + "learning_rate": 0.00014452133794694348, + "loss": 1.2571461200714111, + "step": 1444 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.7362657785415649, + "learning_rate": 0.000144482891195694, + "loss": 0.9890142679214478, + "step": 1445 + }, + { + "epoch": 0.8339100346020761, + "grad_norm": 1.047896385192871, + "learning_rate": 0.00014444444444444444, + "loss": 0.7491689920425415, + "step": 1446 + }, + { + "epoch": 0.8344867358708189, + "grad_norm": 1.0869019031524658, + "learning_rate": 0.00014440599769319494, + "loss": 1.0598435401916504, + "step": 1447 + }, + { + "epoch": 0.8350634371395617, + "grad_norm": 0.8003841042518616, + "learning_rate": 0.0001443675509419454, + "loss": 0.7503578662872314, + "step": 1448 + }, + { + "epoch": 0.8356401384083045, + "grad_norm": 1.3352385759353638, + "learning_rate": 0.0001443291041906959, + "loss": 1.7147669792175293, + "step": 1449 + }, + { + "epoch": 0.8362168396770473, + "grad_norm": 0.7203720808029175, + "learning_rate": 0.00014429065743944635, + "loss": 0.7103738188743591, + "step": 1450 + }, + { + "epoch": 0.8367935409457901, + "grad_norm": 0.7292425036430359, + "learning_rate": 0.00014425221068819686, + "loss": 0.9089938402175903, + "step": 1451 + }, + { + "epoch": 0.8373702422145328, + "grad_norm": 1.5864981412887573, + "learning_rate": 0.00014421376393694733, + "loss": 1.2735176086425781, + "step": 1452 + }, + { + "epoch": 0.8379469434832757, + "grad_norm": 0.5966582298278809, + "learning_rate": 0.0001441753171856978, + "loss": 0.8211960196495056, + "step": 1453 + }, + { + "epoch": 0.8385236447520185, + "grad_norm": 0.6568999886512756, + "learning_rate": 0.0001441368704344483, + "loss": 0.9273509979248047, + "step": 1454 + }, + { + "epoch": 0.8391003460207612, + "grad_norm": 0.6672592163085938, + "learning_rate": 0.00014409842368319877, + "loss": 0.7854159474372864, + "step": 1455 + }, + { + "epoch": 0.839677047289504, + "grad_norm": 1.1119751930236816, + "learning_rate": 0.00014405997693194925, + "loss": 1.2850849628448486, + "step": 1456 + }, + { + "epoch": 0.8402537485582469, + "grad_norm": 0.8437113165855408, + "learning_rate": 0.00014402153018069975, + "loss": 0.9052360653877258, + "step": 1457 + }, + { + "epoch": 0.8408304498269896, + "grad_norm": 1.1120409965515137, + "learning_rate": 0.0001439830834294502, + "loss": 1.4261767864227295, + "step": 1458 + }, + { + "epoch": 0.8414071510957324, + "grad_norm": 0.6494320631027222, + "learning_rate": 0.0001439446366782007, + "loss": 0.8434788584709167, + "step": 1459 + }, + { + "epoch": 0.8419838523644751, + "grad_norm": 0.5622795820236206, + "learning_rate": 0.00014390618992695116, + "loss": 0.646868109703064, + "step": 1460 + }, + { + "epoch": 0.842560553633218, + "grad_norm": 0.8375677466392517, + "learning_rate": 0.00014386774317570167, + "loss": 1.0123827457427979, + "step": 1461 + }, + { + "epoch": 0.8431372549019608, + "grad_norm": 0.6013731956481934, + "learning_rate": 0.00014382929642445214, + "loss": 0.7129334211349487, + "step": 1462 + }, + { + "epoch": 0.8437139561707035, + "grad_norm": 0.7148757576942444, + "learning_rate": 0.00014379084967320262, + "loss": 0.7350738048553467, + "step": 1463 + }, + { + "epoch": 0.8442906574394463, + "grad_norm": 0.7380696535110474, + "learning_rate": 0.0001437524029219531, + "loss": 0.7962418794631958, + "step": 1464 + }, + { + "epoch": 0.8448673587081892, + "grad_norm": 0.6836022734642029, + "learning_rate": 0.00014371395617070358, + "loss": 1.0249385833740234, + "step": 1465 + }, + { + "epoch": 0.845444059976932, + "grad_norm": 0.8065418004989624, + "learning_rate": 0.00014367550941945406, + "loss": 1.0036308765411377, + "step": 1466 + }, + { + "epoch": 0.8460207612456747, + "grad_norm": 0.8336586356163025, + "learning_rate": 0.00014363706266820454, + "loss": 0.9442139863967896, + "step": 1467 + }, + { + "epoch": 0.8465974625144176, + "grad_norm": 0.9105651378631592, + "learning_rate": 0.00014359861591695501, + "loss": 1.198281168937683, + "step": 1468 + }, + { + "epoch": 0.8471741637831603, + "grad_norm": 0.6932002902030945, + "learning_rate": 0.0001435601691657055, + "loss": 0.76617431640625, + "step": 1469 + }, + { + "epoch": 0.8477508650519031, + "grad_norm": 0.6474612951278687, + "learning_rate": 0.00014352172241445597, + "loss": 0.9350631237030029, + "step": 1470 + }, + { + "epoch": 0.8483275663206459, + "grad_norm": 1.0232489109039307, + "learning_rate": 0.00014348327566320648, + "loss": 1.2790873050689697, + "step": 1471 + }, + { + "epoch": 0.8489042675893888, + "grad_norm": 0.5638800263404846, + "learning_rate": 0.00014344482891195695, + "loss": 0.6640872359275818, + "step": 1472 + }, + { + "epoch": 0.8494809688581315, + "grad_norm": 0.7060153484344482, + "learning_rate": 0.00014340638216070743, + "loss": 0.549694299697876, + "step": 1473 + }, + { + "epoch": 0.8500576701268743, + "grad_norm": 0.7553113698959351, + "learning_rate": 0.0001433679354094579, + "loss": 0.6748926639556885, + "step": 1474 + }, + { + "epoch": 0.850634371395617, + "grad_norm": 1.0750683546066284, + "learning_rate": 0.0001433294886582084, + "loss": 1.2567592859268188, + "step": 1475 + }, + { + "epoch": 0.8512110726643599, + "grad_norm": 0.8767377138137817, + "learning_rate": 0.00014329104190695887, + "loss": 0.8606712818145752, + "step": 1476 + }, + { + "epoch": 0.8517877739331027, + "grad_norm": 0.8583175539970398, + "learning_rate": 0.00014325259515570935, + "loss": 1.0961095094680786, + "step": 1477 + }, + { + "epoch": 0.8523644752018454, + "grad_norm": 0.8185640573501587, + "learning_rate": 0.00014321414840445982, + "loss": 0.9456279277801514, + "step": 1478 + }, + { + "epoch": 0.8529411764705882, + "grad_norm": 0.7922638058662415, + "learning_rate": 0.0001431757016532103, + "loss": 0.8527402281761169, + "step": 1479 + }, + { + "epoch": 0.8535178777393311, + "grad_norm": 0.8317216634750366, + "learning_rate": 0.00014313725490196078, + "loss": 1.0812233686447144, + "step": 1480 + }, + { + "epoch": 0.8540945790080738, + "grad_norm": 0.5592607855796814, + "learning_rate": 0.00014309880815071126, + "loss": 0.6856215000152588, + "step": 1481 + }, + { + "epoch": 0.8546712802768166, + "grad_norm": 0.6144684553146362, + "learning_rate": 0.00014306036139946174, + "loss": 0.8217105269432068, + "step": 1482 + }, + { + "epoch": 0.8552479815455594, + "grad_norm": 0.8721742630004883, + "learning_rate": 0.00014302191464821224, + "loss": 1.1268048286437988, + "step": 1483 + }, + { + "epoch": 0.8558246828143022, + "grad_norm": 0.7512510418891907, + "learning_rate": 0.00014298346789696272, + "loss": 0.7509297132492065, + "step": 1484 + }, + { + "epoch": 0.856401384083045, + "grad_norm": 0.7145662307739258, + "learning_rate": 0.0001429450211457132, + "loss": 0.787600040435791, + "step": 1485 + }, + { + "epoch": 0.8569780853517878, + "grad_norm": 0.5714643597602844, + "learning_rate": 0.00014290657439446368, + "loss": 0.5843244791030884, + "step": 1486 + }, + { + "epoch": 0.8575547866205305, + "grad_norm": 0.567432701587677, + "learning_rate": 0.00014286812764321416, + "loss": 0.5819793939590454, + "step": 1487 + }, + { + "epoch": 0.8581314878892734, + "grad_norm": 0.7957308888435364, + "learning_rate": 0.00014282968089196463, + "loss": 1.127239465713501, + "step": 1488 + }, + { + "epoch": 0.8587081891580162, + "grad_norm": 0.6828871369361877, + "learning_rate": 0.0001427912341407151, + "loss": 0.8339288234710693, + "step": 1489 + }, + { + "epoch": 0.8592848904267589, + "grad_norm": 0.6947774887084961, + "learning_rate": 0.0001427527873894656, + "loss": 0.8848856687545776, + "step": 1490 + }, + { + "epoch": 0.8598615916955017, + "grad_norm": 0.7703558802604675, + "learning_rate": 0.00014271434063821607, + "loss": 1.1964079141616821, + "step": 1491 + }, + { + "epoch": 0.8604382929642446, + "grad_norm": 0.9820204973220825, + "learning_rate": 0.00014267589388696655, + "loss": 1.3156203031539917, + "step": 1492 + }, + { + "epoch": 0.8610149942329873, + "grad_norm": 0.663357138633728, + "learning_rate": 0.00014263744713571703, + "loss": 1.1208245754241943, + "step": 1493 + }, + { + "epoch": 0.8615916955017301, + "grad_norm": 0.6204859018325806, + "learning_rate": 0.00014259900038446753, + "loss": 0.8412761688232422, + "step": 1494 + }, + { + "epoch": 0.8621683967704729, + "grad_norm": 0.8673816323280334, + "learning_rate": 0.000142560553633218, + "loss": 0.9236775040626526, + "step": 1495 + }, + { + "epoch": 0.8627450980392157, + "grad_norm": 0.6511439681053162, + "learning_rate": 0.0001425221068819685, + "loss": 0.8711351156234741, + "step": 1496 + }, + { + "epoch": 0.8633217993079585, + "grad_norm": 0.5167029500007629, + "learning_rate": 0.00014248366013071897, + "loss": 0.6116561889648438, + "step": 1497 + }, + { + "epoch": 0.8638985005767013, + "grad_norm": 0.6007522940635681, + "learning_rate": 0.00014244521337946944, + "loss": 0.7663001418113708, + "step": 1498 + }, + { + "epoch": 0.864475201845444, + "grad_norm": 0.5924880504608154, + "learning_rate": 0.00014240676662821992, + "loss": 0.6707437038421631, + "step": 1499 + }, + { + "epoch": 0.8650519031141869, + "grad_norm": 0.859641969203949, + "learning_rate": 0.0001423683198769704, + "loss": 1.0436668395996094, + "step": 1500 + } + ], + "logging_steps": 1, + "max_steps": 5202, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.361771665599693e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}