| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 4.998285910181694, |
| "eval_steps": 1000, |
| "global_step": 7290, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.006856359273225917, |
| "grad_norm": 10.705459594726562, |
| "learning_rate": 2.7434842249657065e-06, |
| "loss": 4.058, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.013712718546451834, |
| "grad_norm": 5.851310729980469, |
| "learning_rate": 5.486968449931413e-06, |
| "loss": 4.049, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.02056907781967775, |
| "grad_norm": 10.28573226928711, |
| "learning_rate": 8.23045267489712e-06, |
| "loss": 3.9324, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.027425437092903668, |
| "grad_norm": 6.631351947784424, |
| "learning_rate": 1.0973936899862826e-05, |
| "loss": 3.3949, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.03428179636612959, |
| "grad_norm": 7.74021053314209, |
| "learning_rate": 1.3717421124828534e-05, |
| "loss": 2.5865, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.0411381556393555, |
| "grad_norm": 4.802703380584717, |
| "learning_rate": 1.646090534979424e-05, |
| "loss": 1.9904, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.04799451491258142, |
| "grad_norm": 3.307770252227783, |
| "learning_rate": 1.9204389574759944e-05, |
| "loss": 1.8117, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.054850874185807336, |
| "grad_norm": 2.1599233150482178, |
| "learning_rate": 2.1947873799725652e-05, |
| "loss": 1.5714, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.061707233459033256, |
| "grad_norm": 2.8519175052642822, |
| "learning_rate": 2.4691358024691357e-05, |
| "loss": 1.6103, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.06856359273225918, |
| "grad_norm": 5.519439220428467, |
| "learning_rate": 2.7434842249657068e-05, |
| "loss": 1.5985, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.07541995200548508, |
| "grad_norm": 2.3000030517578125, |
| "learning_rate": 3.017832647462277e-05, |
| "loss": 1.6145, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.082276311278711, |
| "grad_norm": 2.166654586791992, |
| "learning_rate": 3.292181069958848e-05, |
| "loss": 1.3969, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.08913267055193692, |
| "grad_norm": 2.6651406288146973, |
| "learning_rate": 3.566529492455419e-05, |
| "loss": 1.5422, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.09598902982516284, |
| "grad_norm": 3.262629747390747, |
| "learning_rate": 3.840877914951989e-05, |
| "loss": 1.5311, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.10284538909838875, |
| "grad_norm": 2.1092171669006348, |
| "learning_rate": 4.11522633744856e-05, |
| "loss": 1.3981, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.10970174837161467, |
| "grad_norm": 2.9152212142944336, |
| "learning_rate": 4.3895747599451304e-05, |
| "loss": 1.5888, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.11655810764484059, |
| "grad_norm": 2.3841612339019775, |
| "learning_rate": 4.6639231824417016e-05, |
| "loss": 1.4813, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.12341446691806651, |
| "grad_norm": 2.5338547229766846, |
| "learning_rate": 4.938271604938271e-05, |
| "loss": 1.5353, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.13027082619129243, |
| "grad_norm": 3.3338496685028076, |
| "learning_rate": 5.2126200274348424e-05, |
| "loss": 1.4861, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.13712718546451835, |
| "grad_norm": 2.825850248336792, |
| "learning_rate": 5.4869684499314136e-05, |
| "loss": 1.2727, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.14398354473774425, |
| "grad_norm": 2.7089977264404297, |
| "learning_rate": 5.761316872427984e-05, |
| "loss": 1.4897, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.15083990401097017, |
| "grad_norm": 2.1353025436401367, |
| "learning_rate": 6.035665294924554e-05, |
| "loss": 1.5895, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.1576962632841961, |
| "grad_norm": 2.382019519805908, |
| "learning_rate": 6.310013717421126e-05, |
| "loss": 1.4928, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.164552622557422, |
| "grad_norm": 2.485421895980835, |
| "learning_rate": 6.584362139917696e-05, |
| "loss": 1.5546, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.17140898183064793, |
| "grad_norm": 2.1953341960906982, |
| "learning_rate": 6.858710562414266e-05, |
| "loss": 1.4226, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.17826534110387385, |
| "grad_norm": 2.0357184410095215, |
| "learning_rate": 7.133058984910838e-05, |
| "loss": 1.3597, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.18512170037709977, |
| "grad_norm": 3.600464344024658, |
| "learning_rate": 7.407407407407407e-05, |
| "loss": 1.4977, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.1919780596503257, |
| "grad_norm": 2.2848992347717285, |
| "learning_rate": 7.681755829903978e-05, |
| "loss": 1.4477, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.19883441892355158, |
| "grad_norm": 2.5611186027526855, |
| "learning_rate": 7.95610425240055e-05, |
| "loss": 1.4536, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.2056907781967775, |
| "grad_norm": 2.1254303455352783, |
| "learning_rate": 8.23045267489712e-05, |
| "loss": 1.4916, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.21254713747000342, |
| "grad_norm": 3.0583224296569824, |
| "learning_rate": 8.50480109739369e-05, |
| "loss": 1.6612, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.21940349674322934, |
| "grad_norm": 1.6011234521865845, |
| "learning_rate": 8.779149519890261e-05, |
| "loss": 1.489, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.22625985601645526, |
| "grad_norm": 2.642266273498535, |
| "learning_rate": 9.053497942386831e-05, |
| "loss": 1.3562, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.23311621528968118, |
| "grad_norm": 3.995382785797119, |
| "learning_rate": 9.327846364883403e-05, |
| "loss": 1.4473, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.2399725745629071, |
| "grad_norm": 2.1678707599639893, |
| "learning_rate": 9.602194787379974e-05, |
| "loss": 1.5002, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.24682893383613302, |
| "grad_norm": 3.0694093704223633, |
| "learning_rate": 9.876543209876543e-05, |
| "loss": 1.3842, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.25368529310935894, |
| "grad_norm": 2.040300130844116, |
| "learning_rate": 0.00010150891632373114, |
| "loss": 1.3354, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.26054165238258487, |
| "grad_norm": 2.9175169467926025, |
| "learning_rate": 0.00010425240054869685, |
| "loss": 1.4611, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.2673980116558108, |
| "grad_norm": 2.048736095428467, |
| "learning_rate": 0.00010699588477366255, |
| "loss": 1.4053, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.2742543709290367, |
| "grad_norm": 2.210230827331543, |
| "learning_rate": 0.00010973936899862827, |
| "loss": 1.4418, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.28111073020226257, |
| "grad_norm": 2.244452714920044, |
| "learning_rate": 0.00011248285322359398, |
| "loss": 1.4981, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.2879670894754885, |
| "grad_norm": 2.487210273742676, |
| "learning_rate": 0.00011522633744855968, |
| "loss": 1.488, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.2948234487487144, |
| "grad_norm": 2.0524230003356934, |
| "learning_rate": 0.0001179698216735254, |
| "loss": 1.4526, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.30167980802194033, |
| "grad_norm": 2.3150599002838135, |
| "learning_rate": 0.00012071330589849108, |
| "loss": 1.2799, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.30853616729516625, |
| "grad_norm": 6.619499206542969, |
| "learning_rate": 0.0001234567901234568, |
| "loss": 1.4248, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.3153925265683922, |
| "grad_norm": 2.391008138656616, |
| "learning_rate": 0.0001262002743484225, |
| "loss": 1.4802, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.3222488858416181, |
| "grad_norm": 1.9163336753845215, |
| "learning_rate": 0.0001289437585733882, |
| "loss": 1.343, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.329105245114844, |
| "grad_norm": 2.3108503818511963, |
| "learning_rate": 0.00013168724279835392, |
| "loss": 1.4222, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.33596160438806993, |
| "grad_norm": 2.137388229370117, |
| "learning_rate": 0.00013443072702331964, |
| "loss": 1.3412, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.34281796366129585, |
| "grad_norm": 1.5437091588974, |
| "learning_rate": 0.00013717421124828533, |
| "loss": 1.5056, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.3496743229345218, |
| "grad_norm": 2.1628546714782715, |
| "learning_rate": 0.00013991769547325105, |
| "loss": 1.3907, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.3565306822077477, |
| "grad_norm": 2.018361806869507, |
| "learning_rate": 0.00014266117969821676, |
| "loss": 1.4301, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.3633870414809736, |
| "grad_norm": 1.873982310295105, |
| "learning_rate": 0.00014540466392318243, |
| "loss": 1.3484, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.37024340075419954, |
| "grad_norm": 2.2962214946746826, |
| "learning_rate": 0.00014814814814814815, |
| "loss": 1.2669, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.37709976002742546, |
| "grad_norm": 1.6865073442459106, |
| "learning_rate": 0.00015089163237311386, |
| "loss": 1.4261, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.3839561193006514, |
| "grad_norm": 2.0754506587982178, |
| "learning_rate": 0.00015363511659807956, |
| "loss": 1.2982, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.3908124785738773, |
| "grad_norm": 1.6448793411254883, |
| "learning_rate": 0.00015637860082304527, |
| "loss": 1.3753, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.39766883784710316, |
| "grad_norm": 1.5936967134475708, |
| "learning_rate": 0.000159122085048011, |
| "loss": 1.4155, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.4045251971203291, |
| "grad_norm": 2.3004658222198486, |
| "learning_rate": 0.00016186556927297668, |
| "loss": 1.3321, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.411381556393555, |
| "grad_norm": 2.23530912399292, |
| "learning_rate": 0.0001646090534979424, |
| "loss": 1.385, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.4182379156667809, |
| "grad_norm": 2.0479331016540527, |
| "learning_rate": 0.00016735253772290812, |
| "loss": 1.4551, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.42509427494000684, |
| "grad_norm": 2.0379133224487305, |
| "learning_rate": 0.0001700960219478738, |
| "loss": 1.338, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.43195063421323276, |
| "grad_norm": 2.908133029937744, |
| "learning_rate": 0.0001728395061728395, |
| "loss": 1.419, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.4388069934864587, |
| "grad_norm": 2.721883773803711, |
| "learning_rate": 0.00017558299039780522, |
| "loss": 1.3527, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.4456633527596846, |
| "grad_norm": 2.2164113521575928, |
| "learning_rate": 0.00017832647462277094, |
| "loss": 1.4272, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.4525197120329105, |
| "grad_norm": 1.7344247102737427, |
| "learning_rate": 0.00018106995884773663, |
| "loss": 1.3838, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.45937607130613645, |
| "grad_norm": 1.705946922302246, |
| "learning_rate": 0.00018381344307270234, |
| "loss": 1.2844, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.46623243057936237, |
| "grad_norm": 1.6594369411468506, |
| "learning_rate": 0.00018655692729766806, |
| "loss": 1.5451, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.4730887898525883, |
| "grad_norm": 1.326300859451294, |
| "learning_rate": 0.00018930041152263375, |
| "loss": 1.3065, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.4799451491258142, |
| "grad_norm": 2.301481008529663, |
| "learning_rate": 0.00019204389574759947, |
| "loss": 1.3551, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.48680150839904013, |
| "grad_norm": 1.6948672533035278, |
| "learning_rate": 0.0001947873799725652, |
| "loss": 1.3867, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.49365786767226605, |
| "grad_norm": 2.5889105796813965, |
| "learning_rate": 0.00019753086419753085, |
| "loss": 1.4339, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.500514226945492, |
| "grad_norm": 2.4985668659210205, |
| "learning_rate": 0.0001999999885361719, |
| "loss": 1.3048, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.5073705862187179, |
| "grad_norm": 1.8193042278289795, |
| "learning_rate": 0.00019999861287997797, |
| "loss": 1.2499, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.5142269454919438, |
| "grad_norm": 3.7840700149536133, |
| "learning_rate": 0.00019999494449430045, |
| "loss": 1.3802, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.5210833047651697, |
| "grad_norm": 1.7764334678649902, |
| "learning_rate": 0.00019998898346324667, |
| "loss": 1.482, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.5279396640383957, |
| "grad_norm": 3.376749038696289, |
| "learning_rate": 0.00019998072992348886, |
| "loss": 1.4751, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.5347960233116216, |
| "grad_norm": 1.9762446880340576, |
| "learning_rate": 0.00019997018406426093, |
| "loss": 1.434, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.5416523825848475, |
| "grad_norm": 2.363563060760498, |
| "learning_rate": 0.00019995734612735427, |
| "loss": 1.3767, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.5485087418580734, |
| "grad_norm": 2.0239980220794678, |
| "learning_rate": 0.00019994221640711222, |
| "loss": 1.3242, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.5553651011312993, |
| "grad_norm": 4.290923118591309, |
| "learning_rate": 0.00019992479525042303, |
| "loss": 1.3456, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.5622214604045251, |
| "grad_norm": 1.7077864408493042, |
| "learning_rate": 0.00019990508305671228, |
| "loss": 1.4179, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.5690778196777511, |
| "grad_norm": 2.9807496070861816, |
| "learning_rate": 0.0001998830802779335, |
| "loss": 1.4003, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.575934178950977, |
| "grad_norm": 1.4405466318130493, |
| "learning_rate": 0.00019985878741855793, |
| "loss": 1.3682, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.5827905382242029, |
| "grad_norm": 2.661698341369629, |
| "learning_rate": 0.00019983220503556282, |
| "loss": 1.5405, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.5896468974974288, |
| "grad_norm": 1.7290005683898926, |
| "learning_rate": 0.00019980333373841873, |
| "loss": 1.3195, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.5965032567706547, |
| "grad_norm": 1.6726288795471191, |
| "learning_rate": 0.00019977217418907562, |
| "loss": 1.4031, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.6033596160438807, |
| "grad_norm": 1.7914421558380127, |
| "learning_rate": 0.00019973872710194756, |
| "loss": 1.5047, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.6102159753171066, |
| "grad_norm": 2.1812944412231445, |
| "learning_rate": 0.00019970299324389642, |
| "loss": 1.4172, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.6170723345903325, |
| "grad_norm": 1.884153962135315, |
| "learning_rate": 0.0001996649734342143, |
| "loss": 1.3291, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.6239286938635584, |
| "grad_norm": 1.7976921796798706, |
| "learning_rate": 0.00019962466854460458, |
| "loss": 1.4267, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.6307850531367843, |
| "grad_norm": 1.628185510635376, |
| "learning_rate": 0.00019958207949916223, |
| "loss": 1.4677, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.6376414124100103, |
| "grad_norm": 1.7098406553268433, |
| "learning_rate": 0.00019953720727435242, |
| "loss": 1.4233, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.6444977716832362, |
| "grad_norm": 2.8682682514190674, |
| "learning_rate": 0.0001994900528989881, |
| "loss": 1.2757, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.6513541309564621, |
| "grad_norm": 2.8800859451293945, |
| "learning_rate": 0.00019944061745420655, |
| "loss": 1.3997, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.658210490229688, |
| "grad_norm": 1.1743911504745483, |
| "learning_rate": 0.00019938890207344453, |
| "loss": 1.4948, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.665066849502914, |
| "grad_norm": 3.9781527519226074, |
| "learning_rate": 0.00019933490794241224, |
| "loss": 1.349, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.6719232087761399, |
| "grad_norm": 1.9682557582855225, |
| "learning_rate": 0.00019927863629906622, |
| "loss": 1.4381, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.6787795680493658, |
| "grad_norm": 2.0713021755218506, |
| "learning_rate": 0.00019922008843358094, |
| "loss": 1.3814, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.6856359273225917, |
| "grad_norm": 2.3139188289642334, |
| "learning_rate": 0.0001991592656883192, |
| "loss": 1.3592, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.6856359273225917, |
| "eval_loss": 1.1938791275024414, |
| "eval_runtime": 29.6937, |
| "eval_samples_per_second": 82.745, |
| "eval_steps_per_second": 10.373, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.6924922865958176, |
| "grad_norm": 1.9212145805358887, |
| "learning_rate": 0.00019909616945780134, |
| "loss": 1.4605, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.6993486458690436, |
| "grad_norm": 1.236790657043457, |
| "learning_rate": 0.0001990308011886733, |
| "loss": 1.2371, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.7062050051422695, |
| "grad_norm": 4.3965373039245605, |
| "learning_rate": 0.00019896316237967343, |
| "loss": 1.5101, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.7130613644154954, |
| "grad_norm": 1.8042622804641724, |
| "learning_rate": 0.0001988932545815982, |
| "loss": 1.2647, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.7199177236887213, |
| "grad_norm": 2.162903070449829, |
| "learning_rate": 0.00019882107939726655, |
| "loss": 1.4241, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.7267740829619472, |
| "grad_norm": 1.7536650896072388, |
| "learning_rate": 0.00019874663848148312, |
| "loss": 1.3215, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.7336304422351732, |
| "grad_norm": 1.6323691606521606, |
| "learning_rate": 0.00019866993354100042, |
| "loss": 1.3117, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.7404868015083991, |
| "grad_norm": 2.1569364070892334, |
| "learning_rate": 0.00019859096633447965, |
| "loss": 1.4203, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.747343160781625, |
| "grad_norm": 2.549560546875, |
| "learning_rate": 0.00019850973867245036, |
| "loss": 1.3122, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.7541995200548509, |
| "grad_norm": 2.85105562210083, |
| "learning_rate": 0.00019842625241726892, |
| "loss": 1.3834, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.7610558793280768, |
| "grad_norm": 2.235344648361206, |
| "learning_rate": 0.00019834050948307582, |
| "loss": 1.3419, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.7679122386013028, |
| "grad_norm": 2.0601747035980225, |
| "learning_rate": 0.00019825251183575195, |
| "loss": 1.4033, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.7747685978745287, |
| "grad_norm": 1.8832818269729614, |
| "learning_rate": 0.00019816226149287324, |
| "loss": 1.442, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.7816249571477546, |
| "grad_norm": 1.8006055355072021, |
| "learning_rate": 0.00019806976052366465, |
| "loss": 1.3696, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.7884813164209804, |
| "grad_norm": 2.5300374031066895, |
| "learning_rate": 0.00019797501104895258, |
| "loss": 1.3844, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.7953376756942063, |
| "grad_norm": 2.987879991531372, |
| "learning_rate": 0.00019787801524111628, |
| "loss": 1.2814, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.8021940349674322, |
| "grad_norm": 1.7485980987548828, |
| "learning_rate": 0.00019777877532403814, |
| "loss": 1.3488, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.8090503942406582, |
| "grad_norm": 1.764123558998108, |
| "learning_rate": 0.0001976772935730525, |
| "loss": 1.4282, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.8159067535138841, |
| "grad_norm": 2.458010673522949, |
| "learning_rate": 0.00019757357231489365, |
| "loss": 1.4672, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.82276311278711, |
| "grad_norm": 3.0731263160705566, |
| "learning_rate": 0.00019746761392764253, |
| "loss": 1.4038, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.8296194720603359, |
| "grad_norm": 1.8123950958251953, |
| "learning_rate": 0.00019735942084067197, |
| "loss": 1.2516, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.8364758313335618, |
| "grad_norm": 1.8038643598556519, |
| "learning_rate": 0.00019724899553459117, |
| "loss": 1.2599, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.8433321906067878, |
| "grad_norm": 1.6645665168762207, |
| "learning_rate": 0.0001971363405411888, |
| "loss": 1.4674, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.8501885498800137, |
| "grad_norm": 1.7630431652069092, |
| "learning_rate": 0.00019702145844337497, |
| "loss": 1.3191, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.8570449091532396, |
| "grad_norm": 2.408414840698242, |
| "learning_rate": 0.00019690435187512192, |
| "loss": 1.4237, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.8639012684264655, |
| "grad_norm": 2.5194432735443115, |
| "learning_rate": 0.00019678502352140368, |
| "loss": 1.2969, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.8707576276996914, |
| "grad_norm": 2.1748664379119873, |
| "learning_rate": 0.0001966634761181346, |
| "loss": 1.2632, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.8776139869729174, |
| "grad_norm": 1.8696935176849365, |
| "learning_rate": 0.0001965397124521065, |
| "loss": 1.3904, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.8844703462461433, |
| "grad_norm": 1.6131293773651123, |
| "learning_rate": 0.00019641373536092473, |
| "loss": 1.3648, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.8913267055193692, |
| "grad_norm": 2.2704977989196777, |
| "learning_rate": 0.00019628554773294335, |
| "loss": 1.4555, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.8981830647925951, |
| "grad_norm": 2.2356860637664795, |
| "learning_rate": 0.0001961551525071986, |
| "loss": 1.3464, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.905039424065821, |
| "grad_norm": 2.372926712036133, |
| "learning_rate": 0.00019602255267334179, |
| "loss": 1.2966, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.911895783339047, |
| "grad_norm": 1.4219205379486084, |
| "learning_rate": 0.00019588775127157054, |
| "loss": 1.4147, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.9187521426122729, |
| "grad_norm": 1.885373830795288, |
| "learning_rate": 0.00019575075139255922, |
| "loss": 1.3167, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.9256085018854988, |
| "grad_norm": 3.269922971725464, |
| "learning_rate": 0.00019561155617738797, |
| "loss": 1.3325, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.9324648611587247, |
| "grad_norm": 1.6605535745620728, |
| "learning_rate": 0.00019547016881747088, |
| "loss": 1.2398, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.9393212204319507, |
| "grad_norm": 1.9962818622589111, |
| "learning_rate": 0.00019532659255448257, |
| "loss": 1.3011, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.9461775797051766, |
| "grad_norm": 1.9618587493896484, |
| "learning_rate": 0.00019518083068028398, |
| "loss": 1.3274, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.9530339389784025, |
| "grad_norm": 2.1130402088165283, |
| "learning_rate": 0.000195032886536847, |
| "loss": 1.3072, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.9598902982516284, |
| "grad_norm": 2.2084875106811523, |
| "learning_rate": 0.00019488276351617762, |
| "loss": 1.336, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.9667466575248543, |
| "grad_norm": 1.6304380893707275, |
| "learning_rate": 0.00019473046506023837, |
| "loss": 1.313, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.9736030167980803, |
| "grad_norm": 1.511129379272461, |
| "learning_rate": 0.00019457599466086927, |
| "loss": 1.369, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.9804593760713062, |
| "grad_norm": 1.7005033493041992, |
| "learning_rate": 0.00019441935585970784, |
| "loss": 1.4165, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.9873157353445321, |
| "grad_norm": 2.2424979209899902, |
| "learning_rate": 0.0001942605522481079, |
| "loss": 1.4495, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.994172094617758, |
| "grad_norm": 1.764769434928894, |
| "learning_rate": 0.0001940995874670571, |
| "loss": 1.3326, |
| "step": 1450 |
| }, |
| { |
| "epoch": 1.001028453890984, |
| "grad_norm": 1.098801851272583, |
| "learning_rate": 0.0001939364652070937, |
| "loss": 1.2684, |
| "step": 1460 |
| }, |
| { |
| "epoch": 1.0078848131642097, |
| "grad_norm": 1.260248064994812, |
| "learning_rate": 0.00019377118920822176, |
| "loss": 1.1724, |
| "step": 1470 |
| }, |
| { |
| "epoch": 1.0147411724374358, |
| "grad_norm": 2.705153465270996, |
| "learning_rate": 0.00019360376325982533, |
| "loss": 1.1531, |
| "step": 1480 |
| }, |
| { |
| "epoch": 1.0215975317106616, |
| "grad_norm": 2.205137252807617, |
| "learning_rate": 0.00019343419120058174, |
| "loss": 1.172, |
| "step": 1490 |
| }, |
| { |
| "epoch": 1.0284538909838876, |
| "grad_norm": 2.324617862701416, |
| "learning_rate": 0.00019326247691837356, |
| "loss": 1.1682, |
| "step": 1500 |
| }, |
| { |
| "epoch": 1.0353102502571134, |
| "grad_norm": 1.6484122276306152, |
| "learning_rate": 0.0001930886243501993, |
| "loss": 1.0909, |
| "step": 1510 |
| }, |
| { |
| "epoch": 1.0421666095303395, |
| "grad_norm": 2.0225424766540527, |
| "learning_rate": 0.00019291263748208345, |
| "loss": 1.2112, |
| "step": 1520 |
| }, |
| { |
| "epoch": 1.0490229688035653, |
| "grad_norm": 2.0471479892730713, |
| "learning_rate": 0.00019273452034898473, |
| "loss": 1.2315, |
| "step": 1530 |
| }, |
| { |
| "epoch": 1.0558793280767913, |
| "grad_norm": 1.5864293575286865, |
| "learning_rate": 0.00019255427703470377, |
| "loss": 1.1076, |
| "step": 1540 |
| }, |
| { |
| "epoch": 1.062735687350017, |
| "grad_norm": 1.960959792137146, |
| "learning_rate": 0.00019237191167178957, |
| "loss": 1.1633, |
| "step": 1550 |
| }, |
| { |
| "epoch": 1.0695920466232431, |
| "grad_norm": 1.6834721565246582, |
| "learning_rate": 0.00019218742844144456, |
| "loss": 1.2397, |
| "step": 1560 |
| }, |
| { |
| "epoch": 1.076448405896469, |
| "grad_norm": 2.398449182510376, |
| "learning_rate": 0.00019200083157342877, |
| "loss": 1.0861, |
| "step": 1570 |
| }, |
| { |
| "epoch": 1.083304765169695, |
| "grad_norm": 1.7399917840957642, |
| "learning_rate": 0.0001918121253459631, |
| "loss": 1.1532, |
| "step": 1580 |
| }, |
| { |
| "epoch": 1.0901611244429208, |
| "grad_norm": 1.8287321329116821, |
| "learning_rate": 0.0001916213140856307, |
| "loss": 1.1941, |
| "step": 1590 |
| }, |
| { |
| "epoch": 1.0970174837161468, |
| "grad_norm": 1.559448003768921, |
| "learning_rate": 0.00019142840216727835, |
| "loss": 1.1891, |
| "step": 1600 |
| }, |
| { |
| "epoch": 1.1038738429893726, |
| "grad_norm": 1.947444200515747, |
| "learning_rate": 0.00019123339401391589, |
| "loss": 1.2102, |
| "step": 1610 |
| }, |
| { |
| "epoch": 1.1107302022625984, |
| "grad_norm": 1.3778343200683594, |
| "learning_rate": 0.0001910362940966147, |
| "loss": 1.0902, |
| "step": 1620 |
| }, |
| { |
| "epoch": 1.1175865615358245, |
| "grad_norm": 1.4132441282272339, |
| "learning_rate": 0.00019083710693440536, |
| "loss": 1.1211, |
| "step": 1630 |
| }, |
| { |
| "epoch": 1.1244429208090505, |
| "grad_norm": 1.4806476831436157, |
| "learning_rate": 0.00019063583709417407, |
| "loss": 1.2051, |
| "step": 1640 |
| }, |
| { |
| "epoch": 1.1312992800822763, |
| "grad_norm": 1.5196651220321655, |
| "learning_rate": 0.00019043248919055778, |
| "loss": 1.1761, |
| "step": 1650 |
| }, |
| { |
| "epoch": 1.1381556393555021, |
| "grad_norm": 1.4891173839569092, |
| "learning_rate": 0.00019022706788583853, |
| "loss": 1.1754, |
| "step": 1660 |
| }, |
| { |
| "epoch": 1.1450119986287282, |
| "grad_norm": 3.816194772720337, |
| "learning_rate": 0.00019001957788983645, |
| "loss": 1.1501, |
| "step": 1670 |
| }, |
| { |
| "epoch": 1.151868357901954, |
| "grad_norm": 2.0599048137664795, |
| "learning_rate": 0.00018981002395980184, |
| "loss": 1.2955, |
| "step": 1680 |
| }, |
| { |
| "epoch": 1.15872471717518, |
| "grad_norm": 2.0258097648620605, |
| "learning_rate": 0.00018959841090030607, |
| "loss": 1.2163, |
| "step": 1690 |
| }, |
| { |
| "epoch": 1.1655810764484058, |
| "grad_norm": 2.2690117359161377, |
| "learning_rate": 0.00018938474356313146, |
| "loss": 1.1374, |
| "step": 1700 |
| }, |
| { |
| "epoch": 1.1724374357216318, |
| "grad_norm": 2.238801956176758, |
| "learning_rate": 0.00018916902684716004, |
| "loss": 1.0886, |
| "step": 1710 |
| }, |
| { |
| "epoch": 1.1792937949948576, |
| "grad_norm": 2.0089728832244873, |
| "learning_rate": 0.00018895126569826108, |
| "loss": 1.068, |
| "step": 1720 |
| }, |
| { |
| "epoch": 1.1861501542680837, |
| "grad_norm": 1.5372521877288818, |
| "learning_rate": 0.00018873146510917796, |
| "loss": 1.176, |
| "step": 1730 |
| }, |
| { |
| "epoch": 1.1930065135413095, |
| "grad_norm": 1.9092096090316772, |
| "learning_rate": 0.0001885096301194135, |
| "loss": 1.1167, |
| "step": 1740 |
| }, |
| { |
| "epoch": 1.1998628728145355, |
| "grad_norm": 1.670440673828125, |
| "learning_rate": 0.00018828576581511442, |
| "loss": 1.0562, |
| "step": 1750 |
| }, |
| { |
| "epoch": 1.2067192320877613, |
| "grad_norm": 3.970067024230957, |
| "learning_rate": 0.00018805987732895484, |
| "loss": 1.1368, |
| "step": 1760 |
| }, |
| { |
| "epoch": 1.2135755913609874, |
| "grad_norm": 2.8760297298431396, |
| "learning_rate": 0.00018783196984001855, |
| "loss": 1.1358, |
| "step": 1770 |
| }, |
| { |
| "epoch": 1.2204319506342132, |
| "grad_norm": 1.9676944017410278, |
| "learning_rate": 0.00018760204857368025, |
| "loss": 1.1217, |
| "step": 1780 |
| }, |
| { |
| "epoch": 1.2272883099074392, |
| "grad_norm": 1.7384953498840332, |
| "learning_rate": 0.00018737011880148562, |
| "loss": 1.1659, |
| "step": 1790 |
| }, |
| { |
| "epoch": 1.234144669180665, |
| "grad_norm": 2.452848196029663, |
| "learning_rate": 0.0001871361858410308, |
| "loss": 1.1873, |
| "step": 1800 |
| }, |
| { |
| "epoch": 1.241001028453891, |
| "grad_norm": 1.7921110391616821, |
| "learning_rate": 0.00018690025505584007, |
| "loss": 1.2562, |
| "step": 1810 |
| }, |
| { |
| "epoch": 1.2478573877271169, |
| "grad_norm": 1.934995412826538, |
| "learning_rate": 0.00018666233185524316, |
| "loss": 1.2777, |
| "step": 1820 |
| }, |
| { |
| "epoch": 1.2547137470003429, |
| "grad_norm": 2.868145227432251, |
| "learning_rate": 0.00018642242169425113, |
| "loss": 1.1826, |
| "step": 1830 |
| }, |
| { |
| "epoch": 1.2615701062735687, |
| "grad_norm": 1.9969412088394165, |
| "learning_rate": 0.00018618053007343126, |
| "loss": 1.2032, |
| "step": 1840 |
| }, |
| { |
| "epoch": 1.2684264655467947, |
| "grad_norm": 1.877845048904419, |
| "learning_rate": 0.00018593666253878096, |
| "loss": 1.1458, |
| "step": 1850 |
| }, |
| { |
| "epoch": 1.2752828248200205, |
| "grad_norm": 1.800827145576477, |
| "learning_rate": 0.0001856908246816007, |
| "loss": 1.0928, |
| "step": 1860 |
| }, |
| { |
| "epoch": 1.2821391840932466, |
| "grad_norm": 1.6568301916122437, |
| "learning_rate": 0.00018544302213836566, |
| "loss": 1.1258, |
| "step": 1870 |
| }, |
| { |
| "epoch": 1.2889955433664724, |
| "grad_norm": 3.1525049209594727, |
| "learning_rate": 0.00018519326059059665, |
| "loss": 1.2118, |
| "step": 1880 |
| }, |
| { |
| "epoch": 1.2958519026396984, |
| "grad_norm": 4.377742767333984, |
| "learning_rate": 0.00018494154576472976, |
| "loss": 1.1747, |
| "step": 1890 |
| }, |
| { |
| "epoch": 1.3027082619129242, |
| "grad_norm": 1.685784935951233, |
| "learning_rate": 0.000184687883431985, |
| "loss": 1.2139, |
| "step": 1900 |
| }, |
| { |
| "epoch": 1.30956462118615, |
| "grad_norm": 1.660079002380371, |
| "learning_rate": 0.00018443227940823423, |
| "loss": 1.2692, |
| "step": 1910 |
| }, |
| { |
| "epoch": 1.316420980459376, |
| "grad_norm": 1.9912021160125732, |
| "learning_rate": 0.00018417473955386745, |
| "loss": 1.2104, |
| "step": 1920 |
| }, |
| { |
| "epoch": 1.323277339732602, |
| "grad_norm": 2.156569242477417, |
| "learning_rate": 0.00018391526977365883, |
| "loss": 1.126, |
| "step": 1930 |
| }, |
| { |
| "epoch": 1.330133699005828, |
| "grad_norm": 1.768411636352539, |
| "learning_rate": 0.000183653876016631, |
| "loss": 1.107, |
| "step": 1940 |
| }, |
| { |
| "epoch": 1.3369900582790537, |
| "grad_norm": 2.667548179626465, |
| "learning_rate": 0.00018339056427591884, |
| "loss": 1.1562, |
| "step": 1950 |
| }, |
| { |
| "epoch": 1.3438464175522797, |
| "grad_norm": 1.7439193725585938, |
| "learning_rate": 0.00018312534058863194, |
| "loss": 1.1052, |
| "step": 1960 |
| }, |
| { |
| "epoch": 1.3507027768255058, |
| "grad_norm": 1.8108903169631958, |
| "learning_rate": 0.00018285821103571645, |
| "loss": 1.1605, |
| "step": 1970 |
| }, |
| { |
| "epoch": 1.3575591360987316, |
| "grad_norm": 2.2900757789611816, |
| "learning_rate": 0.00018258918174181526, |
| "loss": 1.214, |
| "step": 1980 |
| }, |
| { |
| "epoch": 1.3644154953719574, |
| "grad_norm": 3.963160514831543, |
| "learning_rate": 0.0001823182588751279, |
| "loss": 1.1504, |
| "step": 1990 |
| }, |
| { |
| "epoch": 1.3712718546451834, |
| "grad_norm": 1.6582880020141602, |
| "learning_rate": 0.00018204544864726895, |
| "loss": 1.0402, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.3712718546451834, |
| "eval_loss": 1.16560697555542, |
| "eval_runtime": 29.7405, |
| "eval_samples_per_second": 82.615, |
| "eval_steps_per_second": 10.356, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.3781282139184095, |
| "grad_norm": 2.182267904281616, |
| "learning_rate": 0.00018177075731312577, |
| "loss": 1.1344, |
| "step": 2010 |
| }, |
| { |
| "epoch": 1.3849845731916353, |
| "grad_norm": 1.5705207586288452, |
| "learning_rate": 0.00018149419117071482, |
| "loss": 1.0929, |
| "step": 2020 |
| }, |
| { |
| "epoch": 1.391840932464861, |
| "grad_norm": 2.289984941482544, |
| "learning_rate": 0.0001812157565610376, |
| "loss": 1.098, |
| "step": 2030 |
| }, |
| { |
| "epoch": 1.398697291738087, |
| "grad_norm": 2.600581645965576, |
| "learning_rate": 0.00018093545986793506, |
| "loss": 1.1924, |
| "step": 2040 |
| }, |
| { |
| "epoch": 1.405553651011313, |
| "grad_norm": 2.1132428646087646, |
| "learning_rate": 0.00018065330751794125, |
| "loss": 1.1127, |
| "step": 2050 |
| }, |
| { |
| "epoch": 1.412410010284539, |
| "grad_norm": 2.1575536727905273, |
| "learning_rate": 0.00018036930598013605, |
| "loss": 1.2272, |
| "step": 2060 |
| }, |
| { |
| "epoch": 1.4192663695577648, |
| "grad_norm": 1.6063116788864136, |
| "learning_rate": 0.00018008346176599674, |
| "loss": 1.0894, |
| "step": 2070 |
| }, |
| { |
| "epoch": 1.4261227288309908, |
| "grad_norm": 2.389429807662964, |
| "learning_rate": 0.00017979578142924885, |
| "loss": 1.1353, |
| "step": 2080 |
| }, |
| { |
| "epoch": 1.4329790881042166, |
| "grad_norm": 2.307987928390503, |
| "learning_rate": 0.0001795062715657157, |
| "loss": 1.1205, |
| "step": 2090 |
| }, |
| { |
| "epoch": 1.4398354473774426, |
| "grad_norm": 1.9370081424713135, |
| "learning_rate": 0.0001792149388131674, |
| "loss": 1.088, |
| "step": 2100 |
| }, |
| { |
| "epoch": 1.4466918066506684, |
| "grad_norm": 2.298769474029541, |
| "learning_rate": 0.0001789217898511685, |
| "loss": 1.1258, |
| "step": 2110 |
| }, |
| { |
| "epoch": 1.4535481659238945, |
| "grad_norm": 3.375627279281616, |
| "learning_rate": 0.00017862683140092497, |
| "loss": 1.0808, |
| "step": 2120 |
| }, |
| { |
| "epoch": 1.4604045251971203, |
| "grad_norm": 1.4899195432662964, |
| "learning_rate": 0.00017833007022512992, |
| "loss": 1.1043, |
| "step": 2130 |
| }, |
| { |
| "epoch": 1.4672608844703463, |
| "grad_norm": 2.4468886852264404, |
| "learning_rate": 0.0001780315131278087, |
| "loss": 1.2583, |
| "step": 2140 |
| }, |
| { |
| "epoch": 1.4741172437435721, |
| "grad_norm": 1.4145742654800415, |
| "learning_rate": 0.0001777311669541629, |
| "loss": 1.2913, |
| "step": 2150 |
| }, |
| { |
| "epoch": 1.4809736030167981, |
| "grad_norm": 1.8379381895065308, |
| "learning_rate": 0.00017742903859041325, |
| "loss": 1.1803, |
| "step": 2160 |
| }, |
| { |
| "epoch": 1.487829962290024, |
| "grad_norm": 2.4418864250183105, |
| "learning_rate": 0.00017712513496364197, |
| "loss": 1.1791, |
| "step": 2170 |
| }, |
| { |
| "epoch": 1.49468632156325, |
| "grad_norm": 2.0489251613616943, |
| "learning_rate": 0.00017681946304163372, |
| "loss": 1.193, |
| "step": 2180 |
| }, |
| { |
| "epoch": 1.5015426808364758, |
| "grad_norm": 1.6349157094955444, |
| "learning_rate": 0.00017651202983271603, |
| "loss": 1.1657, |
| "step": 2190 |
| }, |
| { |
| "epoch": 1.5083990401097016, |
| "grad_norm": 2.110250949859619, |
| "learning_rate": 0.00017620284238559848, |
| "loss": 1.2797, |
| "step": 2200 |
| }, |
| { |
| "epoch": 1.5152553993829276, |
| "grad_norm": 2.1508467197418213, |
| "learning_rate": 0.00017589190778921117, |
| "loss": 1.2165, |
| "step": 2210 |
| }, |
| { |
| "epoch": 1.5221117586561537, |
| "grad_norm": 1.8186125755310059, |
| "learning_rate": 0.00017557923317254213, |
| "loss": 1.2268, |
| "step": 2220 |
| }, |
| { |
| "epoch": 1.5289681179293795, |
| "grad_norm": 1.9265555143356323, |
| "learning_rate": 0.00017526482570447396, |
| "loss": 1.1894, |
| "step": 2230 |
| }, |
| { |
| "epoch": 1.5358244772026053, |
| "grad_norm": 2.7489798069000244, |
| "learning_rate": 0.00017494869259361933, |
| "loss": 1.217, |
| "step": 2240 |
| }, |
| { |
| "epoch": 1.5426808364758313, |
| "grad_norm": 1.585384726524353, |
| "learning_rate": 0.00017463084108815586, |
| "loss": 1.0137, |
| "step": 2250 |
| }, |
| { |
| "epoch": 1.5495371957490574, |
| "grad_norm": 3.09739089012146, |
| "learning_rate": 0.0001743112784756598, |
| "loss": 1.2187, |
| "step": 2260 |
| }, |
| { |
| "epoch": 1.5563935550222832, |
| "grad_norm": 2.081796884536743, |
| "learning_rate": 0.000173990012082939, |
| "loss": 1.0896, |
| "step": 2270 |
| }, |
| { |
| "epoch": 1.563249914295509, |
| "grad_norm": 2.9067137241363525, |
| "learning_rate": 0.00017366704927586498, |
| "loss": 1.1316, |
| "step": 2280 |
| }, |
| { |
| "epoch": 1.570106273568735, |
| "grad_norm": 2.1909544467926025, |
| "learning_rate": 0.00017334239745920394, |
| "loss": 1.1706, |
| "step": 2290 |
| }, |
| { |
| "epoch": 1.576962632841961, |
| "grad_norm": 2.680025815963745, |
| "learning_rate": 0.00017301606407644701, |
| "loss": 1.1753, |
| "step": 2300 |
| }, |
| { |
| "epoch": 1.5838189921151868, |
| "grad_norm": 3.4758894443511963, |
| "learning_rate": 0.0001726880566096397, |
| "loss": 1.1402, |
| "step": 2310 |
| }, |
| { |
| "epoch": 1.5906753513884127, |
| "grad_norm": 3.713744878768921, |
| "learning_rate": 0.0001723583825792102, |
| "loss": 1.1975, |
| "step": 2320 |
| }, |
| { |
| "epoch": 1.5975317106616387, |
| "grad_norm": 1.8732534646987915, |
| "learning_rate": 0.0001720270495437971, |
| "loss": 1.0603, |
| "step": 2330 |
| }, |
| { |
| "epoch": 1.6043880699348647, |
| "grad_norm": 1.7863500118255615, |
| "learning_rate": 0.0001716940651000759, |
| "loss": 1.1048, |
| "step": 2340 |
| }, |
| { |
| "epoch": 1.6112444292080905, |
| "grad_norm": 2.9769108295440674, |
| "learning_rate": 0.00017135943688258506, |
| "loss": 1.1211, |
| "step": 2350 |
| }, |
| { |
| "epoch": 1.6181007884813163, |
| "grad_norm": 3.001119375228882, |
| "learning_rate": 0.00017102317256355082, |
| "loss": 1.2261, |
| "step": 2360 |
| }, |
| { |
| "epoch": 1.6249571477545424, |
| "grad_norm": 2.4549243450164795, |
| "learning_rate": 0.00017068527985271125, |
| "loss": 1.3292, |
| "step": 2370 |
| }, |
| { |
| "epoch": 1.6318135070277684, |
| "grad_norm": 2.303267478942871, |
| "learning_rate": 0.00017034576649713965, |
| "loss": 1.1634, |
| "step": 2380 |
| }, |
| { |
| "epoch": 1.6386698663009942, |
| "grad_norm": 3.4060795307159424, |
| "learning_rate": 0.00017000464028106682, |
| "loss": 1.2278, |
| "step": 2390 |
| }, |
| { |
| "epoch": 1.64552622557422, |
| "grad_norm": 2.647289514541626, |
| "learning_rate": 0.00016966190902570257, |
| "loss": 1.1818, |
| "step": 2400 |
| }, |
| { |
| "epoch": 1.652382584847446, |
| "grad_norm": 2.2770307064056396, |
| "learning_rate": 0.00016931758058905642, |
| "loss": 1.106, |
| "step": 2410 |
| }, |
| { |
| "epoch": 1.659238944120672, |
| "grad_norm": 1.959615707397461, |
| "learning_rate": 0.00016897166286575747, |
| "loss": 1.0618, |
| "step": 2420 |
| }, |
| { |
| "epoch": 1.6660953033938979, |
| "grad_norm": 2.4052298069000244, |
| "learning_rate": 0.0001686241637868734, |
| "loss": 1.1638, |
| "step": 2430 |
| }, |
| { |
| "epoch": 1.6729516626671237, |
| "grad_norm": 1.4760863780975342, |
| "learning_rate": 0.00016827509131972848, |
| "loss": 1.126, |
| "step": 2440 |
| }, |
| { |
| "epoch": 1.6798080219403497, |
| "grad_norm": 2.8597095012664795, |
| "learning_rate": 0.0001679244534677212, |
| "loss": 1.1905, |
| "step": 2450 |
| }, |
| { |
| "epoch": 1.6866643812135755, |
| "grad_norm": 3.0205864906311035, |
| "learning_rate": 0.00016757225827014044, |
| "loss": 1.1128, |
| "step": 2460 |
| }, |
| { |
| "epoch": 1.6935207404868016, |
| "grad_norm": 3.0635123252868652, |
| "learning_rate": 0.00016721851380198136, |
| "loss": 1.2575, |
| "step": 2470 |
| }, |
| { |
| "epoch": 1.7003770997600274, |
| "grad_norm": 2.0744762420654297, |
| "learning_rate": 0.00016686322817376014, |
| "loss": 1.2229, |
| "step": 2480 |
| }, |
| { |
| "epoch": 1.7072334590332532, |
| "grad_norm": 2.0594987869262695, |
| "learning_rate": 0.0001665064095313282, |
| "loss": 1.1665, |
| "step": 2490 |
| }, |
| { |
| "epoch": 1.7140898183064792, |
| "grad_norm": 3.5101630687713623, |
| "learning_rate": 0.00016614806605568514, |
| "loss": 1.0664, |
| "step": 2500 |
| }, |
| { |
| "epoch": 1.7209461775797052, |
| "grad_norm": 2.1221086978912354, |
| "learning_rate": 0.0001657882059627915, |
| "loss": 1.2069, |
| "step": 2510 |
| }, |
| { |
| "epoch": 1.727802536852931, |
| "grad_norm": 2.5655086040496826, |
| "learning_rate": 0.0001654268375033802, |
| "loss": 1.2594, |
| "step": 2520 |
| }, |
| { |
| "epoch": 1.7346588961261569, |
| "grad_norm": 2.0883429050445557, |
| "learning_rate": 0.00016506396896276732, |
| "loss": 1.0933, |
| "step": 2530 |
| }, |
| { |
| "epoch": 1.741515255399383, |
| "grad_norm": 2.505929946899414, |
| "learning_rate": 0.00016469960866066235, |
| "loss": 1.1908, |
| "step": 2540 |
| }, |
| { |
| "epoch": 1.748371614672609, |
| "grad_norm": 1.4579987525939941, |
| "learning_rate": 0.00016433376495097717, |
| "loss": 1.0112, |
| "step": 2550 |
| }, |
| { |
| "epoch": 1.7552279739458347, |
| "grad_norm": 2.1962618827819824, |
| "learning_rate": 0.00016396644622163476, |
| "loss": 1.0926, |
| "step": 2560 |
| }, |
| { |
| "epoch": 1.7620843332190606, |
| "grad_norm": 1.6549545526504517, |
| "learning_rate": 0.00016359766089437677, |
| "loss": 1.1251, |
| "step": 2570 |
| }, |
| { |
| "epoch": 1.7689406924922866, |
| "grad_norm": 1.649786114692688, |
| "learning_rate": 0.0001632274174245704, |
| "loss": 1.1365, |
| "step": 2580 |
| }, |
| { |
| "epoch": 1.7757970517655126, |
| "grad_norm": 2.0461771488189697, |
| "learning_rate": 0.00016285572430101456, |
| "loss": 1.1217, |
| "step": 2590 |
| }, |
| { |
| "epoch": 1.7826534110387384, |
| "grad_norm": 2.474700450897217, |
| "learning_rate": 0.00016248259004574534, |
| "loss": 1.1719, |
| "step": 2600 |
| }, |
| { |
| "epoch": 1.7895097703119642, |
| "grad_norm": 2.6934568881988525, |
| "learning_rate": 0.00016210802321384046, |
| "loss": 1.2663, |
| "step": 2610 |
| }, |
| { |
| "epoch": 1.7963661295851903, |
| "grad_norm": 2.020193099975586, |
| "learning_rate": 0.00016173203239322327, |
| "loss": 1.3247, |
| "step": 2620 |
| }, |
| { |
| "epoch": 1.8032224888584163, |
| "grad_norm": 2.375579357147217, |
| "learning_rate": 0.0001613546262044657, |
| "loss": 1.1168, |
| "step": 2630 |
| }, |
| { |
| "epoch": 1.810078848131642, |
| "grad_norm": 2.353813648223877, |
| "learning_rate": 0.00016097581330059074, |
| "loss": 1.2081, |
| "step": 2640 |
| }, |
| { |
| "epoch": 1.816935207404868, |
| "grad_norm": 1.9840728044509888, |
| "learning_rate": 0.00016059560236687408, |
| "loss": 1.1065, |
| "step": 2650 |
| }, |
| { |
| "epoch": 1.823791566678094, |
| "grad_norm": 2.407522201538086, |
| "learning_rate": 0.00016021400212064472, |
| "loss": 1.14, |
| "step": 2660 |
| }, |
| { |
| "epoch": 1.83064792595132, |
| "grad_norm": 1.937991976737976, |
| "learning_rate": 0.00015983102131108545, |
| "loss": 1.0747, |
| "step": 2670 |
| }, |
| { |
| "epoch": 1.8375042852245458, |
| "grad_norm": 2.0098466873168945, |
| "learning_rate": 0.000159446668719032, |
| "loss": 1.3052, |
| "step": 2680 |
| }, |
| { |
| "epoch": 1.8443606444977716, |
| "grad_norm": 1.9577100276947021, |
| "learning_rate": 0.00015906095315677173, |
| "loss": 1.2056, |
| "step": 2690 |
| }, |
| { |
| "epoch": 1.8512170037709976, |
| "grad_norm": 1.6589945554733276, |
| "learning_rate": 0.0001586738834678418, |
| "loss": 1.1571, |
| "step": 2700 |
| }, |
| { |
| "epoch": 1.8580733630442237, |
| "grad_norm": 2.461854934692383, |
| "learning_rate": 0.00015828546852682615, |
| "loss": 1.1748, |
| "step": 2710 |
| }, |
| { |
| "epoch": 1.8649297223174495, |
| "grad_norm": 3.0385873317718506, |
| "learning_rate": 0.00015789571723915223, |
| "loss": 1.1237, |
| "step": 2720 |
| }, |
| { |
| "epoch": 1.8717860815906753, |
| "grad_norm": 1.846217155456543, |
| "learning_rate": 0.00015750463854088666, |
| "loss": 1.0674, |
| "step": 2730 |
| }, |
| { |
| "epoch": 1.8786424408639013, |
| "grad_norm": 2.5026347637176514, |
| "learning_rate": 0.00015711224139853042, |
| "loss": 1.1815, |
| "step": 2740 |
| }, |
| { |
| "epoch": 1.8854988001371273, |
| "grad_norm": 2.515775203704834, |
| "learning_rate": 0.00015671853480881328, |
| "loss": 1.1674, |
| "step": 2750 |
| }, |
| { |
| "epoch": 1.8923551594103531, |
| "grad_norm": 3.0344481468200684, |
| "learning_rate": 0.00015632352779848755, |
| "loss": 1.1975, |
| "step": 2760 |
| }, |
| { |
| "epoch": 1.899211518683579, |
| "grad_norm": 1.853408932685852, |
| "learning_rate": 0.00015592722942412102, |
| "loss": 1.209, |
| "step": 2770 |
| }, |
| { |
| "epoch": 1.906067877956805, |
| "grad_norm": 2.2342731952667236, |
| "learning_rate": 0.00015552964877188935, |
| "loss": 1.1296, |
| "step": 2780 |
| }, |
| { |
| "epoch": 1.9129242372300308, |
| "grad_norm": 1.8905003070831299, |
| "learning_rate": 0.00015513079495736788, |
| "loss": 1.1877, |
| "step": 2790 |
| }, |
| { |
| "epoch": 1.9197805965032568, |
| "grad_norm": 2.1061089038848877, |
| "learning_rate": 0.00015473067712532245, |
| "loss": 1.1134, |
| "step": 2800 |
| }, |
| { |
| "epoch": 1.9266369557764826, |
| "grad_norm": 2.1238293647766113, |
| "learning_rate": 0.00015432930444949982, |
| "loss": 1.1978, |
| "step": 2810 |
| }, |
| { |
| "epoch": 1.9334933150497084, |
| "grad_norm": 1.7509264945983887, |
| "learning_rate": 0.0001539266861324173, |
| "loss": 1.1469, |
| "step": 2820 |
| }, |
| { |
| "epoch": 1.9403496743229345, |
| "grad_norm": 2.364008665084839, |
| "learning_rate": 0.00015352283140515177, |
| "loss": 1.1545, |
| "step": 2830 |
| }, |
| { |
| "epoch": 1.9472060335961605, |
| "grad_norm": 1.9981671571731567, |
| "learning_rate": 0.00015311774952712814, |
| "loss": 1.2245, |
| "step": 2840 |
| }, |
| { |
| "epoch": 1.9540623928693863, |
| "grad_norm": 3.0137462615966797, |
| "learning_rate": 0.00015271144978590685, |
| "loss": 1.2361, |
| "step": 2850 |
| }, |
| { |
| "epoch": 1.9609187521426121, |
| "grad_norm": 2.985872268676758, |
| "learning_rate": 0.00015230394149697108, |
| "loss": 1.2297, |
| "step": 2860 |
| }, |
| { |
| "epoch": 1.9677751114158382, |
| "grad_norm": 1.7955970764160156, |
| "learning_rate": 0.00015189523400351314, |
| "loss": 1.2132, |
| "step": 2870 |
| }, |
| { |
| "epoch": 1.9746314706890642, |
| "grad_norm": 2.0052096843719482, |
| "learning_rate": 0.0001514853366762202, |
| "loss": 1.0715, |
| "step": 2880 |
| }, |
| { |
| "epoch": 1.98148782996229, |
| "grad_norm": 3.2730658054351807, |
| "learning_rate": 0.00015107425891305946, |
| "loss": 1.3002, |
| "step": 2890 |
| }, |
| { |
| "epoch": 1.9883441892355158, |
| "grad_norm": 2.5514261722564697, |
| "learning_rate": 0.00015066201013906277, |
| "loss": 1.216, |
| "step": 2900 |
| }, |
| { |
| "epoch": 1.9952005485087418, |
| "grad_norm": 3.393329381942749, |
| "learning_rate": 0.00015024859980611048, |
| "loss": 1.2525, |
| "step": 2910 |
| }, |
| { |
| "epoch": 2.002056907781968, |
| "grad_norm": 1.7734365463256836, |
| "learning_rate": 0.00014983403739271455, |
| "loss": 1.0946, |
| "step": 2920 |
| }, |
| { |
| "epoch": 2.0089132670551937, |
| "grad_norm": 1.6522287130355835, |
| "learning_rate": 0.0001494183324038016, |
| "loss": 0.9668, |
| "step": 2930 |
| }, |
| { |
| "epoch": 2.0157696263284195, |
| "grad_norm": 1.8900648355484009, |
| "learning_rate": 0.00014900149437049463, |
| "loss": 0.9247, |
| "step": 2940 |
| }, |
| { |
| "epoch": 2.0226259856016453, |
| "grad_norm": 1.4394400119781494, |
| "learning_rate": 0.00014858353284989467, |
| "loss": 0.8676, |
| "step": 2950 |
| }, |
| { |
| "epoch": 2.0294823448748716, |
| "grad_norm": 1.6402225494384766, |
| "learning_rate": 0.00014816445742486177, |
| "loss": 0.7732, |
| "step": 2960 |
| }, |
| { |
| "epoch": 2.0363387041480974, |
| "grad_norm": 2.301037073135376, |
| "learning_rate": 0.0001477442777037949, |
| "loss": 0.8462, |
| "step": 2970 |
| }, |
| { |
| "epoch": 2.043195063421323, |
| "grad_norm": 2.0403075218200684, |
| "learning_rate": 0.00014732300332041215, |
| "loss": 0.7681, |
| "step": 2980 |
| }, |
| { |
| "epoch": 2.050051422694549, |
| "grad_norm": 2.1729094982147217, |
| "learning_rate": 0.00014690064393352943, |
| "loss": 1.0043, |
| "step": 2990 |
| }, |
| { |
| "epoch": 2.0569077819677752, |
| "grad_norm": 1.7256251573562622, |
| "learning_rate": 0.0001464772092268393, |
| "loss": 0.8462, |
| "step": 3000 |
| }, |
| { |
| "epoch": 2.0569077819677752, |
| "eval_loss": 1.2205281257629395, |
| "eval_runtime": 29.6488, |
| "eval_samples_per_second": 82.87, |
| "eval_steps_per_second": 10.388, |
| "step": 3000 |
| }, |
| { |
| "epoch": 2.063764141241001, |
| "grad_norm": 2.0800058841705322, |
| "learning_rate": 0.00014605270890868873, |
| "loss": 0.7895, |
| "step": 3010 |
| }, |
| { |
| "epoch": 2.070620500514227, |
| "grad_norm": 1.9822988510131836, |
| "learning_rate": 0.00014562715271185673, |
| "loss": 0.8707, |
| "step": 3020 |
| }, |
| { |
| "epoch": 2.0774768597874527, |
| "grad_norm": 2.0999081134796143, |
| "learning_rate": 0.00014520055039333101, |
| "loss": 0.8167, |
| "step": 3030 |
| }, |
| { |
| "epoch": 2.084333219060679, |
| "grad_norm": 3.332972288131714, |
| "learning_rate": 0.0001447729117340844, |
| "loss": 0.8117, |
| "step": 3040 |
| }, |
| { |
| "epoch": 2.0911895783339047, |
| "grad_norm": 2.6177289485931396, |
| "learning_rate": 0.0001443442465388505, |
| "loss": 0.8042, |
| "step": 3050 |
| }, |
| { |
| "epoch": 2.0980459376071305, |
| "grad_norm": 2.5687131881713867, |
| "learning_rate": 0.000143914564635899, |
| "loss": 0.8638, |
| "step": 3060 |
| }, |
| { |
| "epoch": 2.1049022968803563, |
| "grad_norm": 2.9838380813598633, |
| "learning_rate": 0.00014348387587681018, |
| "loss": 0.9421, |
| "step": 3070 |
| }, |
| { |
| "epoch": 2.1117586561535826, |
| "grad_norm": 3.1572110652923584, |
| "learning_rate": 0.00014305219013624918, |
| "loss": 0.8763, |
| "step": 3080 |
| }, |
| { |
| "epoch": 2.1186150154268084, |
| "grad_norm": 2.1814610958099365, |
| "learning_rate": 0.00014261951731173956, |
| "loss": 0.9218, |
| "step": 3090 |
| }, |
| { |
| "epoch": 2.125471374700034, |
| "grad_norm": 2.19307279586792, |
| "learning_rate": 0.00014218586732343635, |
| "loss": 0.8691, |
| "step": 3100 |
| }, |
| { |
| "epoch": 2.13232773397326, |
| "grad_norm": 2.0345587730407715, |
| "learning_rate": 0.00014175125011389858, |
| "loss": 0.9038, |
| "step": 3110 |
| }, |
| { |
| "epoch": 2.1391840932464863, |
| "grad_norm": 2.5638718605041504, |
| "learning_rate": 0.0001413156756478614, |
| "loss": 0.8381, |
| "step": 3120 |
| }, |
| { |
| "epoch": 2.146040452519712, |
| "grad_norm": 2.1037793159484863, |
| "learning_rate": 0.00014087915391200747, |
| "loss": 0.9794, |
| "step": 3130 |
| }, |
| { |
| "epoch": 2.152896811792938, |
| "grad_norm": 2.0018515586853027, |
| "learning_rate": 0.0001404416949147383, |
| "loss": 0.8893, |
| "step": 3140 |
| }, |
| { |
| "epoch": 2.1597531710661637, |
| "grad_norm": 4.70350456237793, |
| "learning_rate": 0.00014000330868594427, |
| "loss": 0.8194, |
| "step": 3150 |
| }, |
| { |
| "epoch": 2.16660953033939, |
| "grad_norm": 2.2104270458221436, |
| "learning_rate": 0.00013956400527677523, |
| "loss": 0.9157, |
| "step": 3160 |
| }, |
| { |
| "epoch": 2.1734658896126158, |
| "grad_norm": 2.0395448207855225, |
| "learning_rate": 0.00013912379475940963, |
| "loss": 0.9017, |
| "step": 3170 |
| }, |
| { |
| "epoch": 2.1803222488858416, |
| "grad_norm": 2.4973316192626953, |
| "learning_rate": 0.0001386826872268238, |
| "loss": 0.9304, |
| "step": 3180 |
| }, |
| { |
| "epoch": 2.1871786081590674, |
| "grad_norm": 2.2849769592285156, |
| "learning_rate": 0.00013824069279256052, |
| "loss": 0.828, |
| "step": 3190 |
| }, |
| { |
| "epoch": 2.1940349674322936, |
| "grad_norm": 2.458329916000366, |
| "learning_rate": 0.000137797821590497, |
| "loss": 0.8158, |
| "step": 3200 |
| }, |
| { |
| "epoch": 2.2008913267055195, |
| "grad_norm": 1.5932427644729614, |
| "learning_rate": 0.00013735408377461275, |
| "loss": 0.8592, |
| "step": 3210 |
| }, |
| { |
| "epoch": 2.2077476859787453, |
| "grad_norm": 2.6613569259643555, |
| "learning_rate": 0.00013690948951875658, |
| "loss": 0.8317, |
| "step": 3220 |
| }, |
| { |
| "epoch": 2.214604045251971, |
| "grad_norm": 1.6658825874328613, |
| "learning_rate": 0.00013646404901641358, |
| "loss": 0.8648, |
| "step": 3230 |
| }, |
| { |
| "epoch": 2.221460404525197, |
| "grad_norm": 2.0496561527252197, |
| "learning_rate": 0.00013601777248047105, |
| "loss": 0.8589, |
| "step": 3240 |
| }, |
| { |
| "epoch": 2.228316763798423, |
| "grad_norm": 3.063122510910034, |
| "learning_rate": 0.0001355706701429847, |
| "loss": 0.9327, |
| "step": 3250 |
| }, |
| { |
| "epoch": 2.235173123071649, |
| "grad_norm": 2.60986590385437, |
| "learning_rate": 0.00013512275225494377, |
| "loss": 0.9661, |
| "step": 3260 |
| }, |
| { |
| "epoch": 2.2420294823448748, |
| "grad_norm": 2.036770820617676, |
| "learning_rate": 0.00013467402908603622, |
| "loss": 0.7925, |
| "step": 3270 |
| }, |
| { |
| "epoch": 2.248885841618101, |
| "grad_norm": 2.920668125152588, |
| "learning_rate": 0.0001342245109244132, |
| "loss": 0.8235, |
| "step": 3280 |
| }, |
| { |
| "epoch": 2.255742200891327, |
| "grad_norm": 2.1719000339508057, |
| "learning_rate": 0.000133774208076453, |
| "loss": 0.8847, |
| "step": 3290 |
| }, |
| { |
| "epoch": 2.2625985601645526, |
| "grad_norm": 2.4449331760406494, |
| "learning_rate": 0.00013332313086652516, |
| "loss": 0.8658, |
| "step": 3300 |
| }, |
| { |
| "epoch": 2.2694549194377784, |
| "grad_norm": 2.6060760021209717, |
| "learning_rate": 0.00013287128963675312, |
| "loss": 0.7972, |
| "step": 3310 |
| }, |
| { |
| "epoch": 2.2763112787110042, |
| "grad_norm": 2.1217901706695557, |
| "learning_rate": 0.00013241869474677783, |
| "loss": 0.8716, |
| "step": 3320 |
| }, |
| { |
| "epoch": 2.2831676379842305, |
| "grad_norm": 2.239027738571167, |
| "learning_rate": 0.00013196535657351957, |
| "loss": 0.8919, |
| "step": 3330 |
| }, |
| { |
| "epoch": 2.2900239972574563, |
| "grad_norm": 2.4673471450805664, |
| "learning_rate": 0.00013151128551094064, |
| "loss": 0.8553, |
| "step": 3340 |
| }, |
| { |
| "epoch": 2.296880356530682, |
| "grad_norm": 2.783276319503784, |
| "learning_rate": 0.00013105649196980647, |
| "loss": 0.8081, |
| "step": 3350 |
| }, |
| { |
| "epoch": 2.303736715803908, |
| "grad_norm": 2.2374160289764404, |
| "learning_rate": 0.00013060098637744733, |
| "loss": 0.8908, |
| "step": 3360 |
| }, |
| { |
| "epoch": 2.310593075077134, |
| "grad_norm": 2.431093215942383, |
| "learning_rate": 0.00013014477917751912, |
| "loss": 1.0646, |
| "step": 3370 |
| }, |
| { |
| "epoch": 2.31744943435036, |
| "grad_norm": 3.1827499866485596, |
| "learning_rate": 0.00012968788082976386, |
| "loss": 0.8314, |
| "step": 3380 |
| }, |
| { |
| "epoch": 2.324305793623586, |
| "grad_norm": 2.4208121299743652, |
| "learning_rate": 0.00012923030180977005, |
| "loss": 0.8218, |
| "step": 3390 |
| }, |
| { |
| "epoch": 2.3311621528968116, |
| "grad_norm": 1.9898459911346436, |
| "learning_rate": 0.0001287720526087323, |
| "loss": 0.8163, |
| "step": 3400 |
| }, |
| { |
| "epoch": 2.338018512170038, |
| "grad_norm": 2.327742099761963, |
| "learning_rate": 0.00012831314373321084, |
| "loss": 0.8621, |
| "step": 3410 |
| }, |
| { |
| "epoch": 2.3448748714432637, |
| "grad_norm": 3.3728277683258057, |
| "learning_rate": 0.00012785358570489077, |
| "loss": 0.8402, |
| "step": 3420 |
| }, |
| { |
| "epoch": 2.3517312307164895, |
| "grad_norm": 2.3549559116363525, |
| "learning_rate": 0.00012739338906034062, |
| "loss": 0.9521, |
| "step": 3430 |
| }, |
| { |
| "epoch": 2.3585875899897153, |
| "grad_norm": 2.086442232131958, |
| "learning_rate": 0.00012693256435077093, |
| "loss": 0.9513, |
| "step": 3440 |
| }, |
| { |
| "epoch": 2.3654439492629415, |
| "grad_norm": 2.5228612422943115, |
| "learning_rate": 0.00012647112214179222, |
| "loss": 0.9159, |
| "step": 3450 |
| }, |
| { |
| "epoch": 2.3723003085361674, |
| "grad_norm": 4.470077991485596, |
| "learning_rate": 0.00012600907301317285, |
| "loss": 0.8976, |
| "step": 3460 |
| }, |
| { |
| "epoch": 2.379156667809393, |
| "grad_norm": 2.9393067359924316, |
| "learning_rate": 0.00012554642755859628, |
| "loss": 0.9191, |
| "step": 3470 |
| }, |
| { |
| "epoch": 2.386013027082619, |
| "grad_norm": 2.242415428161621, |
| "learning_rate": 0.0001250831963854185, |
| "loss": 0.7794, |
| "step": 3480 |
| }, |
| { |
| "epoch": 2.3928693863558452, |
| "grad_norm": 2.3584256172180176, |
| "learning_rate": 0.00012461939011442446, |
| "loss": 0.9089, |
| "step": 3490 |
| }, |
| { |
| "epoch": 2.399725745629071, |
| "grad_norm": 1.7331100702285767, |
| "learning_rate": 0.00012415501937958478, |
| "loss": 0.9748, |
| "step": 3500 |
| }, |
| { |
| "epoch": 2.406582104902297, |
| "grad_norm": 3.369680404663086, |
| "learning_rate": 0.00012369009482781192, |
| "loss": 0.8841, |
| "step": 3510 |
| }, |
| { |
| "epoch": 2.4134384641755227, |
| "grad_norm": 2.561638593673706, |
| "learning_rate": 0.000123224627118716, |
| "loss": 0.8507, |
| "step": 3520 |
| }, |
| { |
| "epoch": 2.4202948234487485, |
| "grad_norm": 1.9922767877578735, |
| "learning_rate": 0.00012275862692436048, |
| "loss": 0.9133, |
| "step": 3530 |
| }, |
| { |
| "epoch": 2.4271511827219747, |
| "grad_norm": 3.621152400970459, |
| "learning_rate": 0.00012229210492901738, |
| "loss": 0.7956, |
| "step": 3540 |
| }, |
| { |
| "epoch": 2.4340075419952005, |
| "grad_norm": 2.8023221492767334, |
| "learning_rate": 0.00012182507182892244, |
| "loss": 0.8476, |
| "step": 3550 |
| }, |
| { |
| "epoch": 2.4408639012684263, |
| "grad_norm": 2.273455858230591, |
| "learning_rate": 0.00012135753833202973, |
| "loss": 0.9277, |
| "step": 3560 |
| }, |
| { |
| "epoch": 2.4477202605416526, |
| "grad_norm": 1.6278008222579956, |
| "learning_rate": 0.00012088951515776634, |
| "loss": 0.9194, |
| "step": 3570 |
| }, |
| { |
| "epoch": 2.4545766198148784, |
| "grad_norm": 2.1401588916778564, |
| "learning_rate": 0.00012042101303678636, |
| "loss": 0.8345, |
| "step": 3580 |
| }, |
| { |
| "epoch": 2.461432979088104, |
| "grad_norm": 1.6796125173568726, |
| "learning_rate": 0.00011995204271072509, |
| "loss": 0.9335, |
| "step": 3590 |
| }, |
| { |
| "epoch": 2.46828933836133, |
| "grad_norm": 2.8594868183135986, |
| "learning_rate": 0.00011948261493195256, |
| "loss": 0.913, |
| "step": 3600 |
| }, |
| { |
| "epoch": 2.475145697634556, |
| "grad_norm": 2.4989187717437744, |
| "learning_rate": 0.0001190127404633272, |
| "loss": 0.9408, |
| "step": 3610 |
| }, |
| { |
| "epoch": 2.482002056907782, |
| "grad_norm": 4.41836404800415, |
| "learning_rate": 0.00011854243007794891, |
| "loss": 0.9526, |
| "step": 3620 |
| }, |
| { |
| "epoch": 2.488858416181008, |
| "grad_norm": 2.835994005203247, |
| "learning_rate": 0.00011807169455891216, |
| "loss": 0.8953, |
| "step": 3630 |
| }, |
| { |
| "epoch": 2.4957147754542337, |
| "grad_norm": 1.9034545421600342, |
| "learning_rate": 0.00011760054469905868, |
| "loss": 0.8837, |
| "step": 3640 |
| }, |
| { |
| "epoch": 2.50257113472746, |
| "grad_norm": 1.9930311441421509, |
| "learning_rate": 0.00011712899130072999, |
| "loss": 0.8693, |
| "step": 3650 |
| }, |
| { |
| "epoch": 2.5094274940006858, |
| "grad_norm": 1.874711513519287, |
| "learning_rate": 0.00011665704517551995, |
| "loss": 0.8614, |
| "step": 3660 |
| }, |
| { |
| "epoch": 2.5162838532739116, |
| "grad_norm": 3.2593679428100586, |
| "learning_rate": 0.00011618471714402656, |
| "loss": 0.8577, |
| "step": 3670 |
| }, |
| { |
| "epoch": 2.5231402125471374, |
| "grad_norm": 1.9729576110839844, |
| "learning_rate": 0.0001157120180356041, |
| "loss": 0.7806, |
| "step": 3680 |
| }, |
| { |
| "epoch": 2.529996571820363, |
| "grad_norm": 2.616779327392578, |
| "learning_rate": 0.00011523895868811472, |
| "loss": 0.9526, |
| "step": 3690 |
| }, |
| { |
| "epoch": 2.5368529310935894, |
| "grad_norm": 3.430968761444092, |
| "learning_rate": 0.00011476554994768001, |
| "loss": 0.8698, |
| "step": 3700 |
| }, |
| { |
| "epoch": 2.5437092903668153, |
| "grad_norm": 1.9257500171661377, |
| "learning_rate": 0.0001142918026684323, |
| "loss": 0.8879, |
| "step": 3710 |
| }, |
| { |
| "epoch": 2.550565649640041, |
| "grad_norm": 3.2250068187713623, |
| "learning_rate": 0.00011381772771226577, |
| "loss": 0.9508, |
| "step": 3720 |
| }, |
| { |
| "epoch": 2.5574220089132673, |
| "grad_norm": 2.567389726638794, |
| "learning_rate": 0.00011334333594858755, |
| "loss": 0.8863, |
| "step": 3730 |
| }, |
| { |
| "epoch": 2.564278368186493, |
| "grad_norm": 2.4031014442443848, |
| "learning_rate": 0.00011286863825406831, |
| "loss": 0.8951, |
| "step": 3740 |
| }, |
| { |
| "epoch": 2.571134727459719, |
| "grad_norm": 3.276573896408081, |
| "learning_rate": 0.000112393645512393, |
| "loss": 0.8829, |
| "step": 3750 |
| }, |
| { |
| "epoch": 2.5779910867329447, |
| "grad_norm": 3.0637712478637695, |
| "learning_rate": 0.00011191836861401137, |
| "loss": 0.985, |
| "step": 3760 |
| }, |
| { |
| "epoch": 2.5848474460061706, |
| "grad_norm": 2.7773642539978027, |
| "learning_rate": 0.00011144281845588811, |
| "loss": 0.9017, |
| "step": 3770 |
| }, |
| { |
| "epoch": 2.591703805279397, |
| "grad_norm": 2.187830686569214, |
| "learning_rate": 0.00011096700594125318, |
| "loss": 0.7401, |
| "step": 3780 |
| }, |
| { |
| "epoch": 2.5985601645526226, |
| "grad_norm": 3.42225980758667, |
| "learning_rate": 0.00011049094197935165, |
| "loss": 0.9513, |
| "step": 3790 |
| }, |
| { |
| "epoch": 2.6054165238258484, |
| "grad_norm": 2.064603567123413, |
| "learning_rate": 0.00011001463748519383, |
| "loss": 0.8678, |
| "step": 3800 |
| }, |
| { |
| "epoch": 2.6122728830990742, |
| "grad_norm": 2.707390308380127, |
| "learning_rate": 0.00010953810337930468, |
| "loss": 0.8812, |
| "step": 3810 |
| }, |
| { |
| "epoch": 2.6191292423723, |
| "grad_norm": 3.8909173011779785, |
| "learning_rate": 0.00010906135058747376, |
| "loss": 0.855, |
| "step": 3820 |
| }, |
| { |
| "epoch": 2.6259856016455263, |
| "grad_norm": 3.4854836463928223, |
| "learning_rate": 0.0001085843900405045, |
| "loss": 0.7692, |
| "step": 3830 |
| }, |
| { |
| "epoch": 2.632841960918752, |
| "grad_norm": 2.343348503112793, |
| "learning_rate": 0.00010810723267396366, |
| "loss": 0.8362, |
| "step": 3840 |
| }, |
| { |
| "epoch": 2.639698320191978, |
| "grad_norm": 2.7717478275299072, |
| "learning_rate": 0.00010762988942793065, |
| "loss": 1.0403, |
| "step": 3850 |
| }, |
| { |
| "epoch": 2.646554679465204, |
| "grad_norm": 3.4452064037323, |
| "learning_rate": 0.00010715237124674658, |
| "loss": 0.8948, |
| "step": 3860 |
| }, |
| { |
| "epoch": 2.65341103873843, |
| "grad_norm": 2.542163133621216, |
| "learning_rate": 0.00010667468907876348, |
| "loss": 0.8332, |
| "step": 3870 |
| }, |
| { |
| "epoch": 2.660267398011656, |
| "grad_norm": 3.4537532329559326, |
| "learning_rate": 0.00010619685387609313, |
| "loss": 0.9012, |
| "step": 3880 |
| }, |
| { |
| "epoch": 2.6671237572848816, |
| "grad_norm": 2.38268780708313, |
| "learning_rate": 0.00010571887659435614, |
| "loss": 0.8836, |
| "step": 3890 |
| }, |
| { |
| "epoch": 2.6739801165581074, |
| "grad_norm": 2.2005438804626465, |
| "learning_rate": 0.00010524076819243051, |
| "loss": 0.928, |
| "step": 3900 |
| }, |
| { |
| "epoch": 2.6808364758313337, |
| "grad_norm": 2.7791340351104736, |
| "learning_rate": 0.00010476253963220062, |
| "loss": 0.8545, |
| "step": 3910 |
| }, |
| { |
| "epoch": 2.6876928351045595, |
| "grad_norm": 1.3345112800598145, |
| "learning_rate": 0.00010428420187830581, |
| "loss": 0.757, |
| "step": 3920 |
| }, |
| { |
| "epoch": 2.6945491943777853, |
| "grad_norm": 2.2857022285461426, |
| "learning_rate": 0.00010380576589788884, |
| "loss": 0.7812, |
| "step": 3930 |
| }, |
| { |
| "epoch": 2.7014055536510115, |
| "grad_norm": 2.7016501426696777, |
| "learning_rate": 0.00010332724266034472, |
| "loss": 0.7387, |
| "step": 3940 |
| }, |
| { |
| "epoch": 2.7082619129242373, |
| "grad_norm": 2.9767794609069824, |
| "learning_rate": 0.00010284864313706894, |
| "loss": 0.9737, |
| "step": 3950 |
| }, |
| { |
| "epoch": 2.715118272197463, |
| "grad_norm": 2.667555570602417, |
| "learning_rate": 0.00010236997830120614, |
| "loss": 0.8329, |
| "step": 3960 |
| }, |
| { |
| "epoch": 2.721974631470689, |
| "grad_norm": 1.9413018226623535, |
| "learning_rate": 0.00010189125912739832, |
| "loss": 0.8278, |
| "step": 3970 |
| }, |
| { |
| "epoch": 2.7288309907439148, |
| "grad_norm": 2.892408609390259, |
| "learning_rate": 0.0001014124965915334, |
| "loss": 0.8695, |
| "step": 3980 |
| }, |
| { |
| "epoch": 2.735687350017141, |
| "grad_norm": 2.4925520420074463, |
| "learning_rate": 0.00010093370167049343, |
| "loss": 0.8573, |
| "step": 3990 |
| }, |
| { |
| "epoch": 2.742543709290367, |
| "grad_norm": 2.353090524673462, |
| "learning_rate": 0.00010045488534190303, |
| "loss": 0.8322, |
| "step": 4000 |
| }, |
| { |
| "epoch": 2.742543709290367, |
| "eval_loss": 1.2436245679855347, |
| "eval_runtime": 29.4642, |
| "eval_samples_per_second": 83.389, |
| "eval_steps_per_second": 10.453, |
| "step": 4000 |
| }, |
| { |
| "epoch": 2.7494000685635926, |
| "grad_norm": 2.932257890701294, |
| "learning_rate": 9.997605858387764e-05, |
| "loss": 0.9329, |
| "step": 4010 |
| }, |
| { |
| "epoch": 2.756256427836819, |
| "grad_norm": 1.6092489957809448, |
| "learning_rate": 9.949723237477173e-05, |
| "loss": 0.8408, |
| "step": 4020 |
| }, |
| { |
| "epoch": 2.7631127871100447, |
| "grad_norm": 2.919813871383667, |
| "learning_rate": 9.901841769292733e-05, |
| "loss": 0.8821, |
| "step": 4030 |
| }, |
| { |
| "epoch": 2.7699691463832705, |
| "grad_norm": 2.4160959720611572, |
| "learning_rate": 9.853962551642204e-05, |
| "loss": 0.8638, |
| "step": 4040 |
| }, |
| { |
| "epoch": 2.7768255056564963, |
| "grad_norm": 2.036405563354492, |
| "learning_rate": 9.806086682281758e-05, |
| "loss": 0.8286, |
| "step": 4050 |
| }, |
| { |
| "epoch": 2.783681864929722, |
| "grad_norm": 2.90258526802063, |
| "learning_rate": 9.758215258890787e-05, |
| "loss": 0.8771, |
| "step": 4060 |
| }, |
| { |
| "epoch": 2.7905382242029484, |
| "grad_norm": 2.936398506164551, |
| "learning_rate": 9.710349379046762e-05, |
| "loss": 0.8782, |
| "step": 4070 |
| }, |
| { |
| "epoch": 2.797394583476174, |
| "grad_norm": 2.3748672008514404, |
| "learning_rate": 9.662490140200038e-05, |
| "loss": 0.9046, |
| "step": 4080 |
| }, |
| { |
| "epoch": 2.8042509427494, |
| "grad_norm": 2.2428438663482666, |
| "learning_rate": 9.614638639648719e-05, |
| "loss": 0.8763, |
| "step": 4090 |
| }, |
| { |
| "epoch": 2.811107302022626, |
| "grad_norm": 2.7683987617492676, |
| "learning_rate": 9.566795974513489e-05, |
| "loss": 0.8338, |
| "step": 4100 |
| }, |
| { |
| "epoch": 2.8179636612958516, |
| "grad_norm": 3.3809077739715576, |
| "learning_rate": 9.518963241712445e-05, |
| "loss": 0.9071, |
| "step": 4110 |
| }, |
| { |
| "epoch": 2.824820020569078, |
| "grad_norm": 3.275303602218628, |
| "learning_rate": 9.471141537935974e-05, |
| "loss": 0.969, |
| "step": 4120 |
| }, |
| { |
| "epoch": 2.8316763798423037, |
| "grad_norm": 2.0883727073669434, |
| "learning_rate": 9.423331959621582e-05, |
| "loss": 0.8391, |
| "step": 4130 |
| }, |
| { |
| "epoch": 2.8385327391155295, |
| "grad_norm": 2.7466342449188232, |
| "learning_rate": 9.375535602928776e-05, |
| "loss": 0.9003, |
| "step": 4140 |
| }, |
| { |
| "epoch": 2.8453890983887558, |
| "grad_norm": 4.680114269256592, |
| "learning_rate": 9.327753563713913e-05, |
| "loss": 0.8568, |
| "step": 4150 |
| }, |
| { |
| "epoch": 2.8522454576619816, |
| "grad_norm": 1.9148244857788086, |
| "learning_rate": 9.279986937505096e-05, |
| "loss": 0.8435, |
| "step": 4160 |
| }, |
| { |
| "epoch": 2.8591018169352074, |
| "grad_norm": 2.7942795753479004, |
| "learning_rate": 9.232236819477038e-05, |
| "loss": 0.8624, |
| "step": 4170 |
| }, |
| { |
| "epoch": 2.865958176208433, |
| "grad_norm": 2.342534065246582, |
| "learning_rate": 9.184504304425958e-05, |
| "loss": 0.9329, |
| "step": 4180 |
| }, |
| { |
| "epoch": 2.872814535481659, |
| "grad_norm": 2.486449718475342, |
| "learning_rate": 9.136790486744482e-05, |
| "loss": 0.9163, |
| "step": 4190 |
| }, |
| { |
| "epoch": 2.8796708947548852, |
| "grad_norm": 2.3326776027679443, |
| "learning_rate": 9.089096460396552e-05, |
| "loss": 0.7974, |
| "step": 4200 |
| }, |
| { |
| "epoch": 2.886527254028111, |
| "grad_norm": 2.1666274070739746, |
| "learning_rate": 9.041423318892339e-05, |
| "loss": 0.9513, |
| "step": 4210 |
| }, |
| { |
| "epoch": 2.893383613301337, |
| "grad_norm": 2.0120041370391846, |
| "learning_rate": 8.993772155263175e-05, |
| "loss": 0.8523, |
| "step": 4220 |
| }, |
| { |
| "epoch": 2.900239972574563, |
| "grad_norm": 3.034512996673584, |
| "learning_rate": 8.946144062036496e-05, |
| "loss": 0.904, |
| "step": 4230 |
| }, |
| { |
| "epoch": 2.907096331847789, |
| "grad_norm": 2.084458589553833, |
| "learning_rate": 8.89854013121078e-05, |
| "loss": 0.8582, |
| "step": 4240 |
| }, |
| { |
| "epoch": 2.9139526911210147, |
| "grad_norm": 2.5334362983703613, |
| "learning_rate": 8.850961454230526e-05, |
| "loss": 0.9028, |
| "step": 4250 |
| }, |
| { |
| "epoch": 2.9208090503942405, |
| "grad_norm": 2.835369348526001, |
| "learning_rate": 8.803409121961226e-05, |
| "loss": 0.8264, |
| "step": 4260 |
| }, |
| { |
| "epoch": 2.9276654096674664, |
| "grad_norm": 2.492717742919922, |
| "learning_rate": 8.755884224664342e-05, |
| "loss": 0.8943, |
| "step": 4270 |
| }, |
| { |
| "epoch": 2.9345217689406926, |
| "grad_norm": 1.9058958292007446, |
| "learning_rate": 8.708387851972313e-05, |
| "loss": 0.7474, |
| "step": 4280 |
| }, |
| { |
| "epoch": 2.9413781282139184, |
| "grad_norm": 3.058417558670044, |
| "learning_rate": 8.660921092863596e-05, |
| "loss": 0.8995, |
| "step": 4290 |
| }, |
| { |
| "epoch": 2.9482344874871442, |
| "grad_norm": 2.5806403160095215, |
| "learning_rate": 8.613485035637662e-05, |
| "loss": 0.7901, |
| "step": 4300 |
| }, |
| { |
| "epoch": 2.9550908467603705, |
| "grad_norm": 3.59805965423584, |
| "learning_rate": 8.566080767890069e-05, |
| "loss": 0.7556, |
| "step": 4310 |
| }, |
| { |
| "epoch": 2.9619472060335963, |
| "grad_norm": 1.853975772857666, |
| "learning_rate": 8.518709376487515e-05, |
| "loss": 0.9284, |
| "step": 4320 |
| }, |
| { |
| "epoch": 2.968803565306822, |
| "grad_norm": 3.123701810836792, |
| "learning_rate": 8.471371947542924e-05, |
| "loss": 0.8234, |
| "step": 4330 |
| }, |
| { |
| "epoch": 2.975659924580048, |
| "grad_norm": 2.0518481731414795, |
| "learning_rate": 8.424069566390541e-05, |
| "loss": 0.9438, |
| "step": 4340 |
| }, |
| { |
| "epoch": 2.9825162838532737, |
| "grad_norm": 1.894871473312378, |
| "learning_rate": 8.376803317561048e-05, |
| "loss": 0.9597, |
| "step": 4350 |
| }, |
| { |
| "epoch": 2.9893726431265, |
| "grad_norm": 2.3247461318969727, |
| "learning_rate": 8.329574284756704e-05, |
| "loss": 0.7713, |
| "step": 4360 |
| }, |
| { |
| "epoch": 2.996229002399726, |
| "grad_norm": 1.6081085205078125, |
| "learning_rate": 8.282383550826483e-05, |
| "loss": 0.9166, |
| "step": 4370 |
| }, |
| { |
| "epoch": 3.0030853616729516, |
| "grad_norm": 1.5519185066223145, |
| "learning_rate": 8.23523219774127e-05, |
| "loss": 0.8393, |
| "step": 4380 |
| }, |
| { |
| "epoch": 3.0099417209461774, |
| "grad_norm": 2.066254138946533, |
| "learning_rate": 8.188121306569028e-05, |
| "loss": 0.6205, |
| "step": 4390 |
| }, |
| { |
| "epoch": 3.0167980802194037, |
| "grad_norm": 2.129615306854248, |
| "learning_rate": 8.141051957450039e-05, |
| "loss": 0.5888, |
| "step": 4400 |
| }, |
| { |
| "epoch": 3.0236544394926295, |
| "grad_norm": 2.3542020320892334, |
| "learning_rate": 8.09402522957211e-05, |
| "loss": 0.5694, |
| "step": 4410 |
| }, |
| { |
| "epoch": 3.0305107987658553, |
| "grad_norm": 2.1178841590881348, |
| "learning_rate": 8.04704220114586e-05, |
| "loss": 0.6532, |
| "step": 4420 |
| }, |
| { |
| "epoch": 3.037367158039081, |
| "grad_norm": 2.2024478912353516, |
| "learning_rate": 8.00010394937997e-05, |
| "loss": 0.5173, |
| "step": 4430 |
| }, |
| { |
| "epoch": 3.0442235173123073, |
| "grad_norm": 3.5857062339782715, |
| "learning_rate": 7.953211550456507e-05, |
| "loss": 0.6258, |
| "step": 4440 |
| }, |
| { |
| "epoch": 3.051079876585533, |
| "grad_norm": 2.69157338142395, |
| "learning_rate": 7.906366079506244e-05, |
| "loss": 0.5143, |
| "step": 4450 |
| }, |
| { |
| "epoch": 3.057936235858759, |
| "grad_norm": 2.82023286819458, |
| "learning_rate": 7.859568610583998e-05, |
| "loss": 0.5554, |
| "step": 4460 |
| }, |
| { |
| "epoch": 3.0647925951319848, |
| "grad_norm": 2.121466636657715, |
| "learning_rate": 7.812820216644024e-05, |
| "loss": 0.621, |
| "step": 4470 |
| }, |
| { |
| "epoch": 3.071648954405211, |
| "grad_norm": 2.0588443279266357, |
| "learning_rate": 7.766121969515397e-05, |
| "loss": 0.5973, |
| "step": 4480 |
| }, |
| { |
| "epoch": 3.078505313678437, |
| "grad_norm": 2.1448562145233154, |
| "learning_rate": 7.719474939877451e-05, |
| "loss": 0.5669, |
| "step": 4490 |
| }, |
| { |
| "epoch": 3.0853616729516626, |
| "grad_norm": 2.030393123626709, |
| "learning_rate": 7.672880197235222e-05, |
| "loss": 0.805, |
| "step": 4500 |
| }, |
| { |
| "epoch": 3.0922180322248884, |
| "grad_norm": 3.153280019760132, |
| "learning_rate": 7.626338809894932e-05, |
| "loss": 0.7532, |
| "step": 4510 |
| }, |
| { |
| "epoch": 3.0990743914981147, |
| "grad_norm": 2.932528257369995, |
| "learning_rate": 7.579851844939491e-05, |
| "loss": 0.5241, |
| "step": 4520 |
| }, |
| { |
| "epoch": 3.1059307507713405, |
| "grad_norm": 2.2017295360565186, |
| "learning_rate": 7.533420368204036e-05, |
| "loss": 0.5952, |
| "step": 4530 |
| }, |
| { |
| "epoch": 3.1127871100445663, |
| "grad_norm": 4.451969146728516, |
| "learning_rate": 7.487045444251493e-05, |
| "loss": 0.6306, |
| "step": 4540 |
| }, |
| { |
| "epoch": 3.119643469317792, |
| "grad_norm": 2.1478543281555176, |
| "learning_rate": 7.440728136348158e-05, |
| "loss": 0.7007, |
| "step": 4550 |
| }, |
| { |
| "epoch": 3.126499828591018, |
| "grad_norm": 2.6645448207855225, |
| "learning_rate": 7.394469506439346e-05, |
| "loss": 0.6055, |
| "step": 4560 |
| }, |
| { |
| "epoch": 3.133356187864244, |
| "grad_norm": 2.741107702255249, |
| "learning_rate": 7.348270615125006e-05, |
| "loss": 0.5782, |
| "step": 4570 |
| }, |
| { |
| "epoch": 3.14021254713747, |
| "grad_norm": 2.3291351795196533, |
| "learning_rate": 7.302132521635438e-05, |
| "loss": 0.6248, |
| "step": 4580 |
| }, |
| { |
| "epoch": 3.147068906410696, |
| "grad_norm": 2.323695421218872, |
| "learning_rate": 7.256056283806986e-05, |
| "loss": 0.6929, |
| "step": 4590 |
| }, |
| { |
| "epoch": 3.153925265683922, |
| "grad_norm": 3.022197723388672, |
| "learning_rate": 7.210042958057794e-05, |
| "loss": 0.6514, |
| "step": 4600 |
| }, |
| { |
| "epoch": 3.160781624957148, |
| "grad_norm": 2.103457450866699, |
| "learning_rate": 7.164093599363585e-05, |
| "loss": 0.6308, |
| "step": 4610 |
| }, |
| { |
| "epoch": 3.1676379842303737, |
| "grad_norm": 3.9690616130828857, |
| "learning_rate": 7.118209261233461e-05, |
| "loss": 0.6485, |
| "step": 4620 |
| }, |
| { |
| "epoch": 3.1744943435035995, |
| "grad_norm": 3.117914915084839, |
| "learning_rate": 7.072390995685769e-05, |
| "loss": 0.6061, |
| "step": 4630 |
| }, |
| { |
| "epoch": 3.1813507027768253, |
| "grad_norm": 2.1254525184631348, |
| "learning_rate": 7.026639853223958e-05, |
| "loss": 0.6515, |
| "step": 4640 |
| }, |
| { |
| "epoch": 3.1882070620500516, |
| "grad_norm": 3.2843899726867676, |
| "learning_rate": 6.980956882812515e-05, |
| "loss": 0.6282, |
| "step": 4650 |
| }, |
| { |
| "epoch": 3.1950634213232774, |
| "grad_norm": 2.0985405445098877, |
| "learning_rate": 6.935343131852899e-05, |
| "loss": 0.6912, |
| "step": 4660 |
| }, |
| { |
| "epoch": 3.201919780596503, |
| "grad_norm": 2.0823545455932617, |
| "learning_rate": 6.889799646159534e-05, |
| "loss": 0.6898, |
| "step": 4670 |
| }, |
| { |
| "epoch": 3.208776139869729, |
| "grad_norm": 2.333568811416626, |
| "learning_rate": 6.844327469935827e-05, |
| "loss": 0.5982, |
| "step": 4680 |
| }, |
| { |
| "epoch": 3.2156324991429552, |
| "grad_norm": 3.6117570400238037, |
| "learning_rate": 6.79892764575023e-05, |
| "loss": 0.6153, |
| "step": 4690 |
| }, |
| { |
| "epoch": 3.222488858416181, |
| "grad_norm": 2.7832589149475098, |
| "learning_rate": 6.753601214512343e-05, |
| "loss": 0.6015, |
| "step": 4700 |
| }, |
| { |
| "epoch": 3.229345217689407, |
| "grad_norm": 2.1461801528930664, |
| "learning_rate": 6.708349215449025e-05, |
| "loss": 0.6104, |
| "step": 4710 |
| }, |
| { |
| "epoch": 3.2362015769626327, |
| "grad_norm": 2.373319625854492, |
| "learning_rate": 6.6631726860806e-05, |
| "loss": 0.5886, |
| "step": 4720 |
| }, |
| { |
| "epoch": 3.243057936235859, |
| "grad_norm": 3.069390058517456, |
| "learning_rate": 6.618072662197039e-05, |
| "loss": 0.5351, |
| "step": 4730 |
| }, |
| { |
| "epoch": 3.2499142955090847, |
| "grad_norm": 2.87705135345459, |
| "learning_rate": 6.573050177834233e-05, |
| "loss": 0.6533, |
| "step": 4740 |
| }, |
| { |
| "epoch": 3.2567706547823105, |
| "grad_norm": 2.443903923034668, |
| "learning_rate": 6.528106265250271e-05, |
| "loss": 0.6517, |
| "step": 4750 |
| }, |
| { |
| "epoch": 3.2636270140555363, |
| "grad_norm": 1.977824091911316, |
| "learning_rate": 6.483241954901785e-05, |
| "loss": 0.5984, |
| "step": 4760 |
| }, |
| { |
| "epoch": 3.2704833733287626, |
| "grad_norm": 3.083531379699707, |
| "learning_rate": 6.438458275420309e-05, |
| "loss": 0.6077, |
| "step": 4770 |
| }, |
| { |
| "epoch": 3.2773397326019884, |
| "grad_norm": 3.2755494117736816, |
| "learning_rate": 6.393756253588714e-05, |
| "loss": 0.5889, |
| "step": 4780 |
| }, |
| { |
| "epoch": 3.284196091875214, |
| "grad_norm": 2.7298457622528076, |
| "learning_rate": 6.349136914317652e-05, |
| "loss": 0.5221, |
| "step": 4790 |
| }, |
| { |
| "epoch": 3.29105245114844, |
| "grad_norm": 2.45281720161438, |
| "learning_rate": 6.304601280622055e-05, |
| "loss": 0.6113, |
| "step": 4800 |
| }, |
| { |
| "epoch": 3.2979088104216663, |
| "grad_norm": 3.0578701496124268, |
| "learning_rate": 6.260150373597697e-05, |
| "loss": 0.6029, |
| "step": 4810 |
| }, |
| { |
| "epoch": 3.304765169694892, |
| "grad_norm": 3.420334577560425, |
| "learning_rate": 6.21578521239776e-05, |
| "loss": 0.4964, |
| "step": 4820 |
| }, |
| { |
| "epoch": 3.311621528968118, |
| "grad_norm": 2.0759449005126953, |
| "learning_rate": 6.171506814209489e-05, |
| "loss": 0.6309, |
| "step": 4830 |
| }, |
| { |
| "epoch": 3.3184778882413437, |
| "grad_norm": 3.347933530807495, |
| "learning_rate": 6.127316194230854e-05, |
| "loss": 0.5931, |
| "step": 4840 |
| }, |
| { |
| "epoch": 3.3253342475145695, |
| "grad_norm": 2.531506061553955, |
| "learning_rate": 6.083214365647285e-05, |
| "loss": 0.616, |
| "step": 4850 |
| }, |
| { |
| "epoch": 3.3321906067877958, |
| "grad_norm": 2.149945020675659, |
| "learning_rate": 6.039202339608432e-05, |
| "loss": 0.5663, |
| "step": 4860 |
| }, |
| { |
| "epoch": 3.3390469660610216, |
| "grad_norm": 2.5064194202423096, |
| "learning_rate": 5.99528112520499e-05, |
| "loss": 0.7054, |
| "step": 4870 |
| }, |
| { |
| "epoch": 3.3459033253342474, |
| "grad_norm": 2.7665531635284424, |
| "learning_rate": 5.951451729445563e-05, |
| "loss": 0.5341, |
| "step": 4880 |
| }, |
| { |
| "epoch": 3.3527596846074736, |
| "grad_norm": 2.899704694747925, |
| "learning_rate": 5.907715157233563e-05, |
| "loss": 0.6004, |
| "step": 4890 |
| }, |
| { |
| "epoch": 3.3596160438806995, |
| "grad_norm": 2.8619532585144043, |
| "learning_rate": 5.8640724113441925e-05, |
| "loss": 0.657, |
| "step": 4900 |
| }, |
| { |
| "epoch": 3.3664724031539253, |
| "grad_norm": 2.754093647003174, |
| "learning_rate": 5.820524492401428e-05, |
| "loss": 0.6386, |
| "step": 4910 |
| }, |
| { |
| "epoch": 3.373328762427151, |
| "grad_norm": 2.501052141189575, |
| "learning_rate": 5.777072398855101e-05, |
| "loss": 0.6552, |
| "step": 4920 |
| }, |
| { |
| "epoch": 3.380185121700377, |
| "grad_norm": 4.3751983642578125, |
| "learning_rate": 5.7337171269579895e-05, |
| "loss": 0.6642, |
| "step": 4930 |
| }, |
| { |
| "epoch": 3.387041480973603, |
| "grad_norm": 3.558741331100464, |
| "learning_rate": 5.690459670742977e-05, |
| "loss": 0.5393, |
| "step": 4940 |
| }, |
| { |
| "epoch": 3.393897840246829, |
| "grad_norm": 2.205996036529541, |
| "learning_rate": 5.647301022000284e-05, |
| "loss": 0.5735, |
| "step": 4950 |
| }, |
| { |
| "epoch": 3.4007541995200548, |
| "grad_norm": 3.4519693851470947, |
| "learning_rate": 5.6042421702546956e-05, |
| "loss": 0.6545, |
| "step": 4960 |
| }, |
| { |
| "epoch": 3.407610558793281, |
| "grad_norm": 2.279060125350952, |
| "learning_rate": 5.561284102742892e-05, |
| "loss": 0.5327, |
| "step": 4970 |
| }, |
| { |
| "epoch": 3.414466918066507, |
| "grad_norm": 2.359433174133301, |
| "learning_rate": 5.51842780439082e-05, |
| "loss": 0.587, |
| "step": 4980 |
| }, |
| { |
| "epoch": 3.4213232773397326, |
| "grad_norm": 2.9181036949157715, |
| "learning_rate": 5.475674257791097e-05, |
| "loss": 0.6117, |
| "step": 4990 |
| }, |
| { |
| "epoch": 3.4281796366129584, |
| "grad_norm": 3.4551374912261963, |
| "learning_rate": 5.433024443180486e-05, |
| "loss": 0.5623, |
| "step": 5000 |
| }, |
| { |
| "epoch": 3.4281796366129584, |
| "eval_loss": 1.4529434442520142, |
| "eval_runtime": 29.3836, |
| "eval_samples_per_second": 83.618, |
| "eval_steps_per_second": 10.482, |
| "step": 5000 |
| }, |
| { |
| "epoch": 3.4350359958861842, |
| "grad_norm": 2.4626495838165283, |
| "learning_rate": 5.3904793384174226e-05, |
| "loss": 0.6439, |
| "step": 5010 |
| }, |
| { |
| "epoch": 3.4418923551594105, |
| "grad_norm": 4.081802845001221, |
| "learning_rate": 5.348039918959604e-05, |
| "loss": 0.633, |
| "step": 5020 |
| }, |
| { |
| "epoch": 3.4487487144326363, |
| "grad_norm": 2.2953267097473145, |
| "learning_rate": 5.30570715784161e-05, |
| "loss": 0.6224, |
| "step": 5030 |
| }, |
| { |
| "epoch": 3.455605073705862, |
| "grad_norm": 2.835130214691162, |
| "learning_rate": 5.263482025652591e-05, |
| "loss": 0.6399, |
| "step": 5040 |
| }, |
| { |
| "epoch": 3.462461432979088, |
| "grad_norm": 3.4341225624084473, |
| "learning_rate": 5.221365490514041e-05, |
| "loss": 0.5624, |
| "step": 5050 |
| }, |
| { |
| "epoch": 3.469317792252314, |
| "grad_norm": 3.069215774536133, |
| "learning_rate": 5.1793585180575685e-05, |
| "loss": 0.7112, |
| "step": 5060 |
| }, |
| { |
| "epoch": 3.47617415152554, |
| "grad_norm": 2.27093768119812, |
| "learning_rate": 5.137462071402778e-05, |
| "loss": 0.6722, |
| "step": 5070 |
| }, |
| { |
| "epoch": 3.483030510798766, |
| "grad_norm": 2.0508460998535156, |
| "learning_rate": 5.095677111135172e-05, |
| "loss": 0.6451, |
| "step": 5080 |
| }, |
| { |
| "epoch": 3.4898868700719916, |
| "grad_norm": 2.6624417304992676, |
| "learning_rate": 5.054004595284153e-05, |
| "loss": 0.5707, |
| "step": 5090 |
| }, |
| { |
| "epoch": 3.496743229345218, |
| "grad_norm": 2.8987631797790527, |
| "learning_rate": 5.012445479301027e-05, |
| "loss": 0.6265, |
| "step": 5100 |
| }, |
| { |
| "epoch": 3.5035995886184437, |
| "grad_norm": 2.9133450984954834, |
| "learning_rate": 4.971000716037116e-05, |
| "loss": 0.5917, |
| "step": 5110 |
| }, |
| { |
| "epoch": 3.5104559478916695, |
| "grad_norm": 2.526829481124878, |
| "learning_rate": 4.929671255721906e-05, |
| "loss": 0.5988, |
| "step": 5120 |
| }, |
| { |
| "epoch": 3.5173123071648953, |
| "grad_norm": 3.506739854812622, |
| "learning_rate": 4.888458045941269e-05, |
| "loss": 0.5997, |
| "step": 5130 |
| }, |
| { |
| "epoch": 3.524168666438121, |
| "grad_norm": 2.5815718173980713, |
| "learning_rate": 4.84736203161572e-05, |
| "loss": 0.6613, |
| "step": 5140 |
| }, |
| { |
| "epoch": 3.5310250257113474, |
| "grad_norm": 1.7911795377731323, |
| "learning_rate": 4.806384154978766e-05, |
| "loss": 0.6121, |
| "step": 5150 |
| }, |
| { |
| "epoch": 3.537881384984573, |
| "grad_norm": 3.8519790172576904, |
| "learning_rate": 4.7655253555553e-05, |
| "loss": 0.5567, |
| "step": 5160 |
| }, |
| { |
| "epoch": 3.544737744257799, |
| "grad_norm": 3.618427038192749, |
| "learning_rate": 4.724786570140056e-05, |
| "loss": 0.6715, |
| "step": 5170 |
| }, |
| { |
| "epoch": 3.5515941035310252, |
| "grad_norm": 2.0794076919555664, |
| "learning_rate": 4.684168732776132e-05, |
| "loss": 0.5609, |
| "step": 5180 |
| }, |
| { |
| "epoch": 3.558450462804251, |
| "grad_norm": 2.582932472229004, |
| "learning_rate": 4.6436727747335864e-05, |
| "loss": 0.6279, |
| "step": 5190 |
| }, |
| { |
| "epoch": 3.565306822077477, |
| "grad_norm": 2.6992735862731934, |
| "learning_rate": 4.6032996244880634e-05, |
| "loss": 0.639, |
| "step": 5200 |
| }, |
| { |
| "epoch": 3.5721631813507027, |
| "grad_norm": 3.3372395038604736, |
| "learning_rate": 4.563050207699519e-05, |
| "loss": 0.7179, |
| "step": 5210 |
| }, |
| { |
| "epoch": 3.5790195406239285, |
| "grad_norm": 1.9892611503601074, |
| "learning_rate": 4.522925447191005e-05, |
| "loss": 0.583, |
| "step": 5220 |
| }, |
| { |
| "epoch": 3.5858758998971547, |
| "grad_norm": 1.765522837638855, |
| "learning_rate": 4.4829262629274956e-05, |
| "loss": 0.5503, |
| "step": 5230 |
| }, |
| { |
| "epoch": 3.5927322591703805, |
| "grad_norm": 2.380490303039551, |
| "learning_rate": 4.443053571994803e-05, |
| "loss": 0.555, |
| "step": 5240 |
| }, |
| { |
| "epoch": 3.5995886184436063, |
| "grad_norm": 2.7402896881103516, |
| "learning_rate": 4.403308288578544e-05, |
| "loss": 0.5897, |
| "step": 5250 |
| }, |
| { |
| "epoch": 3.6064449777168326, |
| "grad_norm": 2.248931407928467, |
| "learning_rate": 4.3636913239431966e-05, |
| "loss": 0.5743, |
| "step": 5260 |
| }, |
| { |
| "epoch": 3.6133013369900584, |
| "grad_norm": 2.618868112564087, |
| "learning_rate": 4.324203586411186e-05, |
| "loss": 0.6093, |
| "step": 5270 |
| }, |
| { |
| "epoch": 3.620157696263284, |
| "grad_norm": 2.2212753295898438, |
| "learning_rate": 4.2848459813420724e-05, |
| "loss": 0.6123, |
| "step": 5280 |
| }, |
| { |
| "epoch": 3.62701405553651, |
| "grad_norm": 2.1903445720672607, |
| "learning_rate": 4.245619411111785e-05, |
| "loss": 0.6425, |
| "step": 5290 |
| }, |
| { |
| "epoch": 3.633870414809736, |
| "grad_norm": 2.3697597980499268, |
| "learning_rate": 4.2065247750919455e-05, |
| "loss": 0.7007, |
| "step": 5300 |
| }, |
| { |
| "epoch": 3.640726774082962, |
| "grad_norm": 2.1647536754608154, |
| "learning_rate": 4.167562969629233e-05, |
| "loss": 0.5604, |
| "step": 5310 |
| }, |
| { |
| "epoch": 3.647583133356188, |
| "grad_norm": 3.097696542739868, |
| "learning_rate": 4.128734888024833e-05, |
| "loss": 0.6153, |
| "step": 5320 |
| }, |
| { |
| "epoch": 3.6544394926294137, |
| "grad_norm": 3.1014564037323, |
| "learning_rate": 4.090041420513978e-05, |
| "loss": 0.5866, |
| "step": 5330 |
| }, |
| { |
| "epoch": 3.66129585190264, |
| "grad_norm": 3.541137456893921, |
| "learning_rate": 4.0514834542455085e-05, |
| "loss": 0.7697, |
| "step": 5340 |
| }, |
| { |
| "epoch": 3.6681522111758658, |
| "grad_norm": 2.912750244140625, |
| "learning_rate": 4.0130618732615467e-05, |
| "loss": 0.6285, |
| "step": 5350 |
| }, |
| { |
| "epoch": 3.6750085704490916, |
| "grad_norm": 3.2789480686187744, |
| "learning_rate": 3.974777558477224e-05, |
| "loss": 0.5937, |
| "step": 5360 |
| }, |
| { |
| "epoch": 3.6818649297223174, |
| "grad_norm": 2.6167523860931396, |
| "learning_rate": 3.9366313876604966e-05, |
| "loss": 0.6394, |
| "step": 5370 |
| }, |
| { |
| "epoch": 3.688721288995543, |
| "grad_norm": 2.452730178833008, |
| "learning_rate": 3.898624235411997e-05, |
| "loss": 0.634, |
| "step": 5380 |
| }, |
| { |
| "epoch": 3.6955776482687694, |
| "grad_norm": 2.2829294204711914, |
| "learning_rate": 3.860756973144996e-05, |
| "loss": 0.6577, |
| "step": 5390 |
| }, |
| { |
| "epoch": 3.7024340075419953, |
| "grad_norm": 2.3391597270965576, |
| "learning_rate": 3.8230304690654304e-05, |
| "loss": 0.6229, |
| "step": 5400 |
| }, |
| { |
| "epoch": 3.709290366815221, |
| "grad_norm": 2.8178772926330566, |
| "learning_rate": 3.7854455881519757e-05, |
| "loss": 0.6546, |
| "step": 5410 |
| }, |
| { |
| "epoch": 3.716146726088447, |
| "grad_norm": 2.5491912364959717, |
| "learning_rate": 3.7480031921362316e-05, |
| "loss": 0.5136, |
| "step": 5420 |
| }, |
| { |
| "epoch": 3.7230030853616727, |
| "grad_norm": 3.2480504512786865, |
| "learning_rate": 3.7107041394829556e-05, |
| "loss": 0.6712, |
| "step": 5430 |
| }, |
| { |
| "epoch": 3.729859444634899, |
| "grad_norm": 2.2545714378356934, |
| "learning_rate": 3.673549285370395e-05, |
| "loss": 0.6131, |
| "step": 5440 |
| }, |
| { |
| "epoch": 3.7367158039081247, |
| "grad_norm": 2.1797900199890137, |
| "learning_rate": 3.636539481670656e-05, |
| "loss": 0.5145, |
| "step": 5450 |
| }, |
| { |
| "epoch": 3.7435721631813506, |
| "grad_norm": 2.1412277221679688, |
| "learning_rate": 3.5996755769301904e-05, |
| "loss": 0.6453, |
| "step": 5460 |
| }, |
| { |
| "epoch": 3.750428522454577, |
| "grad_norm": 2.3049139976501465, |
| "learning_rate": 3.562958416350334e-05, |
| "loss": 0.5419, |
| "step": 5470 |
| }, |
| { |
| "epoch": 3.7572848817278026, |
| "grad_norm": 2.415832281112671, |
| "learning_rate": 3.526388841767934e-05, |
| "loss": 0.6422, |
| "step": 5480 |
| }, |
| { |
| "epoch": 3.7641412410010284, |
| "grad_norm": 1.7117339372634888, |
| "learning_rate": 3.489967691636038e-05, |
| "loss": 0.6086, |
| "step": 5490 |
| }, |
| { |
| "epoch": 3.7709976002742542, |
| "grad_norm": 3.3377764225006104, |
| "learning_rate": 3.4536958010046715e-05, |
| "loss": 0.6449, |
| "step": 5500 |
| }, |
| { |
| "epoch": 3.77785395954748, |
| "grad_norm": 2.1959517002105713, |
| "learning_rate": 3.417574001501709e-05, |
| "loss": 0.6297, |
| "step": 5510 |
| }, |
| { |
| "epoch": 3.7847103188207063, |
| "grad_norm": 2.526855945587158, |
| "learning_rate": 3.381603121313781e-05, |
| "loss": 0.6232, |
| "step": 5520 |
| }, |
| { |
| "epoch": 3.791566678093932, |
| "grad_norm": 2.3273513317108154, |
| "learning_rate": 3.3457839851673045e-05, |
| "loss": 0.5473, |
| "step": 5530 |
| }, |
| { |
| "epoch": 3.798423037367158, |
| "grad_norm": 3.1892895698547363, |
| "learning_rate": 3.310117414309563e-05, |
| "loss": 0.6174, |
| "step": 5540 |
| }, |
| { |
| "epoch": 3.805279396640384, |
| "grad_norm": 3.0987548828125, |
| "learning_rate": 3.2746042264898905e-05, |
| "loss": 0.677, |
| "step": 5550 |
| }, |
| { |
| "epoch": 3.81213575591361, |
| "grad_norm": 2.421818256378174, |
| "learning_rate": 3.2392452359409064e-05, |
| "loss": 0.5264, |
| "step": 5560 |
| }, |
| { |
| "epoch": 3.818992115186836, |
| "grad_norm": 2.0628252029418945, |
| "learning_rate": 3.2040412533598554e-05, |
| "loss": 0.6114, |
| "step": 5570 |
| }, |
| { |
| "epoch": 3.8258484744600616, |
| "grad_norm": 3.190624475479126, |
| "learning_rate": 3.1689930858900263e-05, |
| "loss": 0.6122, |
| "step": 5580 |
| }, |
| { |
| "epoch": 3.8327048337332874, |
| "grad_norm": 2.274360418319702, |
| "learning_rate": 3.134101537102232e-05, |
| "loss": 0.7138, |
| "step": 5590 |
| }, |
| { |
| "epoch": 3.8395611930065137, |
| "grad_norm": 2.9021377563476562, |
| "learning_rate": 3.099367406976397e-05, |
| "loss": 0.4782, |
| "step": 5600 |
| }, |
| { |
| "epoch": 3.8464175522797395, |
| "grad_norm": 3.5629074573516846, |
| "learning_rate": 3.0647914918832054e-05, |
| "loss": 0.605, |
| "step": 5610 |
| }, |
| { |
| "epoch": 3.8532739115529653, |
| "grad_norm": 3.650761604309082, |
| "learning_rate": 3.0303745845658595e-05, |
| "loss": 0.6043, |
| "step": 5620 |
| }, |
| { |
| "epoch": 3.8601302708261915, |
| "grad_norm": 2.938598394393921, |
| "learning_rate": 2.9961174741218833e-05, |
| "loss": 0.5623, |
| "step": 5630 |
| }, |
| { |
| "epoch": 3.8669866300994173, |
| "grad_norm": 2.986330032348633, |
| "learning_rate": 2.9620209459850412e-05, |
| "loss": 0.6794, |
| "step": 5640 |
| }, |
| { |
| "epoch": 3.873842989372643, |
| "grad_norm": 2.9300005435943604, |
| "learning_rate": 2.9280857819073347e-05, |
| "loss": 0.6184, |
| "step": 5650 |
| }, |
| { |
| "epoch": 3.880699348645869, |
| "grad_norm": 3.191838264465332, |
| "learning_rate": 2.894312759941068e-05, |
| "loss": 0.5835, |
| "step": 5660 |
| }, |
| { |
| "epoch": 3.8875557079190948, |
| "grad_norm": 3.251615285873413, |
| "learning_rate": 2.8607026544210114e-05, |
| "loss": 0.482, |
| "step": 5670 |
| }, |
| { |
| "epoch": 3.894412067192321, |
| "grad_norm": 2.271155595779419, |
| "learning_rate": 2.8272562359466502e-05, |
| "loss": 0.5164, |
| "step": 5680 |
| }, |
| { |
| "epoch": 3.901268426465547, |
| "grad_norm": 2.547264575958252, |
| "learning_rate": 2.793974271364528e-05, |
| "loss": 0.6159, |
| "step": 5690 |
| }, |
| { |
| "epoch": 3.9081247857387726, |
| "grad_norm": 2.5156540870666504, |
| "learning_rate": 2.760857523750637e-05, |
| "loss": 0.6175, |
| "step": 5700 |
| }, |
| { |
| "epoch": 3.914981145011999, |
| "grad_norm": 2.334170341491699, |
| "learning_rate": 2.7279067523929493e-05, |
| "loss": 0.6351, |
| "step": 5710 |
| }, |
| { |
| "epoch": 3.9218375042852247, |
| "grad_norm": 2.4579522609710693, |
| "learning_rate": 2.6951227127739898e-05, |
| "loss": 0.606, |
| "step": 5720 |
| }, |
| { |
| "epoch": 3.9286938635584505, |
| "grad_norm": 2.738943576812744, |
| "learning_rate": 2.6625061565535337e-05, |
| "loss": 0.5695, |
| "step": 5730 |
| }, |
| { |
| "epoch": 3.9355502228316763, |
| "grad_norm": 4.373673439025879, |
| "learning_rate": 2.630057831551351e-05, |
| "loss": 0.563, |
| "step": 5740 |
| }, |
| { |
| "epoch": 3.942406582104902, |
| "grad_norm": 2.3918118476867676, |
| "learning_rate": 2.5977784817300742e-05, |
| "loss": 0.612, |
| "step": 5750 |
| }, |
| { |
| "epoch": 3.9492629413781284, |
| "grad_norm": 3.0279042720794678, |
| "learning_rate": 2.5656688471781453e-05, |
| "loss": 0.6402, |
| "step": 5760 |
| }, |
| { |
| "epoch": 3.956119300651354, |
| "grad_norm": 2.2601287364959717, |
| "learning_rate": 2.533729664092831e-05, |
| "loss": 0.5693, |
| "step": 5770 |
| }, |
| { |
| "epoch": 3.96297565992458, |
| "grad_norm": 4.802177906036377, |
| "learning_rate": 2.501961664763357e-05, |
| "loss": 0.5653, |
| "step": 5780 |
| }, |
| { |
| "epoch": 3.969832019197806, |
| "grad_norm": 1.8669755458831787, |
| "learning_rate": 2.4703655775541102e-05, |
| "loss": 0.5914, |
| "step": 5790 |
| }, |
| { |
| "epoch": 3.9766883784710316, |
| "grad_norm": 2.633484125137329, |
| "learning_rate": 2.438942126887953e-05, |
| "loss": 0.4817, |
| "step": 5800 |
| }, |
| { |
| "epoch": 3.983544737744258, |
| "grad_norm": 2.3888065814971924, |
| "learning_rate": 2.407692033229594e-05, |
| "loss": 0.5817, |
| "step": 5810 |
| }, |
| { |
| "epoch": 3.9904010970174837, |
| "grad_norm": 2.9784138202667236, |
| "learning_rate": 2.3766160130690784e-05, |
| "loss": 0.602, |
| "step": 5820 |
| }, |
| { |
| "epoch": 3.9972574562907095, |
| "grad_norm": 2.4055368900299072, |
| "learning_rate": 2.3457147789053747e-05, |
| "loss": 0.6197, |
| "step": 5830 |
| }, |
| { |
| "epoch": 4.004113815563936, |
| "grad_norm": 2.228529214859009, |
| "learning_rate": 2.314989039230011e-05, |
| "loss": 0.4612, |
| "step": 5840 |
| }, |
| { |
| "epoch": 4.010970174837161, |
| "grad_norm": 2.3445403575897217, |
| "learning_rate": 2.284439498510854e-05, |
| "loss": 0.4838, |
| "step": 5850 |
| }, |
| { |
| "epoch": 4.017826534110387, |
| "grad_norm": 2.2850747108459473, |
| "learning_rate": 2.2540668571759428e-05, |
| "loss": 0.4838, |
| "step": 5860 |
| }, |
| { |
| "epoch": 4.024682893383614, |
| "grad_norm": 2.4510273933410645, |
| "learning_rate": 2.2238718115974454e-05, |
| "loss": 0.385, |
| "step": 5870 |
| }, |
| { |
| "epoch": 4.031539252656839, |
| "grad_norm": 2.711827278137207, |
| "learning_rate": 2.193855054075674e-05, |
| "loss": 0.3388, |
| "step": 5880 |
| }, |
| { |
| "epoch": 4.038395611930065, |
| "grad_norm": 4.566797733306885, |
| "learning_rate": 2.1640172728232267e-05, |
| "loss": 0.4607, |
| "step": 5890 |
| }, |
| { |
| "epoch": 4.045251971203291, |
| "grad_norm": 3.350759506225586, |
| "learning_rate": 2.1343591519491966e-05, |
| "loss": 0.4704, |
| "step": 5900 |
| }, |
| { |
| "epoch": 4.052108330476517, |
| "grad_norm": 2.5083415508270264, |
| "learning_rate": 2.104881371443502e-05, |
| "loss": 0.4961, |
| "step": 5910 |
| }, |
| { |
| "epoch": 4.058964689749743, |
| "grad_norm": 2.662445068359375, |
| "learning_rate": 2.075584607161283e-05, |
| "loss": 0.4776, |
| "step": 5920 |
| }, |
| { |
| "epoch": 4.0658210490229685, |
| "grad_norm": 1.9160922765731812, |
| "learning_rate": 2.0464695308074032e-05, |
| "loss": 0.4301, |
| "step": 5930 |
| }, |
| { |
| "epoch": 4.072677408296195, |
| "grad_norm": 1.7301816940307617, |
| "learning_rate": 2.01753680992107e-05, |
| "loss": 0.4956, |
| "step": 5940 |
| }, |
| { |
| "epoch": 4.079533767569421, |
| "grad_norm": 2.6820240020751953, |
| "learning_rate": 1.9887871078605037e-05, |
| "loss": 0.4724, |
| "step": 5950 |
| }, |
| { |
| "epoch": 4.086390126842646, |
| "grad_norm": 2.909573793411255, |
| "learning_rate": 1.9602210837877423e-05, |
| "loss": 0.4075, |
| "step": 5960 |
| }, |
| { |
| "epoch": 4.093246486115873, |
| "grad_norm": 2.7761423587799072, |
| "learning_rate": 1.931839392653525e-05, |
| "loss": 0.4034, |
| "step": 5970 |
| }, |
| { |
| "epoch": 4.100102845389098, |
| "grad_norm": 2.786170244216919, |
| "learning_rate": 1.903642685182283e-05, |
| "loss": 0.4334, |
| "step": 5980 |
| }, |
| { |
| "epoch": 4.106959204662324, |
| "grad_norm": 2.5752928256988525, |
| "learning_rate": 1.875631607857209e-05, |
| "loss": 0.5167, |
| "step": 5990 |
| }, |
| { |
| "epoch": 4.1138155639355505, |
| "grad_norm": 5.471198081970215, |
| "learning_rate": 1.8478068029054386e-05, |
| "loss": 0.4546, |
| "step": 6000 |
| }, |
| { |
| "epoch": 4.1138155639355505, |
| "eval_loss": 1.6329894065856934, |
| "eval_runtime": 29.3853, |
| "eval_samples_per_second": 83.613, |
| "eval_steps_per_second": 10.481, |
| "step": 6000 |
| }, |
| { |
| "epoch": 4.120671923208776, |
| "grad_norm": 3.2511181831359863, |
| "learning_rate": 1.8201689082833272e-05, |
| "loss": 0.4688, |
| "step": 6010 |
| }, |
| { |
| "epoch": 4.127528282482002, |
| "grad_norm": 2.027602195739746, |
| "learning_rate": 1.7927185576618244e-05, |
| "loss": 0.457, |
| "step": 6020 |
| }, |
| { |
| "epoch": 4.134384641755228, |
| "grad_norm": 3.83060622215271, |
| "learning_rate": 1.7654563804119396e-05, |
| "loss": 0.4621, |
| "step": 6030 |
| }, |
| { |
| "epoch": 4.141241001028454, |
| "grad_norm": 1.915252447128296, |
| "learning_rate": 1.7383830015903223e-05, |
| "loss": 0.5149, |
| "step": 6040 |
| }, |
| { |
| "epoch": 4.14809736030168, |
| "grad_norm": 3.230914831161499, |
| "learning_rate": 1.711499041924921e-05, |
| "loss": 0.4142, |
| "step": 6050 |
| }, |
| { |
| "epoch": 4.154953719574905, |
| "grad_norm": 2.2559401988983154, |
| "learning_rate": 1.684805117800755e-05, |
| "loss": 0.4578, |
| "step": 6060 |
| }, |
| { |
| "epoch": 4.161810078848132, |
| "grad_norm": 3.0432651042938232, |
| "learning_rate": 1.6583018412457784e-05, |
| "loss": 0.4685, |
| "step": 6070 |
| }, |
| { |
| "epoch": 4.168666438121358, |
| "grad_norm": 2.882570743560791, |
| "learning_rate": 1.6319898199168627e-05, |
| "loss": 0.4859, |
| "step": 6080 |
| }, |
| { |
| "epoch": 4.175522797394583, |
| "grad_norm": 2.090360164642334, |
| "learning_rate": 1.6058696570858422e-05, |
| "loss": 0.5028, |
| "step": 6090 |
| }, |
| { |
| "epoch": 4.1823791566678095, |
| "grad_norm": 2.042056083679199, |
| "learning_rate": 1.5799419516256985e-05, |
| "loss": 0.4483, |
| "step": 6100 |
| }, |
| { |
| "epoch": 4.189235515941036, |
| "grad_norm": 1.4510776996612549, |
| "learning_rate": 1.5542072979968268e-05, |
| "loss": 0.4544, |
| "step": 6110 |
| }, |
| { |
| "epoch": 4.196091875214261, |
| "grad_norm": 2.745123863220215, |
| "learning_rate": 1.5286662862334035e-05, |
| "loss": 0.5091, |
| "step": 6120 |
| }, |
| { |
| "epoch": 4.202948234487487, |
| "grad_norm": 2.3903136253356934, |
| "learning_rate": 1.5033195019298563e-05, |
| "loss": 0.4537, |
| "step": 6130 |
| }, |
| { |
| "epoch": 4.209804593760713, |
| "grad_norm": 2.7773876190185547, |
| "learning_rate": 1.4781675262274419e-05, |
| "loss": 0.4708, |
| "step": 6140 |
| }, |
| { |
| "epoch": 4.216660953033939, |
| "grad_norm": 2.1511118412017822, |
| "learning_rate": 1.4532109358009272e-05, |
| "loss": 0.4282, |
| "step": 6150 |
| }, |
| { |
| "epoch": 4.223517312307165, |
| "grad_norm": 2.3747787475585938, |
| "learning_rate": 1.4284503028453522e-05, |
| "loss": 0.3366, |
| "step": 6160 |
| }, |
| { |
| "epoch": 4.230373671580391, |
| "grad_norm": 2.423283576965332, |
| "learning_rate": 1.4038861950629234e-05, |
| "loss": 0.4201, |
| "step": 6170 |
| }, |
| { |
| "epoch": 4.237230030853617, |
| "grad_norm": 3.218165159225464, |
| "learning_rate": 1.379519175649997e-05, |
| "loss": 0.386, |
| "step": 6180 |
| }, |
| { |
| "epoch": 4.244086390126842, |
| "grad_norm": 2.6405014991760254, |
| "learning_rate": 1.3553498032841605e-05, |
| "loss": 0.4341, |
| "step": 6190 |
| }, |
| { |
| "epoch": 4.250942749400068, |
| "grad_norm": 3.0675742626190186, |
| "learning_rate": 1.3313786321114252e-05, |
| "loss": 0.476, |
| "step": 6200 |
| }, |
| { |
| "epoch": 4.257799108673295, |
| "grad_norm": 2.122243642807007, |
| "learning_rate": 1.307606211733522e-05, |
| "loss": 0.456, |
| "step": 6210 |
| }, |
| { |
| "epoch": 4.26465546794652, |
| "grad_norm": 2.534996509552002, |
| "learning_rate": 1.2840330871953077e-05, |
| "loss": 0.3959, |
| "step": 6220 |
| }, |
| { |
| "epoch": 4.271511827219746, |
| "grad_norm": 2.619480848312378, |
| "learning_rate": 1.2606597989722524e-05, |
| "loss": 0.4781, |
| "step": 6230 |
| }, |
| { |
| "epoch": 4.278368186492973, |
| "grad_norm": 2.5350911617279053, |
| "learning_rate": 1.237486882958061e-05, |
| "loss": 0.4387, |
| "step": 6240 |
| }, |
| { |
| "epoch": 4.285224545766198, |
| "grad_norm": 3.2731032371520996, |
| "learning_rate": 1.2145148704523779e-05, |
| "loss": 0.4438, |
| "step": 6250 |
| }, |
| { |
| "epoch": 4.292080905039424, |
| "grad_norm": 2.3770089149475098, |
| "learning_rate": 1.1917442881486174e-05, |
| "loss": 0.5292, |
| "step": 6260 |
| }, |
| { |
| "epoch": 4.29893726431265, |
| "grad_norm": 3.8733415603637695, |
| "learning_rate": 1.1691756581218726e-05, |
| "loss": 0.4823, |
| "step": 6270 |
| }, |
| { |
| "epoch": 4.305793623585876, |
| "grad_norm": 4.391140460968018, |
| "learning_rate": 1.1468094978169553e-05, |
| "loss": 0.4197, |
| "step": 6280 |
| }, |
| { |
| "epoch": 4.312649982859102, |
| "grad_norm": 2.645474910736084, |
| "learning_rate": 1.124646320036532e-05, |
| "loss": 0.5782, |
| "step": 6290 |
| }, |
| { |
| "epoch": 4.319506342132327, |
| "grad_norm": 1.8884179592132568, |
| "learning_rate": 1.1026866329293628e-05, |
| "loss": 0.4514, |
| "step": 6300 |
| }, |
| { |
| "epoch": 4.326362701405554, |
| "grad_norm": 2.595188617706299, |
| "learning_rate": 1.0809309399786527e-05, |
| "loss": 0.47, |
| "step": 6310 |
| }, |
| { |
| "epoch": 4.33321906067878, |
| "grad_norm": 3.458259344100952, |
| "learning_rate": 1.0593797399905037e-05, |
| "loss": 0.4688, |
| "step": 6320 |
| }, |
| { |
| "epoch": 4.340075419952005, |
| "grad_norm": 1.9553050994873047, |
| "learning_rate": 1.0380335270824904e-05, |
| "loss": 0.4013, |
| "step": 6330 |
| }, |
| { |
| "epoch": 4.3469317792252316, |
| "grad_norm": 1.8735579252243042, |
| "learning_rate": 1.0168927906723168e-05, |
| "loss": 0.4051, |
| "step": 6340 |
| }, |
| { |
| "epoch": 4.353788138498457, |
| "grad_norm": 3.0255327224731445, |
| "learning_rate": 9.959580154666015e-06, |
| "loss": 0.4805, |
| "step": 6350 |
| }, |
| { |
| "epoch": 4.360644497771683, |
| "grad_norm": 2.6736433506011963, |
| "learning_rate": 9.752296814497697e-06, |
| "loss": 0.4719, |
| "step": 6360 |
| }, |
| { |
| "epoch": 4.367500857044909, |
| "grad_norm": 2.2465081214904785, |
| "learning_rate": 9.547082638730376e-06, |
| "loss": 0.4025, |
| "step": 6370 |
| }, |
| { |
| "epoch": 4.374357216318135, |
| "grad_norm": 2.689293622970581, |
| "learning_rate": 9.343942332435218e-06, |
| "loss": 0.4303, |
| "step": 6380 |
| }, |
| { |
| "epoch": 4.381213575591361, |
| "grad_norm": 2.96524977684021, |
| "learning_rate": 9.142880553134514e-06, |
| "loss": 0.4317, |
| "step": 6390 |
| }, |
| { |
| "epoch": 4.388069934864587, |
| "grad_norm": 2.312793254852295, |
| "learning_rate": 8.943901910694941e-06, |
| "loss": 0.4613, |
| "step": 6400 |
| }, |
| { |
| "epoch": 4.394926294137813, |
| "grad_norm": 1.9500372409820557, |
| "learning_rate": 8.747010967221747e-06, |
| "loss": 0.535, |
| "step": 6410 |
| }, |
| { |
| "epoch": 4.401782653411039, |
| "grad_norm": 2.1753222942352295, |
| "learning_rate": 8.552212236954293e-06, |
| "loss": 0.4357, |
| "step": 6420 |
| }, |
| { |
| "epoch": 4.408639012684264, |
| "grad_norm": 2.4783313274383545, |
| "learning_rate": 8.3595101861624e-06, |
| "loss": 0.4186, |
| "step": 6430 |
| }, |
| { |
| "epoch": 4.4154953719574905, |
| "grad_norm": 1.5100256204605103, |
| "learning_rate": 8.168909233044153e-06, |
| "loss": 0.4332, |
| "step": 6440 |
| }, |
| { |
| "epoch": 4.422351731230717, |
| "grad_norm": 3.4225423336029053, |
| "learning_rate": 7.980413747624383e-06, |
| "loss": 0.4127, |
| "step": 6450 |
| }, |
| { |
| "epoch": 4.429208090503942, |
| "grad_norm": 2.8790810108184814, |
| "learning_rate": 7.7940280516546e-06, |
| "loss": 0.4899, |
| "step": 6460 |
| }, |
| { |
| "epoch": 4.436064449777168, |
| "grad_norm": 2.543872833251953, |
| "learning_rate": 7.609756418513914e-06, |
| "loss": 0.5136, |
| "step": 6470 |
| }, |
| { |
| "epoch": 4.442920809050394, |
| "grad_norm": 1.3648442029953003, |
| "learning_rate": 7.427603073110967e-06, |
| "loss": 0.5212, |
| "step": 6480 |
| }, |
| { |
| "epoch": 4.44977716832362, |
| "grad_norm": 1.9477663040161133, |
| "learning_rate": 7.247572191787167e-06, |
| "loss": 0.4124, |
| "step": 6490 |
| }, |
| { |
| "epoch": 4.456633527596846, |
| "grad_norm": 3.6821651458740234, |
| "learning_rate": 7.069667902220822e-06, |
| "loss": 0.4763, |
| "step": 6500 |
| }, |
| { |
| "epoch": 4.463489886870072, |
| "grad_norm": 2.303973913192749, |
| "learning_rate": 6.8938942833326695e-06, |
| "loss": 0.4455, |
| "step": 6510 |
| }, |
| { |
| "epoch": 4.470346246143298, |
| "grad_norm": 2.560413360595703, |
| "learning_rate": 6.720255365192163e-06, |
| "loss": 0.4055, |
| "step": 6520 |
| }, |
| { |
| "epoch": 4.477202605416524, |
| "grad_norm": 3.7052018642425537, |
| "learning_rate": 6.548755128925188e-06, |
| "loss": 0.4169, |
| "step": 6530 |
| }, |
| { |
| "epoch": 4.4840589646897495, |
| "grad_norm": 1.845646619796753, |
| "learning_rate": 6.379397506622808e-06, |
| "loss": 0.3946, |
| "step": 6540 |
| }, |
| { |
| "epoch": 4.490915323962976, |
| "grad_norm": 2.254162549972534, |
| "learning_rate": 6.212186381250984e-06, |
| "loss": 0.441, |
| "step": 6550 |
| }, |
| { |
| "epoch": 4.497771683236202, |
| "grad_norm": 2.616757392883301, |
| "learning_rate": 6.047125586561686e-06, |
| "loss": 0.5491, |
| "step": 6560 |
| }, |
| { |
| "epoch": 4.504628042509427, |
| "grad_norm": 2.068566083908081, |
| "learning_rate": 5.884218907004901e-06, |
| "loss": 0.4361, |
| "step": 6570 |
| }, |
| { |
| "epoch": 4.511484401782654, |
| "grad_norm": 3.0345873832702637, |
| "learning_rate": 5.723470077641924e-06, |
| "loss": 0.4673, |
| "step": 6580 |
| }, |
| { |
| "epoch": 4.518340761055879, |
| "grad_norm": 2.5010392665863037, |
| "learning_rate": 5.564882784059689e-06, |
| "loss": 0.4171, |
| "step": 6590 |
| }, |
| { |
| "epoch": 4.525197120329105, |
| "grad_norm": 1.6839686632156372, |
| "learning_rate": 5.408460662286241e-06, |
| "loss": 0.4807, |
| "step": 6600 |
| }, |
| { |
| "epoch": 4.5320534796023315, |
| "grad_norm": 1.956069827079773, |
| "learning_rate": 5.2542072987074695e-06, |
| "loss": 0.3572, |
| "step": 6610 |
| }, |
| { |
| "epoch": 4.538909838875557, |
| "grad_norm": 2.861846446990967, |
| "learning_rate": 5.1021262299847495e-06, |
| "loss": 0.4342, |
| "step": 6620 |
| }, |
| { |
| "epoch": 4.545766198148783, |
| "grad_norm": 4.325196743011475, |
| "learning_rate": 4.952220942973973e-06, |
| "loss": 0.4651, |
| "step": 6630 |
| }, |
| { |
| "epoch": 4.5526225574220085, |
| "grad_norm": 2.3949451446533203, |
| "learning_rate": 4.8044948746454935e-06, |
| "loss": 0.4429, |
| "step": 6640 |
| }, |
| { |
| "epoch": 4.559478916695235, |
| "grad_norm": 2.8822643756866455, |
| "learning_rate": 4.6589514120054525e-06, |
| "loss": 0.3782, |
| "step": 6650 |
| }, |
| { |
| "epoch": 4.566335275968461, |
| "grad_norm": 3.471057653427124, |
| "learning_rate": 4.515593892017999e-06, |
| "loss": 0.415, |
| "step": 6660 |
| }, |
| { |
| "epoch": 4.573191635241686, |
| "grad_norm": 3.1017813682556152, |
| "learning_rate": 4.3744256015288645e-06, |
| "loss": 0.5172, |
| "step": 6670 |
| }, |
| { |
| "epoch": 4.580047994514913, |
| "grad_norm": 2.956613779067993, |
| "learning_rate": 4.235449777189937e-06, |
| "loss": 0.491, |
| "step": 6680 |
| }, |
| { |
| "epoch": 4.586904353788139, |
| "grad_norm": 2.5809433460235596, |
| "learning_rate": 4.098669605385142e-06, |
| "loss": 0.4805, |
| "step": 6690 |
| }, |
| { |
| "epoch": 4.593760713061364, |
| "grad_norm": 2.229520082473755, |
| "learning_rate": 3.964088222157303e-06, |
| "loss": 0.4809, |
| "step": 6700 |
| }, |
| { |
| "epoch": 4.6006170723345905, |
| "grad_norm": 2.391052722930908, |
| "learning_rate": 3.83170871313625e-06, |
| "loss": 0.4123, |
| "step": 6710 |
| }, |
| { |
| "epoch": 4.607473431607816, |
| "grad_norm": 2.391326904296875, |
| "learning_rate": 3.7015341134681526e-06, |
| "loss": 0.3621, |
| "step": 6720 |
| }, |
| { |
| "epoch": 4.614329790881042, |
| "grad_norm": 3.428799867630005, |
| "learning_rate": 3.573567407745826e-06, |
| "loss": 0.4297, |
| "step": 6730 |
| }, |
| { |
| "epoch": 4.621186150154268, |
| "grad_norm": 2.7340753078460693, |
| "learning_rate": 3.447811529940348e-06, |
| "loss": 0.4269, |
| "step": 6740 |
| }, |
| { |
| "epoch": 4.628042509427494, |
| "grad_norm": 2.597050666809082, |
| "learning_rate": 3.3242693633337983e-06, |
| "loss": 0.3771, |
| "step": 6750 |
| }, |
| { |
| "epoch": 4.63489886870072, |
| "grad_norm": 2.1940290927886963, |
| "learning_rate": 3.2029437404531683e-06, |
| "loss": 0.3715, |
| "step": 6760 |
| }, |
| { |
| "epoch": 4.641755227973945, |
| "grad_norm": 3.0640647411346436, |
| "learning_rate": 3.083837443005355e-06, |
| "loss": 0.4535, |
| "step": 6770 |
| }, |
| { |
| "epoch": 4.648611587247172, |
| "grad_norm": 2.0457632541656494, |
| "learning_rate": 2.966953201813427e-06, |
| "loss": 0.4653, |
| "step": 6780 |
| }, |
| { |
| "epoch": 4.655467946520398, |
| "grad_norm": 4.1095733642578125, |
| "learning_rate": 2.8522936967540383e-06, |
| "loss": 0.3734, |
| "step": 6790 |
| }, |
| { |
| "epoch": 4.662324305793623, |
| "grad_norm": 1.960434079170227, |
| "learning_rate": 2.739861556695933e-06, |
| "loss": 0.4308, |
| "step": 6800 |
| }, |
| { |
| "epoch": 4.6691806650668495, |
| "grad_norm": 2.4154977798461914, |
| "learning_rate": 2.6296593594396733e-06, |
| "loss": 0.4377, |
| "step": 6810 |
| }, |
| { |
| "epoch": 4.676037024340076, |
| "grad_norm": 4.1492486000061035, |
| "learning_rate": 2.5216896316585746e-06, |
| "loss": 0.4988, |
| "step": 6820 |
| }, |
| { |
| "epoch": 4.682893383613301, |
| "grad_norm": 1.40762460231781, |
| "learning_rate": 2.4159548488407733e-06, |
| "loss": 0.5145, |
| "step": 6830 |
| }, |
| { |
| "epoch": 4.689749742886527, |
| "grad_norm": 2.824812889099121, |
| "learning_rate": 2.31245743523244e-06, |
| "loss": 0.4712, |
| "step": 6840 |
| }, |
| { |
| "epoch": 4.696606102159754, |
| "grad_norm": 6.674090385437012, |
| "learning_rate": 2.2111997637821792e-06, |
| "loss": 0.4395, |
| "step": 6850 |
| }, |
| { |
| "epoch": 4.703462461432979, |
| "grad_norm": 2.4539730548858643, |
| "learning_rate": 2.1121841560867273e-06, |
| "loss": 0.3955, |
| "step": 6860 |
| }, |
| { |
| "epoch": 4.710318820706205, |
| "grad_norm": 1.873875379562378, |
| "learning_rate": 2.015412882337564e-06, |
| "loss": 0.4142, |
| "step": 6870 |
| }, |
| { |
| "epoch": 4.717175179979431, |
| "grad_norm": 11.428731918334961, |
| "learning_rate": 1.9208881612689967e-06, |
| "loss": 0.5176, |
| "step": 6880 |
| }, |
| { |
| "epoch": 4.724031539252657, |
| "grad_norm": 2.4507086277008057, |
| "learning_rate": 1.828612160107257e-06, |
| "loss": 0.4799, |
| "step": 6890 |
| }, |
| { |
| "epoch": 4.730887898525883, |
| "grad_norm": 2.5675251483917236, |
| "learning_rate": 1.7385869945207523e-06, |
| "loss": 0.4605, |
| "step": 6900 |
| }, |
| { |
| "epoch": 4.7377442577991085, |
| "grad_norm": 2.8171119689941406, |
| "learning_rate": 1.650814728571648e-06, |
| "loss": 0.4743, |
| "step": 6910 |
| }, |
| { |
| "epoch": 4.744600617072335, |
| "grad_norm": 2.8307461738586426, |
| "learning_rate": 1.565297374668473e-06, |
| "loss": 0.4494, |
| "step": 6920 |
| }, |
| { |
| "epoch": 4.75145697634556, |
| "grad_norm": 2.4572560787200928, |
| "learning_rate": 1.4820368935200002e-06, |
| "loss": 0.4678, |
| "step": 6930 |
| }, |
| { |
| "epoch": 4.758313335618786, |
| "grad_norm": 2.9318461418151855, |
| "learning_rate": 1.4010351940903276e-06, |
| "loss": 0.3936, |
| "step": 6940 |
| }, |
| { |
| "epoch": 4.765169694892013, |
| "grad_norm": 2.561730146408081, |
| "learning_rate": 1.3222941335550353e-06, |
| "loss": 0.4745, |
| "step": 6950 |
| }, |
| { |
| "epoch": 4.772026054165238, |
| "grad_norm": 2.51572847366333, |
| "learning_rate": 1.2458155172587083e-06, |
| "loss": 0.4616, |
| "step": 6960 |
| }, |
| { |
| "epoch": 4.778882413438464, |
| "grad_norm": 2.1606862545013428, |
| "learning_rate": 1.171601098673436e-06, |
| "loss": 0.4764, |
| "step": 6970 |
| }, |
| { |
| "epoch": 4.7857387727116905, |
| "grad_norm": 2.5997395515441895, |
| "learning_rate": 1.0996525793586677e-06, |
| "loss": 0.497, |
| "step": 6980 |
| }, |
| { |
| "epoch": 4.792595131984916, |
| "grad_norm": 2.7361416816711426, |
| "learning_rate": 1.02997160892222e-06, |
| "loss": 0.3996, |
| "step": 6990 |
| }, |
| { |
| "epoch": 4.799451491258142, |
| "grad_norm": 2.5793089866638184, |
| "learning_rate": 9.625597849823976e-07, |
| "loss": 0.448, |
| "step": 7000 |
| }, |
| { |
| "epoch": 4.799451491258142, |
| "eval_loss": 1.6523703336715698, |
| "eval_runtime": 29.3476, |
| "eval_samples_per_second": 83.721, |
| "eval_steps_per_second": 10.495, |
| "step": 7000 |
| }, |
| { |
| "epoch": 4.806307850531368, |
| "grad_norm": 2.204183340072632, |
| "learning_rate": 8.974186531313988e-07, |
| "loss": 0.4577, |
| "step": 7010 |
| }, |
| { |
| "epoch": 4.813164209804594, |
| "grad_norm": 2.363396406173706, |
| "learning_rate": 8.345497068998897e-07, |
| "loss": 0.4684, |
| "step": 7020 |
| }, |
| { |
| "epoch": 4.82002056907782, |
| "grad_norm": 2.908766269683838, |
| "learning_rate": 7.739543877227196e-07, |
| "loss": 0.4612, |
| "step": 7030 |
| }, |
| { |
| "epoch": 4.826876928351045, |
| "grad_norm": 2.48903489112854, |
| "learning_rate": 7.15634084905914e-07, |
| "loss": 0.4656, |
| "step": 7040 |
| }, |
| { |
| "epoch": 4.833733287624272, |
| "grad_norm": 2.835102081298828, |
| "learning_rate": 6.595901355947898e-07, |
| "loss": 0.3777, |
| "step": 7050 |
| }, |
| { |
| "epoch": 4.840589646897497, |
| "grad_norm": 2.326117992401123, |
| "learning_rate": 6.058238247433234e-07, |
| "loss": 0.4727, |
| "step": 7060 |
| }, |
| { |
| "epoch": 4.847446006170723, |
| "grad_norm": 2.406785488128662, |
| "learning_rate": 5.543363850846972e-07, |
| "loss": 0.3892, |
| "step": 7070 |
| }, |
| { |
| "epoch": 4.854302365443949, |
| "grad_norm": 2.84722900390625, |
| "learning_rate": 5.05128997102966e-07, |
| "loss": 0.4482, |
| "step": 7080 |
| }, |
| { |
| "epoch": 4.861158724717175, |
| "grad_norm": 2.8080146312713623, |
| "learning_rate": 4.582027890060792e-07, |
| "loss": 0.3906, |
| "step": 7090 |
| }, |
| { |
| "epoch": 4.868015083990401, |
| "grad_norm": 2.620560884475708, |
| "learning_rate": 4.1355883669997873e-07, |
| "loss": 0.5222, |
| "step": 7100 |
| }, |
| { |
| "epoch": 4.874871443263627, |
| "grad_norm": 2.826660394668579, |
| "learning_rate": 3.7119816376390836e-07, |
| "loss": 0.4543, |
| "step": 7110 |
| }, |
| { |
| "epoch": 4.881727802536853, |
| "grad_norm": 2.5080008506774902, |
| "learning_rate": 3.311217414269874e-07, |
| "loss": 0.4995, |
| "step": 7120 |
| }, |
| { |
| "epoch": 4.888584161810079, |
| "grad_norm": 2.6868581771850586, |
| "learning_rate": 2.933304885459065e-07, |
| "loss": 0.4136, |
| "step": 7130 |
| }, |
| { |
| "epoch": 4.895440521083305, |
| "grad_norm": 2.1843154430389404, |
| "learning_rate": 2.5782527158388916e-07, |
| "loss": 0.4089, |
| "step": 7140 |
| }, |
| { |
| "epoch": 4.9022968803565306, |
| "grad_norm": 2.803866386413574, |
| "learning_rate": 2.2460690459079615e-07, |
| "loss": 0.4824, |
| "step": 7150 |
| }, |
| { |
| "epoch": 4.909153239629757, |
| "grad_norm": 4.336270332336426, |
| "learning_rate": 1.9367614918449627e-07, |
| "loss": 0.5094, |
| "step": 7160 |
| }, |
| { |
| "epoch": 4.916009598902982, |
| "grad_norm": 5.751471996307373, |
| "learning_rate": 1.6503371453335803e-07, |
| "loss": 0.3751, |
| "step": 7170 |
| }, |
| { |
| "epoch": 4.922865958176208, |
| "grad_norm": 1.8664264678955078, |
| "learning_rate": 1.386802573400514e-07, |
| "loss": 0.4913, |
| "step": 7180 |
| }, |
| { |
| "epoch": 4.929722317449435, |
| "grad_norm": 2.173042058944702, |
| "learning_rate": 1.1461638182643786e-07, |
| "loss": 0.429, |
| "step": 7190 |
| }, |
| { |
| "epoch": 4.93657867672266, |
| "grad_norm": 3.0668938159942627, |
| "learning_rate": 9.284263971972573e-08, |
| "loss": 0.4568, |
| "step": 7200 |
| }, |
| { |
| "epoch": 4.943435035995886, |
| "grad_norm": 7.5888352394104, |
| "learning_rate": 7.33595302398582e-08, |
| "loss": 0.427, |
| "step": 7210 |
| }, |
| { |
| "epoch": 4.950291395269112, |
| "grad_norm": 2.1892218589782715, |
| "learning_rate": 5.616750008803351e-08, |
| "loss": 0.4677, |
| "step": 7220 |
| }, |
| { |
| "epoch": 4.957147754542338, |
| "grad_norm": 2.2843613624572754, |
| "learning_rate": 4.126694343644655e-08, |
| "loss": 0.4085, |
| "step": 7230 |
| }, |
| { |
| "epoch": 4.964004113815564, |
| "grad_norm": 2.025791883468628, |
| "learning_rate": 2.8658201919296023e-08, |
| "loss": 0.4369, |
| "step": 7240 |
| }, |
| { |
| "epoch": 4.9708604730887895, |
| "grad_norm": 1.981285810470581, |
| "learning_rate": 1.8341564624935194e-08, |
| "loss": 0.4147, |
| "step": 7250 |
| }, |
| { |
| "epoch": 4.977716832362016, |
| "grad_norm": 3.0213518142700195, |
| "learning_rate": 1.031726808921052e-08, |
| "loss": 0.434, |
| "step": 7260 |
| }, |
| { |
| "epoch": 4.984573191635242, |
| "grad_norm": 2.9700491428375244, |
| "learning_rate": 4.585496290110403e-09, |
| "loss": 0.4067, |
| "step": 7270 |
| }, |
| { |
| "epoch": 4.991429550908467, |
| "grad_norm": 1.8337976932525635, |
| "learning_rate": 1.1463806434686143e-09, |
| "loss": 0.5116, |
| "step": 7280 |
| }, |
| { |
| "epoch": 4.998285910181694, |
| "grad_norm": 4.141937732696533, |
| "learning_rate": 0.0, |
| "loss": 0.5268, |
| "step": 7290 |
| }, |
| { |
| "epoch": 4.998285910181694, |
| "step": 7290, |
| "total_flos": 2.0655422352392192e+18, |
| "train_loss": 0.9170206386202484, |
| "train_runtime": 7769.9064, |
| "train_samples_per_second": 30.03, |
| "train_steps_per_second": 0.938 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 7290, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.0655422352392192e+18, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|