| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.118375, |
| "eval_steps": 500, |
| "global_step": 2900, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.00125, |
| "grad_norm": 1.0962235927581787, |
| "learning_rate": 1.3499999999999998e-05, |
| "loss": 4.542208099365235, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.0025, |
| "grad_norm": 0.6401466727256775, |
| "learning_rate": 2.8499999999999998e-05, |
| "loss": 4.2099754333496096, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.00375, |
| "grad_norm": 0.5453175902366638, |
| "learning_rate": 4.3499999999999993e-05, |
| "loss": 4.047785949707031, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.005, |
| "grad_norm": 0.48525792360305786, |
| "learning_rate": 5.85e-05, |
| "loss": 3.964551544189453, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.00625, |
| "grad_norm": 0.4704461395740509, |
| "learning_rate": 7.35e-05, |
| "loss": 3.8913909912109377, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.0075, |
| "grad_norm": 0.46138691902160645, |
| "learning_rate": 8.849999999999998e-05, |
| "loss": 3.830010986328125, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.00875, |
| "grad_norm": 0.4441743493080139, |
| "learning_rate": 0.00010349999999999998, |
| "loss": 3.7685489654541016, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 0.45079436898231506, |
| "learning_rate": 0.0001185, |
| "loss": 3.7476680755615233, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.01125, |
| "grad_norm": 0.4830528497695923, |
| "learning_rate": 0.0001335, |
| "loss": 3.6839160919189453, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.0125, |
| "grad_norm": 0.44532763957977295, |
| "learning_rate": 0.00014849999999999998, |
| "loss": 3.6974281311035155, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.01375, |
| "grad_norm": 0.4425051808357239, |
| "learning_rate": 0.0001635, |
| "loss": 3.6414600372314454, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.015, |
| "grad_norm": 0.44826197624206543, |
| "learning_rate": 0.00017849999999999997, |
| "loss": 3.678114318847656, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.01625, |
| "grad_norm": 0.4449610412120819, |
| "learning_rate": 0.0001935, |
| "loss": 3.620401382446289, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.0175, |
| "grad_norm": 0.4469984173774719, |
| "learning_rate": 0.00020849999999999997, |
| "loss": 3.5873218536376954, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.01875, |
| "grad_norm": 0.43191787600517273, |
| "learning_rate": 0.00022349999999999998, |
| "loss": 3.6034412384033203, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 0.42065128684043884, |
| "learning_rate": 0.0002385, |
| "loss": 3.5601234436035156, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.02125, |
| "grad_norm": 0.4359514117240906, |
| "learning_rate": 0.0002535, |
| "loss": 3.569548797607422, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.0225, |
| "grad_norm": 0.4924817681312561, |
| "learning_rate": 0.00026849999999999997, |
| "loss": 3.5676345825195312, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.02375, |
| "grad_norm": 0.45101380348205566, |
| "learning_rate": 0.00028349999999999995, |
| "loss": 3.548577880859375, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.025, |
| "grad_norm": 0.43757978081703186, |
| "learning_rate": 0.0002985, |
| "loss": 3.542619323730469, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.02625, |
| "grad_norm": 0.42400574684143066, |
| "learning_rate": 0.00029999901450063963, |
| "loss": 3.519863510131836, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.0275, |
| "grad_norm": 0.4135052263736725, |
| "learning_rate": 0.00029999560785280927, |
| "loss": 3.489061737060547, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.02875, |
| "grad_norm": 0.42550331354141235, |
| "learning_rate": 0.0002999897679451004, |
| "loss": 3.5228813171386717, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 0.4192274808883667, |
| "learning_rate": 0.0002999814948722491, |
| "loss": 3.500740814208984, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.03125, |
| "grad_norm": 0.4208814799785614, |
| "learning_rate": 0.00029997078876846286, |
| "loss": 3.4928520202636717, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.0325, |
| "grad_norm": 0.4207195043563843, |
| "learning_rate": 0.00029995764980741843, |
| "loss": 3.455915069580078, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.03375, |
| "grad_norm": 0.4257533550262451, |
| "learning_rate": 0.00029994207820225867, |
| "loss": 3.4429149627685547, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.035, |
| "grad_norm": 0.4169400930404663, |
| "learning_rate": 0.0002999240742055895, |
| "loss": 3.423783874511719, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.03625, |
| "grad_norm": 0.4206733703613281, |
| "learning_rate": 0.0002999036381094753, |
| "loss": 3.463421630859375, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.0375, |
| "grad_norm": 0.4171896278858185, |
| "learning_rate": 0.0002998807702454349, |
| "loss": 3.4295799255371096, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.03875, |
| "grad_norm": 0.4365955889225006, |
| "learning_rate": 0.00029985547098443534, |
| "loss": 3.395406723022461, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 0.405113160610199, |
| "learning_rate": 0.00029982774073688656, |
| "loss": 3.4092079162597657, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.04125, |
| "grad_norm": 0.4027758836746216, |
| "learning_rate": 0.0002997975799526344, |
| "loss": 3.3900100708007814, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.0425, |
| "grad_norm": 0.36929693818092346, |
| "learning_rate": 0.0002997649891209534, |
| "loss": 3.3978004455566406, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.04375, |
| "grad_norm": 0.36560943722724915, |
| "learning_rate": 0.00029972996877053866, |
| "loss": 3.389325714111328, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.045, |
| "grad_norm": 0.381977915763855, |
| "learning_rate": 0.0002996925194694977, |
| "loss": 3.3937278747558595, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.04625, |
| "grad_norm": 0.4034512937068939, |
| "learning_rate": 0.0002996526418253408, |
| "loss": 3.3755035400390625, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.0475, |
| "grad_norm": 0.40442460775375366, |
| "learning_rate": 0.00029961033648497136, |
| "loss": 3.3367355346679686, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.04875, |
| "grad_norm": 0.4017426073551178, |
| "learning_rate": 0.00029956560413467545, |
| "loss": 3.358184051513672, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 0.39038756489753723, |
| "learning_rate": 0.00029951844550011046, |
| "loss": 3.343119430541992, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.05125, |
| "grad_norm": 0.3888174295425415, |
| "learning_rate": 0.00029946886134629366, |
| "loss": 3.3283355712890623, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.0525, |
| "grad_norm": 0.37037158012390137, |
| "learning_rate": 0.0002994168524775894, |
| "loss": 3.3299789428710938, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.05375, |
| "grad_norm": 0.3683609366416931, |
| "learning_rate": 0.0002993624197376964, |
| "loss": 3.3183124542236326, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.055, |
| "grad_norm": 0.38251495361328125, |
| "learning_rate": 0.00029930556400963374, |
| "loss": 3.3277027130126955, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.05625, |
| "grad_norm": 0.3685538172721863, |
| "learning_rate": 0.00029924628621572693, |
| "loss": 3.3209506988525392, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.0575, |
| "grad_norm": 0.40296629071235657, |
| "learning_rate": 0.0002991845873175927, |
| "loss": 3.328449249267578, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.05875, |
| "grad_norm": 0.38991594314575195, |
| "learning_rate": 0.0002991204683161233, |
| "loss": 3.3068389892578125, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 0.39050525426864624, |
| "learning_rate": 0.00029905393025147044, |
| "loss": 3.3071823120117188, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.06125, |
| "grad_norm": 0.3846362829208374, |
| "learning_rate": 0.00029898497420302855, |
| "loss": 3.280712127685547, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.0625, |
| "grad_norm": 0.3796476125717163, |
| "learning_rate": 0.0002989136012894168, |
| "loss": 3.2889366149902344, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.06375, |
| "grad_norm": 0.36522796750068665, |
| "learning_rate": 0.0002988398126684615, |
| "loss": 3.291548156738281, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.065, |
| "grad_norm": 0.3771977722644806, |
| "learning_rate": 0.000298763609537177, |
| "loss": 3.2864120483398436, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.06625, |
| "grad_norm": 0.35490748286247253, |
| "learning_rate": 0.00029868499313174624, |
| "loss": 3.285882568359375, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.0675, |
| "grad_norm": 0.37846773862838745, |
| "learning_rate": 0.00029860396472750083, |
| "loss": 3.279650115966797, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.06875, |
| "grad_norm": 0.38366758823394775, |
| "learning_rate": 0.0002985205256389005, |
| "loss": 3.2765472412109373, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 0.3582383990287781, |
| "learning_rate": 0.0002984346772195113, |
| "loss": 3.283465576171875, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.07125, |
| "grad_norm": 0.35587790608406067, |
| "learning_rate": 0.00029834642086198427, |
| "loss": 3.271703338623047, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.0725, |
| "grad_norm": 0.37970319390296936, |
| "learning_rate": 0.0002982557579980322, |
| "loss": 3.245616912841797, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.07375, |
| "grad_norm": 0.3639717400074005, |
| "learning_rate": 0.000298162690098407, |
| "loss": 3.258502960205078, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.075, |
| "grad_norm": 0.36604151129722595, |
| "learning_rate": 0.0002980672186728754, |
| "loss": 3.2504783630371095, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.07625, |
| "grad_norm": 0.3567405343055725, |
| "learning_rate": 0.0002979693452701947, |
| "loss": 3.2632408142089844, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.0775, |
| "grad_norm": 0.3561687171459198, |
| "learning_rate": 0.0002978690714780875, |
| "loss": 3.2412956237792967, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.07875, |
| "grad_norm": 0.3695826232433319, |
| "learning_rate": 0.00029776639892321606, |
| "loss": 3.2321800231933593, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 0.3597981035709381, |
| "learning_rate": 0.000297661329271156, |
| "loss": 3.205059814453125, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.08125, |
| "grad_norm": 0.3703027367591858, |
| "learning_rate": 0.00029755386422636884, |
| "loss": 3.2197303771972656, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.0825, |
| "grad_norm": 0.34924793243408203, |
| "learning_rate": 0.000297444005532175, |
| "loss": 3.1991270065307615, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.08375, |
| "grad_norm": 0.3698681592941284, |
| "learning_rate": 0.0002973317549707249, |
| "loss": 3.2529460906982424, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.085, |
| "grad_norm": 0.3563082814216614, |
| "learning_rate": 0.0002972171143629705, |
| "loss": 3.2158493041992187, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.08625, |
| "grad_norm": 0.34349241852760315, |
| "learning_rate": 0.0002971000855686355, |
| "loss": 3.1953773498535156, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.0875, |
| "grad_norm": 0.35246145725250244, |
| "learning_rate": 0.00029698067048618536, |
| "loss": 3.1936809539794924, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.08875, |
| "grad_norm": 0.372384637594223, |
| "learning_rate": 0.00029685887105279624, |
| "loss": 3.1743907928466797, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 0.36444413661956787, |
| "learning_rate": 0.0002967346892443239, |
| "loss": 3.1915206909179688, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.09125, |
| "grad_norm": 0.351936012506485, |
| "learning_rate": 0.00029660812707527133, |
| "loss": 3.2071929931640626, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.0925, |
| "grad_norm": 0.34430351853370667, |
| "learning_rate": 0.00029647918659875635, |
| "loss": 3.181209182739258, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.09375, |
| "grad_norm": 0.35013672709465027, |
| "learning_rate": 0.0002963478699064781, |
| "loss": 3.1906646728515624, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.095, |
| "grad_norm": 0.34257224202156067, |
| "learning_rate": 0.00029621417912868323, |
| "loss": 3.228814697265625, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.09625, |
| "grad_norm": 0.3684031367301941, |
| "learning_rate": 0.00029607811643413135, |
| "loss": 3.1923717498779296, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.0975, |
| "grad_norm": 0.3443935513496399, |
| "learning_rate": 0.0002959396840300596, |
| "loss": 3.19796085357666, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.09875, |
| "grad_norm": 0.35859978199005127, |
| "learning_rate": 0.0002957988841621472, |
| "loss": 3.2059059143066406, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 0.35764068365097046, |
| "learning_rate": 0.00029565571911447893, |
| "loss": 3.197886276245117, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.10125, |
| "grad_norm": 0.34071439504623413, |
| "learning_rate": 0.0002955101912095078, |
| "loss": 3.1616336822509767, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.1025, |
| "grad_norm": 0.3420080244541168, |
| "learning_rate": 0.00029536230280801767, |
| "loss": 3.1601337432861327, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.10375, |
| "grad_norm": 0.33847373723983765, |
| "learning_rate": 0.000295212056309085, |
| "loss": 3.1570825576782227, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.105, |
| "grad_norm": 0.33834919333457947, |
| "learning_rate": 0.00029505945415003954, |
| "loss": 3.1625667572021485, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.10625, |
| "grad_norm": 0.33340954780578613, |
| "learning_rate": 0.0002949044988064253, |
| "loss": 3.180934524536133, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.1075, |
| "grad_norm": 0.36460039019584656, |
| "learning_rate": 0.0002947471927919599, |
| "loss": 3.157451629638672, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.10875, |
| "grad_norm": 0.3350942134857178, |
| "learning_rate": 0.00029458753865849424, |
| "loss": 3.16195068359375, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 0.3496924936771393, |
| "learning_rate": 0.00029442553899597075, |
| "loss": 3.151328468322754, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.11125, |
| "grad_norm": 0.33595219254493713, |
| "learning_rate": 0.0002942611964323817, |
| "loss": 3.1474483489990233, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.1125, |
| "grad_norm": 0.34623855352401733, |
| "learning_rate": 0.00029409451363372605, |
| "loss": 3.151190757751465, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.11375, |
| "grad_norm": 0.3318001925945282, |
| "learning_rate": 0.00029392549330396696, |
| "loss": 3.166637420654297, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.115, |
| "grad_norm": 0.3412299156188965, |
| "learning_rate": 0.0002937541381849872, |
| "loss": 3.1225704193115233, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.11625, |
| "grad_norm": 0.3505077064037323, |
| "learning_rate": 0.000293580451056545, |
| "loss": 3.1353290557861326, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.1175, |
| "grad_norm": 0.36463502049446106, |
| "learning_rate": 0.00029340443473622915, |
| "loss": 3.136840057373047, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.11875, |
| "grad_norm": 0.3351852297782898, |
| "learning_rate": 0.00029322609207941283, |
| "loss": 3.13529109954834, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 0.34219565987586975, |
| "learning_rate": 0.00029304542597920766, |
| "loss": 3.158438873291016, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.12125, |
| "grad_norm": 0.32809194922447205, |
| "learning_rate": 0.0002928624393664166, |
| "loss": 3.1299543380737305, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.1225, |
| "grad_norm": 0.3327496647834778, |
| "learning_rate": 0.00029267713520948643, |
| "loss": 3.127825164794922, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.12375, |
| "grad_norm": 0.34798717498779297, |
| "learning_rate": 0.00029248951651445973, |
| "loss": 3.1128513336181642, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.125, |
| "grad_norm": 0.3409072756767273, |
| "learning_rate": 0.0002922995863249258, |
| "loss": 3.0872947692871096, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.12625, |
| "grad_norm": 0.3241552412509918, |
| "learning_rate": 0.00029210734772197166, |
| "loss": 3.135254669189453, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.1275, |
| "grad_norm": 0.33775585889816284, |
| "learning_rate": 0.0002919128038241318, |
| "loss": 3.1159934997558594, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.12875, |
| "grad_norm": 0.3477814197540283, |
| "learning_rate": 0.0002917159577873377, |
| "loss": 3.1170124053955077, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 0.34592533111572266, |
| "learning_rate": 0.00029151681280486656, |
| "loss": 3.108189010620117, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.13125, |
| "grad_norm": 0.3351367712020874, |
| "learning_rate": 0.00029131537210728975, |
| "loss": 3.1113187789916994, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.1325, |
| "grad_norm": 0.354574054479599, |
| "learning_rate": 0.00029111163896241996, |
| "loss": 3.114427947998047, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.13375, |
| "grad_norm": 0.3495411276817322, |
| "learning_rate": 0.0002909056166752586, |
| "loss": 3.099652862548828, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.135, |
| "grad_norm": 0.32834145426750183, |
| "learning_rate": 0.0002906973085879419, |
| "loss": 3.1386039733886717, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.13625, |
| "grad_norm": 0.3264714181423187, |
| "learning_rate": 0.000290486718079687, |
| "loss": 3.0912006378173826, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.1375, |
| "grad_norm": 0.3366532623767853, |
| "learning_rate": 0.0002902738485667367, |
| "loss": 3.066594123840332, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.13875, |
| "grad_norm": 0.3517299294471741, |
| "learning_rate": 0.00029005870350230453, |
| "loss": 3.1176836013793947, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 0.3267839550971985, |
| "learning_rate": 0.00028984128637651825, |
| "loss": 3.0968740463256834, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.14125, |
| "grad_norm": 0.3284844160079956, |
| "learning_rate": 0.0002896216007163637, |
| "loss": 3.0969438552856445, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.1425, |
| "grad_norm": 0.32528674602508545, |
| "learning_rate": 0.0002893996500856272, |
| "loss": 3.1029247283935546, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.14375, |
| "grad_norm": 0.3221900463104248, |
| "learning_rate": 0.00028917543808483796, |
| "loss": 3.1135093688964846, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.145, |
| "grad_norm": 0.33406203985214233, |
| "learning_rate": 0.0002889489683512096, |
| "loss": 3.0768909454345703, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.14625, |
| "grad_norm": 0.3397602140903473, |
| "learning_rate": 0.0002887202445585811, |
| "loss": 3.0875574111938477, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.1475, |
| "grad_norm": 0.33265799283981323, |
| "learning_rate": 0.0002884892704173573, |
| "loss": 3.0731714248657225, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.14875, |
| "grad_norm": 0.32758811116218567, |
| "learning_rate": 0.00028825604967444866, |
| "loss": 3.12725772857666, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 0.3414333164691925, |
| "learning_rate": 0.0002880205861132105, |
| "loss": 3.0664628982543944, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.15125, |
| "grad_norm": 0.33994749188423157, |
| "learning_rate": 0.00028778288355338144, |
| "loss": 3.0672843933105467, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.1525, |
| "grad_norm": 0.33031970262527466, |
| "learning_rate": 0.0002875429458510219, |
| "loss": 3.1195865631103517, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.15375, |
| "grad_norm": 0.3435764014720917, |
| "learning_rate": 0.0002873007768984511, |
| "loss": 3.0530136108398436, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.155, |
| "grad_norm": 0.32233837246894836, |
| "learning_rate": 0.00028705638062418386, |
| "loss": 3.076219177246094, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.15625, |
| "grad_norm": 0.32135722041130066, |
| "learning_rate": 0.0002868097609928674, |
| "loss": 3.051331329345703, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.1575, |
| "grad_norm": 0.3371255099773407, |
| "learning_rate": 0.0002865609220052165, |
| "loss": 3.060790252685547, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.15875, |
| "grad_norm": 0.344938188791275, |
| "learning_rate": 0.000286309867697949, |
| "loss": 3.0386356353759765, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 0.3418465256690979, |
| "learning_rate": 0.0002860566021437197, |
| "loss": 3.0650426864624025, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.16125, |
| "grad_norm": 0.3343369960784912, |
| "learning_rate": 0.0002858011294510552, |
| "loss": 3.04056396484375, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.1625, |
| "grad_norm": 0.34760987758636475, |
| "learning_rate": 0.0002855434537642865, |
| "loss": 3.057343292236328, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.16375, |
| "grad_norm": 0.3561476767063141, |
| "learning_rate": 0.000285283579263482, |
| "loss": 3.0452644348144533, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.165, |
| "grad_norm": 0.3393990993499756, |
| "learning_rate": 0.00028502151016437986, |
| "loss": 3.0280082702636717, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.16625, |
| "grad_norm": 0.34054502844810486, |
| "learning_rate": 0.0002847572507183193, |
| "loss": 3.05167293548584, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.1675, |
| "grad_norm": 0.33443859219551086, |
| "learning_rate": 0.000284490805212172, |
| "loss": 3.090429496765137, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.16875, |
| "grad_norm": 0.33417847752571106, |
| "learning_rate": 0.00028422217796827216, |
| "loss": 3.064510726928711, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 0.33783411979675293, |
| "learning_rate": 0.00028395137334434676, |
| "loss": 3.03808479309082, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.17125, |
| "grad_norm": 0.3298597037792206, |
| "learning_rate": 0.00028367839573344454, |
| "loss": 3.0556320190429687, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.1725, |
| "grad_norm": 0.3494487702846527, |
| "learning_rate": 0.000283403249563865, |
| "loss": 3.039119338989258, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.17375, |
| "grad_norm": 0.3314499258995056, |
| "learning_rate": 0.0002831259392990864, |
| "loss": 3.0571495056152345, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.175, |
| "grad_norm": 0.32631996273994446, |
| "learning_rate": 0.00028284646943769337, |
| "loss": 3.0531475067138674, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.17625, |
| "grad_norm": 0.32792332768440247, |
| "learning_rate": 0.00028256484451330403, |
| "loss": 3.026413917541504, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.1775, |
| "grad_norm": 0.31503939628601074, |
| "learning_rate": 0.0002822810690944963, |
| "loss": 3.052714538574219, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.17875, |
| "grad_norm": 0.33248430490493774, |
| "learning_rate": 0.000281995147784734, |
| "loss": 3.050276184082031, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 0.3251739740371704, |
| "learning_rate": 0.0002817070852222918, |
| "loss": 3.057173156738281, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.18125, |
| "grad_norm": 0.32668808102607727, |
| "learning_rate": 0.0002814168860801806, |
| "loss": 3.0323795318603515, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.1825, |
| "grad_norm": 0.3308984339237213, |
| "learning_rate": 0.0002811245550660709, |
| "loss": 3.0549407958984376, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.18375, |
| "grad_norm": 0.33381035923957825, |
| "learning_rate": 0.0002808300969222172, |
| "loss": 3.038156509399414, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.185, |
| "grad_norm": 0.3257412016391754, |
| "learning_rate": 0.0002805335164253806, |
| "loss": 2.9872032165527345, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.18625, |
| "grad_norm": 0.33056777715682983, |
| "learning_rate": 0.0002802348183867514, |
| "loss": 3.0133747100830077, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.1875, |
| "grad_norm": 0.3441354036331177, |
| "learning_rate": 0.00027993400765187124, |
| "loss": 2.9739742279052734, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.18875, |
| "grad_norm": 0.33665338158607483, |
| "learning_rate": 0.0002796310891005542, |
| "loss": 3.0187236785888674, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 0.3456818163394928, |
| "learning_rate": 0.00027932606764680796, |
| "loss": 3.000716781616211, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.19125, |
| "grad_norm": 0.31569328904151917, |
| "learning_rate": 0.00027901894823875387, |
| "loss": 3.003590774536133, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.1925, |
| "grad_norm": 0.33296287059783936, |
| "learning_rate": 0.00027870973585854665, |
| "loss": 3.010061264038086, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.19375, |
| "grad_norm": 0.3256262242794037, |
| "learning_rate": 0.0002783984355222937, |
| "loss": 3.009804534912109, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.195, |
| "grad_norm": 0.33614271879196167, |
| "learning_rate": 0.0002780850522799737, |
| "loss": 3.0124454498291016, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.19625, |
| "grad_norm": 0.3145756721496582, |
| "learning_rate": 0.00027776959121535464, |
| "loss": 3.0317821502685547, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.1975, |
| "grad_norm": 0.33354440331459045, |
| "learning_rate": 0.0002774520574459113, |
| "loss": 2.9961336135864256, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.19875, |
| "grad_norm": 0.33014026284217834, |
| "learning_rate": 0.00027713245612274247, |
| "loss": 2.996552658081055, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 0.3215552866458893, |
| "learning_rate": 0.00027681079243048717, |
| "loss": 3.0088550567626955, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.20125, |
| "grad_norm": 0.33168256282806396, |
| "learning_rate": 0.0002764870715872405, |
| "loss": 3.017764663696289, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.2025, |
| "grad_norm": 0.3189021646976471, |
| "learning_rate": 0.00027616129884446916, |
| "loss": 3.013848876953125, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.20375, |
| "grad_norm": 0.3378245532512665, |
| "learning_rate": 0.0002758334794869262, |
| "loss": 2.9824432373046874, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.205, |
| "grad_norm": 0.3281719386577606, |
| "learning_rate": 0.00027550361883256535, |
| "loss": 2.998569679260254, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.20625, |
| "grad_norm": 0.3160180151462555, |
| "learning_rate": 0.00027517172223245445, |
| "loss": 2.9816024780273436, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.2075, |
| "grad_norm": 0.33755186200141907, |
| "learning_rate": 0.00027483779507068913, |
| "loss": 2.9858730316162108, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.20875, |
| "grad_norm": 0.3391866683959961, |
| "learning_rate": 0.0002745018427643051, |
| "loss": 3.0096504211425783, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.21, |
| "grad_norm": 0.31814366579055786, |
| "learning_rate": 0.00027416387076319035, |
| "loss": 3.0215930938720703, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.21125, |
| "grad_norm": 0.33236604928970337, |
| "learning_rate": 0.00027382388454999686, |
| "loss": 2.9959911346435546, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.2125, |
| "grad_norm": 0.31699317693710327, |
| "learning_rate": 0.00027348188964005147, |
| "loss": 2.9976850509643556, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.21375, |
| "grad_norm": 0.3221769630908966, |
| "learning_rate": 0.00027313789158126667, |
| "loss": 2.992999267578125, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.215, |
| "grad_norm": 0.3359137773513794, |
| "learning_rate": 0.00027279189595405036, |
| "loss": 3.005726623535156, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.21625, |
| "grad_norm": 0.3230113685131073, |
| "learning_rate": 0.0002724439083712153, |
| "loss": 2.9472639083862306, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.2175, |
| "grad_norm": 0.3287331461906433, |
| "learning_rate": 0.00027209393447788835, |
| "loss": 2.979531478881836, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.21875, |
| "grad_norm": 0.33660778403282166, |
| "learning_rate": 0.00027174197995141866, |
| "loss": 2.972854232788086, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 0.3239246904850006, |
| "learning_rate": 0.0002713880505012855, |
| "loss": 2.9944473266601563, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.22125, |
| "grad_norm": 0.33757129311561584, |
| "learning_rate": 0.00027103215186900597, |
| "loss": 2.984470748901367, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.2225, |
| "grad_norm": 0.3297254741191864, |
| "learning_rate": 0.0002706742898280415, |
| "loss": 2.9779335021972657, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.22375, |
| "grad_norm": 0.3173220753669739, |
| "learning_rate": 0.0002703144701837044, |
| "loss": 2.989591598510742, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.225, |
| "grad_norm": 0.3255234658718109, |
| "learning_rate": 0.00026995269877306356, |
| "loss": 2.9630172729492186, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.22625, |
| "grad_norm": 0.31826311349868774, |
| "learning_rate": 0.0002695889814648499, |
| "loss": 2.949393463134766, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.2275, |
| "grad_norm": 0.3174227774143219, |
| "learning_rate": 0.00026922332415936116, |
| "loss": 2.962572479248047, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.22875, |
| "grad_norm": 0.3316509425640106, |
| "learning_rate": 0.0002688557327883659, |
| "loss": 2.940341567993164, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.23, |
| "grad_norm": 0.3224658668041229, |
| "learning_rate": 0.00026848621331500766, |
| "loss": 2.96124267578125, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.23125, |
| "grad_norm": 0.32638388872146606, |
| "learning_rate": 0.00026811477173370815, |
| "loss": 2.967144775390625, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.2325, |
| "grad_norm": 0.3130776286125183, |
| "learning_rate": 0.0002677414140700696, |
| "loss": 2.946596145629883, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.23375, |
| "grad_norm": 0.3233889937400818, |
| "learning_rate": 0.0002673661463807776, |
| "loss": 2.967226028442383, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.235, |
| "grad_norm": 0.32231318950653076, |
| "learning_rate": 0.00026698897475350254, |
| "loss": 2.9710044860839844, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.23625, |
| "grad_norm": 0.32844141125679016, |
| "learning_rate": 0.0002666099053068007, |
| "loss": 2.98897762298584, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.2375, |
| "grad_norm": 0.32306230068206787, |
| "learning_rate": 0.00026622894419001537, |
| "loss": 2.978479766845703, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.23875, |
| "grad_norm": 0.3146902620792389, |
| "learning_rate": 0.0002658460975831769, |
| "loss": 2.9550348281860352, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 0.3284965753555298, |
| "learning_rate": 0.00026546137169690235, |
| "loss": 2.954052734375, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.24125, |
| "grad_norm": 0.3169645071029663, |
| "learning_rate": 0.00026507477277229496, |
| "loss": 2.9700557708740236, |
| "step": 1930 |
| }, |
| { |
| "epoch": 0.2425, |
| "grad_norm": 0.32831114530563354, |
| "learning_rate": 0.0002646863070808425, |
| "loss": 2.965903091430664, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.24375, |
| "grad_norm": 0.3175643980503082, |
| "learning_rate": 0.0002642959809243163, |
| "loss": 2.9406291961669924, |
| "step": 1950 |
| }, |
| { |
| "epoch": 1.000875, |
| "grad_norm": 0.3207930624485016, |
| "learning_rate": 0.00026390380063466806, |
| "loss": 3.2728614807128906, |
| "step": 1960 |
| }, |
| { |
| "epoch": 1.002125, |
| "grad_norm": 0.33733275532722473, |
| "learning_rate": 0.000263509772573928, |
| "loss": 2.9439430236816406, |
| "step": 1970 |
| }, |
| { |
| "epoch": 1.003375, |
| "grad_norm": 0.3284999132156372, |
| "learning_rate": 0.00026311390313410097, |
| "loss": 2.9343544006347657, |
| "step": 1980 |
| }, |
| { |
| "epoch": 1.004625, |
| "grad_norm": 0.31731417775154114, |
| "learning_rate": 0.0002627161987370632, |
| "loss": 2.9394100189208983, |
| "step": 1990 |
| }, |
| { |
| "epoch": 1.005875, |
| "grad_norm": 0.3323110044002533, |
| "learning_rate": 0.000262316665834458, |
| "loss": 2.932010269165039, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.007125, |
| "grad_norm": 0.3103737533092499, |
| "learning_rate": 0.000261915310907591, |
| "loss": 2.9386672973632812, |
| "step": 2010 |
| }, |
| { |
| "epoch": 1.008375, |
| "grad_norm": 0.311427503824234, |
| "learning_rate": 0.0002615121404673251, |
| "loss": 2.893377113342285, |
| "step": 2020 |
| }, |
| { |
| "epoch": 1.009625, |
| "grad_norm": 0.31239134073257446, |
| "learning_rate": 0.00026110716105397485, |
| "loss": 2.9058563232421877, |
| "step": 2030 |
| }, |
| { |
| "epoch": 1.010875, |
| "grad_norm": 0.31634724140167236, |
| "learning_rate": 0.0002607003792372004, |
| "loss": 2.9108051300048827, |
| "step": 2040 |
| }, |
| { |
| "epoch": 1.012125, |
| "grad_norm": 0.3243110775947571, |
| "learning_rate": 0.00026029180161590067, |
| "loss": 2.919489288330078, |
| "step": 2050 |
| }, |
| { |
| "epoch": 1.013375, |
| "grad_norm": 0.3139530420303345, |
| "learning_rate": 0.0002598814348181068, |
| "loss": 2.901732635498047, |
| "step": 2060 |
| }, |
| { |
| "epoch": 1.014625, |
| "grad_norm": 0.33199912309646606, |
| "learning_rate": 0.000259469285500874, |
| "loss": 2.9169565200805665, |
| "step": 2070 |
| }, |
| { |
| "epoch": 1.015875, |
| "grad_norm": 0.31367039680480957, |
| "learning_rate": 0.0002590553603501741, |
| "loss": 2.8976207733154298, |
| "step": 2080 |
| }, |
| { |
| "epoch": 1.017125, |
| "grad_norm": 0.3230586349964142, |
| "learning_rate": 0.00025863966608078673, |
| "loss": 2.882274055480957, |
| "step": 2090 |
| }, |
| { |
| "epoch": 1.018375, |
| "grad_norm": 0.313128262758255, |
| "learning_rate": 0.0002582222094361907, |
| "loss": 2.8793052673339843, |
| "step": 2100 |
| }, |
| { |
| "epoch": 1.019625, |
| "grad_norm": 0.31868046522140503, |
| "learning_rate": 0.00025780299718845416, |
| "loss": 2.8837726593017576, |
| "step": 2110 |
| }, |
| { |
| "epoch": 1.020875, |
| "grad_norm": 0.32915839552879333, |
| "learning_rate": 0.00025738203613812543, |
| "loss": 2.8813737869262694, |
| "step": 2120 |
| }, |
| { |
| "epoch": 1.022125, |
| "grad_norm": 0.3112785220146179, |
| "learning_rate": 0.0002569593331141218, |
| "loss": 2.8455078125, |
| "step": 2130 |
| }, |
| { |
| "epoch": 1.023375, |
| "grad_norm": 0.29719340801239014, |
| "learning_rate": 0.0002565348949736196, |
| "loss": 2.8642330169677734, |
| "step": 2140 |
| }, |
| { |
| "epoch": 1.024625, |
| "grad_norm": 0.3277027904987335, |
| "learning_rate": 0.0002561087286019424, |
| "loss": 2.8876218795776367, |
| "step": 2150 |
| }, |
| { |
| "epoch": 1.025875, |
| "grad_norm": 0.3183266520500183, |
| "learning_rate": 0.0002556808409124494, |
| "loss": 2.883536720275879, |
| "step": 2160 |
| }, |
| { |
| "epoch": 1.027125, |
| "grad_norm": 0.3155848979949951, |
| "learning_rate": 0.00025525123884642366, |
| "loss": 2.862305450439453, |
| "step": 2170 |
| }, |
| { |
| "epoch": 1.028375, |
| "grad_norm": 0.3132951855659485, |
| "learning_rate": 0.000254819929372959, |
| "loss": 2.856226348876953, |
| "step": 2180 |
| }, |
| { |
| "epoch": 1.029625, |
| "grad_norm": 0.3311610221862793, |
| "learning_rate": 0.0002543869194888471, |
| "loss": 2.854077911376953, |
| "step": 2190 |
| }, |
| { |
| "epoch": 1.030875, |
| "grad_norm": 0.31400758028030396, |
| "learning_rate": 0.00025395221621846435, |
| "loss": 2.8614845275878906, |
| "step": 2200 |
| }, |
| { |
| "epoch": 1.032125, |
| "grad_norm": 0.30809491872787476, |
| "learning_rate": 0.00025351582661365724, |
| "loss": 2.849281883239746, |
| "step": 2210 |
| }, |
| { |
| "epoch": 1.033375, |
| "grad_norm": 0.3170127272605896, |
| "learning_rate": 0.00025307775775362855, |
| "loss": 2.8551118850708006, |
| "step": 2220 |
| }, |
| { |
| "epoch": 1.034625, |
| "grad_norm": 0.33024269342422485, |
| "learning_rate": 0.0002526380167448223, |
| "loss": 2.867789459228516, |
| "step": 2230 |
| }, |
| { |
| "epoch": 1.035875, |
| "grad_norm": 0.31417202949523926, |
| "learning_rate": 0.0002521966107208084, |
| "loss": 2.841164779663086, |
| "step": 2240 |
| }, |
| { |
| "epoch": 1.037125, |
| "grad_norm": 0.31379812955856323, |
| "learning_rate": 0.0002517535468421669, |
| "loss": 2.8253313064575196, |
| "step": 2250 |
| }, |
| { |
| "epoch": 1.038375, |
| "grad_norm": 0.32265710830688477, |
| "learning_rate": 0.00025130883229637196, |
| "loss": 2.7978553771972656, |
| "step": 2260 |
| }, |
| { |
| "epoch": 1.039625, |
| "grad_norm": 0.3156164288520813, |
| "learning_rate": 0.0002508624742976753, |
| "loss": 2.8533695220947264, |
| "step": 2270 |
| }, |
| { |
| "epoch": 1.040875, |
| "grad_norm": 0.31064850091934204, |
| "learning_rate": 0.000250414480086989, |
| "loss": 2.8257659912109374, |
| "step": 2280 |
| }, |
| { |
| "epoch": 1.042125, |
| "grad_norm": 0.31829559803009033, |
| "learning_rate": 0.00024996485693176815, |
| "loss": 2.8458267211914063, |
| "step": 2290 |
| }, |
| { |
| "epoch": 1.043375, |
| "grad_norm": 0.3130020499229431, |
| "learning_rate": 0.0002495136121258928, |
| "loss": 2.823675537109375, |
| "step": 2300 |
| }, |
| { |
| "epoch": 1.044625, |
| "grad_norm": 0.31633248925209045, |
| "learning_rate": 0.0002490607529895499, |
| "loss": 2.831635093688965, |
| "step": 2310 |
| }, |
| { |
| "epoch": 1.045875, |
| "grad_norm": 0.3220139741897583, |
| "learning_rate": 0.00024860628686911436, |
| "loss": 2.8292957305908204, |
| "step": 2320 |
| }, |
| { |
| "epoch": 1.047125, |
| "grad_norm": 0.3184780180454254, |
| "learning_rate": 0.0002481502211370298, |
| "loss": 2.813566207885742, |
| "step": 2330 |
| }, |
| { |
| "epoch": 1.048375, |
| "grad_norm": 0.3212302625179291, |
| "learning_rate": 0.00024769256319168923, |
| "loss": 2.820956993103027, |
| "step": 2340 |
| }, |
| { |
| "epoch": 1.049625, |
| "grad_norm": 0.31229448318481445, |
| "learning_rate": 0.00024723332045731484, |
| "loss": 2.8484216690063477, |
| "step": 2350 |
| }, |
| { |
| "epoch": 1.050875, |
| "grad_norm": 0.3293566107749939, |
| "learning_rate": 0.0002467725003838375, |
| "loss": 2.822405242919922, |
| "step": 2360 |
| }, |
| { |
| "epoch": 1.052125, |
| "grad_norm": 0.32199931144714355, |
| "learning_rate": 0.00024631011044677615, |
| "loss": 2.8012535095214846, |
| "step": 2370 |
| }, |
| { |
| "epoch": 1.053375, |
| "grad_norm": 0.3443029820919037, |
| "learning_rate": 0.00024584615814711626, |
| "loss": 2.8285900115966798, |
| "step": 2380 |
| }, |
| { |
| "epoch": 1.054625, |
| "grad_norm": 0.31065329909324646, |
| "learning_rate": 0.00024538065101118833, |
| "loss": 2.8233985900878906, |
| "step": 2390 |
| }, |
| { |
| "epoch": 1.055875, |
| "grad_norm": 0.31999537348747253, |
| "learning_rate": 0.0002449135965905457, |
| "loss": 2.819304656982422, |
| "step": 2400 |
| }, |
| { |
| "epoch": 1.057125, |
| "grad_norm": 0.3119924068450928, |
| "learning_rate": 0.0002444450024618422, |
| "loss": 2.794045257568359, |
| "step": 2410 |
| }, |
| { |
| "epoch": 1.058375, |
| "grad_norm": 0.30880260467529297, |
| "learning_rate": 0.00024397487622670894, |
| "loss": 2.8043006896972655, |
| "step": 2420 |
| }, |
| { |
| "epoch": 1.059625, |
| "grad_norm": 0.33332186937332153, |
| "learning_rate": 0.0002435032255116313, |
| "loss": 2.795323371887207, |
| "step": 2430 |
| }, |
| { |
| "epoch": 1.060875, |
| "grad_norm": 0.313965767621994, |
| "learning_rate": 0.00024303005796782508, |
| "loss": 2.823124313354492, |
| "step": 2440 |
| }, |
| { |
| "epoch": 1.062125, |
| "grad_norm": 0.3164885938167572, |
| "learning_rate": 0.0002425553812711123, |
| "loss": 2.8160400390625, |
| "step": 2450 |
| }, |
| { |
| "epoch": 1.063375, |
| "grad_norm": 0.3221667408943176, |
| "learning_rate": 0.00024207920312179686, |
| "loss": 2.7901626586914063, |
| "step": 2460 |
| }, |
| { |
| "epoch": 1.064625, |
| "grad_norm": 0.31535276770591736, |
| "learning_rate": 0.0002416015312445396, |
| "loss": 2.7759862899780274, |
| "step": 2470 |
| }, |
| { |
| "epoch": 1.065875, |
| "grad_norm": 0.3354146480560303, |
| "learning_rate": 0.0002411223733882328, |
| "loss": 2.7960500717163086, |
| "step": 2480 |
| }, |
| { |
| "epoch": 1.067125, |
| "grad_norm": 0.36151498556137085, |
| "learning_rate": 0.0002406417373258746, |
| "loss": 2.787026596069336, |
| "step": 2490 |
| }, |
| { |
| "epoch": 1.068375, |
| "grad_norm": 0.32125601172447205, |
| "learning_rate": 0.00024015963085444297, |
| "loss": 2.8015857696533204, |
| "step": 2500 |
| }, |
| { |
| "epoch": 1.069625, |
| "grad_norm": 0.31707149744033813, |
| "learning_rate": 0.00023967606179476914, |
| "loss": 2.7936481475830077, |
| "step": 2510 |
| }, |
| { |
| "epoch": 1.070875, |
| "grad_norm": 0.3259861171245575, |
| "learning_rate": 0.00023919103799141078, |
| "loss": 2.7787807464599608, |
| "step": 2520 |
| }, |
| { |
| "epoch": 1.072125, |
| "grad_norm": 0.32442429661750793, |
| "learning_rate": 0.00023870456731252466, |
| "loss": 2.7599460601806642, |
| "step": 2530 |
| }, |
| { |
| "epoch": 1.073375, |
| "grad_norm": 0.3134000599384308, |
| "learning_rate": 0.0002382166576497391, |
| "loss": 2.802600288391113, |
| "step": 2540 |
| }, |
| { |
| "epoch": 1.074625, |
| "grad_norm": 0.31214284896850586, |
| "learning_rate": 0.00023772731691802583, |
| "loss": 2.783060073852539, |
| "step": 2550 |
| }, |
| { |
| "epoch": 1.075875, |
| "grad_norm": 0.31292805075645447, |
| "learning_rate": 0.00023723655305557187, |
| "loss": 2.766581153869629, |
| "step": 2560 |
| }, |
| { |
| "epoch": 1.077125, |
| "grad_norm": 0.3185879588127136, |
| "learning_rate": 0.0002367443740236504, |
| "loss": 2.7777988433837892, |
| "step": 2570 |
| }, |
| { |
| "epoch": 1.078375, |
| "grad_norm": 0.33435672521591187, |
| "learning_rate": 0.00023625078780649178, |
| "loss": 2.783607292175293, |
| "step": 2580 |
| }, |
| { |
| "epoch": 1.079625, |
| "grad_norm": 0.32837480306625366, |
| "learning_rate": 0.00023575580241115408, |
| "loss": 2.778690719604492, |
| "step": 2590 |
| }, |
| { |
| "epoch": 1.080875, |
| "grad_norm": 0.3214893639087677, |
| "learning_rate": 0.00023525942586739309, |
| "loss": 2.7992782592773438, |
| "step": 2600 |
| }, |
| { |
| "epoch": 1.082125, |
| "grad_norm": 0.31656068563461304, |
| "learning_rate": 0.00023476166622753212, |
| "loss": 2.7780582427978517, |
| "step": 2610 |
| }, |
| { |
| "epoch": 1.083375, |
| "grad_norm": 0.32466331124305725, |
| "learning_rate": 0.0002342625315663314, |
| "loss": 2.756012535095215, |
| "step": 2620 |
| }, |
| { |
| "epoch": 1.084625, |
| "grad_norm": 0.33582648634910583, |
| "learning_rate": 0.0002337620299808569, |
| "loss": 2.791756820678711, |
| "step": 2630 |
| }, |
| { |
| "epoch": 1.085875, |
| "grad_norm": 0.320222944021225, |
| "learning_rate": 0.00023326016959034922, |
| "loss": 2.7860565185546875, |
| "step": 2640 |
| }, |
| { |
| "epoch": 1.087125, |
| "grad_norm": 0.30881232023239136, |
| "learning_rate": 0.00023275695853609184, |
| "loss": 2.765717315673828, |
| "step": 2650 |
| }, |
| { |
| "epoch": 1.088375, |
| "grad_norm": 0.32089343667030334, |
| "learning_rate": 0.00023225240498127883, |
| "loss": 2.798817825317383, |
| "step": 2660 |
| }, |
| { |
| "epoch": 1.089625, |
| "grad_norm": 0.3162356913089752, |
| "learning_rate": 0.00023174651711088272, |
| "loss": 2.7743175506591795, |
| "step": 2670 |
| }, |
| { |
| "epoch": 1.090875, |
| "grad_norm": 0.3252812623977661, |
| "learning_rate": 0.0002312393031315215, |
| "loss": 2.74603271484375, |
| "step": 2680 |
| }, |
| { |
| "epoch": 1.092125, |
| "grad_norm": 0.32010287046432495, |
| "learning_rate": 0.00023073077127132562, |
| "loss": 2.7529132843017576, |
| "step": 2690 |
| }, |
| { |
| "epoch": 1.093375, |
| "grad_norm": 0.31325286626815796, |
| "learning_rate": 0.00023022092977980442, |
| "loss": 2.759819984436035, |
| "step": 2700 |
| }, |
| { |
| "epoch": 1.094625, |
| "grad_norm": 0.31815576553344727, |
| "learning_rate": 0.00022970978692771242, |
| "loss": 2.762770080566406, |
| "step": 2710 |
| }, |
| { |
| "epoch": 1.095875, |
| "grad_norm": 0.3144858181476593, |
| "learning_rate": 0.00022919735100691504, |
| "loss": 2.7560394287109373, |
| "step": 2720 |
| }, |
| { |
| "epoch": 1.097125, |
| "grad_norm": 0.3316778540611267, |
| "learning_rate": 0.00022868363033025406, |
| "loss": 2.7469573974609376, |
| "step": 2730 |
| }, |
| { |
| "epoch": 1.098375, |
| "grad_norm": 0.3257002830505371, |
| "learning_rate": 0.000228168633231413, |
| "loss": 2.743033218383789, |
| "step": 2740 |
| }, |
| { |
| "epoch": 1.099625, |
| "grad_norm": 0.31909677386283875, |
| "learning_rate": 0.00022765236806478154, |
| "loss": 2.7673528671264647, |
| "step": 2750 |
| }, |
| { |
| "epoch": 1.100875, |
| "grad_norm": 0.31668204069137573, |
| "learning_rate": 0.00022713484320532055, |
| "loss": 2.7211772918701174, |
| "step": 2760 |
| }, |
| { |
| "epoch": 1.102125, |
| "grad_norm": 0.3331785202026367, |
| "learning_rate": 0.00022661606704842558, |
| "loss": 2.722859191894531, |
| "step": 2770 |
| }, |
| { |
| "epoch": 1.103375, |
| "grad_norm": 0.33396124839782715, |
| "learning_rate": 0.00022609604800979111, |
| "loss": 2.7449823379516602, |
| "step": 2780 |
| }, |
| { |
| "epoch": 1.104625, |
| "grad_norm": 0.31729063391685486, |
| "learning_rate": 0.00022557479452527392, |
| "loss": 2.725127601623535, |
| "step": 2790 |
| }, |
| { |
| "epoch": 1.105875, |
| "grad_norm": 0.3257978558540344, |
| "learning_rate": 0.00022505231505075613, |
| "loss": 2.7355205535888674, |
| "step": 2800 |
| }, |
| { |
| "epoch": 1.107125, |
| "grad_norm": 0.33255940675735474, |
| "learning_rate": 0.00022452861806200838, |
| "loss": 2.7257015228271486, |
| "step": 2810 |
| }, |
| { |
| "epoch": 1.108375, |
| "grad_norm": 0.32612425088882446, |
| "learning_rate": 0.00022400371205455176, |
| "loss": 2.767654800415039, |
| "step": 2820 |
| }, |
| { |
| "epoch": 1.109625, |
| "grad_norm": 0.3252048194408417, |
| "learning_rate": 0.0002234776055435205, |
| "loss": 2.7518121719360353, |
| "step": 2830 |
| }, |
| { |
| "epoch": 1.110875, |
| "grad_norm": 0.31889963150024414, |
| "learning_rate": 0.00022295030706352356, |
| "loss": 2.7312660217285156, |
| "step": 2840 |
| }, |
| { |
| "epoch": 1.112125, |
| "grad_norm": 0.319558709859848, |
| "learning_rate": 0.00022242182516850635, |
| "loss": 2.740167236328125, |
| "step": 2850 |
| }, |
| { |
| "epoch": 1.113375, |
| "grad_norm": 0.3091806173324585, |
| "learning_rate": 0.0002218921684316119, |
| "loss": 2.748370552062988, |
| "step": 2860 |
| }, |
| { |
| "epoch": 1.114625, |
| "grad_norm": 0.33689746260643005, |
| "learning_rate": 0.00022136134544504163, |
| "loss": 2.787190628051758, |
| "step": 2870 |
| }, |
| { |
| "epoch": 1.115875, |
| "grad_norm": 0.31660985946655273, |
| "learning_rate": 0.0002208293648199162, |
| "loss": 2.7415428161621094, |
| "step": 2880 |
| }, |
| { |
| "epoch": 1.117125, |
| "grad_norm": 0.325989305973053, |
| "learning_rate": 0.0002202962351861357, |
| "loss": 2.7200296401977537, |
| "step": 2890 |
| }, |
| { |
| "epoch": 1.118375, |
| "grad_norm": 0.3281361758708954, |
| "learning_rate": 0.0002197619651922397, |
| "loss": 2.7388254165649415, |
| "step": 2900 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 8000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 9223372036854775807, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.8847708770441626e+17, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|