{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.996599690880989, "eval_steps": 500, "global_step": 2424, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012364760432766615, "grad_norm": 0.04910803958773613, "learning_rate": 0.00029876237623762373, "loss": 1.0084, "step": 10 }, { "epoch": 0.02472952086553323, "grad_norm": 0.037698596715927124, "learning_rate": 0.0002975247524752475, "loss": 0.9609, "step": 20 }, { "epoch": 0.03709428129829984, "grad_norm": 0.03782161325216293, "learning_rate": 0.00029628712871287126, "loss": 0.9033, "step": 30 }, { "epoch": 0.04945904173106646, "grad_norm": 0.04191258177161217, "learning_rate": 0.000295049504950495, "loss": 0.9154, "step": 40 }, { "epoch": 0.061823802163833076, "grad_norm": 0.0391441248357296, "learning_rate": 0.0002938118811881188, "loss": 0.9001, "step": 50 }, { "epoch": 0.07418856259659969, "grad_norm": 0.04036989435553551, "learning_rate": 0.00029257425742574254, "loss": 0.9069, "step": 60 }, { "epoch": 0.0865533230293663, "grad_norm": 0.036793895065784454, "learning_rate": 0.0002913366336633663, "loss": 0.8903, "step": 70 }, { "epoch": 0.09891808346213292, "grad_norm": 0.04941694810986519, "learning_rate": 0.00029009900990099006, "loss": 0.928, "step": 80 }, { "epoch": 0.11128284389489954, "grad_norm": 0.03952586278319359, "learning_rate": 0.0002888613861386138, "loss": 0.9121, "step": 90 }, { "epoch": 0.12364760432766615, "grad_norm": 0.04417801648378372, "learning_rate": 0.0002876237623762376, "loss": 0.8918, "step": 100 }, { "epoch": 0.13601236476043277, "grad_norm": 0.03843015059828758, "learning_rate": 0.00028638613861386135, "loss": 0.8988, "step": 110 }, { "epoch": 0.14837712519319937, "grad_norm": 0.035807665437459946, "learning_rate": 0.0002851485148514851, "loss": 0.899, "step": 120 }, { "epoch": 0.160741885625966, "grad_norm": 0.041015319526195526, "learning_rate": 0.00028391089108910887, "loss": 0.8937, "step": 130 }, { "epoch": 0.1731066460587326, "grad_norm": 0.04108859598636627, "learning_rate": 0.00028267326732673263, "loss": 0.894, "step": 140 }, { "epoch": 0.18547140649149924, "grad_norm": 0.03874868154525757, "learning_rate": 0.0002814356435643564, "loss": 0.9088, "step": 150 }, { "epoch": 0.19783616692426584, "grad_norm": 0.03931848704814911, "learning_rate": 0.00028019801980198015, "loss": 0.9079, "step": 160 }, { "epoch": 0.21020092735703247, "grad_norm": 0.04638601467013359, "learning_rate": 0.0002789603960396039, "loss": 0.9057, "step": 170 }, { "epoch": 0.22256568778979907, "grad_norm": 0.05649528279900551, "learning_rate": 0.00027772277227722773, "loss": 0.872, "step": 180 }, { "epoch": 0.23493044822256567, "grad_norm": 0.043013814836740494, "learning_rate": 0.00027648514851485144, "loss": 0.9102, "step": 190 }, { "epoch": 0.2472952086553323, "grad_norm": 0.04763510450720787, "learning_rate": 0.0002752475247524752, "loss": 0.8894, "step": 200 }, { "epoch": 0.2596599690880989, "grad_norm": 0.03904448449611664, "learning_rate": 0.000274009900990099, "loss": 0.8856, "step": 210 }, { "epoch": 0.27202472952086554, "grad_norm": 0.04328664019703865, "learning_rate": 0.0002727722772277227, "loss": 0.8967, "step": 220 }, { "epoch": 0.28438948995363217, "grad_norm": 0.04718885198235512, "learning_rate": 0.0002715346534653465, "loss": 0.892, "step": 230 }, { "epoch": 0.29675425038639874, "grad_norm": 0.048305947333574295, "learning_rate": 0.0002702970297029703, "loss": 0.8896, "step": 240 }, { "epoch": 0.3091190108191654, "grad_norm": 0.04656008258461952, "learning_rate": 0.000269059405940594, "loss": 0.8942, "step": 250 }, { "epoch": 0.321483771251932, "grad_norm": 0.041436970233917236, "learning_rate": 0.00026782178217821777, "loss": 0.892, "step": 260 }, { "epoch": 0.33384853168469864, "grad_norm": 0.043510112911462784, "learning_rate": 0.0002665841584158416, "loss": 0.8813, "step": 270 }, { "epoch": 0.3462132921174652, "grad_norm": 0.039642177522182465, "learning_rate": 0.00026534653465346534, "loss": 0.8884, "step": 280 }, { "epoch": 0.35857805255023184, "grad_norm": 0.043365489691495895, "learning_rate": 0.00026410891089108905, "loss": 0.8963, "step": 290 }, { "epoch": 0.37094281298299847, "grad_norm": 0.04440496489405632, "learning_rate": 0.00026287128712871287, "loss": 0.8865, "step": 300 }, { "epoch": 0.38330757341576505, "grad_norm": 0.04222091659903526, "learning_rate": 0.00026163366336633663, "loss": 0.8935, "step": 310 }, { "epoch": 0.3956723338485317, "grad_norm": 0.04118286073207855, "learning_rate": 0.00026039603960396033, "loss": 0.8742, "step": 320 }, { "epoch": 0.4080370942812983, "grad_norm": 0.04324512556195259, "learning_rate": 0.00025915841584158415, "loss": 0.8918, "step": 330 }, { "epoch": 0.42040185471406494, "grad_norm": 0.04111215099692345, "learning_rate": 0.0002579207920792079, "loss": 0.8795, "step": 340 }, { "epoch": 0.4327666151468315, "grad_norm": 0.04449688270688057, "learning_rate": 0.0002566831683168316, "loss": 0.8889, "step": 350 }, { "epoch": 0.44513137557959814, "grad_norm": 0.04658028110861778, "learning_rate": 0.00025544554455445543, "loss": 0.8798, "step": 360 }, { "epoch": 0.4574961360123648, "grad_norm": 0.037669096142053604, "learning_rate": 0.0002542079207920792, "loss": 0.8809, "step": 370 }, { "epoch": 0.46986089644513135, "grad_norm": 0.04158737137913704, "learning_rate": 0.00025297029702970296, "loss": 0.8922, "step": 380 }, { "epoch": 0.482225656877898, "grad_norm": 0.047567520290613174, "learning_rate": 0.0002517326732673267, "loss": 0.8949, "step": 390 }, { "epoch": 0.4945904173106646, "grad_norm": 0.057194869965314865, "learning_rate": 0.0002504950495049505, "loss": 0.8774, "step": 400 }, { "epoch": 0.5069551777434312, "grad_norm": 0.04181217402219772, "learning_rate": 0.00024925742574257424, "loss": 0.8907, "step": 410 }, { "epoch": 0.5193199381761978, "grad_norm": 0.045876242220401764, "learning_rate": 0.000248019801980198, "loss": 0.8841, "step": 420 }, { "epoch": 0.5316846986089645, "grad_norm": 0.041932158172130585, "learning_rate": 0.00024678217821782176, "loss": 0.8582, "step": 430 }, { "epoch": 0.5440494590417311, "grad_norm": 0.044740475714206696, "learning_rate": 0.0002455445544554455, "loss": 0.9204, "step": 440 }, { "epoch": 0.5564142194744977, "grad_norm": 0.04608389362692833, "learning_rate": 0.0002443069306930693, "loss": 0.8859, "step": 450 }, { "epoch": 0.5687789799072643, "grad_norm": 0.04750910773873329, "learning_rate": 0.00024306930693069305, "loss": 0.8786, "step": 460 }, { "epoch": 0.5811437403400309, "grad_norm": 0.0407898910343647, "learning_rate": 0.0002418316831683168, "loss": 0.8731, "step": 470 }, { "epoch": 0.5935085007727975, "grad_norm": 0.047790151089429855, "learning_rate": 0.0002405940594059406, "loss": 0.8984, "step": 480 }, { "epoch": 0.6058732612055642, "grad_norm": 0.04182444140315056, "learning_rate": 0.00023935643564356433, "loss": 0.8865, "step": 490 }, { "epoch": 0.6182380216383307, "grad_norm": 0.04374885559082031, "learning_rate": 0.0002381188118811881, "loss": 0.8736, "step": 500 }, { "epoch": 0.6306027820710973, "grad_norm": 0.04540247470140457, "learning_rate": 0.00023688118811881188, "loss": 0.8977, "step": 510 }, { "epoch": 0.642967542503864, "grad_norm": 0.039125751703977585, "learning_rate": 0.00023564356435643561, "loss": 0.8955, "step": 520 }, { "epoch": 0.6553323029366306, "grad_norm": 0.04842868447303772, "learning_rate": 0.00023440594059405938, "loss": 0.8979, "step": 530 }, { "epoch": 0.6676970633693973, "grad_norm": 0.04414287582039833, "learning_rate": 0.00023316831683168316, "loss": 0.9063, "step": 540 }, { "epoch": 0.6800618238021638, "grad_norm": 0.05018250271677971, "learning_rate": 0.0002319306930693069, "loss": 0.894, "step": 550 }, { "epoch": 0.6924265842349304, "grad_norm": 0.04726792126893997, "learning_rate": 0.00023069306930693066, "loss": 0.8716, "step": 560 }, { "epoch": 0.7047913446676971, "grad_norm": 0.049401551485061646, "learning_rate": 0.00022945544554455445, "loss": 0.9016, "step": 570 }, { "epoch": 0.7171561051004637, "grad_norm": 0.049783241003751755, "learning_rate": 0.0002282178217821782, "loss": 0.8774, "step": 580 }, { "epoch": 0.7295208655332303, "grad_norm": 0.04755168408155441, "learning_rate": 0.00022698019801980194, "loss": 0.8814, "step": 590 }, { "epoch": 0.7418856259659969, "grad_norm": 0.04885553568601608, "learning_rate": 0.00022574257425742573, "loss": 0.8744, "step": 600 }, { "epoch": 0.7542503863987635, "grad_norm": 0.04771718755364418, "learning_rate": 0.0002245049504950495, "loss": 0.9001, "step": 610 }, { "epoch": 0.7666151468315301, "grad_norm": 0.04642605781555176, "learning_rate": 0.00022326732673267323, "loss": 0.88, "step": 620 }, { "epoch": 0.7789799072642968, "grad_norm": 0.047350749373435974, "learning_rate": 0.00022202970297029702, "loss": 0.8928, "step": 630 }, { "epoch": 0.7913446676970634, "grad_norm": 0.04467844218015671, "learning_rate": 0.00022079207920792078, "loss": 0.885, "step": 640 }, { "epoch": 0.80370942812983, "grad_norm": 0.04457986727356911, "learning_rate": 0.0002195544554455445, "loss": 0.888, "step": 650 }, { "epoch": 0.8160741885625966, "grad_norm": 0.04410697519779205, "learning_rate": 0.0002183168316831683, "loss": 0.8888, "step": 660 }, { "epoch": 0.8284389489953632, "grad_norm": 0.0475030243396759, "learning_rate": 0.00021707920792079206, "loss": 0.9009, "step": 670 }, { "epoch": 0.8408037094281299, "grad_norm": 0.043028101325035095, "learning_rate": 0.00021584158415841585, "loss": 0.8735, "step": 680 }, { "epoch": 0.8531684698608965, "grad_norm": 0.04463913291692734, "learning_rate": 0.00021460396039603958, "loss": 0.8742, "step": 690 }, { "epoch": 0.865533230293663, "grad_norm": 0.04648848995566368, "learning_rate": 0.00021336633663366334, "loss": 0.8899, "step": 700 }, { "epoch": 0.8778979907264297, "grad_norm": 0.04463621601462364, "learning_rate": 0.00021212871287128713, "loss": 0.8887, "step": 710 }, { "epoch": 0.8902627511591963, "grad_norm": 0.04241452366113663, "learning_rate": 0.00021089108910891087, "loss": 0.8749, "step": 720 }, { "epoch": 0.9026275115919629, "grad_norm": 0.04464114084839821, "learning_rate": 0.00020965346534653463, "loss": 0.8774, "step": 730 }, { "epoch": 0.9149922720247295, "grad_norm": 0.04345027729868889, "learning_rate": 0.00020841584158415842, "loss": 0.8753, "step": 740 }, { "epoch": 0.9273570324574961, "grad_norm": 0.048532094806432724, "learning_rate": 0.00020717821782178215, "loss": 0.8946, "step": 750 }, { "epoch": 0.9397217928902627, "grad_norm": 0.04126739129424095, "learning_rate": 0.0002059405940594059, "loss": 0.903, "step": 760 }, { "epoch": 0.9520865533230294, "grad_norm": 0.04423375427722931, "learning_rate": 0.0002047029702970297, "loss": 0.8843, "step": 770 }, { "epoch": 0.964451313755796, "grad_norm": 0.04136930778622627, "learning_rate": 0.00020346534653465346, "loss": 0.8757, "step": 780 }, { "epoch": 0.9768160741885626, "grad_norm": 0.05331163853406906, "learning_rate": 0.0002022277227722772, "loss": 0.8842, "step": 790 }, { "epoch": 0.9891808346213292, "grad_norm": 0.04790889099240303, "learning_rate": 0.00020099009900990098, "loss": 0.8814, "step": 800 }, { "epoch": 1.0012364760432766, "grad_norm": 0.05177275091409683, "learning_rate": 0.00019975247524752475, "loss": 0.8858, "step": 810 }, { "epoch": 1.0136012364760432, "grad_norm": 0.0411980040371418, "learning_rate": 0.00019851485148514848, "loss": 0.8461, "step": 820 }, { "epoch": 1.02596599690881, "grad_norm": 0.04518349468708038, "learning_rate": 0.00019727722772277227, "loss": 0.8547, "step": 830 }, { "epoch": 1.0383307573415765, "grad_norm": 0.047048419713974, "learning_rate": 0.00019603960396039603, "loss": 0.8502, "step": 840 }, { "epoch": 1.0506955177743431, "grad_norm": 0.04998902231454849, "learning_rate": 0.00019480198019801976, "loss": 0.8584, "step": 850 }, { "epoch": 1.0630602782071097, "grad_norm": 0.05004483088850975, "learning_rate": 0.00019356435643564355, "loss": 0.8787, "step": 860 }, { "epoch": 1.0754250386398763, "grad_norm": 0.0483798012137413, "learning_rate": 0.0001923267326732673, "loss": 0.8732, "step": 870 }, { "epoch": 1.087789799072643, "grad_norm": 0.048114124685525894, "learning_rate": 0.00019108910891089107, "loss": 0.8773, "step": 880 }, { "epoch": 1.1001545595054096, "grad_norm": 0.04553611949086189, "learning_rate": 0.00018985148514851484, "loss": 0.8646, "step": 890 }, { "epoch": 1.1125193199381762, "grad_norm": 0.052288319915533066, "learning_rate": 0.0001886138613861386, "loss": 0.8592, "step": 900 }, { "epoch": 1.1248840803709428, "grad_norm": 0.05070117861032486, "learning_rate": 0.00018737623762376236, "loss": 0.8565, "step": 910 }, { "epoch": 1.1372488408037094, "grad_norm": 0.049008361995220184, "learning_rate": 0.00018613861386138612, "loss": 0.8783, "step": 920 }, { "epoch": 1.1496136012364762, "grad_norm": 0.04916449636220932, "learning_rate": 0.00018490099009900988, "loss": 0.8668, "step": 930 }, { "epoch": 1.1619783616692427, "grad_norm": 0.05646826699376106, "learning_rate": 0.00018366336633663364, "loss": 0.858, "step": 940 }, { "epoch": 1.1743431221020093, "grad_norm": 0.05039024353027344, "learning_rate": 0.0001824257425742574, "loss": 0.8687, "step": 950 }, { "epoch": 1.1867078825347759, "grad_norm": 0.052257779985666275, "learning_rate": 0.00018118811881188116, "loss": 0.8731, "step": 960 }, { "epoch": 1.1990726429675425, "grad_norm": 0.04960246384143829, "learning_rate": 0.00017995049504950493, "loss": 0.8346, "step": 970 }, { "epoch": 1.211437403400309, "grad_norm": 0.05193152651190758, "learning_rate": 0.00017871287128712871, "loss": 0.8656, "step": 980 }, { "epoch": 1.2238021638330758, "grad_norm": 0.05180949717760086, "learning_rate": 0.00017747524752475245, "loss": 0.8542, "step": 990 }, { "epoch": 1.2361669242658424, "grad_norm": 0.05225878953933716, "learning_rate": 0.0001762376237623762, "loss": 0.8628, "step": 1000 }, { "epoch": 1.248531684698609, "grad_norm": 0.05485387519001961, "learning_rate": 0.000175, "loss": 0.8746, "step": 1010 }, { "epoch": 1.2608964451313756, "grad_norm": 0.06754795461893082, "learning_rate": 0.00017376237623762373, "loss": 0.8702, "step": 1020 }, { "epoch": 1.2732612055641421, "grad_norm": 0.05525548383593559, "learning_rate": 0.00017252475247524752, "loss": 0.863, "step": 1030 }, { "epoch": 1.2856259659969087, "grad_norm": 0.05193280428647995, "learning_rate": 0.00017128712871287128, "loss": 0.8389, "step": 1040 }, { "epoch": 1.2979907264296755, "grad_norm": 0.04822159186005592, "learning_rate": 0.00017004950495049502, "loss": 0.8665, "step": 1050 }, { "epoch": 1.310355486862442, "grad_norm": 0.05497356876730919, "learning_rate": 0.0001688118811881188, "loss": 0.8635, "step": 1060 }, { "epoch": 1.3227202472952087, "grad_norm": 0.05118054896593094, "learning_rate": 0.00016757425742574257, "loss": 0.8483, "step": 1070 }, { "epoch": 1.3350850077279752, "grad_norm": 0.051902711391448975, "learning_rate": 0.00016633663366336633, "loss": 0.8478, "step": 1080 }, { "epoch": 1.3474497681607418, "grad_norm": 0.049953706562519073, "learning_rate": 0.0001650990099009901, "loss": 0.8569, "step": 1090 }, { "epoch": 1.3598145285935086, "grad_norm": 0.09028486907482147, "learning_rate": 0.00016386138613861385, "loss": 0.8465, "step": 1100 }, { "epoch": 1.3721792890262752, "grad_norm": 0.05248475819826126, "learning_rate": 0.0001626237623762376, "loss": 0.8707, "step": 1110 }, { "epoch": 1.3845440494590417, "grad_norm": 0.05470622703433037, "learning_rate": 0.00016138613861386137, "loss": 0.8581, "step": 1120 }, { "epoch": 1.3969088098918083, "grad_norm": 0.051429346203804016, "learning_rate": 0.00016014851485148513, "loss": 0.867, "step": 1130 }, { "epoch": 1.409273570324575, "grad_norm": 0.05353890359401703, "learning_rate": 0.0001589108910891089, "loss": 0.8489, "step": 1140 }, { "epoch": 1.4216383307573417, "grad_norm": 0.0630929172039032, "learning_rate": 0.00015767326732673266, "loss": 0.8575, "step": 1150 }, { "epoch": 1.4340030911901083, "grad_norm": 0.0524783730506897, "learning_rate": 0.00015643564356435642, "loss": 0.8527, "step": 1160 }, { "epoch": 1.4463678516228748, "grad_norm": 0.05413209646940231, "learning_rate": 0.00015519801980198018, "loss": 0.8786, "step": 1170 }, { "epoch": 1.4587326120556414, "grad_norm": 0.055751536041498184, "learning_rate": 0.00015396039603960397, "loss": 0.872, "step": 1180 }, { "epoch": 1.471097372488408, "grad_norm": 0.05271457880735397, "learning_rate": 0.0001527227722772277, "loss": 0.8734, "step": 1190 }, { "epoch": 1.4834621329211746, "grad_norm": 0.04827325418591499, "learning_rate": 0.00015148514851485146, "loss": 0.8488, "step": 1200 }, { "epoch": 1.4958268933539411, "grad_norm": 0.05717690661549568, "learning_rate": 0.00015024752475247525, "loss": 0.8732, "step": 1210 }, { "epoch": 1.508191653786708, "grad_norm": 0.055509038269519806, "learning_rate": 0.000149009900990099, "loss": 0.8675, "step": 1220 }, { "epoch": 1.5205564142194745, "grad_norm": 0.05562078580260277, "learning_rate": 0.00014777227722772275, "loss": 0.8644, "step": 1230 }, { "epoch": 1.532921174652241, "grad_norm": 0.046674925833940506, "learning_rate": 0.00014653465346534653, "loss": 0.8429, "step": 1240 }, { "epoch": 1.545285935085008, "grad_norm": 0.053251732140779495, "learning_rate": 0.0001452970297029703, "loss": 0.849, "step": 1250 }, { "epoch": 1.5576506955177742, "grad_norm": 0.05253510922193527, "learning_rate": 0.00014405940594059403, "loss": 0.8445, "step": 1260 }, { "epoch": 1.570015455950541, "grad_norm": 0.05021601915359497, "learning_rate": 0.00014282178217821782, "loss": 0.8668, "step": 1270 }, { "epoch": 1.5823802163833076, "grad_norm": 0.052446555346250534, "learning_rate": 0.00014158415841584158, "loss": 0.8733, "step": 1280 }, { "epoch": 1.5947449768160742, "grad_norm": 0.056364450603723526, "learning_rate": 0.00014034653465346534, "loss": 0.8823, "step": 1290 }, { "epoch": 1.6071097372488408, "grad_norm": 0.05288272723555565, "learning_rate": 0.0001391089108910891, "loss": 0.8678, "step": 1300 }, { "epoch": 1.6194744976816073, "grad_norm": 0.054042939096689224, "learning_rate": 0.00013787128712871286, "loss": 0.8439, "step": 1310 }, { "epoch": 1.6318392581143741, "grad_norm": 0.051554158329963684, "learning_rate": 0.00013663366336633662, "loss": 0.8514, "step": 1320 }, { "epoch": 1.6442040185471405, "grad_norm": 0.04892382398247719, "learning_rate": 0.00013539603960396039, "loss": 0.834, "step": 1330 }, { "epoch": 1.6565687789799073, "grad_norm": 0.05448554828763008, "learning_rate": 0.00013415841584158415, "loss": 0.8484, "step": 1340 }, { "epoch": 1.6689335394126739, "grad_norm": 0.056680306792259216, "learning_rate": 0.0001329207920792079, "loss": 0.8495, "step": 1350 }, { "epoch": 1.6812982998454404, "grad_norm": 0.05566761642694473, "learning_rate": 0.00013168316831683167, "loss": 0.856, "step": 1360 }, { "epoch": 1.6936630602782072, "grad_norm": 0.04952670633792877, "learning_rate": 0.00013044554455445543, "loss": 0.8405, "step": 1370 }, { "epoch": 1.7060278207109736, "grad_norm": 0.05578543245792389, "learning_rate": 0.0001292079207920792, "loss": 0.8555, "step": 1380 }, { "epoch": 1.7183925811437404, "grad_norm": 0.05533617362380028, "learning_rate": 0.00012797029702970295, "loss": 0.8558, "step": 1390 }, { "epoch": 1.730757341576507, "grad_norm": 0.05991559103131294, "learning_rate": 0.00012673267326732672, "loss": 0.8636, "step": 1400 }, { "epoch": 1.7431221020092735, "grad_norm": 0.054518427699804306, "learning_rate": 0.00012549504950495048, "loss": 0.8471, "step": 1410 }, { "epoch": 1.7554868624420403, "grad_norm": 0.04764275252819061, "learning_rate": 0.00012425742574257426, "loss": 0.8449, "step": 1420 }, { "epoch": 1.7678516228748067, "grad_norm": 0.058475952595472336, "learning_rate": 0.000123019801980198, "loss": 0.8523, "step": 1430 }, { "epoch": 1.7802163833075735, "grad_norm": 0.05991446226835251, "learning_rate": 0.00012178217821782177, "loss": 0.8564, "step": 1440 }, { "epoch": 1.79258114374034, "grad_norm": 0.0623490996658802, "learning_rate": 0.00012054455445544554, "loss": 0.8404, "step": 1450 }, { "epoch": 1.8049459041731066, "grad_norm": 0.04905753955245018, "learning_rate": 0.0001193069306930693, "loss": 0.8474, "step": 1460 }, { "epoch": 1.8173106646058734, "grad_norm": 0.05426807701587677, "learning_rate": 0.00011806930693069306, "loss": 0.8766, "step": 1470 }, { "epoch": 1.8296754250386398, "grad_norm": 0.0476132333278656, "learning_rate": 0.00011683168316831682, "loss": 0.815, "step": 1480 }, { "epoch": 1.8420401854714066, "grad_norm": 0.05849111080169678, "learning_rate": 0.0001155940594059406, "loss": 0.854, "step": 1490 }, { "epoch": 1.8544049459041732, "grad_norm": 0.05493124946951866, "learning_rate": 0.00011435643564356434, "loss": 0.8563, "step": 1500 }, { "epoch": 1.8667697063369397, "grad_norm": 0.05999801307916641, "learning_rate": 0.0001131188118811881, "loss": 0.8498, "step": 1510 }, { "epoch": 1.8791344667697063, "grad_norm": 0.058151423931121826, "learning_rate": 0.00011188118811881188, "loss": 0.8645, "step": 1520 }, { "epoch": 1.8914992272024729, "grad_norm": 0.05524227395653725, "learning_rate": 0.00011064356435643564, "loss": 0.8624, "step": 1530 }, { "epoch": 1.9038639876352397, "grad_norm": 0.06369632482528687, "learning_rate": 0.00010940594059405939, "loss": 0.8695, "step": 1540 }, { "epoch": 1.916228748068006, "grad_norm": 0.057092998176813126, "learning_rate": 0.00010816831683168316, "loss": 0.844, "step": 1550 }, { "epoch": 1.9285935085007728, "grad_norm": 0.05554778128862381, "learning_rate": 0.00010693069306930692, "loss": 0.8543, "step": 1560 }, { "epoch": 1.9409582689335394, "grad_norm": 0.05691225454211235, "learning_rate": 0.00010569306930693068, "loss": 0.8714, "step": 1570 }, { "epoch": 1.953323029366306, "grad_norm": 0.0564524307847023, "learning_rate": 0.00010445544554455445, "loss": 0.8574, "step": 1580 }, { "epoch": 1.9656877897990728, "grad_norm": 0.0588836595416069, "learning_rate": 0.0001032178217821782, "loss": 0.8558, "step": 1590 }, { "epoch": 1.9780525502318391, "grad_norm": 0.05634515732526779, "learning_rate": 0.00010198019801980197, "loss": 0.8444, "step": 1600 }, { "epoch": 1.990417310664606, "grad_norm": 0.055482737720012665, "learning_rate": 0.00010074257425742573, "loss": 0.8672, "step": 1610 }, { "epoch": 2.002472952086553, "grad_norm": 0.054257094860076904, "learning_rate": 9.95049504950495e-05, "loss": 0.8579, "step": 1620 }, { "epoch": 2.01483771251932, "grad_norm": 0.05709832161664963, "learning_rate": 9.826732673267325e-05, "loss": 0.8414, "step": 1630 }, { "epoch": 2.0272024729520863, "grad_norm": 0.05785168707370758, "learning_rate": 9.702970297029701e-05, "loss": 0.8223, "step": 1640 }, { "epoch": 2.039567233384853, "grad_norm": 0.060052480548620224, "learning_rate": 9.579207920792079e-05, "loss": 0.8371, "step": 1650 }, { "epoch": 2.05193199381762, "grad_norm": 0.06388446688652039, "learning_rate": 9.455445544554454e-05, "loss": 0.8308, "step": 1660 }, { "epoch": 2.0642967542503863, "grad_norm": 0.05495399236679077, "learning_rate": 9.331683168316831e-05, "loss": 0.8582, "step": 1670 }, { "epoch": 2.076661514683153, "grad_norm": 0.0544477179646492, "learning_rate": 9.207920792079207e-05, "loss": 0.8383, "step": 1680 }, { "epoch": 2.0890262751159194, "grad_norm": 0.06450890749692917, "learning_rate": 9.084158415841582e-05, "loss": 0.8359, "step": 1690 }, { "epoch": 2.1013910355486862, "grad_norm": 0.054119642823934555, "learning_rate": 8.96039603960396e-05, "loss": 0.8363, "step": 1700 }, { "epoch": 2.113755795981453, "grad_norm": 0.05726737529039383, "learning_rate": 8.836633663366336e-05, "loss": 0.8169, "step": 1710 }, { "epoch": 2.1261205564142194, "grad_norm": 0.0577755868434906, "learning_rate": 8.712871287128713e-05, "loss": 0.8586, "step": 1720 }, { "epoch": 2.138485316846986, "grad_norm": 0.06451012194156647, "learning_rate": 8.589108910891088e-05, "loss": 0.834, "step": 1730 }, { "epoch": 2.1508500772797525, "grad_norm": 0.06303463876247406, "learning_rate": 8.465346534653464e-05, "loss": 0.8333, "step": 1740 }, { "epoch": 2.1632148377125193, "grad_norm": 0.058561887592077255, "learning_rate": 8.341584158415841e-05, "loss": 0.8321, "step": 1750 }, { "epoch": 2.175579598145286, "grad_norm": 0.05364146828651428, "learning_rate": 8.217821782178216e-05, "loss": 0.8428, "step": 1760 }, { "epoch": 2.1879443585780525, "grad_norm": 0.063669353723526, "learning_rate": 8.094059405940594e-05, "loss": 0.8527, "step": 1770 }, { "epoch": 2.2003091190108193, "grad_norm": 0.05790480971336365, "learning_rate": 7.97029702970297e-05, "loss": 0.8261, "step": 1780 }, { "epoch": 2.2126738794435856, "grad_norm": 0.06101266294717789, "learning_rate": 7.846534653465345e-05, "loss": 0.8075, "step": 1790 }, { "epoch": 2.2250386398763524, "grad_norm": 0.06296826899051666, "learning_rate": 7.722772277227722e-05, "loss": 0.8284, "step": 1800 }, { "epoch": 2.237403400309119, "grad_norm": 0.0548894926905632, "learning_rate": 7.599009900990098e-05, "loss": 0.8385, "step": 1810 }, { "epoch": 2.2497681607418856, "grad_norm": 0.06245751306414604, "learning_rate": 7.475247524752474e-05, "loss": 0.8188, "step": 1820 }, { "epoch": 2.2621329211746524, "grad_norm": 0.06896353513002396, "learning_rate": 7.35148514851485e-05, "loss": 0.8229, "step": 1830 }, { "epoch": 2.2744976816074187, "grad_norm": 0.06569264829158783, "learning_rate": 7.227722772277227e-05, "loss": 0.8398, "step": 1840 }, { "epoch": 2.2868624420401855, "grad_norm": 0.06732139736413956, "learning_rate": 7.103960396039604e-05, "loss": 0.8439, "step": 1850 }, { "epoch": 2.2992272024729523, "grad_norm": 0.06835715472698212, "learning_rate": 6.98019801980198e-05, "loss": 0.821, "step": 1860 }, { "epoch": 2.3115919629057187, "grad_norm": 0.05850212648510933, "learning_rate": 6.856435643564355e-05, "loss": 0.8235, "step": 1870 }, { "epoch": 2.3239567233384855, "grad_norm": 0.06048553064465523, "learning_rate": 6.732673267326732e-05, "loss": 0.844, "step": 1880 }, { "epoch": 2.336321483771252, "grad_norm": 0.05443299934267998, "learning_rate": 6.608910891089109e-05, "loss": 0.8173, "step": 1890 }, { "epoch": 2.3486862442040186, "grad_norm": 0.06576599180698395, "learning_rate": 6.485148514851485e-05, "loss": 0.826, "step": 1900 }, { "epoch": 2.361051004636785, "grad_norm": 0.06261160224676132, "learning_rate": 6.361386138613861e-05, "loss": 0.8571, "step": 1910 }, { "epoch": 2.3734157650695518, "grad_norm": 0.05812652036547661, "learning_rate": 6.237623762376237e-05, "loss": 0.8227, "step": 1920 }, { "epoch": 2.3857805255023186, "grad_norm": 0.06309802830219269, "learning_rate": 6.113861386138613e-05, "loss": 0.8412, "step": 1930 }, { "epoch": 2.398145285935085, "grad_norm": 0.06207476556301117, "learning_rate": 5.99009900990099e-05, "loss": 0.8386, "step": 1940 }, { "epoch": 2.4105100463678517, "grad_norm": 0.05841566249728203, "learning_rate": 5.866336633663366e-05, "loss": 0.828, "step": 1950 }, { "epoch": 2.422874806800618, "grad_norm": 0.05857423320412636, "learning_rate": 5.742574257425742e-05, "loss": 0.8198, "step": 1960 }, { "epoch": 2.435239567233385, "grad_norm": 0.06476933509111404, "learning_rate": 5.618811881188118e-05, "loss": 0.84, "step": 1970 }, { "epoch": 2.4476043276661517, "grad_norm": 0.06856492906808853, "learning_rate": 5.4950495049504944e-05, "loss": 0.8386, "step": 1980 }, { "epoch": 2.459969088098918, "grad_norm": 0.0675152987241745, "learning_rate": 5.371287128712871e-05, "loss": 0.8603, "step": 1990 }, { "epoch": 2.472333848531685, "grad_norm": 0.059057943522930145, "learning_rate": 5.247524752475247e-05, "loss": 0.8254, "step": 2000 }, { "epoch": 2.484698608964451, "grad_norm": 0.06778612732887268, "learning_rate": 5.1237623762376234e-05, "loss": 0.829, "step": 2010 }, { "epoch": 2.497063369397218, "grad_norm": 0.0652635246515274, "learning_rate": 4.9999999999999996e-05, "loss": 0.8321, "step": 2020 }, { "epoch": 2.5094281298299848, "grad_norm": 0.0605316124856472, "learning_rate": 4.876237623762376e-05, "loss": 0.8458, "step": 2030 }, { "epoch": 2.521792890262751, "grad_norm": 0.06351178884506226, "learning_rate": 4.752475247524752e-05, "loss": 0.8199, "step": 2040 }, { "epoch": 2.534157650695518, "grad_norm": 0.0644257590174675, "learning_rate": 4.6287128712871286e-05, "loss": 0.8311, "step": 2050 }, { "epoch": 2.5465224111282843, "grad_norm": 0.06502491235733032, "learning_rate": 4.504950495049505e-05, "loss": 0.8443, "step": 2060 }, { "epoch": 2.558887171561051, "grad_norm": 0.07183568179607391, "learning_rate": 4.38118811881188e-05, "loss": 0.825, "step": 2070 }, { "epoch": 2.5712519319938174, "grad_norm": 0.06714395433664322, "learning_rate": 4.257425742574257e-05, "loss": 0.8283, "step": 2080 }, { "epoch": 2.583616692426584, "grad_norm": 0.07098986953496933, "learning_rate": 4.133663366336633e-05, "loss": 0.8376, "step": 2090 }, { "epoch": 2.595981452859351, "grad_norm": 0.0671941488981247, "learning_rate": 4.00990099009901e-05, "loss": 0.8457, "step": 2100 }, { "epoch": 2.6083462132921174, "grad_norm": 0.07306034862995148, "learning_rate": 3.886138613861386e-05, "loss": 0.843, "step": 2110 }, { "epoch": 2.620710973724884, "grad_norm": 0.06762495636940002, "learning_rate": 3.7623762376237615e-05, "loss": 0.8188, "step": 2120 }, { "epoch": 2.633075734157651, "grad_norm": 0.06061069294810295, "learning_rate": 3.638613861386138e-05, "loss": 0.8059, "step": 2130 }, { "epoch": 2.6454404945904173, "grad_norm": 0.0667000338435173, "learning_rate": 3.5148514851485144e-05, "loss": 0.8364, "step": 2140 }, { "epoch": 2.6578052550231837, "grad_norm": 0.058926161378622055, "learning_rate": 3.3910891089108906e-05, "loss": 0.8267, "step": 2150 }, { "epoch": 2.6701700154559505, "grad_norm": 0.05975179746747017, "learning_rate": 3.267326732673267e-05, "loss": 0.8018, "step": 2160 }, { "epoch": 2.6825347758887172, "grad_norm": 0.06300190091133118, "learning_rate": 3.1435643564356435e-05, "loss": 0.8306, "step": 2170 }, { "epoch": 2.6948995363214836, "grad_norm": 0.06579259783029556, "learning_rate": 3.0198019801980193e-05, "loss": 0.8385, "step": 2180 }, { "epoch": 2.7072642967542504, "grad_norm": 0.07062911242246628, "learning_rate": 2.8960396039603958e-05, "loss": 0.8422, "step": 2190 }, { "epoch": 2.719629057187017, "grad_norm": 0.06216396763920784, "learning_rate": 2.772277227722772e-05, "loss": 0.8292, "step": 2200 }, { "epoch": 2.7319938176197835, "grad_norm": 0.06445206701755524, "learning_rate": 2.6485148514851484e-05, "loss": 0.8406, "step": 2210 }, { "epoch": 2.7443585780525503, "grad_norm": 0.06448670476675034, "learning_rate": 2.5247524752475248e-05, "loss": 0.8171, "step": 2220 }, { "epoch": 2.7567233384853167, "grad_norm": 0.05858496576547623, "learning_rate": 2.4009900990099006e-05, "loss": 0.8383, "step": 2230 }, { "epoch": 2.7690880989180835, "grad_norm": 0.07208121567964554, "learning_rate": 2.277227722772277e-05, "loss": 0.8145, "step": 2240 }, { "epoch": 2.78145285935085, "grad_norm": 0.06663426011800766, "learning_rate": 2.1534653465346532e-05, "loss": 0.8293, "step": 2250 }, { "epoch": 2.7938176197836166, "grad_norm": 0.06585463881492615, "learning_rate": 2.0297029702970297e-05, "loss": 0.8303, "step": 2260 }, { "epoch": 2.8061823802163834, "grad_norm": 0.06423688679933548, "learning_rate": 1.9059405940594058e-05, "loss": 0.8172, "step": 2270 }, { "epoch": 2.81854714064915, "grad_norm": 0.06450697034597397, "learning_rate": 1.782178217821782e-05, "loss": 0.8355, "step": 2280 }, { "epoch": 2.8309119010819166, "grad_norm": 0.05580071732401848, "learning_rate": 1.6584158415841584e-05, "loss": 0.8136, "step": 2290 }, { "epoch": 2.8432766615146834, "grad_norm": 0.06626173853874207, "learning_rate": 1.5346534653465345e-05, "loss": 0.8238, "step": 2300 }, { "epoch": 2.8556414219474497, "grad_norm": 0.061952993273735046, "learning_rate": 1.4108910891089108e-05, "loss": 0.8179, "step": 2310 }, { "epoch": 2.8680061823802165, "grad_norm": 0.07288029789924622, "learning_rate": 1.287128712871287e-05, "loss": 0.8194, "step": 2320 }, { "epoch": 2.880370942812983, "grad_norm": 0.06706374138593674, "learning_rate": 1.1633663366336632e-05, "loss": 0.8611, "step": 2330 }, { "epoch": 2.8927357032457497, "grad_norm": 0.06370951235294342, "learning_rate": 1.0396039603960395e-05, "loss": 0.848, "step": 2340 }, { "epoch": 2.905100463678516, "grad_norm": 0.061200667172670364, "learning_rate": 9.158415841584158e-06, "loss": 0.817, "step": 2350 }, { "epoch": 2.917465224111283, "grad_norm": 0.07799932360649109, "learning_rate": 7.92079207920792e-06, "loss": 0.8401, "step": 2360 }, { "epoch": 2.9298299845440496, "grad_norm": 0.0740487277507782, "learning_rate": 6.683168316831683e-06, "loss": 0.8447, "step": 2370 }, { "epoch": 2.942194744976816, "grad_norm": 0.062499478459358215, "learning_rate": 5.445544554455446e-06, "loss": 0.8446, "step": 2380 }, { "epoch": 2.954559505409583, "grad_norm": 0.06899666786193848, "learning_rate": 4.207920792079208e-06, "loss": 0.8411, "step": 2390 }, { "epoch": 2.966924265842349, "grad_norm": 0.0634492039680481, "learning_rate": 2.97029702970297e-06, "loss": 0.8422, "step": 2400 }, { "epoch": 2.979289026275116, "grad_norm": 0.0700407549738884, "learning_rate": 1.7326732673267324e-06, "loss": 0.8468, "step": 2410 }, { "epoch": 2.9916537867078823, "grad_norm": 0.061774324625730515, "learning_rate": 4.95049504950495e-07, "loss": 0.8337, "step": 2420 } ], "logging_steps": 10, "max_steps": 2424, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.723181741683245e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }