| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.10005503026664665, |
| "eval_steps": 500, |
| "global_step": 2000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0005002751513332333, |
| "grad_norm": 0.9514128565788269, |
| "learning_rate": 0.00019999999999999998, |
| "loss": 2.6758, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.0010005503026664665, |
| "grad_norm": 0.24659623205661774, |
| "learning_rate": 0.0002, |
| "loss": 3.1472, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.0015008254539997, |
| "grad_norm": 0.15325957536697388, |
| "learning_rate": 0.0002, |
| "loss": 3.3443, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.002001100605332933, |
| "grad_norm": 0.13286100327968597, |
| "learning_rate": 0.0002, |
| "loss": 3.1872, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.0025013757566661665, |
| "grad_norm": 0.13320517539978027, |
| "learning_rate": 0.0002, |
| "loss": 3.1423, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.0030016509079994, |
| "grad_norm": 0.15094240009784698, |
| "learning_rate": 0.0002, |
| "loss": 3.2816, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.003501926059332633, |
| "grad_norm": 0.11190259456634521, |
| "learning_rate": 0.0002, |
| "loss": 3.1771, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.004002201210665866, |
| "grad_norm": 0.14013367891311646, |
| "learning_rate": 0.0002, |
| "loss": 3.292, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.004502476361999099, |
| "grad_norm": 0.10220601409673691, |
| "learning_rate": 0.0002, |
| "loss": 3.2377, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.005002751513332333, |
| "grad_norm": 0.11760038137435913, |
| "learning_rate": 0.0002, |
| "loss": 3.0844, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.005503026664665566, |
| "grad_norm": 0.11736863106489182, |
| "learning_rate": 0.0002, |
| "loss": 3.2212, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.0060033018159988, |
| "grad_norm": 0.13063907623291016, |
| "learning_rate": 0.0002, |
| "loss": 3.3429, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.006503576967332033, |
| "grad_norm": 0.10621850937604904, |
| "learning_rate": 0.0002, |
| "loss": 3.2445, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.007003852118665266, |
| "grad_norm": 0.11594289541244507, |
| "learning_rate": 0.0002, |
| "loss": 3.2772, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.0075041272699984994, |
| "grad_norm": 0.12541130185127258, |
| "learning_rate": 0.0002, |
| "loss": 3.3111, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.008004402421331732, |
| "grad_norm": 0.09993274509906769, |
| "learning_rate": 0.0002, |
| "loss": 3.1917, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.008504677572664966, |
| "grad_norm": 0.15057022869586945, |
| "learning_rate": 0.0002, |
| "loss": 3.4024, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.009004952723998198, |
| "grad_norm": 0.2080872654914856, |
| "learning_rate": 0.0002, |
| "loss": 3.3374, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.009505227875331432, |
| "grad_norm": 0.09899163246154785, |
| "learning_rate": 0.0002, |
| "loss": 3.316, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.010005503026664666, |
| "grad_norm": 0.10972588509321213, |
| "learning_rate": 0.0002, |
| "loss": 3.2684, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.010505778177997898, |
| "grad_norm": 0.12223390489816666, |
| "learning_rate": 0.0002, |
| "loss": 3.3524, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.011006053329331132, |
| "grad_norm": 0.11201239377260208, |
| "learning_rate": 0.0002, |
| "loss": 3.4964, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.011506328480664366, |
| "grad_norm": 0.11129321157932281, |
| "learning_rate": 0.0002, |
| "loss": 3.3762, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.0120066036319976, |
| "grad_norm": 0.11999399214982986, |
| "learning_rate": 0.0002, |
| "loss": 3.475, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.012506878783330832, |
| "grad_norm": 0.12480571120977402, |
| "learning_rate": 0.0002, |
| "loss": 3.3755, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.013007153934664065, |
| "grad_norm": 0.10506771504878998, |
| "learning_rate": 0.0002, |
| "loss": 3.3847, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.0135074290859973, |
| "grad_norm": 0.08484470844268799, |
| "learning_rate": 0.0002, |
| "loss": 3.3913, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.014007704237330531, |
| "grad_norm": 0.09486206620931625, |
| "learning_rate": 0.0002, |
| "loss": 3.3096, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.014507979388663765, |
| "grad_norm": 0.1048346683382988, |
| "learning_rate": 0.0002, |
| "loss": 3.3356, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.015008254539996999, |
| "grad_norm": 0.09623505920171738, |
| "learning_rate": 0.0002, |
| "loss": 3.357, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.015508529691330231, |
| "grad_norm": 0.09872445464134216, |
| "learning_rate": 0.0002, |
| "loss": 3.5141, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.016008804842663465, |
| "grad_norm": 0.10426618158817291, |
| "learning_rate": 0.0002, |
| "loss": 3.4722, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.016509079993996697, |
| "grad_norm": 0.10077104717493057, |
| "learning_rate": 0.0002, |
| "loss": 3.3915, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.017009355145329932, |
| "grad_norm": 0.11084343492984772, |
| "learning_rate": 0.0002, |
| "loss": 3.3849, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.017509630296663165, |
| "grad_norm": 0.08321600407361984, |
| "learning_rate": 0.0002, |
| "loss": 3.3542, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.018009905447996397, |
| "grad_norm": 0.08220788836479187, |
| "learning_rate": 0.0002, |
| "loss": 3.4381, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.018510180599329632, |
| "grad_norm": 0.10560263693332672, |
| "learning_rate": 0.0002, |
| "loss": 3.2399, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.019010455750662864, |
| "grad_norm": 0.07301683723926544, |
| "learning_rate": 0.0002, |
| "loss": 3.4645, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.019510730901996096, |
| "grad_norm": 0.10291752964258194, |
| "learning_rate": 0.0002, |
| "loss": 3.491, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.020011006053329332, |
| "grad_norm": 0.09107044339179993, |
| "learning_rate": 0.0002, |
| "loss": 3.2015, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.020511281204662564, |
| "grad_norm": 0.08597932755947113, |
| "learning_rate": 0.0002, |
| "loss": 3.5352, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.021011556355995796, |
| "grad_norm": 0.080569788813591, |
| "learning_rate": 0.0002, |
| "loss": 3.3541, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.02151183150732903, |
| "grad_norm": 0.08738499134778976, |
| "learning_rate": 0.0002, |
| "loss": 3.2724, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.022012106658662264, |
| "grad_norm": 0.07786229997873306, |
| "learning_rate": 0.0002, |
| "loss": 3.4347, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.0225123818099955, |
| "grad_norm": 0.07400281727313995, |
| "learning_rate": 0.0002, |
| "loss": 3.2489, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.02301265696132873, |
| "grad_norm": 0.08507199585437775, |
| "learning_rate": 0.0002, |
| "loss": 3.3711, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.023512932112661963, |
| "grad_norm": 0.07275331765413284, |
| "learning_rate": 0.0002, |
| "loss": 3.3241, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.0240132072639952, |
| "grad_norm": 0.0792601928114891, |
| "learning_rate": 0.0002, |
| "loss": 3.4097, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.02451348241532843, |
| "grad_norm": 0.0831415206193924, |
| "learning_rate": 0.0002, |
| "loss": 3.1606, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.025013757566661663, |
| "grad_norm": 0.1146463081240654, |
| "learning_rate": 0.0002, |
| "loss": 3.424, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.0255140327179949, |
| "grad_norm": 0.11574945598840714, |
| "learning_rate": 0.0002, |
| "loss": 3.4287, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.02601430786932813, |
| "grad_norm": 0.06718364357948303, |
| "learning_rate": 0.0002, |
| "loss": 3.2169, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.026514583020661363, |
| "grad_norm": 0.0657232478260994, |
| "learning_rate": 0.0002, |
| "loss": 3.2824, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.0270148581719946, |
| "grad_norm": 0.06566020846366882, |
| "learning_rate": 0.0002, |
| "loss": 3.29, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.02751513332332783, |
| "grad_norm": 0.07870512455701828, |
| "learning_rate": 0.0002, |
| "loss": 3.3454, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.028015408474661063, |
| "grad_norm": 0.06590110063552856, |
| "learning_rate": 0.0002, |
| "loss": 3.2222, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.028515683625994298, |
| "grad_norm": 0.08713185787200928, |
| "learning_rate": 0.0002, |
| "loss": 3.3677, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.02901595877732753, |
| "grad_norm": 0.06802317500114441, |
| "learning_rate": 0.0002, |
| "loss": 3.3194, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.029516233928660762, |
| "grad_norm": 0.07123348116874695, |
| "learning_rate": 0.0002, |
| "loss": 3.1211, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.030016509079993998, |
| "grad_norm": 0.09105300158262253, |
| "learning_rate": 0.0002, |
| "loss": 3.3034, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.03051678423132723, |
| "grad_norm": 0.09690599888563156, |
| "learning_rate": 0.0002, |
| "loss": 3.2923, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.031017059382660462, |
| "grad_norm": 0.06459871679544449, |
| "learning_rate": 0.0002, |
| "loss": 3.1854, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.0315173345339937, |
| "grad_norm": 0.0907784029841423, |
| "learning_rate": 0.0002, |
| "loss": 3.2063, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.03201760968532693, |
| "grad_norm": 0.07171762734651566, |
| "learning_rate": 0.0002, |
| "loss": 3.4051, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.03251788483666016, |
| "grad_norm": 0.0999717265367508, |
| "learning_rate": 0.0002, |
| "loss": 3.3135, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.033018159987993394, |
| "grad_norm": 0.0716700628399849, |
| "learning_rate": 0.0002, |
| "loss": 3.2707, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.03351843513932663, |
| "grad_norm": 0.06543900072574615, |
| "learning_rate": 0.0002, |
| "loss": 3.2032, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.034018710290659865, |
| "grad_norm": 0.05876084417104721, |
| "learning_rate": 0.0002, |
| "loss": 3.332, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.0345189854419931, |
| "grad_norm": 0.5182152390480042, |
| "learning_rate": 0.0002, |
| "loss": 3.3611, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.03501926059332633, |
| "grad_norm": 0.07951213419437408, |
| "learning_rate": 0.0002, |
| "loss": 3.2924, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.03551953574465956, |
| "grad_norm": 0.08420588076114655, |
| "learning_rate": 0.0002, |
| "loss": 3.1806, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.03601981089599279, |
| "grad_norm": 0.08514729142189026, |
| "learning_rate": 0.0002, |
| "loss": 3.2537, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.03652008604732603, |
| "grad_norm": 0.06989168375730515, |
| "learning_rate": 0.0002, |
| "loss": 3.4618, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.037020361198659264, |
| "grad_norm": 0.07098263502120972, |
| "learning_rate": 0.0002, |
| "loss": 3.3092, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.037520636349992496, |
| "grad_norm": 0.06842508912086487, |
| "learning_rate": 0.0002, |
| "loss": 3.3168, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.03802091150132573, |
| "grad_norm": 0.09867072105407715, |
| "learning_rate": 0.0002, |
| "loss": 3.194, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.03852118665265896, |
| "grad_norm": 0.06236390769481659, |
| "learning_rate": 0.0002, |
| "loss": 3.3323, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.03902146180399219, |
| "grad_norm": 0.07258310914039612, |
| "learning_rate": 0.0002, |
| "loss": 3.2518, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.03952173695532543, |
| "grad_norm": 0.060556840151548386, |
| "learning_rate": 0.0002, |
| "loss": 3.3152, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.040022012106658664, |
| "grad_norm": 0.07364658266305923, |
| "learning_rate": 0.0002, |
| "loss": 3.2153, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.040522287257991896, |
| "grad_norm": 0.08476244658231735, |
| "learning_rate": 0.0002, |
| "loss": 3.2618, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.04102256240932513, |
| "grad_norm": 0.06534284353256226, |
| "learning_rate": 0.0002, |
| "loss": 3.2049, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.04152283756065836, |
| "grad_norm": 0.07897084951400757, |
| "learning_rate": 0.0002, |
| "loss": 3.4232, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.04202311271199159, |
| "grad_norm": 0.09437014162540436, |
| "learning_rate": 0.0002, |
| "loss": 3.2636, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.04252338786332483, |
| "grad_norm": 0.06484173983335495, |
| "learning_rate": 0.0002, |
| "loss": 3.1474, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.04302366301465806, |
| "grad_norm": 0.05979447439312935, |
| "learning_rate": 0.0002, |
| "loss": 3.2571, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.043523938165991295, |
| "grad_norm": 0.09203090518712997, |
| "learning_rate": 0.0002, |
| "loss": 3.1984, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.04402421331732453, |
| "grad_norm": 0.1513832062482834, |
| "learning_rate": 0.0002, |
| "loss": 3.3668, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.04452448846865776, |
| "grad_norm": 0.06712643057107925, |
| "learning_rate": 0.0002, |
| "loss": 3.2346, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.045024763619991, |
| "grad_norm": 0.07149570435285568, |
| "learning_rate": 0.0002, |
| "loss": 3.2834, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.04552503877132423, |
| "grad_norm": 0.0681491494178772, |
| "learning_rate": 0.0002, |
| "loss": 3.1853, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.04602531392265746, |
| "grad_norm": 0.06924117356538773, |
| "learning_rate": 0.0002, |
| "loss": 3.2217, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.046525589073990695, |
| "grad_norm": 0.07459249347448349, |
| "learning_rate": 0.0002, |
| "loss": 3.2059, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.04702586422532393, |
| "grad_norm": 0.06532080471515656, |
| "learning_rate": 0.0002, |
| "loss": 3.1643, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.04752613937665716, |
| "grad_norm": 0.07453737407922745, |
| "learning_rate": 0.0002, |
| "loss": 3.3668, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.0480264145279904, |
| "grad_norm": 0.07038157433271408, |
| "learning_rate": 0.0002, |
| "loss": 3.3452, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.04852668967932363, |
| "grad_norm": 0.05873151868581772, |
| "learning_rate": 0.0002, |
| "loss": 3.3277, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.04902696483065686, |
| "grad_norm": 0.05833908170461655, |
| "learning_rate": 0.0002, |
| "loss": 3.2321, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.049527239981990094, |
| "grad_norm": 0.07476246356964111, |
| "learning_rate": 0.0002, |
| "loss": 3.3013, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.050027515133323326, |
| "grad_norm": 0.06895654648542404, |
| "learning_rate": 0.0002, |
| "loss": 3.2011, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.05052779028465656, |
| "grad_norm": 0.06574366986751556, |
| "learning_rate": 0.0002, |
| "loss": 3.2071, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.0510280654359898, |
| "grad_norm": 0.05364847928285599, |
| "learning_rate": 0.0002, |
| "loss": 3.1855, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.05152834058732303, |
| "grad_norm": 0.0581735335290432, |
| "learning_rate": 0.0002, |
| "loss": 3.2567, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.05202861573865626, |
| "grad_norm": 0.09195020794868469, |
| "learning_rate": 0.0002, |
| "loss": 3.29, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.052528890889989494, |
| "grad_norm": 0.059362176805734634, |
| "learning_rate": 0.0002, |
| "loss": 3.2145, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.053029166041322726, |
| "grad_norm": 0.06778449565172195, |
| "learning_rate": 0.0002, |
| "loss": 3.2591, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.05352944119265596, |
| "grad_norm": 0.054793521761894226, |
| "learning_rate": 0.0002, |
| "loss": 3.1621, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.0540297163439892, |
| "grad_norm": 0.05222785100340843, |
| "learning_rate": 0.0002, |
| "loss": 3.1684, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.05452999149532243, |
| "grad_norm": 0.05583691596984863, |
| "learning_rate": 0.0002, |
| "loss": 3.1624, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.05503026664665566, |
| "grad_norm": 0.0779917985200882, |
| "learning_rate": 0.0002, |
| "loss": 3.2844, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.05553054179798889, |
| "grad_norm": 0.058885641396045685, |
| "learning_rate": 0.0002, |
| "loss": 3.3956, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.056030816949322125, |
| "grad_norm": 0.053786501288414, |
| "learning_rate": 0.0002, |
| "loss": 3.2465, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.05653109210065536, |
| "grad_norm": 0.06709844619035721, |
| "learning_rate": 0.0002, |
| "loss": 3.1812, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.057031367251988596, |
| "grad_norm": 0.053172528743743896, |
| "learning_rate": 0.0002, |
| "loss": 3.1945, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.05753164240332183, |
| "grad_norm": 0.0449419841170311, |
| "learning_rate": 0.0002, |
| "loss": 3.2653, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.05803191755465506, |
| "grad_norm": 0.07608778029680252, |
| "learning_rate": 0.0002, |
| "loss": 3.2171, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.05853219270598829, |
| "grad_norm": 0.05426677316427231, |
| "learning_rate": 0.0002, |
| "loss": 3.1954, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.059032467857321524, |
| "grad_norm": 0.07974937558174133, |
| "learning_rate": 0.0002, |
| "loss": 3.2709, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.059532743008654763, |
| "grad_norm": 0.07222287356853485, |
| "learning_rate": 0.0002, |
| "loss": 3.1398, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.060033018159987996, |
| "grad_norm": 0.05869804322719574, |
| "learning_rate": 0.0002, |
| "loss": 3.2176, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.06053329331132123, |
| "grad_norm": 0.053768135607242584, |
| "learning_rate": 0.0002, |
| "loss": 3.1623, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.06103356846265446, |
| "grad_norm": 0.0641162171959877, |
| "learning_rate": 0.0002, |
| "loss": 3.15, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.06153384361398769, |
| "grad_norm": 0.0566398985683918, |
| "learning_rate": 0.0002, |
| "loss": 3.1254, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.062034118765320924, |
| "grad_norm": 0.055305738002061844, |
| "learning_rate": 0.0002, |
| "loss": 3.2141, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.06253439391665416, |
| "grad_norm": 0.059914641082286835, |
| "learning_rate": 0.0002, |
| "loss": 3.1913, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.0630346690679874, |
| "grad_norm": 0.05172060430049896, |
| "learning_rate": 0.0002, |
| "loss": 3.191, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.06353494421932063, |
| "grad_norm": 0.07252514362335205, |
| "learning_rate": 0.0002, |
| "loss": 3.0212, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.06403521937065386, |
| "grad_norm": 0.07096508145332336, |
| "learning_rate": 0.0002, |
| "loss": 3.2677, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.06453549452198709, |
| "grad_norm": 0.058548085391521454, |
| "learning_rate": 0.0002, |
| "loss": 3.1471, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.06503576967332032, |
| "grad_norm": 0.053336966782808304, |
| "learning_rate": 0.0002, |
| "loss": 3.234, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.06553604482465356, |
| "grad_norm": 0.051933031529188156, |
| "learning_rate": 0.0002, |
| "loss": 3.2554, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.06603631997598679, |
| "grad_norm": 0.05643808841705322, |
| "learning_rate": 0.0002, |
| "loss": 3.3304, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.06653659512732002, |
| "grad_norm": 0.057230204343795776, |
| "learning_rate": 0.0002, |
| "loss": 3.1517, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.06703687027865327, |
| "grad_norm": 0.07095087319612503, |
| "learning_rate": 0.0002, |
| "loss": 3.0802, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.0675371454299865, |
| "grad_norm": 0.06798629462718964, |
| "learning_rate": 0.0002, |
| "loss": 3.0827, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.06803742058131973, |
| "grad_norm": 0.0938129797577858, |
| "learning_rate": 0.0002, |
| "loss": 3.3157, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.06853769573265296, |
| "grad_norm": 0.06188824027776718, |
| "learning_rate": 0.0002, |
| "loss": 3.1655, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.0690379708839862, |
| "grad_norm": 0.05734292417764664, |
| "learning_rate": 0.0002, |
| "loss": 3.2153, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.06953824603531943, |
| "grad_norm": 0.05722649022936821, |
| "learning_rate": 0.0002, |
| "loss": 3.164, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.07003852118665266, |
| "grad_norm": 0.058131471276283264, |
| "learning_rate": 0.0002, |
| "loss": 3.2015, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.07053879633798589, |
| "grad_norm": 0.07024545222520828, |
| "learning_rate": 0.0002, |
| "loss": 3.1281, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.07103907148931912, |
| "grad_norm": 0.06485693156719208, |
| "learning_rate": 0.0002, |
| "loss": 3.1668, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.07153934664065235, |
| "grad_norm": 0.056724123656749725, |
| "learning_rate": 0.0002, |
| "loss": 3.2405, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.07203962179198559, |
| "grad_norm": 0.05593548342585564, |
| "learning_rate": 0.0002, |
| "loss": 3.3399, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.07253989694331882, |
| "grad_norm": 0.06867067515850067, |
| "learning_rate": 0.0002, |
| "loss": 3.3871, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.07304017209465206, |
| "grad_norm": 0.09758540242910385, |
| "learning_rate": 0.0002, |
| "loss": 3.2629, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.0735404472459853, |
| "grad_norm": 0.0622124969959259, |
| "learning_rate": 0.0002, |
| "loss": 3.1782, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.07404072239731853, |
| "grad_norm": 0.05847143009305, |
| "learning_rate": 0.0002, |
| "loss": 3.2197, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.07454099754865176, |
| "grad_norm": 0.0578547939658165, |
| "learning_rate": 0.0002, |
| "loss": 3.2242, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.07504127269998499, |
| "grad_norm": 0.057039618492126465, |
| "learning_rate": 0.0002, |
| "loss": 3.1362, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.07554154785131822, |
| "grad_norm": 0.05607955902814865, |
| "learning_rate": 0.0002, |
| "loss": 3.1812, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.07604182300265146, |
| "grad_norm": 0.06097773090004921, |
| "learning_rate": 0.0002, |
| "loss": 3.1074, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.07654209815398469, |
| "grad_norm": 0.08337036520242691, |
| "learning_rate": 0.0002, |
| "loss": 3.1852, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.07704237330531792, |
| "grad_norm": 0.05237165838479996, |
| "learning_rate": 0.0002, |
| "loss": 3.1786, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.07754264845665115, |
| "grad_norm": 0.07054440677165985, |
| "learning_rate": 0.0002, |
| "loss": 3.1673, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.07804292360798439, |
| "grad_norm": 0.04526256397366524, |
| "learning_rate": 0.0002, |
| "loss": 3.2436, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.07854319875931763, |
| "grad_norm": 0.042845677584409714, |
| "learning_rate": 0.0002, |
| "loss": 3.1739, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.07904347391065086, |
| "grad_norm": 0.0452456995844841, |
| "learning_rate": 0.0002, |
| "loss": 3.224, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.0795437490619841, |
| "grad_norm": 0.055119115859270096, |
| "learning_rate": 0.0002, |
| "loss": 3.2443, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.08004402421331733, |
| "grad_norm": 0.06564844399690628, |
| "learning_rate": 0.0002, |
| "loss": 3.3013, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.08054429936465056, |
| "grad_norm": 0.08553501963615417, |
| "learning_rate": 0.0002, |
| "loss": 3.1565, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.08104457451598379, |
| "grad_norm": 0.057848136872053146, |
| "learning_rate": 0.0002, |
| "loss": 3.192, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.08154484966731702, |
| "grad_norm": 0.05926649644970894, |
| "learning_rate": 0.0002, |
| "loss": 3.288, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.08204512481865026, |
| "grad_norm": 0.05204610154032707, |
| "learning_rate": 0.0002, |
| "loss": 3.357, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.08254539996998349, |
| "grad_norm": 0.06709768623113632, |
| "learning_rate": 0.0002, |
| "loss": 3.2767, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.08304567512131672, |
| "grad_norm": 0.05840866640210152, |
| "learning_rate": 0.0002, |
| "loss": 3.3075, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.08354595027264995, |
| "grad_norm": 0.06196371465921402, |
| "learning_rate": 0.0002, |
| "loss": 3.2186, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.08404622542398318, |
| "grad_norm": 0.06448955088853836, |
| "learning_rate": 0.0002, |
| "loss": 3.0591, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.08454650057531643, |
| "grad_norm": 0.06016537919640541, |
| "learning_rate": 0.0002, |
| "loss": 3.1313, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.08504677572664966, |
| "grad_norm": 0.04336397349834442, |
| "learning_rate": 0.0002, |
| "loss": 3.2306, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.0855470508779829, |
| "grad_norm": 0.05283171683549881, |
| "learning_rate": 0.0002, |
| "loss": 3.2208, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.08604732602931613, |
| "grad_norm": 0.05544983223080635, |
| "learning_rate": 0.0002, |
| "loss": 3.0558, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.08654760118064936, |
| "grad_norm": 0.09242791682481766, |
| "learning_rate": 0.0002, |
| "loss": 3.1988, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.08704787633198259, |
| "grad_norm": 0.07003988325595856, |
| "learning_rate": 0.0002, |
| "loss": 3.282, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.08754815148331582, |
| "grad_norm": 0.05473213270306587, |
| "learning_rate": 0.0002, |
| "loss": 3.0874, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.08804842663464905, |
| "grad_norm": 0.05522087588906288, |
| "learning_rate": 0.0002, |
| "loss": 3.2302, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.08854870178598229, |
| "grad_norm": 0.06576565653085709, |
| "learning_rate": 0.0002, |
| "loss": 3.1786, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.08904897693731552, |
| "grad_norm": 0.05434967949986458, |
| "learning_rate": 0.0002, |
| "loss": 3.1149, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.08954925208864875, |
| "grad_norm": 0.05906340479850769, |
| "learning_rate": 0.0002, |
| "loss": 3.0553, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.090049527239982, |
| "grad_norm": 0.05728009715676308, |
| "learning_rate": 0.0002, |
| "loss": 3.0531, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.09054980239131523, |
| "grad_norm": 0.03979711979627609, |
| "learning_rate": 0.0002, |
| "loss": 3.1394, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.09105007754264846, |
| "grad_norm": 0.07336313277482986, |
| "learning_rate": 0.0002, |
| "loss": 3.2452, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.0915503526939817, |
| "grad_norm": 0.055571481585502625, |
| "learning_rate": 0.0002, |
| "loss": 3.123, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.09205062784531493, |
| "grad_norm": 0.048019275069236755, |
| "learning_rate": 0.0002, |
| "loss": 3.1898, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.09255090299664816, |
| "grad_norm": 0.04113614931702614, |
| "learning_rate": 0.0002, |
| "loss": 3.187, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.09305117814798139, |
| "grad_norm": 0.058914463967084885, |
| "learning_rate": 0.0002, |
| "loss": 3.0886, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.09355145329931462, |
| "grad_norm": 0.0579225979745388, |
| "learning_rate": 0.0002, |
| "loss": 3.113, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.09405172845064785, |
| "grad_norm": 0.05847308784723282, |
| "learning_rate": 0.0002, |
| "loss": 3.2902, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.09455200360198109, |
| "grad_norm": 0.04670713469386101, |
| "learning_rate": 0.0002, |
| "loss": 3.0735, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.09505227875331432, |
| "grad_norm": 0.058696549385786057, |
| "learning_rate": 0.0002, |
| "loss": 3.2043, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.09555255390464755, |
| "grad_norm": 0.0533798448741436, |
| "learning_rate": 0.0002, |
| "loss": 3.0289, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.0960528290559808, |
| "grad_norm": 0.04985165223479271, |
| "learning_rate": 0.0002, |
| "loss": 3.2249, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.09655310420731403, |
| "grad_norm": 0.06083301082253456, |
| "learning_rate": 0.0002, |
| "loss": 3.1653, |
| "step": 1930 |
| }, |
| { |
| "epoch": 0.09705337935864726, |
| "grad_norm": 0.055274877697229385, |
| "learning_rate": 0.0002, |
| "loss": 3.1489, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.09755365450998049, |
| "grad_norm": 0.03868628293275833, |
| "learning_rate": 0.0002, |
| "loss": 3.0902, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.09805392966131372, |
| "grad_norm": 0.05481928586959839, |
| "learning_rate": 0.0002, |
| "loss": 3.1383, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.09855420481264696, |
| "grad_norm": 0.05562729388475418, |
| "learning_rate": 0.0002, |
| "loss": 3.2148, |
| "step": 1970 |
| }, |
| { |
| "epoch": 0.09905447996398019, |
| "grad_norm": 0.04779260233044624, |
| "learning_rate": 0.0002, |
| "loss": 3.2358, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.09955475511531342, |
| "grad_norm": 0.04606562480330467, |
| "learning_rate": 0.0002, |
| "loss": 3.172, |
| "step": 1990 |
| }, |
| { |
| "epoch": 0.10005503026664665, |
| "grad_norm": 0.045945361256599426, |
| "learning_rate": 0.0002, |
| "loss": 3.1456, |
| "step": 2000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 2000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 50, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 9.5664155394048e+16, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|