| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.2558087074216107, |
| "eval_steps": 500, |
| "global_step": 10000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0022558087074216106, |
| "grad_norm": 2.141827344894409, |
| "learning_rate": 0.0002998781863297992, |
| "loss": 0.7581, |
| "mean_token_accuracy": 0.8328841328620911, |
| "num_tokens": 20380.0, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.004511617414843221, |
| "grad_norm": 1.2108855247497559, |
| "learning_rate": 0.0002997428378073539, |
| "loss": 0.5535, |
| "mean_token_accuracy": 0.8761807084083557, |
| "num_tokens": 30607.0, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.006767426122264832, |
| "grad_norm": 1.2379798889160156, |
| "learning_rate": 0.0002996074892849086, |
| "loss": 0.5122, |
| "mean_token_accuracy": 0.8828346908092499, |
| "num_tokens": 40842.0, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.009023234829686443, |
| "grad_norm": 1.1645982265472412, |
| "learning_rate": 0.0002994721407624633, |
| "loss": 0.5342, |
| "mean_token_accuracy": 0.8773131012916565, |
| "num_tokens": 51063.0, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.011279043537108053, |
| "grad_norm": 0.6872125267982483, |
| "learning_rate": 0.000299336792240018, |
| "loss": 0.4112, |
| "mean_token_accuracy": 0.9047324001789093, |
| "num_tokens": 61285.0, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.013534852244529664, |
| "grad_norm": 0.7911831736564636, |
| "learning_rate": 0.00029920144371757275, |
| "loss": 0.449, |
| "mean_token_accuracy": 0.8905096411705017, |
| "num_tokens": 71440.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.015790660951951276, |
| "grad_norm": 0.8042737245559692, |
| "learning_rate": 0.00029906609519512746, |
| "loss": 0.5045, |
| "mean_token_accuracy": 0.8851543009281159, |
| "num_tokens": 81592.0, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.018046469659372885, |
| "grad_norm": 0.8313478827476501, |
| "learning_rate": 0.00029893074667268217, |
| "loss": 0.4416, |
| "mean_token_accuracy": 0.8926478564739228, |
| "num_tokens": 91800.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.020302278366794498, |
| "grad_norm": 0.7176661491394043, |
| "learning_rate": 0.0002987953981502368, |
| "loss": 0.4081, |
| "mean_token_accuracy": 0.8967904210090637, |
| "num_tokens": 102030.0, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.022558087074216106, |
| "grad_norm": 0.6425905227661133, |
| "learning_rate": 0.00029866004962779153, |
| "loss": 0.445, |
| "mean_token_accuracy": 0.8905546844005585, |
| "num_tokens": 112153.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.02481389578163772, |
| "grad_norm": 0.4888269305229187, |
| "learning_rate": 0.00029852470110534624, |
| "loss": 0.4923, |
| "mean_token_accuracy": 0.8825848281383515, |
| "num_tokens": 122331.0, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.027069704489059328, |
| "grad_norm": 0.5978611707687378, |
| "learning_rate": 0.00029838935258290095, |
| "loss": 0.362, |
| "mean_token_accuracy": 0.9085110425949097, |
| "num_tokens": 132562.0, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.02932551319648094, |
| "grad_norm": 0.5825199484825134, |
| "learning_rate": 0.0002982540040604556, |
| "loss": 0.322, |
| "mean_token_accuracy": 0.911745023727417, |
| "num_tokens": 142724.0, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.03158132190390255, |
| "grad_norm": 0.5778432488441467, |
| "learning_rate": 0.0002981186555380103, |
| "loss": 0.3264, |
| "mean_token_accuracy": 0.9135629117488862, |
| "num_tokens": 152950.0, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.03383713061132416, |
| "grad_norm": 0.8815613389015198, |
| "learning_rate": 0.0002979833070155651, |
| "loss": 0.3902, |
| "mean_token_accuracy": 0.9044237017631531, |
| "num_tokens": 162418.0, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.03609293931874577, |
| "grad_norm": 0.5860086679458618, |
| "learning_rate": 0.0002978479584931198, |
| "loss": 0.3639, |
| "mean_token_accuracy": 0.903467881679535, |
| "num_tokens": 172644.0, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.03834874802616738, |
| "grad_norm": 0.8777345418930054, |
| "learning_rate": 0.0002977126099706745, |
| "loss": 0.3447, |
| "mean_token_accuracy": 0.9112663745880127, |
| "num_tokens": 182800.0, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.040604556733588995, |
| "grad_norm": 0.5808679461479187, |
| "learning_rate": 0.00029757726144822916, |
| "loss": 0.3847, |
| "mean_token_accuracy": 0.9011166810989379, |
| "num_tokens": 192976.0, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.0428603654410106, |
| "grad_norm": 0.8659303784370422, |
| "learning_rate": 0.00029744191292578387, |
| "loss": 0.3495, |
| "mean_token_accuracy": 0.9079599380493164, |
| "num_tokens": 203148.0, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.04511617414843221, |
| "grad_norm": 0.6106019616127014, |
| "learning_rate": 0.0002973065644033386, |
| "loss": 0.3652, |
| "mean_token_accuracy": 0.9020362496376038, |
| "num_tokens": 213312.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.047371982855853825, |
| "grad_norm": 0.4957314431667328, |
| "learning_rate": 0.0002971712158808933, |
| "loss": 0.4095, |
| "mean_token_accuracy": 0.8981463789939881, |
| "num_tokens": 223516.0, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.04962779156327544, |
| "grad_norm": 0.7348946928977966, |
| "learning_rate": 0.00029703586735844794, |
| "loss": 0.3619, |
| "mean_token_accuracy": 0.9077386081218719, |
| "num_tokens": 233691.0, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.05188360027069704, |
| "grad_norm": 1.0192636251449585, |
| "learning_rate": 0.00029690051883600265, |
| "loss": 0.3664, |
| "mean_token_accuracy": 0.9049350261688233, |
| "num_tokens": 243886.0, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.054139408978118655, |
| "grad_norm": 0.6710547804832458, |
| "learning_rate": 0.00029676517031355736, |
| "loss": 0.4232, |
| "mean_token_accuracy": 0.8985056221485138, |
| "num_tokens": 254069.0, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.05639521768554027, |
| "grad_norm": 0.49837666749954224, |
| "learning_rate": 0.0002966298217911121, |
| "loss": 0.391, |
| "mean_token_accuracy": 0.8973807156085968, |
| "num_tokens": 264303.0, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.05865102639296188, |
| "grad_norm": 0.5633454918861389, |
| "learning_rate": 0.0002964944732686668, |
| "loss": 0.3754, |
| "mean_token_accuracy": 0.9069048821926117, |
| "num_tokens": 274483.0, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.060906835100383486, |
| "grad_norm": 0.4087347686290741, |
| "learning_rate": 0.0002963591247462215, |
| "loss": 0.2673, |
| "mean_token_accuracy": 0.9232019662857056, |
| "num_tokens": 284690.0, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.0631626438078051, |
| "grad_norm": 0.5431217551231384, |
| "learning_rate": 0.0002962237762237762, |
| "loss": 0.4413, |
| "mean_token_accuracy": 0.895687735080719, |
| "num_tokens": 294929.0, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.06541845251522671, |
| "grad_norm": 0.646186351776123, |
| "learning_rate": 0.0002960884277013309, |
| "loss": 0.3284, |
| "mean_token_accuracy": 0.9093968510627747, |
| "num_tokens": 305144.0, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.06767426122264832, |
| "grad_norm": 0.8424251079559326, |
| "learning_rate": 0.00029595307917888557, |
| "loss": 0.3412, |
| "mean_token_accuracy": 0.9124035537242889, |
| "num_tokens": 315322.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.06993006993006994, |
| "grad_norm": 0.4986512362957001, |
| "learning_rate": 0.0002958177306564403, |
| "loss": 0.3524, |
| "mean_token_accuracy": 0.9054706990718842, |
| "num_tokens": 325526.0, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.07218587863749154, |
| "grad_norm": 0.3398238718509674, |
| "learning_rate": 0.000295682382133995, |
| "loss": 0.3065, |
| "mean_token_accuracy": 0.9175658822059631, |
| "num_tokens": 335687.0, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.07444168734491315, |
| "grad_norm": 0.5017867088317871, |
| "learning_rate": 0.0002955470336115497, |
| "loss": 0.409, |
| "mean_token_accuracy": 0.8957060396671295, |
| "num_tokens": 345870.0, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.07669749605233477, |
| "grad_norm": 0.6093907356262207, |
| "learning_rate": 0.00029541168508910446, |
| "loss": 0.2775, |
| "mean_token_accuracy": 0.9198772490024567, |
| "num_tokens": 356074.0, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.07895330475975637, |
| "grad_norm": 0.6241074800491333, |
| "learning_rate": 0.0002952763365666591, |
| "loss": 0.3153, |
| "mean_token_accuracy": 0.9163550198078155, |
| "num_tokens": 366305.0, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.08120911346717799, |
| "grad_norm": 0.5702779293060303, |
| "learning_rate": 0.0002951409880442138, |
| "loss": 0.2969, |
| "mean_token_accuracy": 0.9146891295909881, |
| "num_tokens": 376503.0, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.0834649221745996, |
| "grad_norm": 0.9562923908233643, |
| "learning_rate": 0.00029500563952176853, |
| "loss": 0.3551, |
| "mean_token_accuracy": 0.9118620991706848, |
| "num_tokens": 386698.0, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.0857207308820212, |
| "grad_norm": 0.4826742112636566, |
| "learning_rate": 0.00029487029099932324, |
| "loss": 0.2599, |
| "mean_token_accuracy": 0.9280467808246613, |
| "num_tokens": 396740.0, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.08797653958944282, |
| "grad_norm": 0.4703806936740875, |
| "learning_rate": 0.0002947349424768779, |
| "loss": 0.3232, |
| "mean_token_accuracy": 0.9132184386253357, |
| "num_tokens": 406940.0, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.09023234829686443, |
| "grad_norm": 0.6047152876853943, |
| "learning_rate": 0.0002945995939544326, |
| "loss": 0.3714, |
| "mean_token_accuracy": 0.9036615669727326, |
| "num_tokens": 417166.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.09248815700428603, |
| "grad_norm": 0.49187320470809937, |
| "learning_rate": 0.0002944642454319873, |
| "loss": 0.3322, |
| "mean_token_accuracy": 0.9098089098930359, |
| "num_tokens": 427377.0, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.09474396571170765, |
| "grad_norm": 0.3881978988647461, |
| "learning_rate": 0.00029432889690954203, |
| "loss": 0.3955, |
| "mean_token_accuracy": 0.9027588307857514, |
| "num_tokens": 437556.0, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.09699977441912926, |
| "grad_norm": 0.6978868842124939, |
| "learning_rate": 0.00029419354838709674, |
| "loss": 0.3128, |
| "mean_token_accuracy": 0.9132681012153625, |
| "num_tokens": 447662.0, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.09925558312655088, |
| "grad_norm": 0.47630035877227783, |
| "learning_rate": 0.00029405819986465145, |
| "loss": 0.3218, |
| "mean_token_accuracy": 0.9099625885486603, |
| "num_tokens": 457829.0, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.10151139183397248, |
| "grad_norm": 0.6838335990905762, |
| "learning_rate": 0.00029392285134220616, |
| "loss": 0.3311, |
| "mean_token_accuracy": 0.9098457574844361, |
| "num_tokens": 468031.0, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.10376720054139409, |
| "grad_norm": 0.49004867672920227, |
| "learning_rate": 0.00029378750281976087, |
| "loss": 0.3187, |
| "mean_token_accuracy": 0.910044276714325, |
| "num_tokens": 478268.0, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.1060230092488157, |
| "grad_norm": 0.4654693007469177, |
| "learning_rate": 0.0002936521542973156, |
| "loss": 0.3103, |
| "mean_token_accuracy": 0.9164334952831268, |
| "num_tokens": 488472.0, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.10827881795623731, |
| "grad_norm": 0.45917677879333496, |
| "learning_rate": 0.00029351680577487023, |
| "loss": 0.2829, |
| "mean_token_accuracy": 0.9221859157085419, |
| "num_tokens": 498643.0, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.11053462666365892, |
| "grad_norm": 0.5772504210472107, |
| "learning_rate": 0.00029338145725242494, |
| "loss": 0.2643, |
| "mean_token_accuracy": 0.9243384599685669, |
| "num_tokens": 508807.0, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.11279043537108054, |
| "grad_norm": 0.39301231503486633, |
| "learning_rate": 0.00029324610872997965, |
| "loss": 0.3203, |
| "mean_token_accuracy": 0.9116944551467896, |
| "num_tokens": 519035.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.11504624407850214, |
| "grad_norm": 0.5303699374198914, |
| "learning_rate": 0.00029311076020753436, |
| "loss": 0.2946, |
| "mean_token_accuracy": 0.9144574999809265, |
| "num_tokens": 529274.0, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.11730205278592376, |
| "grad_norm": 0.7598997354507446, |
| "learning_rate": 0.0002929754116850891, |
| "loss": 0.2854, |
| "mean_token_accuracy": 0.916255134344101, |
| "num_tokens": 539479.0, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.11955786149334537, |
| "grad_norm": 0.5954500436782837, |
| "learning_rate": 0.0002928400631626438, |
| "loss": 0.2921, |
| "mean_token_accuracy": 0.9176213085651398, |
| "num_tokens": 549688.0, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.12181367020076697, |
| "grad_norm": 0.4145963191986084, |
| "learning_rate": 0.0002927047146401985, |
| "loss": 0.3326, |
| "mean_token_accuracy": 0.9075924575328826, |
| "num_tokens": 559924.0, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.12406947890818859, |
| "grad_norm": 0.4656633138656616, |
| "learning_rate": 0.0002925693661177532, |
| "loss": 0.3169, |
| "mean_token_accuracy": 0.915722268819809, |
| "num_tokens": 570150.0, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.1263252876156102, |
| "grad_norm": 0.5974048972129822, |
| "learning_rate": 0.00029243401759530786, |
| "loss": 0.3155, |
| "mean_token_accuracy": 0.9142574846744538, |
| "num_tokens": 580375.0, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.1285810963230318, |
| "grad_norm": 0.6624695658683777, |
| "learning_rate": 0.00029229866907286257, |
| "loss": 0.2879, |
| "mean_token_accuracy": 0.9158149421215057, |
| "num_tokens": 590551.0, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.13083690503045342, |
| "grad_norm": 0.3690856099128723, |
| "learning_rate": 0.0002921633205504173, |
| "loss": 0.2915, |
| "mean_token_accuracy": 0.9185068488121033, |
| "num_tokens": 600656.0, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.13309271373787504, |
| "grad_norm": 0.45435959100723267, |
| "learning_rate": 0.000292027972027972, |
| "loss": 0.3514, |
| "mean_token_accuracy": 0.9071869254112244, |
| "num_tokens": 610856.0, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.13534852244529663, |
| "grad_norm": 0.9437525868415833, |
| "learning_rate": 0.0002918926235055267, |
| "loss": 0.2643, |
| "mean_token_accuracy": 0.9242019116878509, |
| "num_tokens": 621079.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.13760433115271825, |
| "grad_norm": 0.6915507912635803, |
| "learning_rate": 0.0002917572749830814, |
| "loss": 0.3138, |
| "mean_token_accuracy": 0.9152492702007293, |
| "num_tokens": 631319.0, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.13986013986013987, |
| "grad_norm": 0.5025702714920044, |
| "learning_rate": 0.0002916219264606361, |
| "loss": 0.3617, |
| "mean_token_accuracy": 0.9049429833889008, |
| "num_tokens": 641505.0, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.14211594856756146, |
| "grad_norm": 0.46047383546829224, |
| "learning_rate": 0.00029148657793819083, |
| "loss": 0.3169, |
| "mean_token_accuracy": 0.9166115164756775, |
| "num_tokens": 651730.0, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.14437175727498308, |
| "grad_norm": 0.7841493487358093, |
| "learning_rate": 0.00029135122941574554, |
| "loss": 0.3393, |
| "mean_token_accuracy": 0.9131287157535553, |
| "num_tokens": 661844.0, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.1466275659824047, |
| "grad_norm": 0.402266263961792, |
| "learning_rate": 0.0002912158808933002, |
| "loss": 0.3463, |
| "mean_token_accuracy": 0.9074052631855011, |
| "num_tokens": 672009.0, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.1488833746898263, |
| "grad_norm": 0.49512675404548645, |
| "learning_rate": 0.0002910805323708549, |
| "loss": 0.2876, |
| "mean_token_accuracy": 0.9215861678123474, |
| "num_tokens": 682162.0, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.1511391833972479, |
| "grad_norm": 0.3438393175601959, |
| "learning_rate": 0.0002909451838484096, |
| "loss": 0.2377, |
| "mean_token_accuracy": 0.9303798198699951, |
| "num_tokens": 692399.0, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.15339499210466953, |
| "grad_norm": 0.29958051443099976, |
| "learning_rate": 0.0002908098353259643, |
| "loss": 0.296, |
| "mean_token_accuracy": 0.9132875919342041, |
| "num_tokens": 702578.0, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.15565080081209112, |
| "grad_norm": 0.4435781240463257, |
| "learning_rate": 0.00029067448680351903, |
| "loss": 0.3736, |
| "mean_token_accuracy": 0.8985945582389832, |
| "num_tokens": 712750.0, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.15790660951951274, |
| "grad_norm": 0.8465012311935425, |
| "learning_rate": 0.00029053913828107374, |
| "loss": 0.2601, |
| "mean_token_accuracy": 0.9246440231800079, |
| "num_tokens": 722979.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.16016241822693436, |
| "grad_norm": 0.6590988636016846, |
| "learning_rate": 0.00029040378975862845, |
| "loss": 0.3242, |
| "mean_token_accuracy": 0.9071028470993042, |
| "num_tokens": 733197.0, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.16241822693435598, |
| "grad_norm": 0.6681052446365356, |
| "learning_rate": 0.00029026844123618316, |
| "loss": 0.2906, |
| "mean_token_accuracy": 0.9161325991153717, |
| "num_tokens": 743426.0, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.16467403564177757, |
| "grad_norm": 0.3675503134727478, |
| "learning_rate": 0.0002901330927137378, |
| "loss": 0.2886, |
| "mean_token_accuracy": 0.9143774032592773, |
| "num_tokens": 753654.0, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.1669298443491992, |
| "grad_norm": 0.38154202699661255, |
| "learning_rate": 0.00028999774419129253, |
| "loss": 0.389, |
| "mean_token_accuracy": 0.9036036729812622, |
| "num_tokens": 763516.0, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.1691856530566208, |
| "grad_norm": 0.86027991771698, |
| "learning_rate": 0.00028986239566884724, |
| "loss": 0.288, |
| "mean_token_accuracy": 0.918215936422348, |
| "num_tokens": 773748.0, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.1714414617640424, |
| "grad_norm": 0.33669188618659973, |
| "learning_rate": 0.00028972704714640195, |
| "loss": 0.2693, |
| "mean_token_accuracy": 0.9215926826000214, |
| "num_tokens": 783906.0, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.17369727047146402, |
| "grad_norm": 0.534418523311615, |
| "learning_rate": 0.00028959169862395666, |
| "loss": 0.2914, |
| "mean_token_accuracy": 0.9151590466499329, |
| "num_tokens": 794110.0, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.17595307917888564, |
| "grad_norm": 0.4096381366252899, |
| "learning_rate": 0.00028945635010151137, |
| "loss": 0.3095, |
| "mean_token_accuracy": 0.9127452373504639, |
| "num_tokens": 804320.0, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.17820888788630723, |
| "grad_norm": 0.4493885636329651, |
| "learning_rate": 0.0002893210015790661, |
| "loss": 0.3061, |
| "mean_token_accuracy": 0.9174495875835419, |
| "num_tokens": 814498.0, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.18046469659372885, |
| "grad_norm": 0.3796924948692322, |
| "learning_rate": 0.0002891856530566208, |
| "loss": 0.3325, |
| "mean_token_accuracy": 0.9142964720726013, |
| "num_tokens": 824645.0, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.18272050530115047, |
| "grad_norm": 0.43683764338493347, |
| "learning_rate": 0.0002890503045341755, |
| "loss": 0.3365, |
| "mean_token_accuracy": 0.9071278512477875, |
| "num_tokens": 834842.0, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.18497631400857206, |
| "grad_norm": 0.7502022385597229, |
| "learning_rate": 0.00028891495601173015, |
| "loss": 0.2807, |
| "mean_token_accuracy": 0.9220738291740418, |
| "num_tokens": 845039.0, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.18723212271599368, |
| "grad_norm": 0.5301753282546997, |
| "learning_rate": 0.00028877960748928486, |
| "loss": 0.3563, |
| "mean_token_accuracy": 0.9055928111076355, |
| "num_tokens": 855269.0, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.1894879314234153, |
| "grad_norm": 0.32157281041145325, |
| "learning_rate": 0.0002886442589668396, |
| "loss": 0.2889, |
| "mean_token_accuracy": 0.9182112574577331, |
| "num_tokens": 865472.0, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.1917437401308369, |
| "grad_norm": 0.4322583079338074, |
| "learning_rate": 0.0002885089104443943, |
| "loss": 0.2695, |
| "mean_token_accuracy": 0.9204015076160431, |
| "num_tokens": 875623.0, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.1939995488382585, |
| "grad_norm": 1.213575839996338, |
| "learning_rate": 0.000288373561921949, |
| "loss": 0.3, |
| "mean_token_accuracy": 0.9166274607181549, |
| "num_tokens": 885852.0, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.19625535754568013, |
| "grad_norm": 0.5853558778762817, |
| "learning_rate": 0.0002882382133995037, |
| "loss": 0.3285, |
| "mean_token_accuracy": 0.9095144391059875, |
| "num_tokens": 896040.0, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.19851116625310175, |
| "grad_norm": 0.6323179602622986, |
| "learning_rate": 0.0002881028648770584, |
| "loss": 0.2662, |
| "mean_token_accuracy": 0.9212876856327057, |
| "num_tokens": 906277.0, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.20076697496052334, |
| "grad_norm": 0.3301967680454254, |
| "learning_rate": 0.0002879675163546131, |
| "loss": 0.2902, |
| "mean_token_accuracy": 0.9195581197738647, |
| "num_tokens": 915505.0, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.20302278366794496, |
| "grad_norm": 0.4250761866569519, |
| "learning_rate": 0.0002878321678321678, |
| "loss": 0.2529, |
| "mean_token_accuracy": 0.9280913352966309, |
| "num_tokens": 925676.0, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.20527859237536658, |
| "grad_norm": 0.4531536102294922, |
| "learning_rate": 0.0002876968193097225, |
| "loss": 0.3005, |
| "mean_token_accuracy": 0.9147605180740357, |
| "num_tokens": 935916.0, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.20753440108278817, |
| "grad_norm": 0.5399945974349976, |
| "learning_rate": 0.0002875614707872772, |
| "loss": 0.2359, |
| "mean_token_accuracy": 0.9297860860824585, |
| "num_tokens": 946142.0, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.2097902097902098, |
| "grad_norm": 0.4450409412384033, |
| "learning_rate": 0.0002874261222648319, |
| "loss": 0.2989, |
| "mean_token_accuracy": 0.9163881063461303, |
| "num_tokens": 956351.0, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.2120460184976314, |
| "grad_norm": 0.3771935999393463, |
| "learning_rate": 0.0002872907737423866, |
| "loss": 0.3236, |
| "mean_token_accuracy": 0.9066317915916443, |
| "num_tokens": 966521.0, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.214301827205053, |
| "grad_norm": 0.6111851930618286, |
| "learning_rate": 0.00028715542521994133, |
| "loss": 0.2397, |
| "mean_token_accuracy": 0.9319941163063049, |
| "num_tokens": 975920.0, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.21655763591247462, |
| "grad_norm": 0.7245665788650513, |
| "learning_rate": 0.00028702007669749604, |
| "loss": 0.3517, |
| "mean_token_accuracy": 0.9068562746047973, |
| "num_tokens": 986149.0, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.21881344461989624, |
| "grad_norm": 0.7466909289360046, |
| "learning_rate": 0.00028688472817505075, |
| "loss": 0.2991, |
| "mean_token_accuracy": 0.9244312167167663, |
| "num_tokens": 996354.0, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.22106925332731783, |
| "grad_norm": 0.7455071210861206, |
| "learning_rate": 0.00028674937965260546, |
| "loss": 0.3036, |
| "mean_token_accuracy": 0.9157386660575867, |
| "num_tokens": 1006580.0, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.22332506203473945, |
| "grad_norm": 0.5593414902687073, |
| "learning_rate": 0.0002866140311301601, |
| "loss": 0.2942, |
| "mean_token_accuracy": 0.9155135810375213, |
| "num_tokens": 1016754.0, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.22558087074216107, |
| "grad_norm": 0.3225398659706116, |
| "learning_rate": 0.0002864786826077148, |
| "loss": 0.2647, |
| "mean_token_accuracy": 0.9190112292766571, |
| "num_tokens": 1026969.0, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.22783667944958266, |
| "grad_norm": 1.3003923892974854, |
| "learning_rate": 0.00028634333408526953, |
| "loss": 0.3036, |
| "mean_token_accuracy": 0.9123725950717926, |
| "num_tokens": 1037080.0, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.23009248815700428, |
| "grad_norm": 0.4882226884365082, |
| "learning_rate": 0.00028620798556282424, |
| "loss": 0.2961, |
| "mean_token_accuracy": 0.9176577150821685, |
| "num_tokens": 1047291.0, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.2323482968644259, |
| "grad_norm": 0.3756118714809418, |
| "learning_rate": 0.00028607263704037895, |
| "loss": 0.278, |
| "mean_token_accuracy": 0.9258804321289062, |
| "num_tokens": 1057514.0, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.23460410557184752, |
| "grad_norm": 0.5116491317749023, |
| "learning_rate": 0.00028593728851793366, |
| "loss": 0.2741, |
| "mean_token_accuracy": 0.9172999203205109, |
| "num_tokens": 1067690.0, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.2368599142792691, |
| "grad_norm": 0.6103722453117371, |
| "learning_rate": 0.00028580193999548837, |
| "loss": 0.3095, |
| "mean_token_accuracy": 0.9097223103046417, |
| "num_tokens": 1077902.0, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.23911572298669073, |
| "grad_norm": 0.36436164379119873, |
| "learning_rate": 0.0002856665914730431, |
| "loss": 0.2427, |
| "mean_token_accuracy": 0.9301242768764496, |
| "num_tokens": 1088129.0, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.24137153169411235, |
| "grad_norm": 0.4570798873901367, |
| "learning_rate": 0.00028553124295059774, |
| "loss": 0.2629, |
| "mean_token_accuracy": 0.9264809966087342, |
| "num_tokens": 1098350.0, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.24362734040153394, |
| "grad_norm": 0.36314597725868225, |
| "learning_rate": 0.00028539589442815245, |
| "loss": 0.3147, |
| "mean_token_accuracy": 0.916249018907547, |
| "num_tokens": 1108454.0, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.24588314910895556, |
| "grad_norm": 0.4478132724761963, |
| "learning_rate": 0.00028526054590570716, |
| "loss": 0.2765, |
| "mean_token_accuracy": 0.9221442401409149, |
| "num_tokens": 1118688.0, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.24813895781637718, |
| "grad_norm": 0.42659443616867065, |
| "learning_rate": 0.00028512519738326187, |
| "loss": 0.2727, |
| "mean_token_accuracy": 0.917762154340744, |
| "num_tokens": 1128861.0, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.2503947665237988, |
| "grad_norm": 0.4264533221721649, |
| "learning_rate": 0.0002849898488608166, |
| "loss": 0.2285, |
| "mean_token_accuracy": 0.9330006301403045, |
| "num_tokens": 1139094.0, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.2526505752312204, |
| "grad_norm": 0.6633393168449402, |
| "learning_rate": 0.0002848545003383713, |
| "loss": 0.3234, |
| "mean_token_accuracy": 0.914995151758194, |
| "num_tokens": 1149311.0, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.254906383938642, |
| "grad_norm": 0.4123522937297821, |
| "learning_rate": 0.000284719151815926, |
| "loss": 0.3043, |
| "mean_token_accuracy": 0.9156239628791809, |
| "num_tokens": 1159539.0, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.2571621926460636, |
| "grad_norm": 0.5609915256500244, |
| "learning_rate": 0.0002845838032934807, |
| "loss": 0.2797, |
| "mean_token_accuracy": 0.9212962448596954, |
| "num_tokens": 1169703.0, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.25941800135348525, |
| "grad_norm": 0.4521436393260956, |
| "learning_rate": 0.0002844484547710354, |
| "loss": 0.2507, |
| "mean_token_accuracy": 0.9307784140110016, |
| "num_tokens": 1179907.0, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.26167381006090684, |
| "grad_norm": 0.45429497957229614, |
| "learning_rate": 0.00028431310624859007, |
| "loss": 0.258, |
| "mean_token_accuracy": 0.9245835185050965, |
| "num_tokens": 1190130.0, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.26392961876832843, |
| "grad_norm": 0.677976131439209, |
| "learning_rate": 0.0002841777577261448, |
| "loss": 0.3474, |
| "mean_token_accuracy": 0.914616483449936, |
| "num_tokens": 1200295.0, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.2661854274757501, |
| "grad_norm": 0.7482908368110657, |
| "learning_rate": 0.0002840424092036995, |
| "loss": 0.2255, |
| "mean_token_accuracy": 0.9334000170230865, |
| "num_tokens": 1210496.0, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.26844123618317167, |
| "grad_norm": 0.6767547726631165, |
| "learning_rate": 0.0002839070606812542, |
| "loss": 0.2856, |
| "mean_token_accuracy": 0.9159020364284516, |
| "num_tokens": 1220649.0, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.27069704489059326, |
| "grad_norm": 0.5904352068901062, |
| "learning_rate": 0.0002837717121588089, |
| "loss": 0.3042, |
| "mean_token_accuracy": 0.9098004937171936, |
| "num_tokens": 1230803.0, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.2729528535980149, |
| "grad_norm": 0.5992192625999451, |
| "learning_rate": 0.0002836363636363636, |
| "loss": 0.2964, |
| "mean_token_accuracy": 0.9183099627494812, |
| "num_tokens": 1240929.0, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.2752086623054365, |
| "grad_norm": 1.1047067642211914, |
| "learning_rate": 0.00028350101511391833, |
| "loss": 0.2256, |
| "mean_token_accuracy": 0.9382818222045899, |
| "num_tokens": 1251084.0, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.2774644710128581, |
| "grad_norm": 0.30373415350914, |
| "learning_rate": 0.00028336566659147304, |
| "loss": 0.2873, |
| "mean_token_accuracy": 0.9165784835815429, |
| "num_tokens": 1261319.0, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.27972027972027974, |
| "grad_norm": 0.5601250529289246, |
| "learning_rate": 0.0002832303180690277, |
| "loss": 0.3025, |
| "mean_token_accuracy": 0.915376091003418, |
| "num_tokens": 1271463.0, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.28197608842770133, |
| "grad_norm": 0.3787858784198761, |
| "learning_rate": 0.0002830949695465824, |
| "loss": 0.2926, |
| "mean_token_accuracy": 0.9160200178623199, |
| "num_tokens": 1281613.0, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.2842318971351229, |
| "grad_norm": 0.41434407234191895, |
| "learning_rate": 0.0002829596210241371, |
| "loss": 0.2365, |
| "mean_token_accuracy": 0.9311029613018036, |
| "num_tokens": 1291786.0, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.28648770584254457, |
| "grad_norm": 0.5667926073074341, |
| "learning_rate": 0.0002828242725016918, |
| "loss": 0.231, |
| "mean_token_accuracy": 0.9344726800918579, |
| "num_tokens": 1301555.0, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.28874351454996616, |
| "grad_norm": 0.5918124318122864, |
| "learning_rate": 0.00028268892397924654, |
| "loss": 0.2174, |
| "mean_token_accuracy": 0.9305562138557434, |
| "num_tokens": 1311787.0, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.29099932325738775, |
| "grad_norm": 0.3638257682323456, |
| "learning_rate": 0.00028255357545680125, |
| "loss": 0.2193, |
| "mean_token_accuracy": 0.9338454186916352, |
| "num_tokens": 1321926.0, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.2932551319648094, |
| "grad_norm": 0.3877502977848053, |
| "learning_rate": 0.00028241822693435596, |
| "loss": 0.2489, |
| "mean_token_accuracy": 0.92558473944664, |
| "num_tokens": 1332162.0, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.295510940672231, |
| "grad_norm": 0.7278009653091431, |
| "learning_rate": 0.00028228287841191067, |
| "loss": 0.2429, |
| "mean_token_accuracy": 0.9280037820339203, |
| "num_tokens": 1342350.0, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.2977667493796526, |
| "grad_norm": 0.43354156613349915, |
| "learning_rate": 0.0002821475298894654, |
| "loss": 0.2416, |
| "mean_token_accuracy": 0.9291642665863037, |
| "num_tokens": 1352516.0, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.30002255808707423, |
| "grad_norm": 0.5411070585250854, |
| "learning_rate": 0.00028201218136702003, |
| "loss": 0.263, |
| "mean_token_accuracy": 0.92619389295578, |
| "num_tokens": 1362735.0, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.3022783667944958, |
| "grad_norm": 0.40434572100639343, |
| "learning_rate": 0.00028187683284457474, |
| "loss": 0.2217, |
| "mean_token_accuracy": 0.9316042363643646, |
| "num_tokens": 1372866.0, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.3045341755019174, |
| "grad_norm": 0.5762608051300049, |
| "learning_rate": 0.00028174148432212945, |
| "loss": 0.282, |
| "mean_token_accuracy": 0.9198409378528595, |
| "num_tokens": 1383044.0, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.30678998420933906, |
| "grad_norm": 0.35077670216560364, |
| "learning_rate": 0.00028160613579968416, |
| "loss": 0.2627, |
| "mean_token_accuracy": 0.9248356401920319, |
| "num_tokens": 1392552.0, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.30904579291676065, |
| "grad_norm": 0.4673321843147278, |
| "learning_rate": 0.00028147078727723887, |
| "loss": 0.2734, |
| "mean_token_accuracy": 0.9211951434612274, |
| "num_tokens": 1402739.0, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.31130160162418224, |
| "grad_norm": 0.68485426902771, |
| "learning_rate": 0.0002813354387547936, |
| "loss": 0.2287, |
| "mean_token_accuracy": 0.9316923320293427, |
| "num_tokens": 1412889.0, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.3135574103316039, |
| "grad_norm": 0.48916900157928467, |
| "learning_rate": 0.0002812000902323483, |
| "loss": 0.2509, |
| "mean_token_accuracy": 0.9270589649677277, |
| "num_tokens": 1423073.0, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.3158132190390255, |
| "grad_norm": 0.49239906668663025, |
| "learning_rate": 0.000281064741709903, |
| "loss": 0.2615, |
| "mean_token_accuracy": 0.9245356857776642, |
| "num_tokens": 1433313.0, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.3180690277464471, |
| "grad_norm": 0.5933843851089478, |
| "learning_rate": 0.00028092939318745766, |
| "loss": 0.2958, |
| "mean_token_accuracy": 0.9148936092853546, |
| "num_tokens": 1443495.0, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.3203248364538687, |
| "grad_norm": 0.410168319940567, |
| "learning_rate": 0.00028079404466501237, |
| "loss": 0.2521, |
| "mean_token_accuracy": 0.9283298313617706, |
| "num_tokens": 1453715.0, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.3225806451612903, |
| "grad_norm": 0.5238193273544312, |
| "learning_rate": 0.0002806586961425671, |
| "loss": 0.3188, |
| "mean_token_accuracy": 0.9118652939796448, |
| "num_tokens": 1463900.0, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.32483645386871196, |
| "grad_norm": 0.347003310918808, |
| "learning_rate": 0.0002805233476201218, |
| "loss": 0.2707, |
| "mean_token_accuracy": 0.921840351819992, |
| "num_tokens": 1474039.0, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.32709226257613355, |
| "grad_norm": 0.49886128306388855, |
| "learning_rate": 0.0002803879990976765, |
| "loss": 0.2386, |
| "mean_token_accuracy": 0.9320064127445221, |
| "num_tokens": 1484218.0, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.32934807128355514, |
| "grad_norm": 0.7472530603408813, |
| "learning_rate": 0.0002802526505752312, |
| "loss": 0.2504, |
| "mean_token_accuracy": 0.9290566742420197, |
| "num_tokens": 1494448.0, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.3316038799909768, |
| "grad_norm": 0.39901986718177795, |
| "learning_rate": 0.0002801173020527859, |
| "loss": 0.3014, |
| "mean_token_accuracy": 0.9148159503936768, |
| "num_tokens": 1504582.0, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.3338596886983984, |
| "grad_norm": 0.6466052532196045, |
| "learning_rate": 0.0002799819535303406, |
| "loss": 0.2645, |
| "mean_token_accuracy": 0.9249064564704895, |
| "num_tokens": 1514759.0, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.33611549740582, |
| "grad_norm": 0.5997007489204407, |
| "learning_rate": 0.00027984660500789534, |
| "loss": 0.246, |
| "mean_token_accuracy": 0.9261491954326629, |
| "num_tokens": 1524964.0, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.3383713061132416, |
| "grad_norm": 0.8568662405014038, |
| "learning_rate": 0.00027971125648545, |
| "loss": 0.2607, |
| "mean_token_accuracy": 0.9246239781379699, |
| "num_tokens": 1535168.0, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.3406271148206632, |
| "grad_norm": 0.4184776544570923, |
| "learning_rate": 0.0002795759079630047, |
| "loss": 0.2996, |
| "mean_token_accuracy": 0.9106807947158814, |
| "num_tokens": 1545312.0, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.3428829235280848, |
| "grad_norm": 0.7694135308265686, |
| "learning_rate": 0.0002794405594405594, |
| "loss": 0.2432, |
| "mean_token_accuracy": 0.9285914778709412, |
| "num_tokens": 1555532.0, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.34513873223550645, |
| "grad_norm": 0.5472086071968079, |
| "learning_rate": 0.0002793052109181141, |
| "loss": 0.2343, |
| "mean_token_accuracy": 0.935352087020874, |
| "num_tokens": 1565702.0, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.34739454094292804, |
| "grad_norm": 0.5112503170967102, |
| "learning_rate": 0.00027916986239566883, |
| "loss": 0.3255, |
| "mean_token_accuracy": 0.9154317140579223, |
| "num_tokens": 1574947.0, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.34965034965034963, |
| "grad_norm": 0.3371566832065582, |
| "learning_rate": 0.00027903451387322354, |
| "loss": 0.2386, |
| "mean_token_accuracy": 0.9288936614990234, |
| "num_tokens": 1585140.0, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.3519061583577713, |
| "grad_norm": 0.6675296425819397, |
| "learning_rate": 0.00027889916535077825, |
| "loss": 0.2715, |
| "mean_token_accuracy": 0.9202642977237702, |
| "num_tokens": 1595358.0, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.3541619670651929, |
| "grad_norm": 0.921124279499054, |
| "learning_rate": 0.00027876381682833296, |
| "loss": 0.2739, |
| "mean_token_accuracy": 0.9199114978313446, |
| "num_tokens": 1605534.0, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.35641777577261446, |
| "grad_norm": 0.3380034565925598, |
| "learning_rate": 0.0002786284683058876, |
| "loss": 0.2683, |
| "mean_token_accuracy": 0.9206540703773498, |
| "num_tokens": 1615751.0, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.3586735844800361, |
| "grad_norm": 0.6300131678581238, |
| "learning_rate": 0.0002784931197834423, |
| "loss": 0.2902, |
| "mean_token_accuracy": 0.9209059238433838, |
| "num_tokens": 1625928.0, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.3609293931874577, |
| "grad_norm": 0.6872547268867493, |
| "learning_rate": 0.00027835777126099704, |
| "loss": 0.3625, |
| "mean_token_accuracy": 0.9109048128128052, |
| "num_tokens": 1636144.0, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.3631852018948793, |
| "grad_norm": 0.4150646924972534, |
| "learning_rate": 0.00027822242273855174, |
| "loss": 0.2687, |
| "mean_token_accuracy": 0.9247509896755218, |
| "num_tokens": 1646331.0, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.36544101060230094, |
| "grad_norm": 0.44850772619247437, |
| "learning_rate": 0.00027808707421610645, |
| "loss": 0.2995, |
| "mean_token_accuracy": 0.9172296404838562, |
| "num_tokens": 1656544.0, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.36769681930972253, |
| "grad_norm": 0.48656392097473145, |
| "learning_rate": 0.00027795172569366116, |
| "loss": 0.277, |
| "mean_token_accuracy": 0.9188252389431, |
| "num_tokens": 1666778.0, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.3699526280171441, |
| "grad_norm": 0.5935309529304504, |
| "learning_rate": 0.0002778163771712159, |
| "loss": 0.2761, |
| "mean_token_accuracy": 0.9251180648803711, |
| "num_tokens": 1676959.0, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.37220843672456577, |
| "grad_norm": 0.7823290228843689, |
| "learning_rate": 0.0002776810286487706, |
| "loss": 0.2672, |
| "mean_token_accuracy": 0.922256076335907, |
| "num_tokens": 1687115.0, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.37446424543198736, |
| "grad_norm": 0.8532506227493286, |
| "learning_rate": 0.0002775456801263253, |
| "loss": 0.2739, |
| "mean_token_accuracy": 0.9220530927181244, |
| "num_tokens": 1697239.0, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.37672005413940896, |
| "grad_norm": 0.5606103539466858, |
| "learning_rate": 0.00027741033160387995, |
| "loss": 0.2311, |
| "mean_token_accuracy": 0.9349769711494446, |
| "num_tokens": 1707409.0, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.3789758628468306, |
| "grad_norm": 0.4320582151412964, |
| "learning_rate": 0.00027727498308143466, |
| "loss": 0.3131, |
| "mean_token_accuracy": 0.916411018371582, |
| "num_tokens": 1717583.0, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.3812316715542522, |
| "grad_norm": 0.5167767405509949, |
| "learning_rate": 0.00027713963455898937, |
| "loss": 0.2443, |
| "mean_token_accuracy": 0.9284303724765778, |
| "num_tokens": 1727785.0, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.3834874802616738, |
| "grad_norm": 0.40260276198387146, |
| "learning_rate": 0.0002770042860365441, |
| "loss": 0.254, |
| "mean_token_accuracy": 0.9235418915748597, |
| "num_tokens": 1738023.0, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.38574328896909543, |
| "grad_norm": 0.2824370563030243, |
| "learning_rate": 0.0002768689375140988, |
| "loss": 0.2086, |
| "mean_token_accuracy": 0.939811784029007, |
| "num_tokens": 1748222.0, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.387999097676517, |
| "grad_norm": 0.37908050417900085, |
| "learning_rate": 0.0002767335889916535, |
| "loss": 0.2753, |
| "mean_token_accuracy": 0.923250812292099, |
| "num_tokens": 1758431.0, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.3902549063839386, |
| "grad_norm": 0.5015047192573547, |
| "learning_rate": 0.0002765982404692082, |
| "loss": 0.2278, |
| "mean_token_accuracy": 0.9315616250038147, |
| "num_tokens": 1768555.0, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.39251071509136026, |
| "grad_norm": 1.0540778636932373, |
| "learning_rate": 0.0002764628919467629, |
| "loss": 0.2969, |
| "mean_token_accuracy": 0.9173891127109528, |
| "num_tokens": 1778765.0, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.39476652379878185, |
| "grad_norm": 0.37756413221359253, |
| "learning_rate": 0.00027632754342431763, |
| "loss": 0.2314, |
| "mean_token_accuracy": 0.9306451320648194, |
| "num_tokens": 1788880.0, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.3970223325062035, |
| "grad_norm": 0.3676840662956238, |
| "learning_rate": 0.0002761921949018723, |
| "loss": 0.2115, |
| "mean_token_accuracy": 0.9343382716178894, |
| "num_tokens": 1799070.0, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.3992781412136251, |
| "grad_norm": 0.33806851506233215, |
| "learning_rate": 0.000276056846379427, |
| "loss": 0.2772, |
| "mean_token_accuracy": 0.9242539584636689, |
| "num_tokens": 1809273.0, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.4015339499210467, |
| "grad_norm": 0.8772180676460266, |
| "learning_rate": 0.0002759214978569817, |
| "loss": 0.2342, |
| "mean_token_accuracy": 0.9347168564796448, |
| "num_tokens": 1819458.0, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.40378975862846833, |
| "grad_norm": 0.8381493091583252, |
| "learning_rate": 0.0002757861493345364, |
| "loss": 0.2367, |
| "mean_token_accuracy": 0.9324899137020111, |
| "num_tokens": 1829636.0, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.4060455673358899, |
| "grad_norm": 0.3281577527523041, |
| "learning_rate": 0.0002756508008120911, |
| "loss": 0.2424, |
| "mean_token_accuracy": 0.9290665745735168, |
| "num_tokens": 1839857.0, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.4083013760433115, |
| "grad_norm": 0.4396291971206665, |
| "learning_rate": 0.00027551545228964583, |
| "loss": 0.3203, |
| "mean_token_accuracy": 0.919403862953186, |
| "num_tokens": 1850052.0, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.41055718475073316, |
| "grad_norm": 0.3047076165676117, |
| "learning_rate": 0.00027538010376720054, |
| "loss": 0.208, |
| "mean_token_accuracy": 0.937256783246994, |
| "num_tokens": 1860236.0, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.41281299345815475, |
| "grad_norm": 0.592738151550293, |
| "learning_rate": 0.00027524475524475525, |
| "loss": 0.2222, |
| "mean_token_accuracy": 0.9334645926952362, |
| "num_tokens": 1870414.0, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.41506880216557634, |
| "grad_norm": 0.30050572752952576, |
| "learning_rate": 0.0002751094067223099, |
| "loss": 0.2439, |
| "mean_token_accuracy": 0.9300739288330078, |
| "num_tokens": 1880626.0, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.417324610872998, |
| "grad_norm": 0.39272746443748474, |
| "learning_rate": 0.0002749740581998646, |
| "loss": 0.2569, |
| "mean_token_accuracy": 0.9248918652534485, |
| "num_tokens": 1890861.0, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.4195804195804196, |
| "grad_norm": 0.3860650062561035, |
| "learning_rate": 0.00027483870967741933, |
| "loss": 0.2459, |
| "mean_token_accuracy": 0.9251980602741241, |
| "num_tokens": 1901049.0, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.4218362282878412, |
| "grad_norm": 0.36546534299850464, |
| "learning_rate": 0.00027470336115497404, |
| "loss": 0.2296, |
| "mean_token_accuracy": 0.9312162160873413, |
| "num_tokens": 1911268.0, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.4240920369952628, |
| "grad_norm": 0.469930499792099, |
| "learning_rate": 0.00027456801263252875, |
| "loss": 0.2448, |
| "mean_token_accuracy": 0.9255866289138794, |
| "num_tokens": 1921450.0, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.4263478457026844, |
| "grad_norm": 0.43246912956237793, |
| "learning_rate": 0.00027443266411008346, |
| "loss": 0.2746, |
| "mean_token_accuracy": 0.9213305950164795, |
| "num_tokens": 1931629.0, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.428603654410106, |
| "grad_norm": 0.5429182648658752, |
| "learning_rate": 0.00027429731558763817, |
| "loss": 0.29, |
| "mean_token_accuracy": 0.9243273913860321, |
| "num_tokens": 1941815.0, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.43085946311752765, |
| "grad_norm": 0.2941581904888153, |
| "learning_rate": 0.0002741619670651929, |
| "loss": 0.2088, |
| "mean_token_accuracy": 0.9382153749465942, |
| "num_tokens": 1952040.0, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.43311527182494924, |
| "grad_norm": 0.3362172544002533, |
| "learning_rate": 0.0002740266185427476, |
| "loss": 0.2576, |
| "mean_token_accuracy": 0.9279570162296296, |
| "num_tokens": 1962222.0, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.43537108053237084, |
| "grad_norm": 0.6598260998725891, |
| "learning_rate": 0.00027389127002030224, |
| "loss": 0.2667, |
| "mean_token_accuracy": 0.9221050620079041, |
| "num_tokens": 1972369.0, |
| "step": 1930 |
| }, |
| { |
| "epoch": 0.4376268892397925, |
| "grad_norm": 0.5338302254676819, |
| "learning_rate": 0.00027375592149785695, |
| "loss": 0.2525, |
| "mean_token_accuracy": 0.9305463373661041, |
| "num_tokens": 1982601.0, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.4398826979472141, |
| "grad_norm": 0.5087529420852661, |
| "learning_rate": 0.00027362057297541166, |
| "loss": 0.3157, |
| "mean_token_accuracy": 0.9138010561466217, |
| "num_tokens": 1992715.0, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.44213850665463567, |
| "grad_norm": 0.9174799919128418, |
| "learning_rate": 0.0002734852244529664, |
| "loss": 0.3144, |
| "mean_token_accuracy": 0.9178781092166901, |
| "num_tokens": 2002936.0, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.4443943153620573, |
| "grad_norm": 0.6057170033454895, |
| "learning_rate": 0.00027334987593052103, |
| "loss": 0.2902, |
| "mean_token_accuracy": 0.9211130678653717, |
| "num_tokens": 2013103.0, |
| "step": 1970 |
| }, |
| { |
| "epoch": 0.4466501240694789, |
| "grad_norm": 0.2543405592441559, |
| "learning_rate": 0.0002732145274080758, |
| "loss": 0.2468, |
| "mean_token_accuracy": 0.9278612732887268, |
| "num_tokens": 2023318.0, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.4489059327769005, |
| "grad_norm": 0.7571674585342407, |
| "learning_rate": 0.0002730791788856305, |
| "loss": 0.2915, |
| "mean_token_accuracy": 0.9226119935512542, |
| "num_tokens": 2033414.0, |
| "step": 1990 |
| }, |
| { |
| "epoch": 0.45116174148432214, |
| "grad_norm": 0.2644379436969757, |
| "learning_rate": 0.0002729438303631852, |
| "loss": 0.2211, |
| "mean_token_accuracy": 0.9352829337120057, |
| "num_tokens": 2043581.0, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.45341755019174373, |
| "grad_norm": 0.39117881655693054, |
| "learning_rate": 0.00027280848184073987, |
| "loss": 0.2923, |
| "mean_token_accuracy": 0.9223450303077698, |
| "num_tokens": 2053806.0, |
| "step": 2010 |
| }, |
| { |
| "epoch": 0.4556733588991653, |
| "grad_norm": 0.46938055753707886, |
| "learning_rate": 0.0002726731333182946, |
| "loss": 0.2391, |
| "mean_token_accuracy": 0.932051545381546, |
| "num_tokens": 2064004.0, |
| "step": 2020 |
| }, |
| { |
| "epoch": 0.457929167606587, |
| "grad_norm": 0.5239212512969971, |
| "learning_rate": 0.0002725377847958493, |
| "loss": 0.2686, |
| "mean_token_accuracy": 0.922615134716034, |
| "num_tokens": 2074201.0, |
| "step": 2030 |
| }, |
| { |
| "epoch": 0.46018497631400856, |
| "grad_norm": 0.5799335241317749, |
| "learning_rate": 0.000272402436273404, |
| "loss": 0.28, |
| "mean_token_accuracy": 0.9241182446479798, |
| "num_tokens": 2084366.0, |
| "step": 2040 |
| }, |
| { |
| "epoch": 0.46244078502143016, |
| "grad_norm": 0.38910114765167236, |
| "learning_rate": 0.00027226708775095865, |
| "loss": 0.2841, |
| "mean_token_accuracy": 0.9211917340755462, |
| "num_tokens": 2094603.0, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.4646965937288518, |
| "grad_norm": 0.5675938725471497, |
| "learning_rate": 0.00027213173922851336, |
| "loss": 0.268, |
| "mean_token_accuracy": 0.9224383175373078, |
| "num_tokens": 2104772.0, |
| "step": 2060 |
| }, |
| { |
| "epoch": 0.4669524024362734, |
| "grad_norm": 0.3569573163986206, |
| "learning_rate": 0.00027199639070606813, |
| "loss": 0.2722, |
| "mean_token_accuracy": 0.9221753597259521, |
| "num_tokens": 2114988.0, |
| "step": 2070 |
| }, |
| { |
| "epoch": 0.46920821114369504, |
| "grad_norm": 0.7027791142463684, |
| "learning_rate": 0.00027186104218362284, |
| "loss": 0.2583, |
| "mean_token_accuracy": 0.926697039604187, |
| "num_tokens": 2125174.0, |
| "step": 2080 |
| }, |
| { |
| "epoch": 0.47146401985111663, |
| "grad_norm": 0.35335201025009155, |
| "learning_rate": 0.00027172569366117755, |
| "loss": 0.2886, |
| "mean_token_accuracy": 0.9233569622039794, |
| "num_tokens": 2135341.0, |
| "step": 2090 |
| }, |
| { |
| "epoch": 0.4737198285585382, |
| "grad_norm": 0.28648829460144043, |
| "learning_rate": 0.0002715903451387322, |
| "loss": 0.2232, |
| "mean_token_accuracy": 0.9318738996982574, |
| "num_tokens": 2145570.0, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.47597563726595987, |
| "grad_norm": 0.264739990234375, |
| "learning_rate": 0.0002714549966162869, |
| "loss": 0.271, |
| "mean_token_accuracy": 0.9208465099334717, |
| "num_tokens": 2155801.0, |
| "step": 2110 |
| }, |
| { |
| "epoch": 0.47823144597338146, |
| "grad_norm": 0.7098460793495178, |
| "learning_rate": 0.0002713196480938416, |
| "loss": 0.2683, |
| "mean_token_accuracy": 0.9297817826271058, |
| "num_tokens": 2166020.0, |
| "step": 2120 |
| }, |
| { |
| "epoch": 0.48048725468080306, |
| "grad_norm": 0.4726192355155945, |
| "learning_rate": 0.00027118429957139633, |
| "loss": 0.242, |
| "mean_token_accuracy": 0.9297101378440857, |
| "num_tokens": 2176220.0, |
| "step": 2130 |
| }, |
| { |
| "epoch": 0.4827430633882247, |
| "grad_norm": 0.42030471563339233, |
| "learning_rate": 0.000271048951048951, |
| "loss": 0.2867, |
| "mean_token_accuracy": 0.9191558003425598, |
| "num_tokens": 2186379.0, |
| "step": 2140 |
| }, |
| { |
| "epoch": 0.4849988720956463, |
| "grad_norm": 0.2791607677936554, |
| "learning_rate": 0.0002709136025265057, |
| "loss": 0.2437, |
| "mean_token_accuracy": 0.9270163774490356, |
| "num_tokens": 2196596.0, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.4872546808030679, |
| "grad_norm": 0.5415903329849243, |
| "learning_rate": 0.00027077825400406046, |
| "loss": 0.232, |
| "mean_token_accuracy": 0.9309392213821411, |
| "num_tokens": 2206772.0, |
| "step": 2160 |
| }, |
| { |
| "epoch": 0.48951048951048953, |
| "grad_norm": 0.446532666683197, |
| "learning_rate": 0.00027064290548161517, |
| "loss": 0.2198, |
| "mean_token_accuracy": 0.9335504353046418, |
| "num_tokens": 2216969.0, |
| "step": 2170 |
| }, |
| { |
| "epoch": 0.4917662982179111, |
| "grad_norm": 0.5093636512756348, |
| "learning_rate": 0.00027050755695916983, |
| "loss": 0.2644, |
| "mean_token_accuracy": 0.9249256730079651, |
| "num_tokens": 2227148.0, |
| "step": 2180 |
| }, |
| { |
| "epoch": 0.4940221069253327, |
| "grad_norm": 0.33594754338264465, |
| "learning_rate": 0.00027037220843672454, |
| "loss": 0.2651, |
| "mean_token_accuracy": 0.9275108754634858, |
| "num_tokens": 2237357.0, |
| "step": 2190 |
| }, |
| { |
| "epoch": 0.49627791563275436, |
| "grad_norm": 0.3259807229042053, |
| "learning_rate": 0.00027023685991427925, |
| "loss": 0.2641, |
| "mean_token_accuracy": 0.9278050780296325, |
| "num_tokens": 2247548.0, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.49853372434017595, |
| "grad_norm": 0.5676048398017883, |
| "learning_rate": 0.00027010151139183396, |
| "loss": 0.1965, |
| "mean_token_accuracy": 0.9407685935497284, |
| "num_tokens": 2257703.0, |
| "step": 2210 |
| }, |
| { |
| "epoch": 0.5007895330475975, |
| "grad_norm": 0.5289788246154785, |
| "learning_rate": 0.0002699661628693886, |
| "loss": 0.2754, |
| "mean_token_accuracy": 0.9240131139755249, |
| "num_tokens": 2267831.0, |
| "step": 2220 |
| }, |
| { |
| "epoch": 0.5030453417550191, |
| "grad_norm": 0.8352831602096558, |
| "learning_rate": 0.0002698308143469433, |
| "loss": 0.2304, |
| "mean_token_accuracy": 0.9294693827629089, |
| "num_tokens": 2277973.0, |
| "step": 2230 |
| }, |
| { |
| "epoch": 0.5053011504624408, |
| "grad_norm": 0.5197424292564392, |
| "learning_rate": 0.00026969546582449803, |
| "loss": 0.2337, |
| "mean_token_accuracy": 0.9289329349994659, |
| "num_tokens": 2288198.0, |
| "step": 2240 |
| }, |
| { |
| "epoch": 0.5075569591698624, |
| "grad_norm": 0.4816972613334656, |
| "learning_rate": 0.00026956011730205274, |
| "loss": 0.243, |
| "mean_token_accuracy": 0.9350267231464386, |
| "num_tokens": 2298386.0, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.509812767877284, |
| "grad_norm": 0.40002062916755676, |
| "learning_rate": 0.0002694247687796075, |
| "loss": 0.2129, |
| "mean_token_accuracy": 0.9361107170581817, |
| "num_tokens": 2307935.0, |
| "step": 2260 |
| }, |
| { |
| "epoch": 0.5120685765847056, |
| "grad_norm": 0.3949548602104187, |
| "learning_rate": 0.00026928942025716216, |
| "loss": 0.2367, |
| "mean_token_accuracy": 0.9292949497699737, |
| "num_tokens": 2318169.0, |
| "step": 2270 |
| }, |
| { |
| "epoch": 0.5143243852921272, |
| "grad_norm": 0.6020212173461914, |
| "learning_rate": 0.00026915407173471687, |
| "loss": 0.2293, |
| "mean_token_accuracy": 0.9293237924575806, |
| "num_tokens": 2328312.0, |
| "step": 2280 |
| }, |
| { |
| "epoch": 0.5165801939995488, |
| "grad_norm": 0.460500031709671, |
| "learning_rate": 0.0002690187232122716, |
| "loss": 0.2328, |
| "mean_token_accuracy": 0.9300384640693664, |
| "num_tokens": 2338522.0, |
| "step": 2290 |
| }, |
| { |
| "epoch": 0.5188360027069705, |
| "grad_norm": 0.37255415320396423, |
| "learning_rate": 0.0002688833746898263, |
| "loss": 0.213, |
| "mean_token_accuracy": 0.9355834662914276, |
| "num_tokens": 2348691.0, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.5210918114143921, |
| "grad_norm": 0.3450946807861328, |
| "learning_rate": 0.00026874802616738095, |
| "loss": 0.2733, |
| "mean_token_accuracy": 0.9193537712097168, |
| "num_tokens": 2358900.0, |
| "step": 2310 |
| }, |
| { |
| "epoch": 0.5233476201218137, |
| "grad_norm": 0.5672959685325623, |
| "learning_rate": 0.00026861267764493566, |
| "loss": 0.2934, |
| "mean_token_accuracy": 0.9169221520423889, |
| "num_tokens": 2369048.0, |
| "step": 2320 |
| }, |
| { |
| "epoch": 0.5256034288292353, |
| "grad_norm": 0.349274218082428, |
| "learning_rate": 0.00026847732912249037, |
| "loss": 0.2575, |
| "mean_token_accuracy": 0.9255650520324707, |
| "num_tokens": 2379209.0, |
| "step": 2330 |
| }, |
| { |
| "epoch": 0.5278592375366569, |
| "grad_norm": 0.7018752694129944, |
| "learning_rate": 0.0002683419806000451, |
| "loss": 0.2404, |
| "mean_token_accuracy": 0.9304228484630584, |
| "num_tokens": 2389369.0, |
| "step": 2340 |
| }, |
| { |
| "epoch": 0.5301150462440785, |
| "grad_norm": 0.6136831045150757, |
| "learning_rate": 0.0002682066320775998, |
| "loss": 0.2828, |
| "mean_token_accuracy": 0.9221972823143005, |
| "num_tokens": 2399599.0, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.5323708549515002, |
| "grad_norm": 0.4122675955295563, |
| "learning_rate": 0.0002680712835551545, |
| "loss": 0.2509, |
| "mean_token_accuracy": 0.9257995843887329, |
| "num_tokens": 2409765.0, |
| "step": 2360 |
| }, |
| { |
| "epoch": 0.5346266636589218, |
| "grad_norm": 0.6374993324279785, |
| "learning_rate": 0.0002679359350327092, |
| "loss": 0.233, |
| "mean_token_accuracy": 0.9314781188964844, |
| "num_tokens": 2419954.0, |
| "step": 2370 |
| }, |
| { |
| "epoch": 0.5368824723663433, |
| "grad_norm": 0.5488554239273071, |
| "learning_rate": 0.0002678005865102639, |
| "loss": 0.238, |
| "mean_token_accuracy": 0.930390727519989, |
| "num_tokens": 2430192.0, |
| "step": 2380 |
| }, |
| { |
| "epoch": 0.5391382810737649, |
| "grad_norm": 0.40995532274246216, |
| "learning_rate": 0.0002676652379878186, |
| "loss": 0.2214, |
| "mean_token_accuracy": 0.9314346790313721, |
| "num_tokens": 2440278.0, |
| "step": 2390 |
| }, |
| { |
| "epoch": 0.5413940897811865, |
| "grad_norm": 0.3758867084980011, |
| "learning_rate": 0.0002675298894653733, |
| "loss": 0.3038, |
| "mean_token_accuracy": 0.918562513589859, |
| "num_tokens": 2450475.0, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.5436498984886081, |
| "grad_norm": 0.3616684675216675, |
| "learning_rate": 0.000267394540942928, |
| "loss": 0.2434, |
| "mean_token_accuracy": 0.9236611008644104, |
| "num_tokens": 2460701.0, |
| "step": 2410 |
| }, |
| { |
| "epoch": 0.5459057071960298, |
| "grad_norm": 0.3775900602340698, |
| "learning_rate": 0.0002672591924204827, |
| "loss": 0.1832, |
| "mean_token_accuracy": 0.9434400200843811, |
| "num_tokens": 2470851.0, |
| "step": 2420 |
| }, |
| { |
| "epoch": 0.5481615159034514, |
| "grad_norm": 0.40669533610343933, |
| "learning_rate": 0.0002671238438980374, |
| "loss": 0.2112, |
| "mean_token_accuracy": 0.9340478003025054, |
| "num_tokens": 2481053.0, |
| "step": 2430 |
| }, |
| { |
| "epoch": 0.550417324610873, |
| "grad_norm": 0.3886403739452362, |
| "learning_rate": 0.0002669884953755921, |
| "loss": 0.2396, |
| "mean_token_accuracy": 0.9314120352268219, |
| "num_tokens": 2491228.0, |
| "step": 2440 |
| }, |
| { |
| "epoch": 0.5526731333182946, |
| "grad_norm": 0.36737385392189026, |
| "learning_rate": 0.00026685314685314683, |
| "loss": 0.2247, |
| "mean_token_accuracy": 0.9330944359302521, |
| "num_tokens": 2501462.0, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.5549289420257162, |
| "grad_norm": 0.3836795389652252, |
| "learning_rate": 0.00026671779833070154, |
| "loss": 0.2207, |
| "mean_token_accuracy": 0.9301238179206848, |
| "num_tokens": 2511691.0, |
| "step": 2460 |
| }, |
| { |
| "epoch": 0.5571847507331378, |
| "grad_norm": 0.41037148237228394, |
| "learning_rate": 0.00026658244980825625, |
| "loss": 0.2079, |
| "mean_token_accuracy": 0.9370009183883667, |
| "num_tokens": 2521883.0, |
| "step": 2470 |
| }, |
| { |
| "epoch": 0.5594405594405595, |
| "grad_norm": 0.5383064150810242, |
| "learning_rate": 0.0002664471012858109, |
| "loss": 0.2509, |
| "mean_token_accuracy": 0.9259045839309692, |
| "num_tokens": 2532053.0, |
| "step": 2480 |
| }, |
| { |
| "epoch": 0.5616963681479811, |
| "grad_norm": 0.4217754900455475, |
| "learning_rate": 0.0002663117527633656, |
| "loss": 0.2151, |
| "mean_token_accuracy": 0.9337125539779663, |
| "num_tokens": 2542249.0, |
| "step": 2490 |
| }, |
| { |
| "epoch": 0.5639521768554027, |
| "grad_norm": 0.3590957820415497, |
| "learning_rate": 0.0002661764042409203, |
| "loss": 0.2406, |
| "mean_token_accuracy": 0.9339839398860932, |
| "num_tokens": 2552448.0, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.5662079855628243, |
| "grad_norm": 0.643915593624115, |
| "learning_rate": 0.00026604105571847504, |
| "loss": 0.2362, |
| "mean_token_accuracy": 0.9307669878005982, |
| "num_tokens": 2562490.0, |
| "step": 2510 |
| }, |
| { |
| "epoch": 0.5684637942702458, |
| "grad_norm": 0.3068816661834717, |
| "learning_rate": 0.00026590570719602975, |
| "loss": 0.1945, |
| "mean_token_accuracy": 0.9392113566398621, |
| "num_tokens": 2572707.0, |
| "step": 2520 |
| }, |
| { |
| "epoch": 0.5707196029776674, |
| "grad_norm": 0.42321914434432983, |
| "learning_rate": 0.00026577035867358446, |
| "loss": 0.2256, |
| "mean_token_accuracy": 0.9326376616954803, |
| "num_tokens": 2582925.0, |
| "step": 2530 |
| }, |
| { |
| "epoch": 0.5729754116850891, |
| "grad_norm": 0.6707413196563721, |
| "learning_rate": 0.00026563501015113917, |
| "loss": 0.2324, |
| "mean_token_accuracy": 0.9275128602981567, |
| "num_tokens": 2593099.0, |
| "step": 2540 |
| }, |
| { |
| "epoch": 0.5752312203925107, |
| "grad_norm": 0.4198610186576843, |
| "learning_rate": 0.0002654996616286939, |
| "loss": 0.2607, |
| "mean_token_accuracy": 0.9212849795818329, |
| "num_tokens": 2603310.0, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.5774870290999323, |
| "grad_norm": 0.9983633160591125, |
| "learning_rate": 0.0002653643131062486, |
| "loss": 0.2259, |
| "mean_token_accuracy": 0.9339877903461457, |
| "num_tokens": 2613439.0, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.5797428378073539, |
| "grad_norm": 0.6598926782608032, |
| "learning_rate": 0.00026522896458380324, |
| "loss": 0.2703, |
| "mean_token_accuracy": 0.9226635038852692, |
| "num_tokens": 2623574.0, |
| "step": 2570 |
| }, |
| { |
| "epoch": 0.5819986465147755, |
| "grad_norm": 0.307011216878891, |
| "learning_rate": 0.00026509361606135795, |
| "loss": 0.2233, |
| "mean_token_accuracy": 0.9347078561782837, |
| "num_tokens": 2633755.0, |
| "step": 2580 |
| }, |
| { |
| "epoch": 0.5842544552221972, |
| "grad_norm": 0.49420446157455444, |
| "learning_rate": 0.00026495826753891266, |
| "loss": 0.2208, |
| "mean_token_accuracy": 0.9309853732585907, |
| "num_tokens": 2643927.0, |
| "step": 2590 |
| }, |
| { |
| "epoch": 0.5865102639296188, |
| "grad_norm": 0.46068477630615234, |
| "learning_rate": 0.00026482291901646737, |
| "loss": 0.257, |
| "mean_token_accuracy": 0.9284857034683227, |
| "num_tokens": 2654118.0, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.5887660726370404, |
| "grad_norm": 0.5037050247192383, |
| "learning_rate": 0.0002646875704940221, |
| "loss": 0.2189, |
| "mean_token_accuracy": 0.9315983355045319, |
| "num_tokens": 2664170.0, |
| "step": 2610 |
| }, |
| { |
| "epoch": 0.591021881344462, |
| "grad_norm": 0.47586536407470703, |
| "learning_rate": 0.0002645522219715768, |
| "loss": 0.2318, |
| "mean_token_accuracy": 0.9364044010639191, |
| "num_tokens": 2674349.0, |
| "step": 2620 |
| }, |
| { |
| "epoch": 0.5932776900518836, |
| "grad_norm": 0.9741955399513245, |
| "learning_rate": 0.0002644168734491315, |
| "loss": 0.1985, |
| "mean_token_accuracy": 0.9385083496570588, |
| "num_tokens": 2684419.0, |
| "step": 2630 |
| }, |
| { |
| "epoch": 0.5955334987593052, |
| "grad_norm": 0.5099119544029236, |
| "learning_rate": 0.0002642815249266862, |
| "loss": 0.2197, |
| "mean_token_accuracy": 0.9376397907733918, |
| "num_tokens": 2694553.0, |
| "step": 2640 |
| }, |
| { |
| "epoch": 0.5977893074667269, |
| "grad_norm": 0.30093830823898315, |
| "learning_rate": 0.00026414617640424087, |
| "loss": 0.2122, |
| "mean_token_accuracy": 0.9366639375686645, |
| "num_tokens": 2704780.0, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.6000451161741485, |
| "grad_norm": 0.6657422780990601, |
| "learning_rate": 0.0002640108278817956, |
| "loss": 0.2269, |
| "mean_token_accuracy": 0.9306385815143585, |
| "num_tokens": 2714979.0, |
| "step": 2660 |
| }, |
| { |
| "epoch": 0.60230092488157, |
| "grad_norm": 0.37251630425453186, |
| "learning_rate": 0.0002638754793593503, |
| "loss": 0.1887, |
| "mean_token_accuracy": 0.9465994536876678, |
| "num_tokens": 2724781.0, |
| "step": 2670 |
| }, |
| { |
| "epoch": 0.6045567335889916, |
| "grad_norm": 0.5274510979652405, |
| "learning_rate": 0.000263740130836905, |
| "loss": 0.2289, |
| "mean_token_accuracy": 0.9338556706905365, |
| "num_tokens": 2734993.0, |
| "step": 2680 |
| }, |
| { |
| "epoch": 0.6068125422964132, |
| "grad_norm": 0.5593155026435852, |
| "learning_rate": 0.0002636047823144597, |
| "loss": 0.2237, |
| "mean_token_accuracy": 0.9336115419864655, |
| "num_tokens": 2744872.0, |
| "step": 2690 |
| }, |
| { |
| "epoch": 0.6090683510038348, |
| "grad_norm": 0.7985104918479919, |
| "learning_rate": 0.0002634694337920144, |
| "loss": 0.2949, |
| "mean_token_accuracy": 0.9198081076145173, |
| "num_tokens": 2755075.0, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.6113241597112565, |
| "grad_norm": 0.41699644923210144, |
| "learning_rate": 0.0002633340852695691, |
| "loss": 0.1981, |
| "mean_token_accuracy": 0.9422951698303222, |
| "num_tokens": 2765210.0, |
| "step": 2710 |
| }, |
| { |
| "epoch": 0.6135799684186781, |
| "grad_norm": 0.5942256450653076, |
| "learning_rate": 0.00026319873674712384, |
| "loss": 0.251, |
| "mean_token_accuracy": 0.9282591760158538, |
| "num_tokens": 2775369.0, |
| "step": 2720 |
| }, |
| { |
| "epoch": 0.6158357771260997, |
| "grad_norm": 0.5558121800422668, |
| "learning_rate": 0.00026306338822467855, |
| "loss": 0.2312, |
| "mean_token_accuracy": 0.9309241354465485, |
| "num_tokens": 2785569.0, |
| "step": 2730 |
| }, |
| { |
| "epoch": 0.6180915858335213, |
| "grad_norm": 0.3139027953147888, |
| "learning_rate": 0.0002629280397022332, |
| "loss": 0.2244, |
| "mean_token_accuracy": 0.9360554933547973, |
| "num_tokens": 2795773.0, |
| "step": 2740 |
| }, |
| { |
| "epoch": 0.6203473945409429, |
| "grad_norm": 0.44965052604675293, |
| "learning_rate": 0.0002627926911797879, |
| "loss": 0.2224, |
| "mean_token_accuracy": 0.9316644787788391, |
| "num_tokens": 2806012.0, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.6226032032483645, |
| "grad_norm": 0.5226691365242004, |
| "learning_rate": 0.0002626573426573426, |
| "loss": 0.2398, |
| "mean_token_accuracy": 0.9320449590682983, |
| "num_tokens": 2816212.0, |
| "step": 2760 |
| }, |
| { |
| "epoch": 0.6248590119557862, |
| "grad_norm": 0.5641390681266785, |
| "learning_rate": 0.00026252199413489733, |
| "loss": 0.2383, |
| "mean_token_accuracy": 0.9291798174381256, |
| "num_tokens": 2826418.0, |
| "step": 2770 |
| }, |
| { |
| "epoch": 0.6271148206632078, |
| "grad_norm": 0.6037927865982056, |
| "learning_rate": 0.00026238664561245204, |
| "loss": 0.2289, |
| "mean_token_accuracy": 0.9318684577941895, |
| "num_tokens": 2836641.0, |
| "step": 2780 |
| }, |
| { |
| "epoch": 0.6293706293706294, |
| "grad_norm": 0.47394031286239624, |
| "learning_rate": 0.00026225129709000675, |
| "loss": 0.2258, |
| "mean_token_accuracy": 0.9298848390579224, |
| "num_tokens": 2846865.0, |
| "step": 2790 |
| }, |
| { |
| "epoch": 0.631626438078051, |
| "grad_norm": 0.6290483474731445, |
| "learning_rate": 0.00026211594856756146, |
| "loss": 0.2514, |
| "mean_token_accuracy": 0.9306814074516296, |
| "num_tokens": 2857102.0, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.6338822467854726, |
| "grad_norm": 0.6657432317733765, |
| "learning_rate": 0.00026198060004511617, |
| "loss": 0.2262, |
| "mean_token_accuracy": 0.9337584257125855, |
| "num_tokens": 2867299.0, |
| "step": 2810 |
| }, |
| { |
| "epoch": 0.6361380554928941, |
| "grad_norm": 0.53950434923172, |
| "learning_rate": 0.0002618452515226708, |
| "loss": 0.2341, |
| "mean_token_accuracy": 0.9332832813262939, |
| "num_tokens": 2877485.0, |
| "step": 2820 |
| }, |
| { |
| "epoch": 0.6383938642003159, |
| "grad_norm": 0.5629047155380249, |
| "learning_rate": 0.00026170990300022554, |
| "loss": 0.1989, |
| "mean_token_accuracy": 0.9396433293819427, |
| "num_tokens": 2887688.0, |
| "step": 2830 |
| }, |
| { |
| "epoch": 0.6406496729077374, |
| "grad_norm": 0.4131242334842682, |
| "learning_rate": 0.00026157455447778025, |
| "loss": 0.2266, |
| "mean_token_accuracy": 0.9329676389694214, |
| "num_tokens": 2897887.0, |
| "step": 2840 |
| }, |
| { |
| "epoch": 0.642905481615159, |
| "grad_norm": 0.5747202634811401, |
| "learning_rate": 0.00026143920595533496, |
| "loss": 0.2601, |
| "mean_token_accuracy": 0.9348642468452454, |
| "num_tokens": 2908059.0, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.6451612903225806, |
| "grad_norm": 0.6643345952033997, |
| "learning_rate": 0.00026130385743288967, |
| "loss": 0.2004, |
| "mean_token_accuracy": 0.9394226372241974, |
| "num_tokens": 2918269.0, |
| "step": 2860 |
| }, |
| { |
| "epoch": 0.6474170990300022, |
| "grad_norm": 0.5732499361038208, |
| "learning_rate": 0.0002611685089104444, |
| "loss": 0.2398, |
| "mean_token_accuracy": 0.9273908972740174, |
| "num_tokens": 2928456.0, |
| "step": 2870 |
| }, |
| { |
| "epoch": 0.6496729077374239, |
| "grad_norm": 0.4198378622531891, |
| "learning_rate": 0.0002610331603879991, |
| "loss": 0.239, |
| "mean_token_accuracy": 0.9306033670902252, |
| "num_tokens": 2938625.0, |
| "step": 2880 |
| }, |
| { |
| "epoch": 0.6519287164448455, |
| "grad_norm": 0.8750539422035217, |
| "learning_rate": 0.0002608978118655538, |
| "loss": 0.2565, |
| "mean_token_accuracy": 0.9291310131549835, |
| "num_tokens": 2948751.0, |
| "step": 2890 |
| }, |
| { |
| "epoch": 0.6541845251522671, |
| "grad_norm": 0.5238725543022156, |
| "learning_rate": 0.0002607624633431085, |
| "loss": 0.2972, |
| "mean_token_accuracy": 0.9199288547039032, |
| "num_tokens": 2958976.0, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.6564403338596887, |
| "grad_norm": 0.6253301501274109, |
| "learning_rate": 0.00026062711482066316, |
| "loss": 0.2486, |
| "mean_token_accuracy": 0.9288740932941437, |
| "num_tokens": 2969180.0, |
| "step": 2910 |
| }, |
| { |
| "epoch": 0.6586961425671103, |
| "grad_norm": 0.5708739161491394, |
| "learning_rate": 0.00026049176629821787, |
| "loss": 0.2126, |
| "mean_token_accuracy": 0.9343720078468323, |
| "num_tokens": 2979355.0, |
| "step": 2920 |
| }, |
| { |
| "epoch": 0.6609519512745319, |
| "grad_norm": 0.5066877603530884, |
| "learning_rate": 0.0002603564177757726, |
| "loss": 0.2085, |
| "mean_token_accuracy": 0.9371103644371033, |
| "num_tokens": 2989482.0, |
| "step": 2930 |
| }, |
| { |
| "epoch": 0.6632077599819536, |
| "grad_norm": 0.71226966381073, |
| "learning_rate": 0.0002602210692533273, |
| "loss": 0.2329, |
| "mean_token_accuracy": 0.9393065094947814, |
| "num_tokens": 2999682.0, |
| "step": 2940 |
| }, |
| { |
| "epoch": 0.6654635686893752, |
| "grad_norm": 0.6169939637184143, |
| "learning_rate": 0.000260085720730882, |
| "loss": 0.2863, |
| "mean_token_accuracy": 0.9209560215473175, |
| "num_tokens": 3009855.0, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.6677193773967968, |
| "grad_norm": 0.41732147336006165, |
| "learning_rate": 0.0002599503722084367, |
| "loss": 0.2062, |
| "mean_token_accuracy": 0.9381140649318696, |
| "num_tokens": 3020060.0, |
| "step": 2960 |
| }, |
| { |
| "epoch": 0.6699751861042184, |
| "grad_norm": 0.5298788547515869, |
| "learning_rate": 0.0002598150236859914, |
| "loss": 0.2941, |
| "mean_token_accuracy": 0.9189365446567536, |
| "num_tokens": 3030266.0, |
| "step": 2970 |
| }, |
| { |
| "epoch": 0.67223099481164, |
| "grad_norm": 0.500662088394165, |
| "learning_rate": 0.00025967967516354613, |
| "loss": 0.2187, |
| "mean_token_accuracy": 0.9356793701648712, |
| "num_tokens": 3040506.0, |
| "step": 2980 |
| }, |
| { |
| "epoch": 0.6744868035190615, |
| "grad_norm": 0.9615169763565063, |
| "learning_rate": 0.0002595443266411008, |
| "loss": 0.2256, |
| "mean_token_accuracy": 0.9331699669361114, |
| "num_tokens": 3050615.0, |
| "step": 2990 |
| }, |
| { |
| "epoch": 0.6767426122264832, |
| "grad_norm": 0.7886420488357544, |
| "learning_rate": 0.0002594089781186555, |
| "loss": 0.3148, |
| "mean_token_accuracy": 0.9215950310230255, |
| "num_tokens": 3060815.0, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.6789984209339048, |
| "grad_norm": 0.5439404845237732, |
| "learning_rate": 0.0002592736295962102, |
| "loss": 0.2323, |
| "mean_token_accuracy": 0.9314812004566193, |
| "num_tokens": 3071015.0, |
| "step": 3010 |
| }, |
| { |
| "epoch": 0.6812542296413264, |
| "grad_norm": 0.4758981764316559, |
| "learning_rate": 0.0002591382810737649, |
| "loss": 0.2825, |
| "mean_token_accuracy": 0.9206907093524933, |
| "num_tokens": 3081209.0, |
| "step": 3020 |
| }, |
| { |
| "epoch": 0.683510038348748, |
| "grad_norm": 0.48459869623184204, |
| "learning_rate": 0.0002590029325513196, |
| "loss": 0.2391, |
| "mean_token_accuracy": 0.9332017719745636, |
| "num_tokens": 3091395.0, |
| "step": 3030 |
| }, |
| { |
| "epoch": 0.6857658470561696, |
| "grad_norm": 0.5378035306930542, |
| "learning_rate": 0.00025886758402887433, |
| "loss": 0.2443, |
| "mean_token_accuracy": 0.9284430742263794, |
| "num_tokens": 3101565.0, |
| "step": 3040 |
| }, |
| { |
| "epoch": 0.6880216557635912, |
| "grad_norm": 0.45487913489341736, |
| "learning_rate": 0.00025873223550642904, |
| "loss": 0.2181, |
| "mean_token_accuracy": 0.9427953720092773, |
| "num_tokens": 3111730.0, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.6902774644710129, |
| "grad_norm": 0.9604068398475647, |
| "learning_rate": 0.00025859688698398375, |
| "loss": 0.2565, |
| "mean_token_accuracy": 0.9298881590366364, |
| "num_tokens": 3121965.0, |
| "step": 3060 |
| }, |
| { |
| "epoch": 0.6925332731784345, |
| "grad_norm": 0.5915318131446838, |
| "learning_rate": 0.00025846153846153846, |
| "loss": 0.2816, |
| "mean_token_accuracy": 0.9280205249786377, |
| "num_tokens": 3132199.0, |
| "step": 3070 |
| }, |
| { |
| "epoch": 0.6947890818858561, |
| "grad_norm": 0.47584712505340576, |
| "learning_rate": 0.0002583261899390931, |
| "loss": 0.1796, |
| "mean_token_accuracy": 0.945578533411026, |
| "num_tokens": 3142379.0, |
| "step": 3080 |
| }, |
| { |
| "epoch": 0.6970448905932777, |
| "grad_norm": 0.4383523464202881, |
| "learning_rate": 0.00025819084141664783, |
| "loss": 0.2353, |
| "mean_token_accuracy": 0.9329768180847168, |
| "num_tokens": 3152556.0, |
| "step": 3090 |
| }, |
| { |
| "epoch": 0.6993006993006993, |
| "grad_norm": 0.9148581624031067, |
| "learning_rate": 0.00025805549289420254, |
| "loss": 0.2627, |
| "mean_token_accuracy": 0.9258742690086365, |
| "num_tokens": 3162716.0, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.7015565080081209, |
| "grad_norm": 0.667614758014679, |
| "learning_rate": 0.00025792014437175725, |
| "loss": 0.2331, |
| "mean_token_accuracy": 0.9343131303787231, |
| "num_tokens": 3172920.0, |
| "step": 3110 |
| }, |
| { |
| "epoch": 0.7038123167155426, |
| "grad_norm": 0.33134734630584717, |
| "learning_rate": 0.00025778479584931196, |
| "loss": 0.2198, |
| "mean_token_accuracy": 0.938484913110733, |
| "num_tokens": 3183152.0, |
| "step": 3120 |
| }, |
| { |
| "epoch": 0.7060681254229642, |
| "grad_norm": 0.3356286883354187, |
| "learning_rate": 0.00025764944732686667, |
| "loss": 0.2183, |
| "mean_token_accuracy": 0.932779735326767, |
| "num_tokens": 3193354.0, |
| "step": 3130 |
| }, |
| { |
| "epoch": 0.7083239341303857, |
| "grad_norm": 0.3519227206707001, |
| "learning_rate": 0.0002575140988044214, |
| "loss": 0.2357, |
| "mean_token_accuracy": 0.9315909683704376, |
| "num_tokens": 3203523.0, |
| "step": 3140 |
| }, |
| { |
| "epoch": 0.7105797428378073, |
| "grad_norm": 0.844898521900177, |
| "learning_rate": 0.0002573787502819761, |
| "loss": 0.2278, |
| "mean_token_accuracy": 0.9361746132373809, |
| "num_tokens": 3213735.0, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.7128355515452289, |
| "grad_norm": 0.4979631304740906, |
| "learning_rate": 0.00025724340175953074, |
| "loss": 0.2013, |
| "mean_token_accuracy": 0.9381689071655274, |
| "num_tokens": 3223919.0, |
| "step": 3160 |
| }, |
| { |
| "epoch": 0.7150913602526505, |
| "grad_norm": 0.5635648369789124, |
| "learning_rate": 0.00025710805323708545, |
| "loss": 0.2066, |
| "mean_token_accuracy": 0.9401957809925079, |
| "num_tokens": 3234145.0, |
| "step": 3170 |
| }, |
| { |
| "epoch": 0.7173471689600722, |
| "grad_norm": 0.7126004695892334, |
| "learning_rate": 0.00025697270471464016, |
| "loss": 0.2267, |
| "mean_token_accuracy": 0.9334402441978454, |
| "num_tokens": 3244336.0, |
| "step": 3180 |
| }, |
| { |
| "epoch": 0.7196029776674938, |
| "grad_norm": 0.7653904557228088, |
| "learning_rate": 0.0002568373561921949, |
| "loss": 0.2367, |
| "mean_token_accuracy": 0.9352998495101928, |
| "num_tokens": 3254473.0, |
| "step": 3190 |
| }, |
| { |
| "epoch": 0.7218587863749154, |
| "grad_norm": 0.44523507356643677, |
| "learning_rate": 0.0002567020076697496, |
| "loss": 0.195, |
| "mean_token_accuracy": 0.9434902846813202, |
| "num_tokens": 3264693.0, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.724114595082337, |
| "grad_norm": 0.5072572827339172, |
| "learning_rate": 0.0002565666591473043, |
| "loss": 0.1927, |
| "mean_token_accuracy": 0.9416171848773957, |
| "num_tokens": 3274913.0, |
| "step": 3210 |
| }, |
| { |
| "epoch": 0.7263704037897586, |
| "grad_norm": 0.42991572618484497, |
| "learning_rate": 0.000256431310624859, |
| "loss": 0.2878, |
| "mean_token_accuracy": 0.9258961975574493, |
| "num_tokens": 3285114.0, |
| "step": 3220 |
| }, |
| { |
| "epoch": 0.7286262124971803, |
| "grad_norm": 0.61916583776474, |
| "learning_rate": 0.0002562959621024137, |
| "loss": 0.2563, |
| "mean_token_accuracy": 0.9288025736808777, |
| "num_tokens": 3295220.0, |
| "step": 3230 |
| }, |
| { |
| "epoch": 0.7308820212046019, |
| "grad_norm": 0.725781261920929, |
| "learning_rate": 0.0002561606135799684, |
| "loss": 0.2307, |
| "mean_token_accuracy": 0.9322963118553161, |
| "num_tokens": 3305451.0, |
| "step": 3240 |
| }, |
| { |
| "epoch": 0.7331378299120235, |
| "grad_norm": 0.6131793260574341, |
| "learning_rate": 0.0002560252650575231, |
| "loss": 0.2268, |
| "mean_token_accuracy": 0.9349699139595031, |
| "num_tokens": 3315579.0, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.7353936386194451, |
| "grad_norm": 0.6788907051086426, |
| "learning_rate": 0.0002558899165350778, |
| "loss": 0.2282, |
| "mean_token_accuracy": 0.935053151845932, |
| "num_tokens": 3325813.0, |
| "step": 3260 |
| }, |
| { |
| "epoch": 0.7376494473268667, |
| "grad_norm": 0.4873131811618805, |
| "learning_rate": 0.0002557545680126325, |
| "loss": 0.1999, |
| "mean_token_accuracy": 0.9406931400299072, |
| "num_tokens": 3335985.0, |
| "step": 3270 |
| }, |
| { |
| "epoch": 0.7399052560342883, |
| "grad_norm": 0.5387445688247681, |
| "learning_rate": 0.0002556192194901872, |
| "loss": 0.2805, |
| "mean_token_accuracy": 0.9226358532905579, |
| "num_tokens": 3346199.0, |
| "step": 3280 |
| }, |
| { |
| "epoch": 0.74216106474171, |
| "grad_norm": 0.6286031603813171, |
| "learning_rate": 0.0002554838709677419, |
| "loss": 0.192, |
| "mean_token_accuracy": 0.940284013748169, |
| "num_tokens": 3356424.0, |
| "step": 3290 |
| }, |
| { |
| "epoch": 0.7444168734491315, |
| "grad_norm": 0.8641782402992249, |
| "learning_rate": 0.00025534852244529663, |
| "loss": 0.1805, |
| "mean_token_accuracy": 0.9438497364521027, |
| "num_tokens": 3366633.0, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.7466726821565531, |
| "grad_norm": 0.6660944223403931, |
| "learning_rate": 0.00025521317392285134, |
| "loss": 0.2103, |
| "mean_token_accuracy": 0.9389594733715058, |
| "num_tokens": 3376532.0, |
| "step": 3310 |
| }, |
| { |
| "epoch": 0.7489284908639747, |
| "grad_norm": 0.6905304193496704, |
| "learning_rate": 0.00025507782540040605, |
| "loss": 0.1885, |
| "mean_token_accuracy": 0.9422814130783081, |
| "num_tokens": 3386719.0, |
| "step": 3320 |
| }, |
| { |
| "epoch": 0.7511842995713963, |
| "grad_norm": 1.0209304094314575, |
| "learning_rate": 0.0002549424768779607, |
| "loss": 0.2332, |
| "mean_token_accuracy": 0.9365878164768219, |
| "num_tokens": 3396927.0, |
| "step": 3330 |
| }, |
| { |
| "epoch": 0.7534401082788179, |
| "grad_norm": 0.3851501941680908, |
| "learning_rate": 0.0002548071283555154, |
| "loss": 0.2496, |
| "mean_token_accuracy": 0.9241067111492157, |
| "num_tokens": 3407161.0, |
| "step": 3340 |
| }, |
| { |
| "epoch": 0.7556959169862396, |
| "grad_norm": 0.5775346755981445, |
| "learning_rate": 0.0002546717798330701, |
| "loss": 0.2284, |
| "mean_token_accuracy": 0.9372583270072937, |
| "num_tokens": 3417372.0, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.7579517256936612, |
| "grad_norm": 0.513624370098114, |
| "learning_rate": 0.00025453643131062483, |
| "loss": 0.2308, |
| "mean_token_accuracy": 0.9305113971233367, |
| "num_tokens": 3427598.0, |
| "step": 3360 |
| }, |
| { |
| "epoch": 0.7602075344010828, |
| "grad_norm": 0.3917955160140991, |
| "learning_rate": 0.00025440108278817954, |
| "loss": 0.3156, |
| "mean_token_accuracy": 0.9161118984222412, |
| "num_tokens": 3437826.0, |
| "step": 3370 |
| }, |
| { |
| "epoch": 0.7624633431085044, |
| "grad_norm": 0.9259962439537048, |
| "learning_rate": 0.00025426573426573425, |
| "loss": 0.2738, |
| "mean_token_accuracy": 0.926971298456192, |
| "num_tokens": 3447977.0, |
| "step": 3380 |
| }, |
| { |
| "epoch": 0.764719151815926, |
| "grad_norm": 0.5740894079208374, |
| "learning_rate": 0.00025413038574328896, |
| "loss": 0.2039, |
| "mean_token_accuracy": 0.940199863910675, |
| "num_tokens": 3458171.0, |
| "step": 3390 |
| }, |
| { |
| "epoch": 0.7669749605233476, |
| "grad_norm": 0.6751810908317566, |
| "learning_rate": 0.00025399503722084367, |
| "loss": 0.226, |
| "mean_token_accuracy": 0.9333556115627288, |
| "num_tokens": 3468359.0, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.7692307692307693, |
| "grad_norm": 0.663020133972168, |
| "learning_rate": 0.0002538596886983984, |
| "loss": 0.2014, |
| "mean_token_accuracy": 0.938248485326767, |
| "num_tokens": 3478585.0, |
| "step": 3410 |
| }, |
| { |
| "epoch": 0.7714865779381909, |
| "grad_norm": 0.6453900337219238, |
| "learning_rate": 0.00025372434017595304, |
| "loss": 0.2258, |
| "mean_token_accuracy": 0.9338716864585876, |
| "num_tokens": 3488706.0, |
| "step": 3420 |
| }, |
| { |
| "epoch": 0.7737423866456125, |
| "grad_norm": 0.3844014108181, |
| "learning_rate": 0.00025358899165350775, |
| "loss": 0.2112, |
| "mean_token_accuracy": 0.9385376691818237, |
| "num_tokens": 3498930.0, |
| "step": 3430 |
| }, |
| { |
| "epoch": 0.775998195353034, |
| "grad_norm": 0.5107812285423279, |
| "learning_rate": 0.00025345364313106246, |
| "loss": 0.2576, |
| "mean_token_accuracy": 0.929598605632782, |
| "num_tokens": 3509052.0, |
| "step": 3440 |
| }, |
| { |
| "epoch": 0.7782540040604556, |
| "grad_norm": 0.4670540392398834, |
| "learning_rate": 0.00025331829460861717, |
| "loss": 0.1817, |
| "mean_token_accuracy": 0.9437798678874969, |
| "num_tokens": 3519236.0, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.7805098127678772, |
| "grad_norm": 0.5630539655685425, |
| "learning_rate": 0.0002531829460861719, |
| "loss": 0.2292, |
| "mean_token_accuracy": 0.9310850441455841, |
| "num_tokens": 3529476.0, |
| "step": 3460 |
| }, |
| { |
| "epoch": 0.7827656214752989, |
| "grad_norm": 2.5165903568267822, |
| "learning_rate": 0.0002530475975637266, |
| "loss": 0.1869, |
| "mean_token_accuracy": 0.9452771425247193, |
| "num_tokens": 3539702.0, |
| "step": 3470 |
| }, |
| { |
| "epoch": 0.7850214301827205, |
| "grad_norm": 0.32777339220046997, |
| "learning_rate": 0.0002529122490412813, |
| "loss": 0.2173, |
| "mean_token_accuracy": 0.9382402420043945, |
| "num_tokens": 3549863.0, |
| "step": 3480 |
| }, |
| { |
| "epoch": 0.7872772388901421, |
| "grad_norm": 0.9197781682014465, |
| "learning_rate": 0.000252776900518836, |
| "loss": 0.2861, |
| "mean_token_accuracy": 0.9292895257472992, |
| "num_tokens": 3560004.0, |
| "step": 3490 |
| }, |
| { |
| "epoch": 0.7895330475975637, |
| "grad_norm": 0.8597625494003296, |
| "learning_rate": 0.00025264155199639066, |
| "loss": 0.1964, |
| "mean_token_accuracy": 0.9433383345603943, |
| "num_tokens": 3570177.0, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.7917888563049853, |
| "grad_norm": 0.5637295842170715, |
| "learning_rate": 0.00025250620347394537, |
| "loss": 0.19, |
| "mean_token_accuracy": 0.94198077917099, |
| "num_tokens": 3580404.0, |
| "step": 3510 |
| }, |
| { |
| "epoch": 0.794044665012407, |
| "grad_norm": 0.4464856684207916, |
| "learning_rate": 0.0002523708549515001, |
| "loss": 0.1661, |
| "mean_token_accuracy": 0.9511018455028534, |
| "num_tokens": 3590640.0, |
| "step": 3520 |
| }, |
| { |
| "epoch": 0.7963004737198286, |
| "grad_norm": 0.4399365484714508, |
| "learning_rate": 0.0002522355064290548, |
| "loss": 0.2588, |
| "mean_token_accuracy": 0.9256435573101044, |
| "num_tokens": 3600798.0, |
| "step": 3530 |
| }, |
| { |
| "epoch": 0.7985562824272502, |
| "grad_norm": 0.3305966854095459, |
| "learning_rate": 0.0002521001579066095, |
| "loss": 0.233, |
| "mean_token_accuracy": 0.9323022544384003, |
| "num_tokens": 3610941.0, |
| "step": 3540 |
| }, |
| { |
| "epoch": 0.8008120911346718, |
| "grad_norm": 0.44945451617240906, |
| "learning_rate": 0.0002519648093841642, |
| "loss": 0.1944, |
| "mean_token_accuracy": 0.9411108553409576, |
| "num_tokens": 3621174.0, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.8030678998420934, |
| "grad_norm": 0.40088480710983276, |
| "learning_rate": 0.0002518294608617189, |
| "loss": 0.1724, |
| "mean_token_accuracy": 0.9453522205352783, |
| "num_tokens": 3631338.0, |
| "step": 3560 |
| }, |
| { |
| "epoch": 0.805323708549515, |
| "grad_norm": 0.6701235771179199, |
| "learning_rate": 0.00025169411233927363, |
| "loss": 0.2435, |
| "mean_token_accuracy": 0.9314075529575347, |
| "num_tokens": 3640587.0, |
| "step": 3570 |
| }, |
| { |
| "epoch": 0.8075795172569367, |
| "grad_norm": 0.4659329354763031, |
| "learning_rate": 0.00025155876381682834, |
| "loss": 0.2118, |
| "mean_token_accuracy": 0.9398883581161499, |
| "num_tokens": 3650742.0, |
| "step": 3580 |
| }, |
| { |
| "epoch": 0.8098353259643583, |
| "grad_norm": 0.5958136320114136, |
| "learning_rate": 0.000251423415294383, |
| "loss": 0.1925, |
| "mean_token_accuracy": 0.9447057127952576, |
| "num_tokens": 3660870.0, |
| "step": 3590 |
| }, |
| { |
| "epoch": 0.8120911346717798, |
| "grad_norm": 0.45198115706443787, |
| "learning_rate": 0.0002512880667719377, |
| "loss": 0.1964, |
| "mean_token_accuracy": 0.9426902115345002, |
| "num_tokens": 3671105.0, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.8143469433792014, |
| "grad_norm": 0.5948301553726196, |
| "learning_rate": 0.0002511527182494924, |
| "loss": 0.1765, |
| "mean_token_accuracy": 0.9449096560478211, |
| "num_tokens": 3681293.0, |
| "step": 3610 |
| }, |
| { |
| "epoch": 0.816602752086623, |
| "grad_norm": 0.7184245586395264, |
| "learning_rate": 0.0002510173697270471, |
| "loss": 0.1735, |
| "mean_token_accuracy": 0.9478671312332153, |
| "num_tokens": 3691481.0, |
| "step": 3620 |
| }, |
| { |
| "epoch": 0.8188585607940446, |
| "grad_norm": 0.603680431842804, |
| "learning_rate": 0.00025088202120460184, |
| "loss": 0.2391, |
| "mean_token_accuracy": 0.9321896970272064, |
| "num_tokens": 3701670.0, |
| "step": 3630 |
| }, |
| { |
| "epoch": 0.8211143695014663, |
| "grad_norm": 0.5140425562858582, |
| "learning_rate": 0.00025074667268215655, |
| "loss": 0.2408, |
| "mean_token_accuracy": 0.933314174413681, |
| "num_tokens": 3711844.0, |
| "step": 3640 |
| }, |
| { |
| "epoch": 0.8233701782088879, |
| "grad_norm": 0.516891360282898, |
| "learning_rate": 0.00025061132415971126, |
| "loss": 0.2389, |
| "mean_token_accuracy": 0.9327298462390899, |
| "num_tokens": 3722013.0, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.8256259869163095, |
| "grad_norm": 0.5612062215805054, |
| "learning_rate": 0.00025047597563726597, |
| "loss": 0.2317, |
| "mean_token_accuracy": 0.9376449286937714, |
| "num_tokens": 3731596.0, |
| "step": 3660 |
| }, |
| { |
| "epoch": 0.8278817956237311, |
| "grad_norm": 0.4045218825340271, |
| "learning_rate": 0.0002503406271148207, |
| "loss": 0.1935, |
| "mean_token_accuracy": 0.9420763790607453, |
| "num_tokens": 3741743.0, |
| "step": 3670 |
| }, |
| { |
| "epoch": 0.8301376043311527, |
| "grad_norm": 0.45677274465560913, |
| "learning_rate": 0.00025020527859237533, |
| "loss": 0.2115, |
| "mean_token_accuracy": 0.9396097540855408, |
| "num_tokens": 3751873.0, |
| "step": 3680 |
| }, |
| { |
| "epoch": 0.8323934130385743, |
| "grad_norm": 0.3399205505847931, |
| "learning_rate": 0.00025006993006993004, |
| "loss": 0.1888, |
| "mean_token_accuracy": 0.9462385714054108, |
| "num_tokens": 3762092.0, |
| "step": 3690 |
| }, |
| { |
| "epoch": 0.834649221745996, |
| "grad_norm": 0.6759438514709473, |
| "learning_rate": 0.00024993458154748475, |
| "loss": 0.237, |
| "mean_token_accuracy": 0.9307721853256226, |
| "num_tokens": 3772329.0, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.8369050304534176, |
| "grad_norm": 0.39573797583580017, |
| "learning_rate": 0.00024979923302503946, |
| "loss": 0.23, |
| "mean_token_accuracy": 0.9365652084350586, |
| "num_tokens": 3782523.0, |
| "step": 3710 |
| }, |
| { |
| "epoch": 0.8391608391608392, |
| "grad_norm": 0.6500651240348816, |
| "learning_rate": 0.00024966388450259417, |
| "loss": 0.2188, |
| "mean_token_accuracy": 0.9423346817493439, |
| "num_tokens": 3792662.0, |
| "step": 3720 |
| }, |
| { |
| "epoch": 0.8414166478682608, |
| "grad_norm": 0.5717751383781433, |
| "learning_rate": 0.0002495285359801489, |
| "loss": 0.2367, |
| "mean_token_accuracy": 0.9352416634559632, |
| "num_tokens": 3802837.0, |
| "step": 3730 |
| }, |
| { |
| "epoch": 0.8436724565756824, |
| "grad_norm": 0.6442503929138184, |
| "learning_rate": 0.0002493931874577036, |
| "loss": 0.1753, |
| "mean_token_accuracy": 0.9477633893489837, |
| "num_tokens": 3813070.0, |
| "step": 3740 |
| }, |
| { |
| "epoch": 0.8459282652831039, |
| "grad_norm": 0.4221307337284088, |
| "learning_rate": 0.0002492578389352583, |
| "loss": 0.1869, |
| "mean_token_accuracy": 0.9431292831897735, |
| "num_tokens": 3823243.0, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.8481840739905256, |
| "grad_norm": 0.5930933952331543, |
| "learning_rate": 0.00024912249041281296, |
| "loss": 0.2455, |
| "mean_token_accuracy": 0.9271996915340424, |
| "num_tokens": 3833364.0, |
| "step": 3760 |
| }, |
| { |
| "epoch": 0.8504398826979472, |
| "grad_norm": 0.42684584856033325, |
| "learning_rate": 0.00024898714189036767, |
| "loss": 0.197, |
| "mean_token_accuracy": 0.941864401102066, |
| "num_tokens": 3843576.0, |
| "step": 3770 |
| }, |
| { |
| "epoch": 0.8526956914053688, |
| "grad_norm": 0.4301314949989319, |
| "learning_rate": 0.0002488517933679224, |
| "loss": 0.2091, |
| "mean_token_accuracy": 0.9357395350933075, |
| "num_tokens": 3853722.0, |
| "step": 3780 |
| }, |
| { |
| "epoch": 0.8549515001127904, |
| "grad_norm": 0.7294553518295288, |
| "learning_rate": 0.0002487164448454771, |
| "loss": 0.2457, |
| "mean_token_accuracy": 0.930441266298294, |
| "num_tokens": 3863953.0, |
| "step": 3790 |
| }, |
| { |
| "epoch": 0.857207308820212, |
| "grad_norm": 0.7167822122573853, |
| "learning_rate": 0.0002485810963230318, |
| "loss": 0.1813, |
| "mean_token_accuracy": 0.9461415946483612, |
| "num_tokens": 3874173.0, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.8594631175276336, |
| "grad_norm": 0.45225095748901367, |
| "learning_rate": 0.0002484457478005865, |
| "loss": 0.2711, |
| "mean_token_accuracy": 0.9266981244087219, |
| "num_tokens": 3884297.0, |
| "step": 3810 |
| }, |
| { |
| "epoch": 0.8617189262350553, |
| "grad_norm": 0.5794314742088318, |
| "learning_rate": 0.0002483103992781412, |
| "loss": 0.1905, |
| "mean_token_accuracy": 0.9450049281120301, |
| "num_tokens": 3894478.0, |
| "step": 3820 |
| }, |
| { |
| "epoch": 0.8639747349424769, |
| "grad_norm": 0.6747323870658875, |
| "learning_rate": 0.0002481750507556959, |
| "loss": 0.2213, |
| "mean_token_accuracy": 0.9325950682163239, |
| "num_tokens": 3904677.0, |
| "step": 3830 |
| }, |
| { |
| "epoch": 0.8662305436498985, |
| "grad_norm": 0.664412796497345, |
| "learning_rate": 0.00024803970223325064, |
| "loss": 0.1948, |
| "mean_token_accuracy": 0.943292647600174, |
| "num_tokens": 3914898.0, |
| "step": 3840 |
| }, |
| { |
| "epoch": 0.8684863523573201, |
| "grad_norm": 1.082534909248352, |
| "learning_rate": 0.0002479043537108053, |
| "loss": 0.2153, |
| "mean_token_accuracy": 0.9366356670856476, |
| "num_tokens": 3925076.0, |
| "step": 3850 |
| }, |
| { |
| "epoch": 0.8707421610647417, |
| "grad_norm": 0.9012080430984497, |
| "learning_rate": 0.00024776900518836, |
| "loss": 0.2589, |
| "mean_token_accuracy": 0.9262183904647827, |
| "num_tokens": 3935233.0, |
| "step": 3860 |
| }, |
| { |
| "epoch": 0.8729979697721634, |
| "grad_norm": 0.6924684047698975, |
| "learning_rate": 0.0002476336566659147, |
| "loss": 0.2281, |
| "mean_token_accuracy": 0.9359488189220428, |
| "num_tokens": 3945432.0, |
| "step": 3870 |
| }, |
| { |
| "epoch": 0.875253778479585, |
| "grad_norm": 0.4230230152606964, |
| "learning_rate": 0.0002474983081434694, |
| "loss": 0.2253, |
| "mean_token_accuracy": 0.9308477878570557, |
| "num_tokens": 3955665.0, |
| "step": 3880 |
| }, |
| { |
| "epoch": 0.8775095871870066, |
| "grad_norm": 0.5450705885887146, |
| "learning_rate": 0.00024736295962102413, |
| "loss": 0.2488, |
| "mean_token_accuracy": 0.9260222852230072, |
| "num_tokens": 3965842.0, |
| "step": 3890 |
| }, |
| { |
| "epoch": 0.8797653958944281, |
| "grad_norm": 0.7850773930549622, |
| "learning_rate": 0.00024722761109857884, |
| "loss": 0.1976, |
| "mean_token_accuracy": 0.9407764256000519, |
| "num_tokens": 3976064.0, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.8820212046018497, |
| "grad_norm": 0.6162165999412537, |
| "learning_rate": 0.00024709226257613355, |
| "loss": 0.2032, |
| "mean_token_accuracy": 0.939377635717392, |
| "num_tokens": 3986249.0, |
| "step": 3910 |
| }, |
| { |
| "epoch": 0.8842770133092713, |
| "grad_norm": 0.3530557453632355, |
| "learning_rate": 0.00024695691405368826, |
| "loss": 0.2005, |
| "mean_token_accuracy": 0.9405902981758117, |
| "num_tokens": 3996443.0, |
| "step": 3920 |
| }, |
| { |
| "epoch": 0.886532822016693, |
| "grad_norm": 0.47658300399780273, |
| "learning_rate": 0.0002468215655312429, |
| "loss": 0.2388, |
| "mean_token_accuracy": 0.934827721118927, |
| "num_tokens": 4006670.0, |
| "step": 3930 |
| }, |
| { |
| "epoch": 0.8887886307241146, |
| "grad_norm": 0.45294544100761414, |
| "learning_rate": 0.0002466862170087976, |
| "loss": 0.1719, |
| "mean_token_accuracy": 0.9483415603637695, |
| "num_tokens": 4016900.0, |
| "step": 3940 |
| }, |
| { |
| "epoch": 0.8910444394315362, |
| "grad_norm": 0.650928258895874, |
| "learning_rate": 0.00024655086848635234, |
| "loss": 0.2177, |
| "mean_token_accuracy": 0.9376309931278228, |
| "num_tokens": 4027125.0, |
| "step": 3950 |
| }, |
| { |
| "epoch": 0.8933002481389578, |
| "grad_norm": 0.3383767604827881, |
| "learning_rate": 0.00024641551996390705, |
| "loss": 0.1752, |
| "mean_token_accuracy": 0.94934002161026, |
| "num_tokens": 4037360.0, |
| "step": 3960 |
| }, |
| { |
| "epoch": 0.8955560568463794, |
| "grad_norm": 1.4327523708343506, |
| "learning_rate": 0.0002462801714414617, |
| "loss": 0.1836, |
| "mean_token_accuracy": 0.9417614221572876, |
| "num_tokens": 4047540.0, |
| "step": 3970 |
| }, |
| { |
| "epoch": 0.897811865553801, |
| "grad_norm": 0.5295197367668152, |
| "learning_rate": 0.0002461448229190164, |
| "loss": 0.2672, |
| "mean_token_accuracy": 0.9268444120883942, |
| "num_tokens": 4057757.0, |
| "step": 3980 |
| }, |
| { |
| "epoch": 0.9000676742612227, |
| "grad_norm": 0.7444450855255127, |
| "learning_rate": 0.0002460094743965712, |
| "loss": 0.1885, |
| "mean_token_accuracy": 0.9425229966640473, |
| "num_tokens": 4067940.0, |
| "step": 3990 |
| }, |
| { |
| "epoch": 0.9023234829686443, |
| "grad_norm": 0.45960938930511475, |
| "learning_rate": 0.0002458741258741259, |
| "loss": 0.1876, |
| "mean_token_accuracy": 0.9434334516525269, |
| "num_tokens": 4078168.0, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.9045792916760659, |
| "grad_norm": 0.42837294936180115, |
| "learning_rate": 0.0002457387773516806, |
| "loss": 0.181, |
| "mean_token_accuracy": 0.9469284176826477, |
| "num_tokens": 4088367.0, |
| "step": 4010 |
| }, |
| { |
| "epoch": 0.9068351003834875, |
| "grad_norm": 0.6162718534469604, |
| "learning_rate": 0.00024560342882923525, |
| "loss": 0.1887, |
| "mean_token_accuracy": 0.9453383386135101, |
| "num_tokens": 4098559.0, |
| "step": 4020 |
| }, |
| { |
| "epoch": 0.9090909090909091, |
| "grad_norm": 0.6896116137504578, |
| "learning_rate": 0.00024546808030678996, |
| "loss": 0.2, |
| "mean_token_accuracy": 0.9440081357955933, |
| "num_tokens": 4108750.0, |
| "step": 4030 |
| }, |
| { |
| "epoch": 0.9113467177983307, |
| "grad_norm": 0.4971648156642914, |
| "learning_rate": 0.00024533273178434467, |
| "loss": 0.1918, |
| "mean_token_accuracy": 0.943715387582779, |
| "num_tokens": 4118931.0, |
| "step": 4040 |
| }, |
| { |
| "epoch": 0.9136025265057524, |
| "grad_norm": 0.4439462125301361, |
| "learning_rate": 0.0002451973832618994, |
| "loss": 0.1893, |
| "mean_token_accuracy": 0.9430592775344848, |
| "num_tokens": 4128996.0, |
| "step": 4050 |
| }, |
| { |
| "epoch": 0.915858335213174, |
| "grad_norm": 0.5335512161254883, |
| "learning_rate": 0.00024506203473945404, |
| "loss": 0.2348, |
| "mean_token_accuracy": 0.938604736328125, |
| "num_tokens": 4139224.0, |
| "step": 4060 |
| }, |
| { |
| "epoch": 0.9181141439205955, |
| "grad_norm": 0.40199655294418335, |
| "learning_rate": 0.00024492668621700875, |
| "loss": 0.2864, |
| "mean_token_accuracy": 0.928019517660141, |
| "num_tokens": 4149335.0, |
| "step": 4070 |
| }, |
| { |
| "epoch": 0.9203699526280171, |
| "grad_norm": 0.47056302428245544, |
| "learning_rate": 0.0002447913376945635, |
| "loss": 0.1971, |
| "mean_token_accuracy": 0.9424499988555908, |
| "num_tokens": 4159563.0, |
| "step": 4080 |
| }, |
| { |
| "epoch": 0.9226257613354387, |
| "grad_norm": 0.6596531271934509, |
| "learning_rate": 0.0002446559891721182, |
| "loss": 0.2427, |
| "mean_token_accuracy": 0.933097630739212, |
| "num_tokens": 4169779.0, |
| "step": 4090 |
| }, |
| { |
| "epoch": 0.9248815700428603, |
| "grad_norm": 0.8583846092224121, |
| "learning_rate": 0.0002445206406496729, |
| "loss": 0.2194, |
| "mean_token_accuracy": 0.9355522513389587, |
| "num_tokens": 4179948.0, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.927137378750282, |
| "grad_norm": 0.9784001708030701, |
| "learning_rate": 0.0002443852921272276, |
| "loss": 0.2335, |
| "mean_token_accuracy": 0.9337855577468872, |
| "num_tokens": 4190170.0, |
| "step": 4110 |
| }, |
| { |
| "epoch": 0.9293931874577036, |
| "grad_norm": 0.6085280179977417, |
| "learning_rate": 0.0002442499436047823, |
| "loss": 0.2133, |
| "mean_token_accuracy": 0.9390449047088623, |
| "num_tokens": 4200347.0, |
| "step": 4120 |
| }, |
| { |
| "epoch": 0.9316489961651252, |
| "grad_norm": 0.8368825912475586, |
| "learning_rate": 0.000244114595082337, |
| "loss": 0.2028, |
| "mean_token_accuracy": 0.9399807095527649, |
| "num_tokens": 4210514.0, |
| "step": 4130 |
| }, |
| { |
| "epoch": 0.9339048048725468, |
| "grad_norm": 0.8934887647628784, |
| "learning_rate": 0.0002439792465598917, |
| "loss": 0.2312, |
| "mean_token_accuracy": 0.9369701504707336, |
| "num_tokens": 4220722.0, |
| "step": 4140 |
| }, |
| { |
| "epoch": 0.9361606135799684, |
| "grad_norm": 0.8492236137390137, |
| "learning_rate": 0.0002438438980374464, |
| "loss": 0.1979, |
| "mean_token_accuracy": 0.9407924175262451, |
| "num_tokens": 4230927.0, |
| "step": 4150 |
| }, |
| { |
| "epoch": 0.9384164222873901, |
| "grad_norm": 0.6423715949058533, |
| "learning_rate": 0.0002437085495150011, |
| "loss": 0.151, |
| "mean_token_accuracy": 0.9534207642078399, |
| "num_tokens": 4241153.0, |
| "step": 4160 |
| }, |
| { |
| "epoch": 0.9406722309948117, |
| "grad_norm": 0.7069862484931946, |
| "learning_rate": 0.00024357320099255582, |
| "loss": 0.1772, |
| "mean_token_accuracy": 0.947028785943985, |
| "num_tokens": 4251293.0, |
| "step": 4170 |
| }, |
| { |
| "epoch": 0.9429280397022333, |
| "grad_norm": 0.5257987380027771, |
| "learning_rate": 0.00024343785247011053, |
| "loss": 0.2198, |
| "mean_token_accuracy": 0.9393032670021058, |
| "num_tokens": 4261424.0, |
| "step": 4180 |
| }, |
| { |
| "epoch": 0.9451838484096549, |
| "grad_norm": 0.5543506741523743, |
| "learning_rate": 0.0002433025039476652, |
| "loss": 0.1789, |
| "mean_token_accuracy": 0.9457716882228852, |
| "num_tokens": 4271649.0, |
| "step": 4190 |
| }, |
| { |
| "epoch": 0.9474396571170765, |
| "grad_norm": 0.5377987623214722, |
| "learning_rate": 0.00024316715542521992, |
| "loss": 0.1661, |
| "mean_token_accuracy": 0.9503224551677704, |
| "num_tokens": 4281830.0, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.949695465824498, |
| "grad_norm": 0.5515894889831543, |
| "learning_rate": 0.00024303180690277463, |
| "loss": 0.2387, |
| "mean_token_accuracy": 0.9334891140460968, |
| "num_tokens": 4292015.0, |
| "step": 4210 |
| }, |
| { |
| "epoch": 0.9519512745319197, |
| "grad_norm": 0.44863930344581604, |
| "learning_rate": 0.00024289645838032934, |
| "loss": 0.2127, |
| "mean_token_accuracy": 0.9371248781681061, |
| "num_tokens": 4302252.0, |
| "step": 4220 |
| }, |
| { |
| "epoch": 0.9542070832393413, |
| "grad_norm": 0.6548046469688416, |
| "learning_rate": 0.00024276110985788402, |
| "loss": 0.1915, |
| "mean_token_accuracy": 0.9437361776828765, |
| "num_tokens": 4312481.0, |
| "step": 4230 |
| }, |
| { |
| "epoch": 0.9564628919467629, |
| "grad_norm": 0.7284629940986633, |
| "learning_rate": 0.00024262576133543873, |
| "loss": 0.2319, |
| "mean_token_accuracy": 0.9398886144161225, |
| "num_tokens": 4322667.0, |
| "step": 4240 |
| }, |
| { |
| "epoch": 0.9587187006541845, |
| "grad_norm": 0.5784673690795898, |
| "learning_rate": 0.00024249041281299344, |
| "loss": 0.2213, |
| "mean_token_accuracy": 0.9347177267074585, |
| "num_tokens": 4332825.0, |
| "step": 4250 |
| }, |
| { |
| "epoch": 0.9609745093616061, |
| "grad_norm": 0.7612512707710266, |
| "learning_rate": 0.00024235506429054815, |
| "loss": 0.1805, |
| "mean_token_accuracy": 0.9467396676540375, |
| "num_tokens": 4342910.0, |
| "step": 4260 |
| }, |
| { |
| "epoch": 0.9632303180690277, |
| "grad_norm": 0.6591640114784241, |
| "learning_rate": 0.00024221971576810283, |
| "loss": 0.2696, |
| "mean_token_accuracy": 0.9323035717010498, |
| "num_tokens": 4352998.0, |
| "step": 4270 |
| }, |
| { |
| "epoch": 0.9654861267764494, |
| "grad_norm": 0.6853541135787964, |
| "learning_rate": 0.00024208436724565754, |
| "loss": 0.2211, |
| "mean_token_accuracy": 0.9332985162734986, |
| "num_tokens": 4363163.0, |
| "step": 4280 |
| }, |
| { |
| "epoch": 0.967741935483871, |
| "grad_norm": 0.5725349187850952, |
| "learning_rate": 0.00024194901872321225, |
| "loss": 0.2287, |
| "mean_token_accuracy": 0.9395670652389526, |
| "num_tokens": 4373354.0, |
| "step": 4290 |
| }, |
| { |
| "epoch": 0.9699977441912926, |
| "grad_norm": 0.7916859984397888, |
| "learning_rate": 0.00024181367020076696, |
| "loss": 0.2157, |
| "mean_token_accuracy": 0.9374566435813904, |
| "num_tokens": 4383585.0, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.9722535528987142, |
| "grad_norm": 0.5497978925704956, |
| "learning_rate": 0.00024167832167832167, |
| "loss": 0.1723, |
| "mean_token_accuracy": 0.9466311931610107, |
| "num_tokens": 4393705.0, |
| "step": 4310 |
| }, |
| { |
| "epoch": 0.9745093616061358, |
| "grad_norm": 0.5773023366928101, |
| "learning_rate": 0.00024154297315587636, |
| "loss": 0.2037, |
| "mean_token_accuracy": 0.9397094666957855, |
| "num_tokens": 4403932.0, |
| "step": 4320 |
| }, |
| { |
| "epoch": 0.9767651703135574, |
| "grad_norm": 0.7037113308906555, |
| "learning_rate": 0.00024140762463343107, |
| "loss": 0.2129, |
| "mean_token_accuracy": 0.935580176115036, |
| "num_tokens": 4414060.0, |
| "step": 4330 |
| }, |
| { |
| "epoch": 0.9790209790209791, |
| "grad_norm": 0.44150829315185547, |
| "learning_rate": 0.00024127227611098578, |
| "loss": 0.1892, |
| "mean_token_accuracy": 0.9447483420372009, |
| "num_tokens": 4424251.0, |
| "step": 4340 |
| }, |
| { |
| "epoch": 0.9812767877284007, |
| "grad_norm": 1.1108092069625854, |
| "learning_rate": 0.0002411369275885405, |
| "loss": 0.2264, |
| "mean_token_accuracy": 0.9399719953536987, |
| "num_tokens": 4434422.0, |
| "step": 4350 |
| }, |
| { |
| "epoch": 0.9835325964358222, |
| "grad_norm": 0.5706174969673157, |
| "learning_rate": 0.00024100157906609517, |
| "loss": 0.1756, |
| "mean_token_accuracy": 0.9461008369922638, |
| "num_tokens": 4444650.0, |
| "step": 4360 |
| }, |
| { |
| "epoch": 0.9857884051432438, |
| "grad_norm": 0.6826101541519165, |
| "learning_rate": 0.00024086623054364988, |
| "loss": 0.2267, |
| "mean_token_accuracy": 0.9352168619632721, |
| "num_tokens": 4454873.0, |
| "step": 4370 |
| }, |
| { |
| "epoch": 0.9880442138506654, |
| "grad_norm": 0.957870602607727, |
| "learning_rate": 0.0002407308820212046, |
| "loss": 0.2347, |
| "mean_token_accuracy": 0.9347853481769561, |
| "num_tokens": 4465075.0, |
| "step": 4380 |
| }, |
| { |
| "epoch": 0.990300022558087, |
| "grad_norm": 0.6360123157501221, |
| "learning_rate": 0.0002405955334987593, |
| "loss": 0.1876, |
| "mean_token_accuracy": 0.9438979625701904, |
| "num_tokens": 4475268.0, |
| "step": 4390 |
| }, |
| { |
| "epoch": 0.9925558312655087, |
| "grad_norm": 0.5211275815963745, |
| "learning_rate": 0.00024046018497631398, |
| "loss": 0.1791, |
| "mean_token_accuracy": 0.9438638925552368, |
| "num_tokens": 4485503.0, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.9948116399729303, |
| "grad_norm": 0.977278470993042, |
| "learning_rate": 0.0002403248364538687, |
| "loss": 0.2014, |
| "mean_token_accuracy": 0.9406830132007599, |
| "num_tokens": 4495710.0, |
| "step": 4410 |
| }, |
| { |
| "epoch": 0.9970674486803519, |
| "grad_norm": 0.45597749948501587, |
| "learning_rate": 0.0002401894879314234, |
| "loss": 0.2089, |
| "mean_token_accuracy": 0.9395563662052154, |
| "num_tokens": 4505875.0, |
| "step": 4420 |
| }, |
| { |
| "epoch": 0.9993232573877735, |
| "grad_norm": 0.5130080580711365, |
| "learning_rate": 0.0002400541394089781, |
| "loss": 0.1854, |
| "mean_token_accuracy": 0.9489345788955689, |
| "num_tokens": 4516011.0, |
| "step": 4430 |
| }, |
| { |
| "epoch": 1.001579066095195, |
| "grad_norm": 0.3131079077720642, |
| "learning_rate": 0.0002399187908865328, |
| "loss": 0.1764, |
| "mean_token_accuracy": 0.9488461136817932, |
| "num_tokens": 4526245.0, |
| "step": 4440 |
| }, |
| { |
| "epoch": 1.0038348748026167, |
| "grad_norm": 0.8407638072967529, |
| "learning_rate": 0.0002397834423640875, |
| "loss": 0.1264, |
| "mean_token_accuracy": 0.9605377078056335, |
| "num_tokens": 4536415.0, |
| "step": 4450 |
| }, |
| { |
| "epoch": 1.0060906835100383, |
| "grad_norm": 0.5652706027030945, |
| "learning_rate": 0.00023964809384164221, |
| "loss": 0.1479, |
| "mean_token_accuracy": 0.9554095208644867, |
| "num_tokens": 4546571.0, |
| "step": 4460 |
| }, |
| { |
| "epoch": 1.0083464922174599, |
| "grad_norm": 0.920248806476593, |
| "learning_rate": 0.00023951274531919692, |
| "loss": 0.1819, |
| "mean_token_accuracy": 0.9463054478168488, |
| "num_tokens": 4556755.0, |
| "step": 4470 |
| }, |
| { |
| "epoch": 1.0106023009248817, |
| "grad_norm": 1.09098482131958, |
| "learning_rate": 0.00023937739679675163, |
| "loss": 0.188, |
| "mean_token_accuracy": 0.9430878520011902, |
| "num_tokens": 4566934.0, |
| "step": 4480 |
| }, |
| { |
| "epoch": 1.0128581096323033, |
| "grad_norm": 0.6064813137054443, |
| "learning_rate": 0.00023924204827430632, |
| "loss": 0.1509, |
| "mean_token_accuracy": 0.9536380171775818, |
| "num_tokens": 4577167.0, |
| "step": 4490 |
| }, |
| { |
| "epoch": 1.0151139183397249, |
| "grad_norm": 0.4210309386253357, |
| "learning_rate": 0.00023910669975186103, |
| "loss": 0.1356, |
| "mean_token_accuracy": 0.958275294303894, |
| "num_tokens": 4587306.0, |
| "step": 4500 |
| }, |
| { |
| "epoch": 1.0173697270471465, |
| "grad_norm": 0.6188949942588806, |
| "learning_rate": 0.00023897135122941574, |
| "loss": 0.1699, |
| "mean_token_accuracy": 0.9480436682701111, |
| "num_tokens": 4597517.0, |
| "step": 4510 |
| }, |
| { |
| "epoch": 1.019625535754568, |
| "grad_norm": 0.9677643775939941, |
| "learning_rate": 0.00023883600270697045, |
| "loss": 0.1729, |
| "mean_token_accuracy": 0.9452177286148071, |
| "num_tokens": 4607748.0, |
| "step": 4520 |
| }, |
| { |
| "epoch": 1.0218813444619896, |
| "grad_norm": 1.0022025108337402, |
| "learning_rate": 0.00023870065418452513, |
| "loss": 0.226, |
| "mean_token_accuracy": 0.9398165047168732, |
| "num_tokens": 4617907.0, |
| "step": 4530 |
| }, |
| { |
| "epoch": 1.0241371531694112, |
| "grad_norm": 0.5817978382110596, |
| "learning_rate": 0.00023856530566207984, |
| "loss": 0.1559, |
| "mean_token_accuracy": 0.9518312573432922, |
| "num_tokens": 4628126.0, |
| "step": 4540 |
| }, |
| { |
| "epoch": 1.0263929618768328, |
| "grad_norm": 0.5886203646659851, |
| "learning_rate": 0.00023842995713963455, |
| "loss": 0.194, |
| "mean_token_accuracy": 0.9445899367332459, |
| "num_tokens": 4638349.0, |
| "step": 4550 |
| }, |
| { |
| "epoch": 1.0286487705842544, |
| "grad_norm": 0.565995991230011, |
| "learning_rate": 0.00023829460861718926, |
| "loss": 0.1571, |
| "mean_token_accuracy": 0.9537485420703888, |
| "num_tokens": 4648556.0, |
| "step": 4560 |
| }, |
| { |
| "epoch": 1.030904579291676, |
| "grad_norm": 0.809699535369873, |
| "learning_rate": 0.00023815926009474394, |
| "loss": 0.1824, |
| "mean_token_accuracy": 0.9448329627513885, |
| "num_tokens": 4658749.0, |
| "step": 4570 |
| }, |
| { |
| "epoch": 1.0331603879990976, |
| "grad_norm": 0.5532633662223816, |
| "learning_rate": 0.00023802391157229865, |
| "loss": 0.199, |
| "mean_token_accuracy": 0.9399157762527466, |
| "num_tokens": 4668925.0, |
| "step": 4580 |
| }, |
| { |
| "epoch": 1.0354161967065192, |
| "grad_norm": 0.7670914530754089, |
| "learning_rate": 0.00023788856304985336, |
| "loss": 0.1814, |
| "mean_token_accuracy": 0.9444401502609253, |
| "num_tokens": 4679158.0, |
| "step": 4590 |
| }, |
| { |
| "epoch": 1.037672005413941, |
| "grad_norm": 0.24954642355442047, |
| "learning_rate": 0.00023775321452740807, |
| "loss": 0.1463, |
| "mean_token_accuracy": 0.9550969898700714, |
| "num_tokens": 4689389.0, |
| "step": 4600 |
| }, |
| { |
| "epoch": 1.0399278141213626, |
| "grad_norm": 0.5538378953933716, |
| "learning_rate": 0.00023761786600496275, |
| "loss": 0.1642, |
| "mean_token_accuracy": 0.9498744547367096, |
| "num_tokens": 4699613.0, |
| "step": 4610 |
| }, |
| { |
| "epoch": 1.0421836228287842, |
| "grad_norm": 0.5744291543960571, |
| "learning_rate": 0.00023748251748251746, |
| "loss": 0.1616, |
| "mean_token_accuracy": 0.9534083902835846, |
| "num_tokens": 4709817.0, |
| "step": 4620 |
| }, |
| { |
| "epoch": 1.0444394315362058, |
| "grad_norm": 0.6914228796958923, |
| "learning_rate": 0.00023734716896007217, |
| "loss": 0.1511, |
| "mean_token_accuracy": 0.9552951693534851, |
| "num_tokens": 4720047.0, |
| "step": 4630 |
| }, |
| { |
| "epoch": 1.0466952402436274, |
| "grad_norm": 0.7641374468803406, |
| "learning_rate": 0.00023721182043762688, |
| "loss": 0.1868, |
| "mean_token_accuracy": 0.9446970582008362, |
| "num_tokens": 4730215.0, |
| "step": 4640 |
| }, |
| { |
| "epoch": 1.048951048951049, |
| "grad_norm": 0.4645124673843384, |
| "learning_rate": 0.0002370764719151816, |
| "loss": 0.1831, |
| "mean_token_accuracy": 0.9454750001430512, |
| "num_tokens": 4740428.0, |
| "step": 4650 |
| }, |
| { |
| "epoch": 1.0512068576584706, |
| "grad_norm": 0.6806756854057312, |
| "learning_rate": 0.00023694112339273628, |
| "loss": 0.1451, |
| "mean_token_accuracy": 0.957587742805481, |
| "num_tokens": 4750619.0, |
| "step": 4660 |
| }, |
| { |
| "epoch": 1.0534626663658921, |
| "grad_norm": 0.6630045175552368, |
| "learning_rate": 0.00023680577487029099, |
| "loss": 0.1618, |
| "mean_token_accuracy": 0.950842559337616, |
| "num_tokens": 4760831.0, |
| "step": 4670 |
| }, |
| { |
| "epoch": 1.0557184750733137, |
| "grad_norm": 0.5879353284835815, |
| "learning_rate": 0.0002366704263478457, |
| "loss": 0.1502, |
| "mean_token_accuracy": 0.9544346511363984, |
| "num_tokens": 4771006.0, |
| "step": 4680 |
| }, |
| { |
| "epoch": 1.0579742837807353, |
| "grad_norm": 0.6933557987213135, |
| "learning_rate": 0.0002365350778254004, |
| "loss": 0.2272, |
| "mean_token_accuracy": 0.938098531961441, |
| "num_tokens": 4781241.0, |
| "step": 4690 |
| }, |
| { |
| "epoch": 1.060230092488157, |
| "grad_norm": 0.6140124201774597, |
| "learning_rate": 0.0002363997293029551, |
| "loss": 0.2053, |
| "mean_token_accuracy": 0.9433477580547333, |
| "num_tokens": 4791472.0, |
| "step": 4700 |
| }, |
| { |
| "epoch": 1.0624859011955785, |
| "grad_norm": 0.6066429615020752, |
| "learning_rate": 0.0002362643807805098, |
| "loss": 0.1449, |
| "mean_token_accuracy": 0.9551500737667084, |
| "num_tokens": 4801650.0, |
| "step": 4710 |
| }, |
| { |
| "epoch": 1.0647417099030003, |
| "grad_norm": 0.6566262245178223, |
| "learning_rate": 0.0002361290322580645, |
| "loss": 0.1428, |
| "mean_token_accuracy": 0.9594050168991088, |
| "num_tokens": 4811882.0, |
| "step": 4720 |
| }, |
| { |
| "epoch": 1.066997518610422, |
| "grad_norm": 2.896930694580078, |
| "learning_rate": 0.00023599368373561922, |
| "loss": 0.2053, |
| "mean_token_accuracy": 0.9418795704841614, |
| "num_tokens": 4821080.0, |
| "step": 4730 |
| }, |
| { |
| "epoch": 1.0692533273178435, |
| "grad_norm": 0.45278868079185486, |
| "learning_rate": 0.0002358583352131739, |
| "loss": 0.172, |
| "mean_token_accuracy": 0.9501473903656006, |
| "num_tokens": 4831283.0, |
| "step": 4740 |
| }, |
| { |
| "epoch": 1.071509136025265, |
| "grad_norm": 0.4739263951778412, |
| "learning_rate": 0.0002357229866907286, |
| "loss": 0.2002, |
| "mean_token_accuracy": 0.9419821977615357, |
| "num_tokens": 4841434.0, |
| "step": 4750 |
| }, |
| { |
| "epoch": 1.0737649447326867, |
| "grad_norm": 0.8713507652282715, |
| "learning_rate": 0.00023558763816828332, |
| "loss": 0.178, |
| "mean_token_accuracy": 0.9494198262691498, |
| "num_tokens": 4851630.0, |
| "step": 4760 |
| }, |
| { |
| "epoch": 1.0760207534401083, |
| "grad_norm": 0.6262771487236023, |
| "learning_rate": 0.00023545228964583803, |
| "loss": 0.1591, |
| "mean_token_accuracy": 0.9523061156272888, |
| "num_tokens": 4861774.0, |
| "step": 4770 |
| }, |
| { |
| "epoch": 1.0782765621475299, |
| "grad_norm": 0.6477741599082947, |
| "learning_rate": 0.0002353169411233927, |
| "loss": 0.203, |
| "mean_token_accuracy": 0.9406819343566895, |
| "num_tokens": 4872001.0, |
| "step": 4780 |
| }, |
| { |
| "epoch": 1.0805323708549515, |
| "grad_norm": 0.37195727229118347, |
| "learning_rate": 0.00023518159260094742, |
| "loss": 0.1458, |
| "mean_token_accuracy": 0.953748595714569, |
| "num_tokens": 4882143.0, |
| "step": 4790 |
| }, |
| { |
| "epoch": 1.082788179562373, |
| "grad_norm": 0.5744128227233887, |
| "learning_rate": 0.00023504624407850213, |
| "loss": 0.1852, |
| "mean_token_accuracy": 0.9459884941577912, |
| "num_tokens": 4892371.0, |
| "step": 4800 |
| }, |
| { |
| "epoch": 1.0850439882697946, |
| "grad_norm": 0.7021632194519043, |
| "learning_rate": 0.00023491089555605684, |
| "loss": 0.1736, |
| "mean_token_accuracy": 0.9457263708114624, |
| "num_tokens": 4902458.0, |
| "step": 4810 |
| }, |
| { |
| "epoch": 1.0872997969772162, |
| "grad_norm": 0.8485981822013855, |
| "learning_rate": 0.00023477554703361155, |
| "loss": 0.1775, |
| "mean_token_accuracy": 0.9493930280208588, |
| "num_tokens": 4912682.0, |
| "step": 4820 |
| }, |
| { |
| "epoch": 1.0895556056846378, |
| "grad_norm": 0.3264307975769043, |
| "learning_rate": 0.00023464019851116623, |
| "loss": 0.1782, |
| "mean_token_accuracy": 0.9486259996891022, |
| "num_tokens": 4922905.0, |
| "step": 4830 |
| }, |
| { |
| "epoch": 1.0918114143920596, |
| "grad_norm": 0.7745571136474609, |
| "learning_rate": 0.00023450484998872094, |
| "loss": 0.1528, |
| "mean_token_accuracy": 0.9531067311763763, |
| "num_tokens": 4933131.0, |
| "step": 4840 |
| }, |
| { |
| "epoch": 1.0940672230994812, |
| "grad_norm": 0.5187065601348877, |
| "learning_rate": 0.00023436950146627565, |
| "loss": 0.1421, |
| "mean_token_accuracy": 0.9564592063426971, |
| "num_tokens": 4943356.0, |
| "step": 4850 |
| }, |
| { |
| "epoch": 1.0963230318069028, |
| "grad_norm": 0.3341211676597595, |
| "learning_rate": 0.00023423415294383036, |
| "loss": 0.1483, |
| "mean_token_accuracy": 0.957532000541687, |
| "num_tokens": 4953583.0, |
| "step": 4860 |
| }, |
| { |
| "epoch": 1.0985788405143244, |
| "grad_norm": 0.6960042715072632, |
| "learning_rate": 0.00023409880442138502, |
| "loss": 0.1702, |
| "mean_token_accuracy": 0.950124329328537, |
| "num_tokens": 4963819.0, |
| "step": 4870 |
| }, |
| { |
| "epoch": 1.100834649221746, |
| "grad_norm": 0.5740098357200623, |
| "learning_rate": 0.00023396345589893976, |
| "loss": 0.1589, |
| "mean_token_accuracy": 0.9537279188632966, |
| "num_tokens": 4973668.0, |
| "step": 4880 |
| }, |
| { |
| "epoch": 1.1030904579291676, |
| "grad_norm": 0.7132306098937988, |
| "learning_rate": 0.00023382810737649447, |
| "loss": 0.1633, |
| "mean_token_accuracy": 0.9555535554885864, |
| "num_tokens": 4983783.0, |
| "step": 4890 |
| }, |
| { |
| "epoch": 1.1053462666365892, |
| "grad_norm": 0.5167339444160461, |
| "learning_rate": 0.00023369275885404918, |
| "loss": 0.2036, |
| "mean_token_accuracy": 0.9411836564540863, |
| "num_tokens": 4994007.0, |
| "step": 4900 |
| }, |
| { |
| "epoch": 1.1076020753440108, |
| "grad_norm": 0.4466972351074219, |
| "learning_rate": 0.00023355741033160383, |
| "loss": 0.167, |
| "mean_token_accuracy": 0.9488433957099914, |
| "num_tokens": 5004178.0, |
| "step": 4910 |
| }, |
| { |
| "epoch": 1.1098578840514324, |
| "grad_norm": 0.6669716835021973, |
| "learning_rate": 0.00023342206180915854, |
| "loss": 0.1465, |
| "mean_token_accuracy": 0.9539471685886383, |
| "num_tokens": 5014415.0, |
| "step": 4920 |
| }, |
| { |
| "epoch": 1.112113692758854, |
| "grad_norm": 0.5115512609481812, |
| "learning_rate": 0.00023328671328671328, |
| "loss": 0.1902, |
| "mean_token_accuracy": 0.94353746175766, |
| "num_tokens": 5024619.0, |
| "step": 4930 |
| }, |
| { |
| "epoch": 1.1143695014662756, |
| "grad_norm": 0.36398640275001526, |
| "learning_rate": 0.000233151364764268, |
| "loss": 0.1665, |
| "mean_token_accuracy": 0.951126629114151, |
| "num_tokens": 5034405.0, |
| "step": 4940 |
| }, |
| { |
| "epoch": 1.1166253101736974, |
| "grad_norm": 0.5980937480926514, |
| "learning_rate": 0.0002330160162418227, |
| "loss": 0.1722, |
| "mean_token_accuracy": 0.9499568104743957, |
| "num_tokens": 5044616.0, |
| "step": 4950 |
| }, |
| { |
| "epoch": 1.118881118881119, |
| "grad_norm": 0.7885400652885437, |
| "learning_rate": 0.00023288066771937735, |
| "loss": 0.1909, |
| "mean_token_accuracy": 0.9432080864906311, |
| "num_tokens": 5054818.0, |
| "step": 4960 |
| }, |
| { |
| "epoch": 1.1211369275885406, |
| "grad_norm": 0.7380874752998352, |
| "learning_rate": 0.0002327453191969321, |
| "loss": 0.1843, |
| "mean_token_accuracy": 0.9453416705131531, |
| "num_tokens": 5064910.0, |
| "step": 4970 |
| }, |
| { |
| "epoch": 1.1233927362959621, |
| "grad_norm": 0.46912482380867004, |
| "learning_rate": 0.0002326099706744868, |
| "loss": 0.1377, |
| "mean_token_accuracy": 0.9612941980361939, |
| "num_tokens": 5075124.0, |
| "step": 4980 |
| }, |
| { |
| "epoch": 1.1256485450033837, |
| "grad_norm": 0.7229384183883667, |
| "learning_rate": 0.0002324746221520415, |
| "loss": 0.1846, |
| "mean_token_accuracy": 0.9464283585548401, |
| "num_tokens": 5085268.0, |
| "step": 4990 |
| }, |
| { |
| "epoch": 1.1279043537108053, |
| "grad_norm": 0.6191129088401794, |
| "learning_rate": 0.00023233927362959617, |
| "loss": 0.1863, |
| "mean_token_accuracy": 0.9472849369049072, |
| "num_tokens": 5095405.0, |
| "step": 5000 |
| }, |
| { |
| "epoch": 1.130160162418227, |
| "grad_norm": 0.9127920866012573, |
| "learning_rate": 0.00023220392510715088, |
| "loss": 0.1621, |
| "mean_token_accuracy": 0.9521571576595307, |
| "num_tokens": 5105635.0, |
| "step": 5010 |
| }, |
| { |
| "epoch": 1.1324159711256485, |
| "grad_norm": 0.7310436367988586, |
| "learning_rate": 0.00023206857658470561, |
| "loss": 0.1762, |
| "mean_token_accuracy": 0.9507647454738617, |
| "num_tokens": 5115845.0, |
| "step": 5020 |
| }, |
| { |
| "epoch": 1.13467177983307, |
| "grad_norm": 0.6670770049095154, |
| "learning_rate": 0.00023193322806226032, |
| "loss": 0.1825, |
| "mean_token_accuracy": 0.9468953788280488, |
| "num_tokens": 5125969.0, |
| "step": 5030 |
| }, |
| { |
| "epoch": 1.1369275885404917, |
| "grad_norm": 0.970748245716095, |
| "learning_rate": 0.00023179787953981498, |
| "loss": 0.1982, |
| "mean_token_accuracy": 0.940336960554123, |
| "num_tokens": 5136130.0, |
| "step": 5040 |
| }, |
| { |
| "epoch": 1.1391833972479133, |
| "grad_norm": 0.6120514869689941, |
| "learning_rate": 0.0002316625310173697, |
| "loss": 0.1629, |
| "mean_token_accuracy": 0.9530200004577637, |
| "num_tokens": 5146326.0, |
| "step": 5050 |
| }, |
| { |
| "epoch": 1.141439205955335, |
| "grad_norm": 0.6671406030654907, |
| "learning_rate": 0.0002315271824949244, |
| "loss": 0.1627, |
| "mean_token_accuracy": 0.9505042195320129, |
| "num_tokens": 5156501.0, |
| "step": 5060 |
| }, |
| { |
| "epoch": 1.1436950146627567, |
| "grad_norm": 0.6213026642799377, |
| "learning_rate": 0.00023139183397247914, |
| "loss": 0.1497, |
| "mean_token_accuracy": 0.9575595736503602, |
| "num_tokens": 5166737.0, |
| "step": 5070 |
| }, |
| { |
| "epoch": 1.1459508233701783, |
| "grad_norm": 0.8337035179138184, |
| "learning_rate": 0.0002312564854500338, |
| "loss": 0.1522, |
| "mean_token_accuracy": 0.9550527095794678, |
| "num_tokens": 5176918.0, |
| "step": 5080 |
| }, |
| { |
| "epoch": 1.1482066320775999, |
| "grad_norm": 0.5575427412986755, |
| "learning_rate": 0.0002311211369275885, |
| "loss": 0.1688, |
| "mean_token_accuracy": 0.9502976775169373, |
| "num_tokens": 5187057.0, |
| "step": 5090 |
| }, |
| { |
| "epoch": 1.1504624407850215, |
| "grad_norm": 0.6455193758010864, |
| "learning_rate": 0.0002309857884051432, |
| "loss": 0.1778, |
| "mean_token_accuracy": 0.9466927468776702, |
| "num_tokens": 5197290.0, |
| "step": 5100 |
| }, |
| { |
| "epoch": 1.152718249492443, |
| "grad_norm": 0.5528632402420044, |
| "learning_rate": 0.00023085043988269795, |
| "loss": 0.2195, |
| "mean_token_accuracy": 0.9405810832977295, |
| "num_tokens": 5207449.0, |
| "step": 5110 |
| }, |
| { |
| "epoch": 1.1549740581998647, |
| "grad_norm": 0.5751885771751404, |
| "learning_rate": 0.00023071509136025266, |
| "loss": 0.1519, |
| "mean_token_accuracy": 0.9569954037666321, |
| "num_tokens": 5217669.0, |
| "step": 5120 |
| }, |
| { |
| "epoch": 1.1572298669072862, |
| "grad_norm": 0.7101620435714722, |
| "learning_rate": 0.00023057974283780731, |
| "loss": 0.1448, |
| "mean_token_accuracy": 0.9569979846477509, |
| "num_tokens": 5227874.0, |
| "step": 5130 |
| }, |
| { |
| "epoch": 1.1594856756147078, |
| "grad_norm": 0.684057354927063, |
| "learning_rate": 0.00023044439431536202, |
| "loss": 0.137, |
| "mean_token_accuracy": 0.959661203622818, |
| "num_tokens": 5238096.0, |
| "step": 5140 |
| }, |
| { |
| "epoch": 1.1617414843221294, |
| "grad_norm": 0.6585609912872314, |
| "learning_rate": 0.00023030904579291673, |
| "loss": 0.1685, |
| "mean_token_accuracy": 0.950641131401062, |
| "num_tokens": 5248279.0, |
| "step": 5150 |
| }, |
| { |
| "epoch": 1.163997293029551, |
| "grad_norm": 0.8113618493080139, |
| "learning_rate": 0.00023017369727047147, |
| "loss": 0.1723, |
| "mean_token_accuracy": 0.9519901633262634, |
| "num_tokens": 5258400.0, |
| "step": 5160 |
| }, |
| { |
| "epoch": 1.1662531017369726, |
| "grad_norm": 0.4892439842224121, |
| "learning_rate": 0.00023003834874802613, |
| "loss": 0.154, |
| "mean_token_accuracy": 0.9551227807998657, |
| "num_tokens": 5268609.0, |
| "step": 5170 |
| }, |
| { |
| "epoch": 1.1685089104443942, |
| "grad_norm": 0.7058376669883728, |
| "learning_rate": 0.00022990300022558084, |
| "loss": 0.2199, |
| "mean_token_accuracy": 0.9406746447086334, |
| "num_tokens": 5278776.0, |
| "step": 5180 |
| }, |
| { |
| "epoch": 1.170764719151816, |
| "grad_norm": 0.3844442367553711, |
| "learning_rate": 0.00022976765170313555, |
| "loss": 0.208, |
| "mean_token_accuracy": 0.9386930227279663, |
| "num_tokens": 5289013.0, |
| "step": 5190 |
| }, |
| { |
| "epoch": 1.1730205278592376, |
| "grad_norm": 0.7708050608634949, |
| "learning_rate": 0.00022963230318069026, |
| "loss": 0.1549, |
| "mean_token_accuracy": 0.9572183132171631, |
| "num_tokens": 5299229.0, |
| "step": 5200 |
| }, |
| { |
| "epoch": 1.1752763365666592, |
| "grad_norm": 0.9352420568466187, |
| "learning_rate": 0.00022949695465824494, |
| "loss": 0.1566, |
| "mean_token_accuracy": 0.9551738202571869, |
| "num_tokens": 5309341.0, |
| "step": 5210 |
| }, |
| { |
| "epoch": 1.1775321452740808, |
| "grad_norm": 0.7025801539421082, |
| "learning_rate": 0.00022936160613579965, |
| "loss": 0.1285, |
| "mean_token_accuracy": 0.9594053149223327, |
| "num_tokens": 5319543.0, |
| "step": 5220 |
| }, |
| { |
| "epoch": 1.1797879539815024, |
| "grad_norm": 0.5407170057296753, |
| "learning_rate": 0.00022922625761335436, |
| "loss": 0.1351, |
| "mean_token_accuracy": 0.9600762069225312, |
| "num_tokens": 5329768.0, |
| "step": 5230 |
| }, |
| { |
| "epoch": 1.182043762688924, |
| "grad_norm": 0.8326946496963501, |
| "learning_rate": 0.00022909090909090907, |
| "loss": 0.194, |
| "mean_token_accuracy": 0.9461723744869233, |
| "num_tokens": 5339915.0, |
| "step": 5240 |
| }, |
| { |
| "epoch": 1.1842995713963456, |
| "grad_norm": 0.7867940664291382, |
| "learning_rate": 0.00022895556056846375, |
| "loss": 0.2338, |
| "mean_token_accuracy": 0.9370923578739166, |
| "num_tokens": 5349979.0, |
| "step": 5250 |
| }, |
| { |
| "epoch": 1.1865553801037672, |
| "grad_norm": 0.5456581711769104, |
| "learning_rate": 0.00022882021204601846, |
| "loss": 0.1983, |
| "mean_token_accuracy": 0.9475960373878479, |
| "num_tokens": 5360121.0, |
| "step": 5260 |
| }, |
| { |
| "epoch": 1.1888111888111887, |
| "grad_norm": 0.6422356367111206, |
| "learning_rate": 0.00022868486352357317, |
| "loss": 0.1762, |
| "mean_token_accuracy": 0.9557766139507293, |
| "num_tokens": 5370302.0, |
| "step": 5270 |
| }, |
| { |
| "epoch": 1.1910669975186103, |
| "grad_norm": 0.5232251286506653, |
| "learning_rate": 0.00022854951500112788, |
| "loss": 0.1307, |
| "mean_token_accuracy": 0.9604880928993225, |
| "num_tokens": 5380536.0, |
| "step": 5280 |
| }, |
| { |
| "epoch": 1.193322806226032, |
| "grad_norm": 0.5937042832374573, |
| "learning_rate": 0.0002284141664786826, |
| "loss": 0.1605, |
| "mean_token_accuracy": 0.9531785011291504, |
| "num_tokens": 5390757.0, |
| "step": 5290 |
| }, |
| { |
| "epoch": 1.1955786149334537, |
| "grad_norm": 0.6249987483024597, |
| "learning_rate": 0.00022827881795623727, |
| "loss": 0.1344, |
| "mean_token_accuracy": 0.959484601020813, |
| "num_tokens": 5400977.0, |
| "step": 5300 |
| }, |
| { |
| "epoch": 1.1978344236408753, |
| "grad_norm": 0.7910540699958801, |
| "learning_rate": 0.00022814346943379198, |
| "loss": 0.1517, |
| "mean_token_accuracy": 0.9554309785366059, |
| "num_tokens": 5411187.0, |
| "step": 5310 |
| }, |
| { |
| "epoch": 1.200090232348297, |
| "grad_norm": 0.9017763137817383, |
| "learning_rate": 0.0002280081209113467, |
| "loss": 0.2021, |
| "mean_token_accuracy": 0.9464009702205658, |
| "num_tokens": 5421260.0, |
| "step": 5320 |
| }, |
| { |
| "epoch": 1.2023460410557185, |
| "grad_norm": 0.4037693738937378, |
| "learning_rate": 0.0002278727723889014, |
| "loss": 0.1439, |
| "mean_token_accuracy": 0.9570986866950989, |
| "num_tokens": 5431402.0, |
| "step": 5330 |
| }, |
| { |
| "epoch": 1.20460184976314, |
| "grad_norm": 0.36305737495422363, |
| "learning_rate": 0.00022773742386645609, |
| "loss": 0.1387, |
| "mean_token_accuracy": 0.9598490238189697, |
| "num_tokens": 5441591.0, |
| "step": 5340 |
| }, |
| { |
| "epoch": 1.2068576584705617, |
| "grad_norm": 0.8021289110183716, |
| "learning_rate": 0.0002276020753440108, |
| "loss": 0.1593, |
| "mean_token_accuracy": 0.9541407644748687, |
| "num_tokens": 5451828.0, |
| "step": 5350 |
| }, |
| { |
| "epoch": 1.2091134671779833, |
| "grad_norm": 0.8966614603996277, |
| "learning_rate": 0.0002274667268215655, |
| "loss": 0.1826, |
| "mean_token_accuracy": 0.9492299973964691, |
| "num_tokens": 5462020.0, |
| "step": 5360 |
| }, |
| { |
| "epoch": 1.2113692758854049, |
| "grad_norm": 0.5038102269172668, |
| "learning_rate": 0.00022733137829912022, |
| "loss": 0.1776, |
| "mean_token_accuracy": 0.9493005454540253, |
| "num_tokens": 5472213.0, |
| "step": 5370 |
| }, |
| { |
| "epoch": 1.2136250845928265, |
| "grad_norm": 0.8426568508148193, |
| "learning_rate": 0.0002271960297766749, |
| "loss": 0.1681, |
| "mean_token_accuracy": 0.9485830664634705, |
| "num_tokens": 5482433.0, |
| "step": 5380 |
| }, |
| { |
| "epoch": 1.215880893300248, |
| "grad_norm": 0.5926401019096375, |
| "learning_rate": 0.0002270606812542296, |
| "loss": 0.1725, |
| "mean_token_accuracy": 0.9492874026298523, |
| "num_tokens": 5492653.0, |
| "step": 5390 |
| }, |
| { |
| "epoch": 1.2181367020076697, |
| "grad_norm": 0.7410178184509277, |
| "learning_rate": 0.00022692533273178432, |
| "loss": 0.1678, |
| "mean_token_accuracy": 0.9544486105442047, |
| "num_tokens": 5502830.0, |
| "step": 5400 |
| }, |
| { |
| "epoch": 1.2203925107150915, |
| "grad_norm": 0.5833625793457031, |
| "learning_rate": 0.00022678998420933903, |
| "loss": 0.195, |
| "mean_token_accuracy": 0.9458029627799988, |
| "num_tokens": 5513013.0, |
| "step": 5410 |
| }, |
| { |
| "epoch": 1.222648319422513, |
| "grad_norm": 0.753453254699707, |
| "learning_rate": 0.0002266546356868937, |
| "loss": 0.1485, |
| "mean_token_accuracy": 0.9566802024841309, |
| "num_tokens": 5523249.0, |
| "step": 5420 |
| }, |
| { |
| "epoch": 1.2249041281299347, |
| "grad_norm": 0.8071026802062988, |
| "learning_rate": 0.00022651928716444842, |
| "loss": 0.162, |
| "mean_token_accuracy": 0.9501352787017823, |
| "num_tokens": 5533427.0, |
| "step": 5430 |
| }, |
| { |
| "epoch": 1.2271599368373562, |
| "grad_norm": 0.6357564926147461, |
| "learning_rate": 0.00022638393864200313, |
| "loss": 0.1418, |
| "mean_token_accuracy": 0.9574578821659088, |
| "num_tokens": 5543579.0, |
| "step": 5440 |
| }, |
| { |
| "epoch": 1.2294157455447778, |
| "grad_norm": 0.8859591484069824, |
| "learning_rate": 0.00022624859011955784, |
| "loss": 0.1743, |
| "mean_token_accuracy": 0.950443959236145, |
| "num_tokens": 5553797.0, |
| "step": 5450 |
| }, |
| { |
| "epoch": 1.2316715542521994, |
| "grad_norm": 0.6225905418395996, |
| "learning_rate": 0.00022611324159711255, |
| "loss": 0.1332, |
| "mean_token_accuracy": 0.9579242885112762, |
| "num_tokens": 5563967.0, |
| "step": 5460 |
| }, |
| { |
| "epoch": 1.233927362959621, |
| "grad_norm": 0.6247864961624146, |
| "learning_rate": 0.00022597789307466723, |
| "loss": 0.1506, |
| "mean_token_accuracy": 0.9570437967777252, |
| "num_tokens": 5574167.0, |
| "step": 5470 |
| }, |
| { |
| "epoch": 1.2361831716670426, |
| "grad_norm": 0.8502485752105713, |
| "learning_rate": 0.00022584254455222194, |
| "loss": 0.1444, |
| "mean_token_accuracy": 0.9581893026828766, |
| "num_tokens": 5584392.0, |
| "step": 5480 |
| }, |
| { |
| "epoch": 1.2384389803744642, |
| "grad_norm": 0.5714454054832458, |
| "learning_rate": 0.00022570719602977665, |
| "loss": 0.1446, |
| "mean_token_accuracy": 0.9594759464263916, |
| "num_tokens": 5594584.0, |
| "step": 5490 |
| }, |
| { |
| "epoch": 1.2406947890818858, |
| "grad_norm": 0.6027330756187439, |
| "learning_rate": 0.00022557184750733136, |
| "loss": 0.141, |
| "mean_token_accuracy": 0.9551207423210144, |
| "num_tokens": 5604775.0, |
| "step": 5500 |
| }, |
| { |
| "epoch": 1.2429505977893074, |
| "grad_norm": 0.6891913414001465, |
| "learning_rate": 0.00022543649898488605, |
| "loss": 0.1513, |
| "mean_token_accuracy": 0.9528867900371552, |
| "num_tokens": 5614990.0, |
| "step": 5510 |
| }, |
| { |
| "epoch": 1.2452064064967292, |
| "grad_norm": 0.5027481913566589, |
| "learning_rate": 0.00022530115046244075, |
| "loss": 0.1628, |
| "mean_token_accuracy": 0.9531277477741241, |
| "num_tokens": 5625183.0, |
| "step": 5520 |
| }, |
| { |
| "epoch": 1.2474622152041506, |
| "grad_norm": 0.4770359694957733, |
| "learning_rate": 0.00022516580193999546, |
| "loss": 0.1523, |
| "mean_token_accuracy": 0.9534553825855255, |
| "num_tokens": 5635367.0, |
| "step": 5530 |
| }, |
| { |
| "epoch": 1.2497180239115724, |
| "grad_norm": 0.8309736847877502, |
| "learning_rate": 0.00022503045341755017, |
| "loss": 0.1461, |
| "mean_token_accuracy": 0.9559681415557861, |
| "num_tokens": 5645594.0, |
| "step": 5540 |
| }, |
| { |
| "epoch": 1.251973832618994, |
| "grad_norm": 0.8505803942680359, |
| "learning_rate": 0.00022489510489510486, |
| "loss": 0.1787, |
| "mean_token_accuracy": 0.9484643459320068, |
| "num_tokens": 5655750.0, |
| "step": 5550 |
| }, |
| { |
| "epoch": 1.2542296413264156, |
| "grad_norm": 0.47796082496643066, |
| "learning_rate": 0.00022475975637265957, |
| "loss": 0.1481, |
| "mean_token_accuracy": 0.9564223170280457, |
| "num_tokens": 5665927.0, |
| "step": 5560 |
| }, |
| { |
| "epoch": 1.2564854500338372, |
| "grad_norm": 0.5004701018333435, |
| "learning_rate": 0.00022462440785021428, |
| "loss": 0.1376, |
| "mean_token_accuracy": 0.9571360588073731, |
| "num_tokens": 5676155.0, |
| "step": 5570 |
| }, |
| { |
| "epoch": 1.2587412587412588, |
| "grad_norm": 0.603234052658081, |
| "learning_rate": 0.000224489059327769, |
| "loss": 0.1299, |
| "mean_token_accuracy": 0.9614892840385437, |
| "num_tokens": 5686280.0, |
| "step": 5580 |
| }, |
| { |
| "epoch": 1.2609970674486803, |
| "grad_norm": 0.6253412961959839, |
| "learning_rate": 0.0002243537108053237, |
| "loss": 0.1412, |
| "mean_token_accuracy": 0.9608110129833222, |
| "num_tokens": 5696432.0, |
| "step": 5590 |
| }, |
| { |
| "epoch": 1.263252876156102, |
| "grad_norm": 0.46702322363853455, |
| "learning_rate": 0.00022421836228287838, |
| "loss": 0.1521, |
| "mean_token_accuracy": 0.9547717452049256, |
| "num_tokens": 5706592.0, |
| "step": 5600 |
| }, |
| { |
| "epoch": 1.2655086848635235, |
| "grad_norm": 0.9571554064750671, |
| "learning_rate": 0.0002240830137604331, |
| "loss": 0.1641, |
| "mean_token_accuracy": 0.9535882234573364, |
| "num_tokens": 5716770.0, |
| "step": 5610 |
| }, |
| { |
| "epoch": 1.2677644935709451, |
| "grad_norm": 0.5119358897209167, |
| "learning_rate": 0.0002239476652379878, |
| "loss": 0.1245, |
| "mean_token_accuracy": 0.9615546584129333, |
| "num_tokens": 5726960.0, |
| "step": 5620 |
| }, |
| { |
| "epoch": 1.270020302278367, |
| "grad_norm": 0.7958256602287292, |
| "learning_rate": 0.0002238123167155425, |
| "loss": 0.1158, |
| "mean_token_accuracy": 0.9650822639465332, |
| "num_tokens": 5737064.0, |
| "step": 5630 |
| }, |
| { |
| "epoch": 1.2722761109857883, |
| "grad_norm": 0.5314520001411438, |
| "learning_rate": 0.0002236769681930972, |
| "loss": 0.133, |
| "mean_token_accuracy": 0.9586548745632172, |
| "num_tokens": 5747248.0, |
| "step": 5640 |
| }, |
| { |
| "epoch": 1.2745319196932101, |
| "grad_norm": 0.578899085521698, |
| "learning_rate": 0.0002235416196706519, |
| "loss": 0.1621, |
| "mean_token_accuracy": 0.9567394137382508, |
| "num_tokens": 5757414.0, |
| "step": 5650 |
| }, |
| { |
| "epoch": 1.2767877284006317, |
| "grad_norm": 0.5973330140113831, |
| "learning_rate": 0.0002234062711482066, |
| "loss": 0.1489, |
| "mean_token_accuracy": 0.9594153523445129, |
| "num_tokens": 5767569.0, |
| "step": 5660 |
| }, |
| { |
| "epoch": 1.2790435371080533, |
| "grad_norm": 0.7263054251670837, |
| "learning_rate": 0.00022327092262576132, |
| "loss": 0.1831, |
| "mean_token_accuracy": 0.9471005976200104, |
| "num_tokens": 5777761.0, |
| "step": 5670 |
| }, |
| { |
| "epoch": 1.2812993458154749, |
| "grad_norm": 0.5503787994384766, |
| "learning_rate": 0.000223135574103316, |
| "loss": 0.1487, |
| "mean_token_accuracy": 0.9587855756282806, |
| "num_tokens": 5787938.0, |
| "step": 5680 |
| }, |
| { |
| "epoch": 1.2835551545228965, |
| "grad_norm": 0.974582314491272, |
| "learning_rate": 0.00022300022558087071, |
| "loss": 0.114, |
| "mean_token_accuracy": 0.9670296013355255, |
| "num_tokens": 5798169.0, |
| "step": 5690 |
| }, |
| { |
| "epoch": 1.285810963230318, |
| "grad_norm": 0.5530260801315308, |
| "learning_rate": 0.00022286487705842542, |
| "loss": 0.139, |
| "mean_token_accuracy": 0.958290958404541, |
| "num_tokens": 5808388.0, |
| "step": 5700 |
| }, |
| { |
| "epoch": 1.2880667719377397, |
| "grad_norm": 0.614759087562561, |
| "learning_rate": 0.00022272952853598013, |
| "loss": 0.1319, |
| "mean_token_accuracy": 0.960501229763031, |
| "num_tokens": 5818546.0, |
| "step": 5710 |
| }, |
| { |
| "epoch": 1.2903225806451613, |
| "grad_norm": 0.561374306678772, |
| "learning_rate": 0.00022259418001353482, |
| "loss": 0.1439, |
| "mean_token_accuracy": 0.9606938123703003, |
| "num_tokens": 5828710.0, |
| "step": 5720 |
| }, |
| { |
| "epoch": 1.2925783893525828, |
| "grad_norm": 1.0184314250946045, |
| "learning_rate": 0.00022245883149108953, |
| "loss": 0.1619, |
| "mean_token_accuracy": 0.954375433921814, |
| "num_tokens": 5838907.0, |
| "step": 5730 |
| }, |
| { |
| "epoch": 1.2948341980600044, |
| "grad_norm": 0.4517951011657715, |
| "learning_rate": 0.00022232348296864424, |
| "loss": 0.1599, |
| "mean_token_accuracy": 0.9561059713363648, |
| "num_tokens": 5848145.0, |
| "step": 5740 |
| }, |
| { |
| "epoch": 1.297090006767426, |
| "grad_norm": 0.6121286749839783, |
| "learning_rate": 0.00022218813444619895, |
| "loss": 0.1724, |
| "mean_token_accuracy": 0.9532230257987976, |
| "num_tokens": 5858356.0, |
| "step": 5750 |
| }, |
| { |
| "epoch": 1.2993458154748478, |
| "grad_norm": 0.5834189057350159, |
| "learning_rate": 0.00022205278592375366, |
| "loss": 0.1724, |
| "mean_token_accuracy": 0.9492375791072846, |
| "num_tokens": 5868590.0, |
| "step": 5760 |
| }, |
| { |
| "epoch": 1.3016016241822692, |
| "grad_norm": 0.6871075630187988, |
| "learning_rate": 0.00022191743740130834, |
| "loss": 0.1496, |
| "mean_token_accuracy": 0.9571017861366272, |
| "num_tokens": 5878761.0, |
| "step": 5770 |
| }, |
| { |
| "epoch": 1.303857432889691, |
| "grad_norm": 0.6542716026306152, |
| "learning_rate": 0.00022178208887886305, |
| "loss": 0.154, |
| "mean_token_accuracy": 0.9534028053283692, |
| "num_tokens": 5888986.0, |
| "step": 5780 |
| }, |
| { |
| "epoch": 1.3061132415971126, |
| "grad_norm": 0.6185471415519714, |
| "learning_rate": 0.00022164674035641776, |
| "loss": 0.1868, |
| "mean_token_accuracy": 0.9496482908725739, |
| "num_tokens": 5899224.0, |
| "step": 5790 |
| }, |
| { |
| "epoch": 1.3083690503045342, |
| "grad_norm": 0.7506686449050903, |
| "learning_rate": 0.00022151139183397247, |
| "loss": 0.1572, |
| "mean_token_accuracy": 0.9547667086124421, |
| "num_tokens": 5909447.0, |
| "step": 5800 |
| }, |
| { |
| "epoch": 1.3106248590119558, |
| "grad_norm": 0.5120002031326294, |
| "learning_rate": 0.00022137604331152715, |
| "loss": 0.1437, |
| "mean_token_accuracy": 0.9579054772853851, |
| "num_tokens": 5919631.0, |
| "step": 5810 |
| }, |
| { |
| "epoch": 1.3128806677193774, |
| "grad_norm": 0.4987918436527252, |
| "learning_rate": 0.00022124069478908186, |
| "loss": 0.1623, |
| "mean_token_accuracy": 0.9541470289230347, |
| "num_tokens": 5929849.0, |
| "step": 5820 |
| }, |
| { |
| "epoch": 1.315136476426799, |
| "grad_norm": 0.5856090784072876, |
| "learning_rate": 0.00022110534626663657, |
| "loss": 0.1631, |
| "mean_token_accuracy": 0.9521755337715149, |
| "num_tokens": 5940060.0, |
| "step": 5830 |
| }, |
| { |
| "epoch": 1.3173922851342206, |
| "grad_norm": 0.5429736971855164, |
| "learning_rate": 0.00022096999774419128, |
| "loss": 0.1354, |
| "mean_token_accuracy": 0.9596905529499054, |
| "num_tokens": 5950290.0, |
| "step": 5840 |
| }, |
| { |
| "epoch": 1.3196480938416422, |
| "grad_norm": 0.8014978170394897, |
| "learning_rate": 0.00022083464922174596, |
| "loss": 0.1462, |
| "mean_token_accuracy": 0.9545858979225159, |
| "num_tokens": 5960485.0, |
| "step": 5850 |
| }, |
| { |
| "epoch": 1.3219039025490638, |
| "grad_norm": 0.6086113452911377, |
| "learning_rate": 0.00022069930069930067, |
| "loss": 0.1698, |
| "mean_token_accuracy": 0.9500519633293152, |
| "num_tokens": 5970670.0, |
| "step": 5860 |
| }, |
| { |
| "epoch": 1.3241597112564856, |
| "grad_norm": 0.9041001200675964, |
| "learning_rate": 0.00022056395217685538, |
| "loss": 0.1866, |
| "mean_token_accuracy": 0.9488508880138398, |
| "num_tokens": 5980902.0, |
| "step": 5870 |
| }, |
| { |
| "epoch": 1.326415519963907, |
| "grad_norm": 0.9248059391975403, |
| "learning_rate": 0.0002204286036544101, |
| "loss": 0.157, |
| "mean_token_accuracy": 0.9538485944271088, |
| "num_tokens": 5991043.0, |
| "step": 5880 |
| }, |
| { |
| "epoch": 1.3286713286713288, |
| "grad_norm": 0.3965446650981903, |
| "learning_rate": 0.00022029325513196478, |
| "loss": 0.1668, |
| "mean_token_accuracy": 0.9556223392486572, |
| "num_tokens": 6001261.0, |
| "step": 5890 |
| }, |
| { |
| "epoch": 1.3309271373787503, |
| "grad_norm": 0.4829697906970978, |
| "learning_rate": 0.00022015790660951949, |
| "loss": 0.137, |
| "mean_token_accuracy": 0.9602304875850678, |
| "num_tokens": 6011366.0, |
| "step": 5900 |
| }, |
| { |
| "epoch": 1.333182946086172, |
| "grad_norm": 0.7699381709098816, |
| "learning_rate": 0.0002200225580870742, |
| "loss": 0.1797, |
| "mean_token_accuracy": 0.949476546049118, |
| "num_tokens": 6021531.0, |
| "step": 5910 |
| }, |
| { |
| "epoch": 1.3354387547935935, |
| "grad_norm": 0.3457069396972656, |
| "learning_rate": 0.0002198872095646289, |
| "loss": 0.136, |
| "mean_token_accuracy": 0.9613852322101593, |
| "num_tokens": 6031652.0, |
| "step": 5920 |
| }, |
| { |
| "epoch": 1.3376945635010151, |
| "grad_norm": 0.7896465063095093, |
| "learning_rate": 0.00021975186104218362, |
| "loss": 0.1561, |
| "mean_token_accuracy": 0.95490260720253, |
| "num_tokens": 6041831.0, |
| "step": 5930 |
| }, |
| { |
| "epoch": 1.3399503722084367, |
| "grad_norm": 0.5957865118980408, |
| "learning_rate": 0.0002196165125197383, |
| "loss": 0.1384, |
| "mean_token_accuracy": 0.9616434097290039, |
| "num_tokens": 6052062.0, |
| "step": 5940 |
| }, |
| { |
| "epoch": 1.3422061809158583, |
| "grad_norm": 0.986088216304779, |
| "learning_rate": 0.000219481163997293, |
| "loss": 0.1509, |
| "mean_token_accuracy": 0.9559194803237915, |
| "num_tokens": 6062238.0, |
| "step": 5950 |
| }, |
| { |
| "epoch": 1.34446198962328, |
| "grad_norm": 0.6079812049865723, |
| "learning_rate": 0.00021934581547484772, |
| "loss": 0.1378, |
| "mean_token_accuracy": 0.9577659904956818, |
| "num_tokens": 6072477.0, |
| "step": 5960 |
| }, |
| { |
| "epoch": 1.3467177983307015, |
| "grad_norm": 0.8939962387084961, |
| "learning_rate": 0.00021921046695240243, |
| "loss": 0.161, |
| "mean_token_accuracy": 0.9524900197982789, |
| "num_tokens": 6082680.0, |
| "step": 5970 |
| }, |
| { |
| "epoch": 1.3489736070381233, |
| "grad_norm": 0.7223296761512756, |
| "learning_rate": 0.0002190751184299571, |
| "loss": 0.2035, |
| "mean_token_accuracy": 0.948406583070755, |
| "num_tokens": 6092874.0, |
| "step": 5980 |
| }, |
| { |
| "epoch": 1.3512294157455447, |
| "grad_norm": 0.6591929793357849, |
| "learning_rate": 0.00021893976990751182, |
| "loss": 0.1405, |
| "mean_token_accuracy": 0.95966557264328, |
| "num_tokens": 6103032.0, |
| "step": 5990 |
| }, |
| { |
| "epoch": 1.3534852244529665, |
| "grad_norm": 0.9769535064697266, |
| "learning_rate": 0.00021880442138506653, |
| "loss": 0.1602, |
| "mean_token_accuracy": 0.9543199658393859, |
| "num_tokens": 6113265.0, |
| "step": 6000 |
| }, |
| { |
| "epoch": 1.355741033160388, |
| "grad_norm": 0.6889714598655701, |
| "learning_rate": 0.00021866907286262124, |
| "loss": 0.1598, |
| "mean_token_accuracy": 0.9535452902317048, |
| "num_tokens": 6123472.0, |
| "step": 6010 |
| }, |
| { |
| "epoch": 1.3579968418678097, |
| "grad_norm": 0.6904102563858032, |
| "learning_rate": 0.00021853372434017592, |
| "loss": 0.1482, |
| "mean_token_accuracy": 0.9565614700317383, |
| "num_tokens": 6133638.0, |
| "step": 6020 |
| }, |
| { |
| "epoch": 1.3602526505752313, |
| "grad_norm": 1.0406423807144165, |
| "learning_rate": 0.00021839837581773063, |
| "loss": 0.1532, |
| "mean_token_accuracy": 0.9554621934890747, |
| "num_tokens": 6143811.0, |
| "step": 6030 |
| }, |
| { |
| "epoch": 1.3625084592826529, |
| "grad_norm": 0.5132746696472168, |
| "learning_rate": 0.00021826302729528534, |
| "loss": 0.1688, |
| "mean_token_accuracy": 0.9485044598579406, |
| "num_tokens": 6154026.0, |
| "step": 6040 |
| }, |
| { |
| "epoch": 1.3647642679900744, |
| "grad_norm": 0.7688167095184326, |
| "learning_rate": 0.00021812767877284005, |
| "loss": 0.1777, |
| "mean_token_accuracy": 0.9504620730876923, |
| "num_tokens": 6164212.0, |
| "step": 6050 |
| }, |
| { |
| "epoch": 1.367020076697496, |
| "grad_norm": 0.8709307312965393, |
| "learning_rate": 0.00021799233025039474, |
| "loss": 0.2142, |
| "mean_token_accuracy": 0.9431596338748932, |
| "num_tokens": 6174378.0, |
| "step": 6060 |
| }, |
| { |
| "epoch": 1.3692758854049176, |
| "grad_norm": 0.4955613911151886, |
| "learning_rate": 0.00021785698172794945, |
| "loss": 0.1385, |
| "mean_token_accuracy": 0.9571300268173217, |
| "num_tokens": 6184541.0, |
| "step": 6070 |
| }, |
| { |
| "epoch": 1.3715316941123392, |
| "grad_norm": 0.9195837378501892, |
| "learning_rate": 0.00021772163320550416, |
| "loss": 0.151, |
| "mean_token_accuracy": 0.9560777604579925, |
| "num_tokens": 6194772.0, |
| "step": 6080 |
| }, |
| { |
| "epoch": 1.3737875028197608, |
| "grad_norm": 0.30228182673454285, |
| "learning_rate": 0.00021758628468305886, |
| "loss": 0.1203, |
| "mean_token_accuracy": 0.9646747708320618, |
| "num_tokens": 6205001.0, |
| "step": 6090 |
| }, |
| { |
| "epoch": 1.3760433115271824, |
| "grad_norm": 1.0205188989639282, |
| "learning_rate": 0.00021745093616061357, |
| "loss": 0.1578, |
| "mean_token_accuracy": 0.9546098172664642, |
| "num_tokens": 6215129.0, |
| "step": 6100 |
| }, |
| { |
| "epoch": 1.3782991202346042, |
| "grad_norm": 0.45732060074806213, |
| "learning_rate": 0.00021731558763816826, |
| "loss": 0.1372, |
| "mean_token_accuracy": 0.9615266501903534, |
| "num_tokens": 6225355.0, |
| "step": 6110 |
| }, |
| { |
| "epoch": 1.3805549289420256, |
| "grad_norm": 0.6275461316108704, |
| "learning_rate": 0.00021718023911572297, |
| "loss": 0.1343, |
| "mean_token_accuracy": 0.9584670960903168, |
| "num_tokens": 6235539.0, |
| "step": 6120 |
| }, |
| { |
| "epoch": 1.3828107376494474, |
| "grad_norm": 0.724653959274292, |
| "learning_rate": 0.00021704489059327768, |
| "loss": 0.1223, |
| "mean_token_accuracy": 0.9638613402843476, |
| "num_tokens": 6245761.0, |
| "step": 6130 |
| }, |
| { |
| "epoch": 1.385066546356869, |
| "grad_norm": 0.6194751858711243, |
| "learning_rate": 0.0002169095420708324, |
| "loss": 0.1206, |
| "mean_token_accuracy": 0.9652007341384887, |
| "num_tokens": 6255660.0, |
| "step": 6140 |
| }, |
| { |
| "epoch": 1.3873223550642906, |
| "grad_norm": 1.0562235116958618, |
| "learning_rate": 0.00021677419354838707, |
| "loss": 0.2017, |
| "mean_token_accuracy": 0.9480350255966187, |
| "num_tokens": 6265817.0, |
| "step": 6150 |
| }, |
| { |
| "epoch": 1.3895781637717122, |
| "grad_norm": 0.23528407514095306, |
| "learning_rate": 0.00021663884502594178, |
| "loss": 0.1662, |
| "mean_token_accuracy": 0.9554067730903626, |
| "num_tokens": 6275388.0, |
| "step": 6160 |
| }, |
| { |
| "epoch": 1.3918339724791338, |
| "grad_norm": 0.9895961284637451, |
| "learning_rate": 0.0002165034965034965, |
| "loss": 0.1573, |
| "mean_token_accuracy": 0.9531720578670502, |
| "num_tokens": 6285627.0, |
| "step": 6170 |
| }, |
| { |
| "epoch": 1.3940897811865554, |
| "grad_norm": 0.5095604062080383, |
| "learning_rate": 0.0002163681479810512, |
| "loss": 0.1517, |
| "mean_token_accuracy": 0.9555117368698121, |
| "num_tokens": 6295805.0, |
| "step": 6180 |
| }, |
| { |
| "epoch": 1.396345589893977, |
| "grad_norm": 0.787956953048706, |
| "learning_rate": 0.00021623279945860588, |
| "loss": 0.1872, |
| "mean_token_accuracy": 0.9512019693851471, |
| "num_tokens": 6306039.0, |
| "step": 6190 |
| }, |
| { |
| "epoch": 1.3986013986013985, |
| "grad_norm": 0.7012321352958679, |
| "learning_rate": 0.0002160974509361606, |
| "loss": 0.1559, |
| "mean_token_accuracy": 0.9556455969810486, |
| "num_tokens": 6316186.0, |
| "step": 6200 |
| }, |
| { |
| "epoch": 1.4008572073088201, |
| "grad_norm": 0.5284978747367859, |
| "learning_rate": 0.0002159621024137153, |
| "loss": 0.13, |
| "mean_token_accuracy": 0.9619249105453491, |
| "num_tokens": 6326413.0, |
| "step": 6210 |
| }, |
| { |
| "epoch": 1.403113016016242, |
| "grad_norm": 0.45027801394462585, |
| "learning_rate": 0.00021582675389127, |
| "loss": 0.1408, |
| "mean_token_accuracy": 0.9602871358394622, |
| "num_tokens": 6336576.0, |
| "step": 6220 |
| }, |
| { |
| "epoch": 1.4053688247236633, |
| "grad_norm": 0.5853649973869324, |
| "learning_rate": 0.00021569140536882472, |
| "loss": 0.1737, |
| "mean_token_accuracy": 0.9547618448734283, |
| "num_tokens": 6346767.0, |
| "step": 6230 |
| }, |
| { |
| "epoch": 1.4076246334310851, |
| "grad_norm": 0.4630914628505707, |
| "learning_rate": 0.0002155560568463794, |
| "loss": 0.1327, |
| "mean_token_accuracy": 0.9596076071262359, |
| "num_tokens": 6356979.0, |
| "step": 6240 |
| }, |
| { |
| "epoch": 1.4098804421385067, |
| "grad_norm": 0.6433979868888855, |
| "learning_rate": 0.00021542070832393411, |
| "loss": 0.1746, |
| "mean_token_accuracy": 0.9512283504009247, |
| "num_tokens": 6367167.0, |
| "step": 6250 |
| }, |
| { |
| "epoch": 1.4121362508459283, |
| "grad_norm": 0.649756908416748, |
| "learning_rate": 0.00021528535980148882, |
| "loss": 0.1216, |
| "mean_token_accuracy": 0.9655301630496979, |
| "num_tokens": 6377389.0, |
| "step": 6260 |
| }, |
| { |
| "epoch": 1.41439205955335, |
| "grad_norm": 0.4103543162345886, |
| "learning_rate": 0.00021515001127904353, |
| "loss": 0.153, |
| "mean_token_accuracy": 0.9575583398342132, |
| "num_tokens": 6387589.0, |
| "step": 6270 |
| }, |
| { |
| "epoch": 1.4166478682607715, |
| "grad_norm": 0.7882847189903259, |
| "learning_rate": 0.00021501466275659822, |
| "loss": 0.1386, |
| "mean_token_accuracy": 0.9569086253643035, |
| "num_tokens": 6397755.0, |
| "step": 6280 |
| }, |
| { |
| "epoch": 1.418903676968193, |
| "grad_norm": 0.8500162363052368, |
| "learning_rate": 0.00021487931423415293, |
| "loss": 0.1614, |
| "mean_token_accuracy": 0.9510623872280121, |
| "num_tokens": 6407938.0, |
| "step": 6290 |
| }, |
| { |
| "epoch": 1.4211594856756147, |
| "grad_norm": 0.5798035264015198, |
| "learning_rate": 0.00021474396571170764, |
| "loss": 0.121, |
| "mean_token_accuracy": 0.9637139558792114, |
| "num_tokens": 6418138.0, |
| "step": 6300 |
| }, |
| { |
| "epoch": 1.4234152943830363, |
| "grad_norm": 0.44802120327949524, |
| "learning_rate": 0.00021460861718926235, |
| "loss": 0.1261, |
| "mean_token_accuracy": 0.9629803776741028, |
| "num_tokens": 6428343.0, |
| "step": 6310 |
| }, |
| { |
| "epoch": 1.4256711030904579, |
| "grad_norm": 0.7741478085517883, |
| "learning_rate": 0.00021447326866681703, |
| "loss": 0.1576, |
| "mean_token_accuracy": 0.9575680911540985, |
| "num_tokens": 6438530.0, |
| "step": 6320 |
| }, |
| { |
| "epoch": 1.4279269117978797, |
| "grad_norm": 0.7088395357131958, |
| "learning_rate": 0.00021433792014437174, |
| "loss": 0.1479, |
| "mean_token_accuracy": 0.9561203837394714, |
| "num_tokens": 6448706.0, |
| "step": 6330 |
| }, |
| { |
| "epoch": 1.430182720505301, |
| "grad_norm": 0.8038097023963928, |
| "learning_rate": 0.00021420257162192645, |
| "loss": 0.1493, |
| "mean_token_accuracy": 0.9569381237030029, |
| "num_tokens": 6458868.0, |
| "step": 6340 |
| }, |
| { |
| "epoch": 1.4324385292127229, |
| "grad_norm": 0.6985066533088684, |
| "learning_rate": 0.00021406722309948116, |
| "loss": 0.1469, |
| "mean_token_accuracy": 0.9614906787872315, |
| "num_tokens": 6469064.0, |
| "step": 6350 |
| }, |
| { |
| "epoch": 1.4346943379201444, |
| "grad_norm": 0.5924888849258423, |
| "learning_rate": 0.00021393187457703584, |
| "loss": 0.1702, |
| "mean_token_accuracy": 0.9545105516910553, |
| "num_tokens": 6479264.0, |
| "step": 6360 |
| }, |
| { |
| "epoch": 1.436950146627566, |
| "grad_norm": 0.4779655933380127, |
| "learning_rate": 0.00021379652605459055, |
| "loss": 0.1185, |
| "mean_token_accuracy": 0.9624211072921753, |
| "num_tokens": 6489432.0, |
| "step": 6370 |
| }, |
| { |
| "epoch": 1.4392059553349876, |
| "grad_norm": 0.5000889301300049, |
| "learning_rate": 0.00021366117753214526, |
| "loss": 0.1218, |
| "mean_token_accuracy": 0.9632967412471771, |
| "num_tokens": 6499626.0, |
| "step": 6380 |
| }, |
| { |
| "epoch": 1.4414617640424092, |
| "grad_norm": 0.8437454700469971, |
| "learning_rate": 0.00021352582900969997, |
| "loss": 0.1697, |
| "mean_token_accuracy": 0.9546212613582611, |
| "num_tokens": 6509837.0, |
| "step": 6390 |
| }, |
| { |
| "epoch": 1.4437175727498308, |
| "grad_norm": 0.4896928668022156, |
| "learning_rate": 0.00021339048048725468, |
| "loss": 0.1125, |
| "mean_token_accuracy": 0.967903059720993, |
| "num_tokens": 6520000.0, |
| "step": 6400 |
| }, |
| { |
| "epoch": 1.4459733814572524, |
| "grad_norm": 0.49815863370895386, |
| "learning_rate": 0.00021325513196480936, |
| "loss": 0.1267, |
| "mean_token_accuracy": 0.9628502309322358, |
| "num_tokens": 6530239.0, |
| "step": 6410 |
| }, |
| { |
| "epoch": 1.448229190164674, |
| "grad_norm": 0.6222156286239624, |
| "learning_rate": 0.00021311978344236407, |
| "loss": 0.1311, |
| "mean_token_accuracy": 0.9611857533454895, |
| "num_tokens": 6540447.0, |
| "step": 6420 |
| }, |
| { |
| "epoch": 1.4504849988720956, |
| "grad_norm": 0.9778371453285217, |
| "learning_rate": 0.00021298443491991878, |
| "loss": 0.1832, |
| "mean_token_accuracy": 0.9495638191699982, |
| "num_tokens": 6550622.0, |
| "step": 6430 |
| }, |
| { |
| "epoch": 1.4527408075795172, |
| "grad_norm": 0.6410164833068848, |
| "learning_rate": 0.0002128490863974735, |
| "loss": 0.175, |
| "mean_token_accuracy": 0.9525443017482758, |
| "num_tokens": 6560832.0, |
| "step": 6440 |
| }, |
| { |
| "epoch": 1.4549966162869388, |
| "grad_norm": 0.389616459608078, |
| "learning_rate": 0.00021271373787502818, |
| "loss": 0.1227, |
| "mean_token_accuracy": 0.9669749975204468, |
| "num_tokens": 6571005.0, |
| "step": 6450 |
| }, |
| { |
| "epoch": 1.4572524249943606, |
| "grad_norm": 0.8085306286811829, |
| "learning_rate": 0.00021257838935258289, |
| "loss": 0.1726, |
| "mean_token_accuracy": 0.9466955840587616, |
| "num_tokens": 6581166.0, |
| "step": 6460 |
| }, |
| { |
| "epoch": 1.459508233701782, |
| "grad_norm": 0.5888795256614685, |
| "learning_rate": 0.0002124430408301376, |
| "loss": 0.1511, |
| "mean_token_accuracy": 0.9562407910823822, |
| "num_tokens": 6591391.0, |
| "step": 6470 |
| }, |
| { |
| "epoch": 1.4617640424092038, |
| "grad_norm": 0.6471384763717651, |
| "learning_rate": 0.0002123076923076923, |
| "loss": 0.1102, |
| "mean_token_accuracy": 0.9670383930206299, |
| "num_tokens": 6601626.0, |
| "step": 6480 |
| }, |
| { |
| "epoch": 1.4640198511166254, |
| "grad_norm": 0.5917261242866516, |
| "learning_rate": 0.000212172343785247, |
| "loss": 0.135, |
| "mean_token_accuracy": 0.9612210988998413, |
| "num_tokens": 6611751.0, |
| "step": 6490 |
| }, |
| { |
| "epoch": 1.466275659824047, |
| "grad_norm": 0.704924464225769, |
| "learning_rate": 0.0002120369952628017, |
| "loss": 0.1177, |
| "mean_token_accuracy": 0.9678965330123901, |
| "num_tokens": 6621977.0, |
| "step": 6500 |
| }, |
| { |
| "epoch": 1.4685314685314685, |
| "grad_norm": 0.47947514057159424, |
| "learning_rate": 0.0002119016467403564, |
| "loss": 0.1495, |
| "mean_token_accuracy": 0.958800095319748, |
| "num_tokens": 6632164.0, |
| "step": 6510 |
| }, |
| { |
| "epoch": 1.4707872772388901, |
| "grad_norm": 0.7660108208656311, |
| "learning_rate": 0.00021176629821791112, |
| "loss": 0.1384, |
| "mean_token_accuracy": 0.9571514368057251, |
| "num_tokens": 6642397.0, |
| "step": 6520 |
| }, |
| { |
| "epoch": 1.4730430859463117, |
| "grad_norm": 0.5778849124908447, |
| "learning_rate": 0.0002116309496954658, |
| "loss": 0.1679, |
| "mean_token_accuracy": 0.9551712095737457, |
| "num_tokens": 6652607.0, |
| "step": 6530 |
| }, |
| { |
| "epoch": 1.4752988946537333, |
| "grad_norm": 0.6587995290756226, |
| "learning_rate": 0.0002114956011730205, |
| "loss": 0.1685, |
| "mean_token_accuracy": 0.9550755620002747, |
| "num_tokens": 6662770.0, |
| "step": 6540 |
| }, |
| { |
| "epoch": 1.477554703361155, |
| "grad_norm": 0.95082026720047, |
| "learning_rate": 0.00021136025265057522, |
| "loss": 0.1931, |
| "mean_token_accuracy": 0.9517137944698334, |
| "num_tokens": 6672969.0, |
| "step": 6550 |
| }, |
| { |
| "epoch": 1.4798105120685765, |
| "grad_norm": 0.4370115399360657, |
| "learning_rate": 0.00021122490412812993, |
| "loss": 0.1635, |
| "mean_token_accuracy": 0.9526604056358338, |
| "num_tokens": 6683167.0, |
| "step": 6560 |
| }, |
| { |
| "epoch": 1.4820663207759983, |
| "grad_norm": 0.4797029197216034, |
| "learning_rate": 0.00021108955560568464, |
| "loss": 0.1795, |
| "mean_token_accuracy": 0.9507135689258576, |
| "num_tokens": 6693384.0, |
| "step": 6570 |
| }, |
| { |
| "epoch": 1.4843221294834197, |
| "grad_norm": 0.9287253022193909, |
| "learning_rate": 0.00021095420708323932, |
| "loss": 0.1423, |
| "mean_token_accuracy": 0.960996150970459, |
| "num_tokens": 6703601.0, |
| "step": 6580 |
| }, |
| { |
| "epoch": 1.4865779381908415, |
| "grad_norm": 1.040830135345459, |
| "learning_rate": 0.00021081885856079403, |
| "loss": 0.1147, |
| "mean_token_accuracy": 0.9659351110458374, |
| "num_tokens": 6713774.0, |
| "step": 6590 |
| }, |
| { |
| "epoch": 1.488833746898263, |
| "grad_norm": 0.5492799282073975, |
| "learning_rate": 0.00021068351003834874, |
| "loss": 0.1376, |
| "mean_token_accuracy": 0.9622784972190856, |
| "num_tokens": 6723950.0, |
| "step": 6600 |
| }, |
| { |
| "epoch": 1.4910895556056847, |
| "grad_norm": 0.5340641736984253, |
| "learning_rate": 0.00021054816151590345, |
| "loss": 0.1377, |
| "mean_token_accuracy": 0.9588437557220459, |
| "num_tokens": 6734151.0, |
| "step": 6610 |
| }, |
| { |
| "epoch": 1.4933453643131063, |
| "grad_norm": 0.3860383629798889, |
| "learning_rate": 0.00021041281299345814, |
| "loss": 0.1291, |
| "mean_token_accuracy": 0.9634852707386017, |
| "num_tokens": 6744322.0, |
| "step": 6620 |
| }, |
| { |
| "epoch": 1.4956011730205279, |
| "grad_norm": 0.8424534201622009, |
| "learning_rate": 0.00021027746447101285, |
| "loss": 0.1228, |
| "mean_token_accuracy": 0.9646449565887452, |
| "num_tokens": 6754539.0, |
| "step": 6630 |
| }, |
| { |
| "epoch": 1.4978569817279495, |
| "grad_norm": 0.8719351887702942, |
| "learning_rate": 0.00021014211594856756, |
| "loss": 0.1232, |
| "mean_token_accuracy": 0.9655321180820465, |
| "num_tokens": 6764694.0, |
| "step": 6640 |
| }, |
| { |
| "epoch": 1.500112790435371, |
| "grad_norm": 0.4581547677516937, |
| "learning_rate": 0.00021000676742612227, |
| "loss": 0.1326, |
| "mean_token_accuracy": 0.9618205249309539, |
| "num_tokens": 6774840.0, |
| "step": 6650 |
| }, |
| { |
| "epoch": 1.5023685991427926, |
| "grad_norm": 0.8103078007698059, |
| "learning_rate": 0.00020987141890367695, |
| "loss": 0.1246, |
| "mean_token_accuracy": 0.9630819737911225, |
| "num_tokens": 6785066.0, |
| "step": 6660 |
| }, |
| { |
| "epoch": 1.5046244078502142, |
| "grad_norm": 0.6315047740936279, |
| "learning_rate": 0.00020973607038123166, |
| "loss": 0.1473, |
| "mean_token_accuracy": 0.9575309336185456, |
| "num_tokens": 6795252.0, |
| "step": 6670 |
| }, |
| { |
| "epoch": 1.506880216557636, |
| "grad_norm": 0.8702731132507324, |
| "learning_rate": 0.00020960072185878637, |
| "loss": 0.1364, |
| "mean_token_accuracy": 0.9622900664806366, |
| "num_tokens": 6805452.0, |
| "step": 6680 |
| }, |
| { |
| "epoch": 1.5091360252650574, |
| "grad_norm": 0.6531904339790344, |
| "learning_rate": 0.00020946537333634108, |
| "loss": 0.1476, |
| "mean_token_accuracy": 0.9573485612869262, |
| "num_tokens": 6815659.0, |
| "step": 6690 |
| }, |
| { |
| "epoch": 1.5113918339724792, |
| "grad_norm": 0.5445907115936279, |
| "learning_rate": 0.00020933002481389576, |
| "loss": 0.1645, |
| "mean_token_accuracy": 0.953666216135025, |
| "num_tokens": 6825881.0, |
| "step": 6700 |
| }, |
| { |
| "epoch": 1.5136476426799006, |
| "grad_norm": 0.823176920413971, |
| "learning_rate": 0.00020919467629145047, |
| "loss": 0.1168, |
| "mean_token_accuracy": 0.9674039006233215, |
| "num_tokens": 6836081.0, |
| "step": 6710 |
| }, |
| { |
| "epoch": 1.5159034513873224, |
| "grad_norm": 0.9127213954925537, |
| "learning_rate": 0.00020905932776900518, |
| "loss": 0.1511, |
| "mean_token_accuracy": 0.9592125058174134, |
| "num_tokens": 6846311.0, |
| "step": 6720 |
| }, |
| { |
| "epoch": 1.518159260094744, |
| "grad_norm": 0.5406370162963867, |
| "learning_rate": 0.0002089239792465599, |
| "loss": 0.1493, |
| "mean_token_accuracy": 0.9566941797733307, |
| "num_tokens": 6856531.0, |
| "step": 6730 |
| }, |
| { |
| "epoch": 1.5204150688021656, |
| "grad_norm": 0.5357968807220459, |
| "learning_rate": 0.0002087886307241146, |
| "loss": 0.1742, |
| "mean_token_accuracy": 0.950842946767807, |
| "num_tokens": 6866646.0, |
| "step": 6740 |
| }, |
| { |
| "epoch": 1.5226708775095872, |
| "grad_norm": 0.3849286437034607, |
| "learning_rate": 0.00020865328220166928, |
| "loss": 0.1016, |
| "mean_token_accuracy": 0.9700107276439667, |
| "num_tokens": 6876800.0, |
| "step": 6750 |
| }, |
| { |
| "epoch": 1.5249266862170088, |
| "grad_norm": 0.6728788614273071, |
| "learning_rate": 0.000208517933679224, |
| "loss": 0.1458, |
| "mean_token_accuracy": 0.9588227331638336, |
| "num_tokens": 6886918.0, |
| "step": 6760 |
| }, |
| { |
| "epoch": 1.5271824949244304, |
| "grad_norm": 1.1982979774475098, |
| "learning_rate": 0.0002083825851567787, |
| "loss": 0.1186, |
| "mean_token_accuracy": 0.9652492642402649, |
| "num_tokens": 6897103.0, |
| "step": 6770 |
| }, |
| { |
| "epoch": 1.529438303631852, |
| "grad_norm": 0.6889848113059998, |
| "learning_rate": 0.0002082472366343334, |
| "loss": 0.1214, |
| "mean_token_accuracy": 0.9651449739933013, |
| "num_tokens": 6907282.0, |
| "step": 6780 |
| }, |
| { |
| "epoch": 1.5316941123392738, |
| "grad_norm": 0.5323916077613831, |
| "learning_rate": 0.00020811188811188807, |
| "loss": 0.117, |
| "mean_token_accuracy": 0.9649187743663787, |
| "num_tokens": 6917456.0, |
| "step": 6790 |
| }, |
| { |
| "epoch": 1.5339499210466951, |
| "grad_norm": 0.41671881079673767, |
| "learning_rate": 0.0002079765395894428, |
| "loss": 0.1259, |
| "mean_token_accuracy": 0.9648886620998383, |
| "num_tokens": 6927691.0, |
| "step": 6800 |
| }, |
| { |
| "epoch": 1.536205729754117, |
| "grad_norm": 0.7938935160636902, |
| "learning_rate": 0.00020784119106699751, |
| "loss": 0.1343, |
| "mean_token_accuracy": 0.9606435716152191, |
| "num_tokens": 6937915.0, |
| "step": 6810 |
| }, |
| { |
| "epoch": 1.5384615384615383, |
| "grad_norm": 0.6101523637771606, |
| "learning_rate": 0.00020770584254455222, |
| "loss": 0.1521, |
| "mean_token_accuracy": 0.9544383645057678, |
| "num_tokens": 6948128.0, |
| "step": 6820 |
| }, |
| { |
| "epoch": 1.5407173471689601, |
| "grad_norm": 0.46042972803115845, |
| "learning_rate": 0.00020757049402210688, |
| "loss": 0.1374, |
| "mean_token_accuracy": 0.9600911319255829, |
| "num_tokens": 6958310.0, |
| "step": 6830 |
| }, |
| { |
| "epoch": 1.5429731558763817, |
| "grad_norm": 0.47602182626724243, |
| "learning_rate": 0.00020743514549966162, |
| "loss": 0.162, |
| "mean_token_accuracy": 0.954825884103775, |
| "num_tokens": 6968547.0, |
| "step": 6840 |
| }, |
| { |
| "epoch": 1.5452289645838033, |
| "grad_norm": 1.1721361875534058, |
| "learning_rate": 0.00020729979697721633, |
| "loss": 0.141, |
| "mean_token_accuracy": 0.9617628633975983, |
| "num_tokens": 6978722.0, |
| "step": 6850 |
| }, |
| { |
| "epoch": 1.547484773291225, |
| "grad_norm": 0.7760915160179138, |
| "learning_rate": 0.00020716444845477104, |
| "loss": 0.1268, |
| "mean_token_accuracy": 0.9629230618476867, |
| "num_tokens": 6988914.0, |
| "step": 6860 |
| }, |
| { |
| "epoch": 1.5497405819986465, |
| "grad_norm": 0.6691769957542419, |
| "learning_rate": 0.00020702909993232575, |
| "loss": 0.1492, |
| "mean_token_accuracy": 0.9562802612781525, |
| "num_tokens": 6999110.0, |
| "step": 6870 |
| }, |
| { |
| "epoch": 1.551996390706068, |
| "grad_norm": 0.565916895866394, |
| "learning_rate": 0.0002068937514098804, |
| "loss": 0.1294, |
| "mean_token_accuracy": 0.9653188228607178, |
| "num_tokens": 7009304.0, |
| "step": 6880 |
| }, |
| { |
| "epoch": 1.5542521994134897, |
| "grad_norm": 0.6766910552978516, |
| "learning_rate": 0.00020675840288743514, |
| "loss": 0.1093, |
| "mean_token_accuracy": 0.9655160486698151, |
| "num_tokens": 7019515.0, |
| "step": 6890 |
| }, |
| { |
| "epoch": 1.5565080081209115, |
| "grad_norm": 0.6299457550048828, |
| "learning_rate": 0.00020662305436498985, |
| "loss": 0.1523, |
| "mean_token_accuracy": 0.9556167304515839, |
| "num_tokens": 7029604.0, |
| "step": 6900 |
| }, |
| { |
| "epoch": 1.5587638168283329, |
| "grad_norm": 0.8913159370422363, |
| "learning_rate": 0.00020648770584254456, |
| "loss": 0.1272, |
| "mean_token_accuracy": 0.9642847359180451, |
| "num_tokens": 7039834.0, |
| "step": 6910 |
| }, |
| { |
| "epoch": 1.5610196255357547, |
| "grad_norm": 0.7572903037071228, |
| "learning_rate": 0.00020635235732009921, |
| "loss": 0.1319, |
| "mean_token_accuracy": 0.9626257240772247, |
| "num_tokens": 7049997.0, |
| "step": 6920 |
| }, |
| { |
| "epoch": 1.563275434243176, |
| "grad_norm": 0.5362765789031982, |
| "learning_rate": 0.00020621700879765392, |
| "loss": 0.1401, |
| "mean_token_accuracy": 0.9602336525917053, |
| "num_tokens": 7060185.0, |
| "step": 6930 |
| }, |
| { |
| "epoch": 1.5655312429505979, |
| "grad_norm": 0.48010891675949097, |
| "learning_rate": 0.00020608166027520866, |
| "loss": 0.1291, |
| "mean_token_accuracy": 0.9639032125473023, |
| "num_tokens": 7070418.0, |
| "step": 6940 |
| }, |
| { |
| "epoch": 1.5677870516580192, |
| "grad_norm": 0.5274568200111389, |
| "learning_rate": 0.00020594631175276337, |
| "loss": 0.1453, |
| "mean_token_accuracy": 0.961913114786148, |
| "num_tokens": 7080638.0, |
| "step": 6950 |
| }, |
| { |
| "epoch": 1.570042860365441, |
| "grad_norm": 0.7161872386932373, |
| "learning_rate": 0.00020581096323031803, |
| "loss": 0.1229, |
| "mean_token_accuracy": 0.966279947757721, |
| "num_tokens": 7090667.0, |
| "step": 6960 |
| }, |
| { |
| "epoch": 1.5722986690728626, |
| "grad_norm": 0.6061383485794067, |
| "learning_rate": 0.00020567561470787274, |
| "loss": 0.1413, |
| "mean_token_accuracy": 0.958649742603302, |
| "num_tokens": 7100841.0, |
| "step": 6970 |
| }, |
| { |
| "epoch": 1.5745544777802842, |
| "grad_norm": 0.8167970180511475, |
| "learning_rate": 0.00020554026618542745, |
| "loss": 0.1267, |
| "mean_token_accuracy": 0.9638277113437652, |
| "num_tokens": 7111056.0, |
| "step": 6980 |
| }, |
| { |
| "epoch": 1.5768102864877058, |
| "grad_norm": 0.6468409895896912, |
| "learning_rate": 0.00020540491766298218, |
| "loss": 0.1179, |
| "mean_token_accuracy": 0.9672865152359009, |
| "num_tokens": 7121242.0, |
| "step": 6990 |
| }, |
| { |
| "epoch": 1.5790660951951274, |
| "grad_norm": 0.7238821983337402, |
| "learning_rate": 0.00020526956914053684, |
| "loss": 0.1433, |
| "mean_token_accuracy": 0.9589922726154327, |
| "num_tokens": 7131470.0, |
| "step": 7000 |
| }, |
| { |
| "epoch": 1.581321903902549, |
| "grad_norm": 0.5528222322463989, |
| "learning_rate": 0.00020513422061809155, |
| "loss": 0.1357, |
| "mean_token_accuracy": 0.9585762560367584, |
| "num_tokens": 7141689.0, |
| "step": 7010 |
| }, |
| { |
| "epoch": 1.5835777126099706, |
| "grad_norm": 0.5497334003448486, |
| "learning_rate": 0.00020499887209564626, |
| "loss": 0.1169, |
| "mean_token_accuracy": 0.9675468802452087, |
| "num_tokens": 7151896.0, |
| "step": 7020 |
| }, |
| { |
| "epoch": 1.5858335213173924, |
| "grad_norm": 0.6677653193473816, |
| "learning_rate": 0.000204863523573201, |
| "loss": 0.1397, |
| "mean_token_accuracy": 0.961426842212677, |
| "num_tokens": 7162088.0, |
| "step": 7030 |
| }, |
| { |
| "epoch": 1.5880893300248138, |
| "grad_norm": 0.8899350762367249, |
| "learning_rate": 0.0002047281750507557, |
| "loss": 0.1373, |
| "mean_token_accuracy": 0.9617144644260407, |
| "num_tokens": 7172269.0, |
| "step": 7040 |
| }, |
| { |
| "epoch": 1.5903451387322356, |
| "grad_norm": 0.5083749890327454, |
| "learning_rate": 0.00020459282652831036, |
| "loss": 0.1105, |
| "mean_token_accuracy": 0.9664644658565521, |
| "num_tokens": 7182507.0, |
| "step": 7050 |
| }, |
| { |
| "epoch": 1.592600947439657, |
| "grad_norm": 0.4294250011444092, |
| "learning_rate": 0.00020445747800586507, |
| "loss": 0.1074, |
| "mean_token_accuracy": 0.9671262919902801, |
| "num_tokens": 7192719.0, |
| "step": 7060 |
| }, |
| { |
| "epoch": 1.5948567561470788, |
| "grad_norm": 0.46395134925842285, |
| "learning_rate": 0.00020432212948341978, |
| "loss": 0.1865, |
| "mean_token_accuracy": 0.9541430771350861, |
| "num_tokens": 7202885.0, |
| "step": 7070 |
| }, |
| { |
| "epoch": 1.5971125648545004, |
| "grad_norm": 0.6415492296218872, |
| "learning_rate": 0.00020418678096097452, |
| "loss": 0.1065, |
| "mean_token_accuracy": 0.9670535027980804, |
| "num_tokens": 7213086.0, |
| "step": 7080 |
| }, |
| { |
| "epoch": 1.599368373561922, |
| "grad_norm": 0.4826420247554779, |
| "learning_rate": 0.00020405143243852917, |
| "loss": 0.1314, |
| "mean_token_accuracy": 0.9614757120609283, |
| "num_tokens": 7223238.0, |
| "step": 7090 |
| }, |
| { |
| "epoch": 1.6016241822693436, |
| "grad_norm": 0.8876851201057434, |
| "learning_rate": 0.00020391608391608388, |
| "loss": 0.1365, |
| "mean_token_accuracy": 0.9598813712596893, |
| "num_tokens": 7233407.0, |
| "step": 7100 |
| }, |
| { |
| "epoch": 1.6038799909767651, |
| "grad_norm": 0.8092009425163269, |
| "learning_rate": 0.0002037807353936386, |
| "loss": 0.1901, |
| "mean_token_accuracy": 0.9505953013896942, |
| "num_tokens": 7243542.0, |
| "step": 7110 |
| }, |
| { |
| "epoch": 1.6061357996841867, |
| "grad_norm": 0.5989572405815125, |
| "learning_rate": 0.0002036453868711933, |
| "loss": 0.1304, |
| "mean_token_accuracy": 0.9629263341426849, |
| "num_tokens": 7253772.0, |
| "step": 7120 |
| }, |
| { |
| "epoch": 1.6083916083916083, |
| "grad_norm": 0.832798421382904, |
| "learning_rate": 0.00020351003834874799, |
| "loss": 0.0883, |
| "mean_token_accuracy": 0.9736064672470093, |
| "num_tokens": 7263938.0, |
| "step": 7130 |
| }, |
| { |
| "epoch": 1.6106474170990301, |
| "grad_norm": 0.7204803228378296, |
| "learning_rate": 0.0002033746898263027, |
| "loss": 0.114, |
| "mean_token_accuracy": 0.9653593361377716, |
| "num_tokens": 7274164.0, |
| "step": 7140 |
| }, |
| { |
| "epoch": 1.6129032258064515, |
| "grad_norm": 0.5595267415046692, |
| "learning_rate": 0.0002032393413038574, |
| "loss": 0.1233, |
| "mean_token_accuracy": 0.9639688670635224, |
| "num_tokens": 7284362.0, |
| "step": 7150 |
| }, |
| { |
| "epoch": 1.6151590345138733, |
| "grad_norm": 0.4636051654815674, |
| "learning_rate": 0.00020310399278141212, |
| "loss": 0.1573, |
| "mean_token_accuracy": 0.9560793101787567, |
| "num_tokens": 7294517.0, |
| "step": 7160 |
| }, |
| { |
| "epoch": 1.6174148432212947, |
| "grad_norm": 0.5867611169815063, |
| "learning_rate": 0.0002029686442589668, |
| "loss": 0.1055, |
| "mean_token_accuracy": 0.967164421081543, |
| "num_tokens": 7304698.0, |
| "step": 7170 |
| }, |
| { |
| "epoch": 1.6196706519287165, |
| "grad_norm": 0.5442803502082825, |
| "learning_rate": 0.0002028332957365215, |
| "loss": 0.2436, |
| "mean_token_accuracy": 0.9414969682693481, |
| "num_tokens": 7314848.0, |
| "step": 7180 |
| }, |
| { |
| "epoch": 1.621926460636138, |
| "grad_norm": 1.0676794052124023, |
| "learning_rate": 0.00020269794721407622, |
| "loss": 0.129, |
| "mean_token_accuracy": 0.965214467048645, |
| "num_tokens": 7325069.0, |
| "step": 7190 |
| }, |
| { |
| "epoch": 1.6241822693435597, |
| "grad_norm": 0.5160236954689026, |
| "learning_rate": 0.00020256259869163093, |
| "loss": 0.1241, |
| "mean_token_accuracy": 0.9648967385292053, |
| "num_tokens": 7335222.0, |
| "step": 7200 |
| }, |
| { |
| "epoch": 1.6264380780509813, |
| "grad_norm": 0.6989278197288513, |
| "learning_rate": 0.00020242725016918564, |
| "loss": 0.1307, |
| "mean_token_accuracy": 0.9661418199539185, |
| "num_tokens": 7345433.0, |
| "step": 7210 |
| }, |
| { |
| "epoch": 1.6286938867584029, |
| "grad_norm": 0.6635163426399231, |
| "learning_rate": 0.00020229190164674032, |
| "loss": 0.1211, |
| "mean_token_accuracy": 0.963777381181717, |
| "num_tokens": 7355635.0, |
| "step": 7220 |
| }, |
| { |
| "epoch": 1.6309496954658245, |
| "grad_norm": 0.763543963432312, |
| "learning_rate": 0.00020215655312429503, |
| "loss": 0.1534, |
| "mean_token_accuracy": 0.9606209695339203, |
| "num_tokens": 7365833.0, |
| "step": 7230 |
| }, |
| { |
| "epoch": 1.633205504173246, |
| "grad_norm": 0.5316635966300964, |
| "learning_rate": 0.00020202120460184974, |
| "loss": 0.1378, |
| "mean_token_accuracy": 0.9582312524318695, |
| "num_tokens": 7376065.0, |
| "step": 7240 |
| }, |
| { |
| "epoch": 1.6354613128806679, |
| "grad_norm": 0.683691680431366, |
| "learning_rate": 0.00020188585607940445, |
| "loss": 0.0918, |
| "mean_token_accuracy": 0.9717151343822479, |
| "num_tokens": 7386250.0, |
| "step": 7250 |
| }, |
| { |
| "epoch": 1.6377171215880892, |
| "grad_norm": 0.584567129611969, |
| "learning_rate": 0.00020175050755695913, |
| "loss": 0.1201, |
| "mean_token_accuracy": 0.9670240402221679, |
| "num_tokens": 7396448.0, |
| "step": 7260 |
| }, |
| { |
| "epoch": 1.639972930295511, |
| "grad_norm": 0.34023529291152954, |
| "learning_rate": 0.00020161515903451384, |
| "loss": 0.1148, |
| "mean_token_accuracy": 0.9664259791374207, |
| "num_tokens": 7406629.0, |
| "step": 7270 |
| }, |
| { |
| "epoch": 1.6422287390029324, |
| "grad_norm": 0.6313138008117676, |
| "learning_rate": 0.00020147981051206855, |
| "loss": 0.1484, |
| "mean_token_accuracy": 0.9585749089717865, |
| "num_tokens": 7416830.0, |
| "step": 7280 |
| }, |
| { |
| "epoch": 1.6444845477103542, |
| "grad_norm": 0.7951473593711853, |
| "learning_rate": 0.00020134446198962326, |
| "loss": 0.1256, |
| "mean_token_accuracy": 0.9634339690208436, |
| "num_tokens": 7427068.0, |
| "step": 7290 |
| }, |
| { |
| "epoch": 1.6467403564177756, |
| "grad_norm": 0.7924121618270874, |
| "learning_rate": 0.00020120911346717795, |
| "loss": 0.1771, |
| "mean_token_accuracy": 0.9546219527721405, |
| "num_tokens": 7437275.0, |
| "step": 7300 |
| }, |
| { |
| "epoch": 1.6489961651251974, |
| "grad_norm": 0.3072904944419861, |
| "learning_rate": 0.00020107376494473266, |
| "loss": 0.1016, |
| "mean_token_accuracy": 0.9672701954841614, |
| "num_tokens": 7447492.0, |
| "step": 7310 |
| }, |
| { |
| "epoch": 1.651251973832619, |
| "grad_norm": 0.46338242292404175, |
| "learning_rate": 0.00020093841642228737, |
| "loss": 0.1036, |
| "mean_token_accuracy": 0.968935513496399, |
| "num_tokens": 7457654.0, |
| "step": 7320 |
| }, |
| { |
| "epoch": 1.6535077825400406, |
| "grad_norm": 0.8118281960487366, |
| "learning_rate": 0.00020080306789984208, |
| "loss": 0.1108, |
| "mean_token_accuracy": 0.9685357689857483, |
| "num_tokens": 7467849.0, |
| "step": 7330 |
| }, |
| { |
| "epoch": 1.6557635912474622, |
| "grad_norm": 0.7755963802337646, |
| "learning_rate": 0.00020066771937739676, |
| "loss": 0.1259, |
| "mean_token_accuracy": 0.9649538099765778, |
| "num_tokens": 7477315.0, |
| "step": 7340 |
| }, |
| { |
| "epoch": 1.6580193999548838, |
| "grad_norm": 0.5522985458374023, |
| "learning_rate": 0.00020053237085495147, |
| "loss": 0.1309, |
| "mean_token_accuracy": 0.9658150196075439, |
| "num_tokens": 7487505.0, |
| "step": 7350 |
| }, |
| { |
| "epoch": 1.6602752086623056, |
| "grad_norm": 0.5838783383369446, |
| "learning_rate": 0.00020039702233250618, |
| "loss": 0.1244, |
| "mean_token_accuracy": 0.9633826553821564, |
| "num_tokens": 7497725.0, |
| "step": 7360 |
| }, |
| { |
| "epoch": 1.662531017369727, |
| "grad_norm": 0.9119488596916199, |
| "learning_rate": 0.0002002616738100609, |
| "loss": 0.0837, |
| "mean_token_accuracy": 0.9742124974727631, |
| "num_tokens": 7507941.0, |
| "step": 7370 |
| }, |
| { |
| "epoch": 1.6647868260771488, |
| "grad_norm": 1.0427502393722534, |
| "learning_rate": 0.0002001263252876156, |
| "loss": 0.1457, |
| "mean_token_accuracy": 0.9598093390464782, |
| "num_tokens": 7518134.0, |
| "step": 7380 |
| }, |
| { |
| "epoch": 1.6670426347845702, |
| "grad_norm": 0.8163001537322998, |
| "learning_rate": 0.00019999097676517028, |
| "loss": 0.1522, |
| "mean_token_accuracy": 0.9592224955558777, |
| "num_tokens": 7528369.0, |
| "step": 7390 |
| }, |
| { |
| "epoch": 1.669298443491992, |
| "grad_norm": 0.37992551922798157, |
| "learning_rate": 0.000199855628242725, |
| "loss": 0.1174, |
| "mean_token_accuracy": 0.9663581132888794, |
| "num_tokens": 7538476.0, |
| "step": 7400 |
| }, |
| { |
| "epoch": 1.6715542521994133, |
| "grad_norm": 0.7863211035728455, |
| "learning_rate": 0.0001997202797202797, |
| "loss": 0.1304, |
| "mean_token_accuracy": 0.9637087225914002, |
| "num_tokens": 7548657.0, |
| "step": 7410 |
| }, |
| { |
| "epoch": 1.6738100609068352, |
| "grad_norm": 0.6097214818000793, |
| "learning_rate": 0.0001995849311978344, |
| "loss": 0.1527, |
| "mean_token_accuracy": 0.9590888619422913, |
| "num_tokens": 7558840.0, |
| "step": 7420 |
| }, |
| { |
| "epoch": 1.6760658696142567, |
| "grad_norm": 0.44298750162124634, |
| "learning_rate": 0.0001994495826753891, |
| "loss": 0.1137, |
| "mean_token_accuracy": 0.9671386659145356, |
| "num_tokens": 7569072.0, |
| "step": 7430 |
| }, |
| { |
| "epoch": 1.6783216783216783, |
| "grad_norm": 0.7080332636833191, |
| "learning_rate": 0.0001993142341529438, |
| "loss": 0.1486, |
| "mean_token_accuracy": 0.9594590961933136, |
| "num_tokens": 7579245.0, |
| "step": 7440 |
| }, |
| { |
| "epoch": 1.6805774870291, |
| "grad_norm": 0.4493122100830078, |
| "learning_rate": 0.0001991788856304985, |
| "loss": 0.1082, |
| "mean_token_accuracy": 0.9686892807483674, |
| "num_tokens": 7589468.0, |
| "step": 7450 |
| }, |
| { |
| "epoch": 1.6828332957365215, |
| "grad_norm": 1.414336919784546, |
| "learning_rate": 0.00019904353710805322, |
| "loss": 0.1421, |
| "mean_token_accuracy": 0.9614466667175293, |
| "num_tokens": 7599607.0, |
| "step": 7460 |
| }, |
| { |
| "epoch": 1.685089104443943, |
| "grad_norm": 0.7419303059577942, |
| "learning_rate": 0.0001989081885856079, |
| "loss": 0.1083, |
| "mean_token_accuracy": 0.9705542147159576, |
| "num_tokens": 7609836.0, |
| "step": 7470 |
| }, |
| { |
| "epoch": 1.6873449131513647, |
| "grad_norm": 0.7655690908432007, |
| "learning_rate": 0.00019877284006316261, |
| "loss": 0.1705, |
| "mean_token_accuracy": 0.9575229167938233, |
| "num_tokens": 7620039.0, |
| "step": 7480 |
| }, |
| { |
| "epoch": 1.6896007218587865, |
| "grad_norm": 0.6142362356185913, |
| "learning_rate": 0.00019863749154071732, |
| "loss": 0.1352, |
| "mean_token_accuracy": 0.9614336788654327, |
| "num_tokens": 7630163.0, |
| "step": 7490 |
| }, |
| { |
| "epoch": 1.6918565305662079, |
| "grad_norm": 0.4549749791622162, |
| "learning_rate": 0.00019850214301827203, |
| "loss": 0.1184, |
| "mean_token_accuracy": 0.9650885164737701, |
| "num_tokens": 7640378.0, |
| "step": 7500 |
| }, |
| { |
| "epoch": 1.6941123392736297, |
| "grad_norm": 0.7649783492088318, |
| "learning_rate": 0.00019836679449582674, |
| "loss": 0.1125, |
| "mean_token_accuracy": 0.9654013931751251, |
| "num_tokens": 7650518.0, |
| "step": 7510 |
| }, |
| { |
| "epoch": 1.696368147981051, |
| "grad_norm": 0.8831228613853455, |
| "learning_rate": 0.00019823144597338143, |
| "loss": 0.1224, |
| "mean_token_accuracy": 0.9647625327110291, |
| "num_tokens": 7660716.0, |
| "step": 7520 |
| }, |
| { |
| "epoch": 1.6986239566884729, |
| "grad_norm": 0.5036336779594421, |
| "learning_rate": 0.00019809609745093614, |
| "loss": 0.1187, |
| "mean_token_accuracy": 0.9658556759357453, |
| "num_tokens": 7670880.0, |
| "step": 7530 |
| }, |
| { |
| "epoch": 1.7008797653958945, |
| "grad_norm": 0.909724235534668, |
| "learning_rate": 0.00019796074892849085, |
| "loss": 0.1226, |
| "mean_token_accuracy": 0.9632156550884247, |
| "num_tokens": 7681090.0, |
| "step": 7540 |
| }, |
| { |
| "epoch": 1.703135574103316, |
| "grad_norm": 0.35965245962142944, |
| "learning_rate": 0.00019782540040604556, |
| "loss": 0.1177, |
| "mean_token_accuracy": 0.964445275068283, |
| "num_tokens": 7691232.0, |
| "step": 7550 |
| }, |
| { |
| "epoch": 1.7053913828107377, |
| "grad_norm": 0.5753873586654663, |
| "learning_rate": 0.00019769005188360024, |
| "loss": 0.1234, |
| "mean_token_accuracy": 0.9669915854930877, |
| "num_tokens": 7701429.0, |
| "step": 7560 |
| }, |
| { |
| "epoch": 1.7076471915181592, |
| "grad_norm": 0.4937607944011688, |
| "learning_rate": 0.00019755470336115495, |
| "loss": 0.1287, |
| "mean_token_accuracy": 0.963130658864975, |
| "num_tokens": 7711663.0, |
| "step": 7570 |
| }, |
| { |
| "epoch": 1.7099030002255808, |
| "grad_norm": 0.48959338665008545, |
| "learning_rate": 0.00019741935483870966, |
| "loss": 0.0879, |
| "mean_token_accuracy": 0.9724132180213928, |
| "num_tokens": 7721895.0, |
| "step": 7580 |
| }, |
| { |
| "epoch": 1.7121588089330024, |
| "grad_norm": 0.7161306738853455, |
| "learning_rate": 0.00019728400631626437, |
| "loss": 0.1306, |
| "mean_token_accuracy": 0.9645233631134034, |
| "num_tokens": 7732066.0, |
| "step": 7590 |
| }, |
| { |
| "epoch": 1.7144146176404242, |
| "grad_norm": 0.9491952061653137, |
| "learning_rate": 0.00019714865779381905, |
| "loss": 0.1113, |
| "mean_token_accuracy": 0.9660575449466705, |
| "num_tokens": 7742256.0, |
| "step": 7600 |
| }, |
| { |
| "epoch": 1.7166704263478456, |
| "grad_norm": 0.7226278185844421, |
| "learning_rate": 0.00019701330927137376, |
| "loss": 0.0989, |
| "mean_token_accuracy": 0.9679902136325836, |
| "num_tokens": 7752469.0, |
| "step": 7610 |
| }, |
| { |
| "epoch": 1.7189262350552674, |
| "grad_norm": 0.3322307765483856, |
| "learning_rate": 0.00019687796074892847, |
| "loss": 0.1009, |
| "mean_token_accuracy": 0.970562607049942, |
| "num_tokens": 7762644.0, |
| "step": 7620 |
| }, |
| { |
| "epoch": 1.7211820437626888, |
| "grad_norm": 0.629764199256897, |
| "learning_rate": 0.00019674261222648318, |
| "loss": 0.0939, |
| "mean_token_accuracy": 0.9733090102672577, |
| "num_tokens": 7772882.0, |
| "step": 7630 |
| }, |
| { |
| "epoch": 1.7234378524701106, |
| "grad_norm": 0.4462561011314392, |
| "learning_rate": 0.00019660726370403786, |
| "loss": 0.0984, |
| "mean_token_accuracy": 0.9717909157276153, |
| "num_tokens": 7783096.0, |
| "step": 7640 |
| }, |
| { |
| "epoch": 1.725693661177532, |
| "grad_norm": 0.504454493522644, |
| "learning_rate": 0.00019647191518159257, |
| "loss": 0.0827, |
| "mean_token_accuracy": 0.9726765751838684, |
| "num_tokens": 7793293.0, |
| "step": 7650 |
| }, |
| { |
| "epoch": 1.7279494698849538, |
| "grad_norm": 0.6186042428016663, |
| "learning_rate": 0.00019633656665914728, |
| "loss": 0.0995, |
| "mean_token_accuracy": 0.9694512605667114, |
| "num_tokens": 7803513.0, |
| "step": 7660 |
| }, |
| { |
| "epoch": 1.7302052785923754, |
| "grad_norm": 0.6722903251647949, |
| "learning_rate": 0.000196201218136702, |
| "loss": 0.1414, |
| "mean_token_accuracy": 0.9589116036891937, |
| "num_tokens": 7813743.0, |
| "step": 7670 |
| }, |
| { |
| "epoch": 1.732461087299797, |
| "grad_norm": 0.818533182144165, |
| "learning_rate": 0.0001960658696142567, |
| "loss": 0.1103, |
| "mean_token_accuracy": 0.9665028691291809, |
| "num_tokens": 7823182.0, |
| "step": 7680 |
| }, |
| { |
| "epoch": 1.7347168960072186, |
| "grad_norm": 0.43735024333000183, |
| "learning_rate": 0.0001959305210918114, |
| "loss": 0.0965, |
| "mean_token_accuracy": 0.969660896062851, |
| "num_tokens": 7833362.0, |
| "step": 7690 |
| }, |
| { |
| "epoch": 1.7369727047146402, |
| "grad_norm": 0.8122400045394897, |
| "learning_rate": 0.0001957951725693661, |
| "loss": 0.1217, |
| "mean_token_accuracy": 0.9671069800853729, |
| "num_tokens": 7843548.0, |
| "step": 7700 |
| }, |
| { |
| "epoch": 1.739228513422062, |
| "grad_norm": 0.9384815692901611, |
| "learning_rate": 0.0001956598240469208, |
| "loss": 0.1085, |
| "mean_token_accuracy": 0.9692832350730896, |
| "num_tokens": 7853778.0, |
| "step": 7710 |
| }, |
| { |
| "epoch": 1.7414843221294833, |
| "grad_norm": 0.6304382085800171, |
| "learning_rate": 0.00019552447552447552, |
| "loss": 0.1068, |
| "mean_token_accuracy": 0.9693370401859284, |
| "num_tokens": 7863983.0, |
| "step": 7720 |
| }, |
| { |
| "epoch": 1.7437401308369052, |
| "grad_norm": 0.6020340323448181, |
| "learning_rate": 0.0001953891270020302, |
| "loss": 0.0795, |
| "mean_token_accuracy": 0.9766138076782227, |
| "num_tokens": 7874213.0, |
| "step": 7730 |
| }, |
| { |
| "epoch": 1.7459959395443265, |
| "grad_norm": 0.6551034450531006, |
| "learning_rate": 0.0001952537784795849, |
| "loss": 0.094, |
| "mean_token_accuracy": 0.9719179630279541, |
| "num_tokens": 7884443.0, |
| "step": 7740 |
| }, |
| { |
| "epoch": 1.7482517482517483, |
| "grad_norm": 0.9742296934127808, |
| "learning_rate": 0.00019511842995713962, |
| "loss": 0.1558, |
| "mean_token_accuracy": 0.9584499776363373, |
| "num_tokens": 7894597.0, |
| "step": 7750 |
| }, |
| { |
| "epoch": 1.7505075569591697, |
| "grad_norm": 0.6788113117218018, |
| "learning_rate": 0.00019498308143469433, |
| "loss": 0.1265, |
| "mean_token_accuracy": 0.9651906430721283, |
| "num_tokens": 7904692.0, |
| "step": 7760 |
| }, |
| { |
| "epoch": 1.7527633656665915, |
| "grad_norm": 0.8864907026290894, |
| "learning_rate": 0.000194847732912249, |
| "loss": 0.1254, |
| "mean_token_accuracy": 0.9674174845218658, |
| "num_tokens": 7914807.0, |
| "step": 7770 |
| }, |
| { |
| "epoch": 1.7550191743740131, |
| "grad_norm": 0.9627271294593811, |
| "learning_rate": 0.00019471238438980372, |
| "loss": 0.0996, |
| "mean_token_accuracy": 0.9694191575050354, |
| "num_tokens": 7925013.0, |
| "step": 7780 |
| }, |
| { |
| "epoch": 1.7572749830814347, |
| "grad_norm": 0.5663970708847046, |
| "learning_rate": 0.00019457703586735843, |
| "loss": 0.1393, |
| "mean_token_accuracy": 0.9625206768512726, |
| "num_tokens": 7935221.0, |
| "step": 7790 |
| }, |
| { |
| "epoch": 1.7595307917888563, |
| "grad_norm": 0.506974995136261, |
| "learning_rate": 0.00019444168734491314, |
| "loss": 0.1484, |
| "mean_token_accuracy": 0.9573349118232727, |
| "num_tokens": 7945383.0, |
| "step": 7800 |
| }, |
| { |
| "epoch": 1.7617866004962779, |
| "grad_norm": 0.3117770254611969, |
| "learning_rate": 0.00019430633882246782, |
| "loss": 0.1104, |
| "mean_token_accuracy": 0.9667424440383912, |
| "num_tokens": 7955526.0, |
| "step": 7810 |
| }, |
| { |
| "epoch": 1.7640424092036995, |
| "grad_norm": 0.7051275968551636, |
| "learning_rate": 0.00019417099030002253, |
| "loss": 0.1049, |
| "mean_token_accuracy": 0.9673680782318115, |
| "num_tokens": 7965740.0, |
| "step": 7820 |
| }, |
| { |
| "epoch": 1.766298217911121, |
| "grad_norm": 0.6643967032432556, |
| "learning_rate": 0.00019403564177757724, |
| "loss": 0.1313, |
| "mean_token_accuracy": 0.9644637405872345, |
| "num_tokens": 7975965.0, |
| "step": 7830 |
| }, |
| { |
| "epoch": 1.7685540266185429, |
| "grad_norm": 0.8619371652603149, |
| "learning_rate": 0.00019390029325513195, |
| "loss": 0.1165, |
| "mean_token_accuracy": 0.9674583613872528, |
| "num_tokens": 7986109.0, |
| "step": 7840 |
| }, |
| { |
| "epoch": 1.7708098353259643, |
| "grad_norm": 0.7922738790512085, |
| "learning_rate": 0.00019376494473268666, |
| "loss": 0.1456, |
| "mean_token_accuracy": 0.9613803088665008, |
| "num_tokens": 7996242.0, |
| "step": 7850 |
| }, |
| { |
| "epoch": 1.773065644033386, |
| "grad_norm": 0.49722597002983093, |
| "learning_rate": 0.00019362959621024135, |
| "loss": 0.1501, |
| "mean_token_accuracy": 0.9591331839561462, |
| "num_tokens": 8006480.0, |
| "step": 7860 |
| }, |
| { |
| "epoch": 1.7753214527408074, |
| "grad_norm": 0.797990083694458, |
| "learning_rate": 0.00019349424768779606, |
| "loss": 0.1185, |
| "mean_token_accuracy": 0.9682628512382507, |
| "num_tokens": 8016688.0, |
| "step": 7870 |
| }, |
| { |
| "epoch": 1.7775772614482293, |
| "grad_norm": 0.6355772614479065, |
| "learning_rate": 0.00019335889916535077, |
| "loss": 0.1186, |
| "mean_token_accuracy": 0.965477454662323, |
| "num_tokens": 8026854.0, |
| "step": 7880 |
| }, |
| { |
| "epoch": 1.7798330701556508, |
| "grad_norm": 0.4601174294948578, |
| "learning_rate": 0.00019322355064290548, |
| "loss": 0.1037, |
| "mean_token_accuracy": 0.969682228565216, |
| "num_tokens": 8037088.0, |
| "step": 7890 |
| }, |
| { |
| "epoch": 1.7820888788630724, |
| "grad_norm": 0.6090744733810425, |
| "learning_rate": 0.00019308820212046016, |
| "loss": 0.0997, |
| "mean_token_accuracy": 0.9699548482894897, |
| "num_tokens": 8047280.0, |
| "step": 7900 |
| }, |
| { |
| "epoch": 1.784344687570494, |
| "grad_norm": 0.6074991226196289, |
| "learning_rate": 0.00019295285359801487, |
| "loss": 0.1002, |
| "mean_token_accuracy": 0.9707536101341248, |
| "num_tokens": 8057474.0, |
| "step": 7910 |
| }, |
| { |
| "epoch": 1.7866004962779156, |
| "grad_norm": 0.4148198068141937, |
| "learning_rate": 0.00019281750507556958, |
| "loss": 0.106, |
| "mean_token_accuracy": 0.9689879715442657, |
| "num_tokens": 8067611.0, |
| "step": 7920 |
| }, |
| { |
| "epoch": 1.7888563049853372, |
| "grad_norm": 0.6076057553291321, |
| "learning_rate": 0.0001926821565531243, |
| "loss": 0.1021, |
| "mean_token_accuracy": 0.9703522503376008, |
| "num_tokens": 8077802.0, |
| "step": 7930 |
| }, |
| { |
| "epoch": 1.7911121136927588, |
| "grad_norm": 1.0338988304138184, |
| "learning_rate": 0.00019254680803067897, |
| "loss": 0.1797, |
| "mean_token_accuracy": 0.9537634730339051, |
| "num_tokens": 8087956.0, |
| "step": 7940 |
| }, |
| { |
| "epoch": 1.7933679224001806, |
| "grad_norm": 0.5177671909332275, |
| "learning_rate": 0.00019241145950823368, |
| "loss": 0.094, |
| "mean_token_accuracy": 0.9720286428928375, |
| "num_tokens": 8098191.0, |
| "step": 7950 |
| }, |
| { |
| "epoch": 1.795623731107602, |
| "grad_norm": 0.6910755038261414, |
| "learning_rate": 0.0001922761109857884, |
| "loss": 0.1018, |
| "mean_token_accuracy": 0.9682543575763702, |
| "num_tokens": 8108272.0, |
| "step": 7960 |
| }, |
| { |
| "epoch": 1.7978795398150238, |
| "grad_norm": 0.6151471138000488, |
| "learning_rate": 0.0001921407624633431, |
| "loss": 0.1203, |
| "mean_token_accuracy": 0.9646850287914276, |
| "num_tokens": 8118449.0, |
| "step": 7970 |
| }, |
| { |
| "epoch": 1.8001353485224452, |
| "grad_norm": 0.8103246092796326, |
| "learning_rate": 0.00019200541394089778, |
| "loss": 0.1128, |
| "mean_token_accuracy": 0.9669484615325927, |
| "num_tokens": 8128647.0, |
| "step": 7980 |
| }, |
| { |
| "epoch": 1.802391157229867, |
| "grad_norm": 0.6422575116157532, |
| "learning_rate": 0.0001918700654184525, |
| "loss": 0.0805, |
| "mean_token_accuracy": 0.9736741304397583, |
| "num_tokens": 8138873.0, |
| "step": 7990 |
| }, |
| { |
| "epoch": 1.8046469659372886, |
| "grad_norm": 0.5007938742637634, |
| "learning_rate": 0.0001917347168960072, |
| "loss": 0.1595, |
| "mean_token_accuracy": 0.9583836376667023, |
| "num_tokens": 8149003.0, |
| "step": 8000 |
| }, |
| { |
| "epoch": 1.8069027746447102, |
| "grad_norm": 0.8241115808486938, |
| "learning_rate": 0.0001915993683735619, |
| "loss": 0.1099, |
| "mean_token_accuracy": 0.9675259053707123, |
| "num_tokens": 8159211.0, |
| "step": 8010 |
| }, |
| { |
| "epoch": 1.8091585833521318, |
| "grad_norm": 0.9143343567848206, |
| "learning_rate": 0.00019146401985111662, |
| "loss": 0.1099, |
| "mean_token_accuracy": 0.9669674336910248, |
| "num_tokens": 8169432.0, |
| "step": 8020 |
| }, |
| { |
| "epoch": 1.8114143920595533, |
| "grad_norm": 0.9340387582778931, |
| "learning_rate": 0.0001913286713286713, |
| "loss": 0.1322, |
| "mean_token_accuracy": 0.9651595711708069, |
| "num_tokens": 8179601.0, |
| "step": 8030 |
| }, |
| { |
| "epoch": 1.813670200766975, |
| "grad_norm": 1.0919502973556519, |
| "learning_rate": 0.00019119332280622601, |
| "loss": 0.157, |
| "mean_token_accuracy": 0.95865718126297, |
| "num_tokens": 8189813.0, |
| "step": 8040 |
| }, |
| { |
| "epoch": 1.8159260094743965, |
| "grad_norm": 0.9550254940986633, |
| "learning_rate": 0.00019105797428378072, |
| "loss": 0.1022, |
| "mean_token_accuracy": 0.9707346975803375, |
| "num_tokens": 8200037.0, |
| "step": 8050 |
| }, |
| { |
| "epoch": 1.8181818181818183, |
| "grad_norm": 0.6538637280464172, |
| "learning_rate": 0.00019092262576133543, |
| "loss": 0.1331, |
| "mean_token_accuracy": 0.9647262156009674, |
| "num_tokens": 8210241.0, |
| "step": 8060 |
| }, |
| { |
| "epoch": 1.8204376268892397, |
| "grad_norm": 0.5565653443336487, |
| "learning_rate": 0.00019078727723889012, |
| "loss": 0.1207, |
| "mean_token_accuracy": 0.9637064814567566, |
| "num_tokens": 8220471.0, |
| "step": 8070 |
| }, |
| { |
| "epoch": 1.8226934355966615, |
| "grad_norm": 0.804456889629364, |
| "learning_rate": 0.00019065192871644483, |
| "loss": 0.1091, |
| "mean_token_accuracy": 0.9688849687576294, |
| "num_tokens": 8230699.0, |
| "step": 8080 |
| }, |
| { |
| "epoch": 1.824949244304083, |
| "grad_norm": 0.5959724187850952, |
| "learning_rate": 0.00019051658019399954, |
| "loss": 0.1176, |
| "mean_token_accuracy": 0.9674373865127563, |
| "num_tokens": 8240936.0, |
| "step": 8090 |
| }, |
| { |
| "epoch": 1.8272050530115047, |
| "grad_norm": 0.671328067779541, |
| "learning_rate": 0.00019038123167155425, |
| "loss": 0.1, |
| "mean_token_accuracy": 0.9702531158924103, |
| "num_tokens": 8251138.0, |
| "step": 8100 |
| }, |
| { |
| "epoch": 1.829460861718926, |
| "grad_norm": 0.5372695922851562, |
| "learning_rate": 0.00019024588314910893, |
| "loss": 0.1265, |
| "mean_token_accuracy": 0.9620508432388306, |
| "num_tokens": 8261303.0, |
| "step": 8110 |
| }, |
| { |
| "epoch": 1.831716670426348, |
| "grad_norm": 0.6847373843193054, |
| "learning_rate": 0.00019011053462666364, |
| "loss": 0.1472, |
| "mean_token_accuracy": 0.9624595642089844, |
| "num_tokens": 8271418.0, |
| "step": 8120 |
| }, |
| { |
| "epoch": 1.8339724791337695, |
| "grad_norm": 0.5652275085449219, |
| "learning_rate": 0.00018997518610421835, |
| "loss": 0.0966, |
| "mean_token_accuracy": 0.970450347661972, |
| "num_tokens": 8281647.0, |
| "step": 8130 |
| }, |
| { |
| "epoch": 1.836228287841191, |
| "grad_norm": 0.21371297538280487, |
| "learning_rate": 0.00018983983758177306, |
| "loss": 0.0867, |
| "mean_token_accuracy": 0.9742642045021057, |
| "num_tokens": 8291483.0, |
| "step": 8140 |
| }, |
| { |
| "epoch": 1.8384840965486127, |
| "grad_norm": 0.7062776684761047, |
| "learning_rate": 0.00018970448905932777, |
| "loss": 0.1026, |
| "mean_token_accuracy": 0.9685595810413361, |
| "num_tokens": 8301705.0, |
| "step": 8150 |
| }, |
| { |
| "epoch": 1.8407399052560343, |
| "grad_norm": 0.6951163411140442, |
| "learning_rate": 0.00018956914053688245, |
| "loss": 0.0935, |
| "mean_token_accuracy": 0.9725548028945923, |
| "num_tokens": 8311909.0, |
| "step": 8160 |
| }, |
| { |
| "epoch": 1.8429957139634559, |
| "grad_norm": 0.5619714260101318, |
| "learning_rate": 0.00018943379201443716, |
| "loss": 0.105, |
| "mean_token_accuracy": 0.9708254158496856, |
| "num_tokens": 8322106.0, |
| "step": 8170 |
| }, |
| { |
| "epoch": 1.8452515226708774, |
| "grad_norm": 0.7393009662628174, |
| "learning_rate": 0.00018929844349199187, |
| "loss": 0.1224, |
| "mean_token_accuracy": 0.9665905058383941, |
| "num_tokens": 8332162.0, |
| "step": 8180 |
| }, |
| { |
| "epoch": 1.8475073313782993, |
| "grad_norm": 0.705487847328186, |
| "learning_rate": 0.00018916309496954658, |
| "loss": 0.1046, |
| "mean_token_accuracy": 0.9709407091140747, |
| "num_tokens": 8341682.0, |
| "step": 8190 |
| }, |
| { |
| "epoch": 1.8497631400857206, |
| "grad_norm": 0.7456323504447937, |
| "learning_rate": 0.00018902774644710126, |
| "loss": 0.1602, |
| "mean_token_accuracy": 0.9597473442554474, |
| "num_tokens": 8351800.0, |
| "step": 8200 |
| }, |
| { |
| "epoch": 1.8520189487931424, |
| "grad_norm": 0.4004161059856415, |
| "learning_rate": 0.00018889239792465597, |
| "loss": 0.072, |
| "mean_token_accuracy": 0.9788155972957611, |
| "num_tokens": 8362016.0, |
| "step": 8210 |
| }, |
| { |
| "epoch": 1.8542747575005638, |
| "grad_norm": 0.8284559845924377, |
| "learning_rate": 0.00018875704940221068, |
| "loss": 0.156, |
| "mean_token_accuracy": 0.9555248856544495, |
| "num_tokens": 8372161.0, |
| "step": 8220 |
| }, |
| { |
| "epoch": 1.8565305662079856, |
| "grad_norm": 0.5218929052352905, |
| "learning_rate": 0.0001886217008797654, |
| "loss": 0.1166, |
| "mean_token_accuracy": 0.9678529024124145, |
| "num_tokens": 8382286.0, |
| "step": 8230 |
| }, |
| { |
| "epoch": 1.8587863749154072, |
| "grad_norm": 0.65278160572052, |
| "learning_rate": 0.00018848635235732008, |
| "loss": 0.1258, |
| "mean_token_accuracy": 0.9670017480850219, |
| "num_tokens": 8392414.0, |
| "step": 8240 |
| }, |
| { |
| "epoch": 1.8610421836228288, |
| "grad_norm": 0.507361888885498, |
| "learning_rate": 0.0001883510038348748, |
| "loss": 0.1036, |
| "mean_token_accuracy": 0.9709287106990814, |
| "num_tokens": 8402535.0, |
| "step": 8250 |
| }, |
| { |
| "epoch": 1.8632979923302504, |
| "grad_norm": 0.5099123120307922, |
| "learning_rate": 0.0001882156553124295, |
| "loss": 0.1396, |
| "mean_token_accuracy": 0.9616460084915162, |
| "num_tokens": 8412680.0, |
| "step": 8260 |
| }, |
| { |
| "epoch": 1.865553801037672, |
| "grad_norm": 0.8495880961418152, |
| "learning_rate": 0.0001880803067899842, |
| "loss": 0.1025, |
| "mean_token_accuracy": 0.9701466143131257, |
| "num_tokens": 8422874.0, |
| "step": 8270 |
| }, |
| { |
| "epoch": 1.8678096097450936, |
| "grad_norm": 0.20612621307373047, |
| "learning_rate": 0.0001879449582675389, |
| "loss": 0.1181, |
| "mean_token_accuracy": 0.9679302871227264, |
| "num_tokens": 8433083.0, |
| "step": 8280 |
| }, |
| { |
| "epoch": 1.8700654184525152, |
| "grad_norm": 0.5034496188163757, |
| "learning_rate": 0.0001878096097450936, |
| "loss": 0.1, |
| "mean_token_accuracy": 0.9701763093471527, |
| "num_tokens": 8443293.0, |
| "step": 8290 |
| }, |
| { |
| "epoch": 1.872321227159937, |
| "grad_norm": 0.8790525794029236, |
| "learning_rate": 0.0001876742612226483, |
| "loss": 0.0974, |
| "mean_token_accuracy": 0.9718030214309692, |
| "num_tokens": 8453487.0, |
| "step": 8300 |
| }, |
| { |
| "epoch": 1.8745770358673584, |
| "grad_norm": 0.4784752428531647, |
| "learning_rate": 0.00018753891270020302, |
| "loss": 0.092, |
| "mean_token_accuracy": 0.9723727226257324, |
| "num_tokens": 8463707.0, |
| "step": 8310 |
| }, |
| { |
| "epoch": 1.8768328445747802, |
| "grad_norm": 0.7611798644065857, |
| "learning_rate": 0.00018740356417775773, |
| "loss": 0.1092, |
| "mean_token_accuracy": 0.9694570362567901, |
| "num_tokens": 8473928.0, |
| "step": 8320 |
| }, |
| { |
| "epoch": 1.8790886532822015, |
| "grad_norm": 0.4173850417137146, |
| "learning_rate": 0.0001872682156553124, |
| "loss": 0.1328, |
| "mean_token_accuracy": 0.9634264826774597, |
| "num_tokens": 8484091.0, |
| "step": 8330 |
| }, |
| { |
| "epoch": 1.8813444619896234, |
| "grad_norm": 0.515573263168335, |
| "learning_rate": 0.00018713286713286712, |
| "loss": 0.1139, |
| "mean_token_accuracy": 0.966448575258255, |
| "num_tokens": 8494322.0, |
| "step": 8340 |
| }, |
| { |
| "epoch": 1.883600270697045, |
| "grad_norm": 0.7708337306976318, |
| "learning_rate": 0.00018699751861042183, |
| "loss": 0.1153, |
| "mean_token_accuracy": 0.9649886667728425, |
| "num_tokens": 8504469.0, |
| "step": 8350 |
| }, |
| { |
| "epoch": 1.8858560794044665, |
| "grad_norm": 1.0154062509536743, |
| "learning_rate": 0.00018686217008797654, |
| "loss": 0.1283, |
| "mean_token_accuracy": 0.9634427964687348, |
| "num_tokens": 8514619.0, |
| "step": 8360 |
| }, |
| { |
| "epoch": 1.8881118881118881, |
| "grad_norm": 0.6789688467979431, |
| "learning_rate": 0.00018672682156553122, |
| "loss": 0.1395, |
| "mean_token_accuracy": 0.9595559239387512, |
| "num_tokens": 8524806.0, |
| "step": 8370 |
| }, |
| { |
| "epoch": 1.8903676968193097, |
| "grad_norm": 0.7808207273483276, |
| "learning_rate": 0.00018659147304308593, |
| "loss": 0.1796, |
| "mean_token_accuracy": 0.9548425853252411, |
| "num_tokens": 8534972.0, |
| "step": 8380 |
| }, |
| { |
| "epoch": 1.8926235055267313, |
| "grad_norm": 0.8315436840057373, |
| "learning_rate": 0.00018645612452064064, |
| "loss": 0.1295, |
| "mean_token_accuracy": 0.9659779012203217, |
| "num_tokens": 8545209.0, |
| "step": 8390 |
| }, |
| { |
| "epoch": 1.894879314234153, |
| "grad_norm": 0.6535826921463013, |
| "learning_rate": 0.00018632077599819535, |
| "loss": 0.1134, |
| "mean_token_accuracy": 0.9662691533565522, |
| "num_tokens": 8555446.0, |
| "step": 8400 |
| }, |
| { |
| "epoch": 1.8971351229415747, |
| "grad_norm": 0.5366471409797668, |
| "learning_rate": 0.00018618542747575004, |
| "loss": 0.1369, |
| "mean_token_accuracy": 0.9663753628730773, |
| "num_tokens": 8565580.0, |
| "step": 8410 |
| }, |
| { |
| "epoch": 1.899390931648996, |
| "grad_norm": 0.5309544205665588, |
| "learning_rate": 0.00018605007895330475, |
| "loss": 0.0862, |
| "mean_token_accuracy": 0.9748947978019714, |
| "num_tokens": 8575743.0, |
| "step": 8420 |
| }, |
| { |
| "epoch": 1.901646740356418, |
| "grad_norm": 0.4496230483055115, |
| "learning_rate": 0.00018591473043085946, |
| "loss": 0.1105, |
| "mean_token_accuracy": 0.969891893863678, |
| "num_tokens": 8585905.0, |
| "step": 8430 |
| }, |
| { |
| "epoch": 1.9039025490638393, |
| "grad_norm": 0.5235305428504944, |
| "learning_rate": 0.00018577938190841417, |
| "loss": 0.1068, |
| "mean_token_accuracy": 0.9693613052368164, |
| "num_tokens": 8596130.0, |
| "step": 8440 |
| }, |
| { |
| "epoch": 1.906158357771261, |
| "grad_norm": 0.3482052683830261, |
| "learning_rate": 0.00018564403338596885, |
| "loss": 0.0934, |
| "mean_token_accuracy": 0.9705743670463562, |
| "num_tokens": 8606333.0, |
| "step": 8450 |
| }, |
| { |
| "epoch": 1.9084141664786824, |
| "grad_norm": 0.3155059814453125, |
| "learning_rate": 0.00018550868486352356, |
| "loss": 0.1494, |
| "mean_token_accuracy": 0.9634097695350647, |
| "num_tokens": 8616546.0, |
| "step": 8460 |
| }, |
| { |
| "epoch": 1.9106699751861043, |
| "grad_norm": 0.6361089944839478, |
| "learning_rate": 0.00018537333634107827, |
| "loss": 0.1315, |
| "mean_token_accuracy": 0.9672301173210144, |
| "num_tokens": 8626008.0, |
| "step": 8470 |
| }, |
| { |
| "epoch": 1.9129257838935259, |
| "grad_norm": 0.37798941135406494, |
| "learning_rate": 0.00018523798781863298, |
| "loss": 0.1064, |
| "mean_token_accuracy": 0.9712181925773621, |
| "num_tokens": 8636230.0, |
| "step": 8480 |
| }, |
| { |
| "epoch": 1.9151815926009474, |
| "grad_norm": 0.60988849401474, |
| "learning_rate": 0.0001851026392961877, |
| "loss": 0.0886, |
| "mean_token_accuracy": 0.9740067481994629, |
| "num_tokens": 8646377.0, |
| "step": 8490 |
| }, |
| { |
| "epoch": 1.917437401308369, |
| "grad_norm": 0.9676559567451477, |
| "learning_rate": 0.00018496729077374237, |
| "loss": 0.1134, |
| "mean_token_accuracy": 0.9681664168834686, |
| "num_tokens": 8656547.0, |
| "step": 8500 |
| }, |
| { |
| "epoch": 1.9196932100157906, |
| "grad_norm": 0.7825080156326294, |
| "learning_rate": 0.00018483194225129708, |
| "loss": 0.1107, |
| "mean_token_accuracy": 0.9664545893669129, |
| "num_tokens": 8666781.0, |
| "step": 8510 |
| }, |
| { |
| "epoch": 1.9219490187232122, |
| "grad_norm": 0.34856459498405457, |
| "learning_rate": 0.0001846965937288518, |
| "loss": 0.1477, |
| "mean_token_accuracy": 0.9601014196872711, |
| "num_tokens": 8676896.0, |
| "step": 8520 |
| }, |
| { |
| "epoch": 1.9242048274306338, |
| "grad_norm": 0.4000522494316101, |
| "learning_rate": 0.0001845612452064065, |
| "loss": 0.1003, |
| "mean_token_accuracy": 0.9701106131076813, |
| "num_tokens": 8687076.0, |
| "step": 8530 |
| }, |
| { |
| "epoch": 1.9264606361380556, |
| "grad_norm": 0.859682559967041, |
| "learning_rate": 0.00018442589668396118, |
| "loss": 0.1115, |
| "mean_token_accuracy": 0.9683105528354645, |
| "num_tokens": 8697310.0, |
| "step": 8540 |
| }, |
| { |
| "epoch": 1.928716444845477, |
| "grad_norm": 0.39500635862350464, |
| "learning_rate": 0.0001842905481615159, |
| "loss": 0.114, |
| "mean_token_accuracy": 0.9657883048057556, |
| "num_tokens": 8707496.0, |
| "step": 8550 |
| }, |
| { |
| "epoch": 1.9309722535528988, |
| "grad_norm": 0.7265903353691101, |
| "learning_rate": 0.0001841551996390706, |
| "loss": 0.1104, |
| "mean_token_accuracy": 0.9682238936424256, |
| "num_tokens": 8717691.0, |
| "step": 8560 |
| }, |
| { |
| "epoch": 1.9332280622603202, |
| "grad_norm": 0.7054004073143005, |
| "learning_rate": 0.0001840198511166253, |
| "loss": 0.1201, |
| "mean_token_accuracy": 0.9653518259525299, |
| "num_tokens": 8727910.0, |
| "step": 8570 |
| }, |
| { |
| "epoch": 1.935483870967742, |
| "grad_norm": 0.630401611328125, |
| "learning_rate": 0.00018388450259418, |
| "loss": 0.0861, |
| "mean_token_accuracy": 0.97560213804245, |
| "num_tokens": 8737095.0, |
| "step": 8580 |
| }, |
| { |
| "epoch": 1.9377396796751636, |
| "grad_norm": 0.4708097279071808, |
| "learning_rate": 0.0001837491540717347, |
| "loss": 0.106, |
| "mean_token_accuracy": 0.9685471832752228, |
| "num_tokens": 8747301.0, |
| "step": 8590 |
| }, |
| { |
| "epoch": 1.9399954883825852, |
| "grad_norm": 0.43274563550949097, |
| "learning_rate": 0.00018361380554928942, |
| "loss": 0.1082, |
| "mean_token_accuracy": 0.9686918735504151, |
| "num_tokens": 8757435.0, |
| "step": 8600 |
| }, |
| { |
| "epoch": 1.9422512970900068, |
| "grad_norm": 0.4595888555049896, |
| "learning_rate": 0.00018347845702684412, |
| "loss": 0.0887, |
| "mean_token_accuracy": 0.9735994637012482, |
| "num_tokens": 8767568.0, |
| "step": 8610 |
| }, |
| { |
| "epoch": 1.9445071057974284, |
| "grad_norm": 0.6666122674942017, |
| "learning_rate": 0.0001833431085043988, |
| "loss": 0.1097, |
| "mean_token_accuracy": 0.9671940863132477, |
| "num_tokens": 8777783.0, |
| "step": 8620 |
| }, |
| { |
| "epoch": 1.94676291450485, |
| "grad_norm": 0.45760247111320496, |
| "learning_rate": 0.00018320775998195352, |
| "loss": 0.1247, |
| "mean_token_accuracy": 0.9661338448524475, |
| "num_tokens": 8787974.0, |
| "step": 8630 |
| }, |
| { |
| "epoch": 1.9490187232122715, |
| "grad_norm": 0.8178554177284241, |
| "learning_rate": 0.00018307241145950823, |
| "loss": 0.0977, |
| "mean_token_accuracy": 0.9710348606109619, |
| "num_tokens": 8798199.0, |
| "step": 8640 |
| }, |
| { |
| "epoch": 1.9512745319196934, |
| "grad_norm": 0.5238173007965088, |
| "learning_rate": 0.00018293706293706294, |
| "loss": 0.0859, |
| "mean_token_accuracy": 0.9734928429126739, |
| "num_tokens": 8808393.0, |
| "step": 8650 |
| }, |
| { |
| "epoch": 1.9535303406271147, |
| "grad_norm": 0.5230134129524231, |
| "learning_rate": 0.00018280171441461765, |
| "loss": 0.1213, |
| "mean_token_accuracy": 0.963700121641159, |
| "num_tokens": 8818596.0, |
| "step": 8660 |
| }, |
| { |
| "epoch": 1.9557861493345365, |
| "grad_norm": 0.4369744062423706, |
| "learning_rate": 0.00018266636589217233, |
| "loss": 0.0795, |
| "mean_token_accuracy": 0.9761726558208466, |
| "num_tokens": 8828762.0, |
| "step": 8670 |
| }, |
| { |
| "epoch": 1.958041958041958, |
| "grad_norm": 0.5722286105155945, |
| "learning_rate": 0.00018253101736972704, |
| "loss": 0.1321, |
| "mean_token_accuracy": 0.965042096376419, |
| "num_tokens": 8838866.0, |
| "step": 8680 |
| }, |
| { |
| "epoch": 1.9602977667493797, |
| "grad_norm": 0.6464564800262451, |
| "learning_rate": 0.00018239566884728175, |
| "loss": 0.114, |
| "mean_token_accuracy": 0.9651407182216645, |
| "num_tokens": 8849045.0, |
| "step": 8690 |
| }, |
| { |
| "epoch": 1.9625535754568013, |
| "grad_norm": 0.49602097272872925, |
| "learning_rate": 0.00018226032032483646, |
| "loss": 0.1277, |
| "mean_token_accuracy": 0.9648896515369415, |
| "num_tokens": 8859264.0, |
| "step": 8700 |
| }, |
| { |
| "epoch": 1.964809384164223, |
| "grad_norm": 0.4740694463253021, |
| "learning_rate": 0.00018212497180239112, |
| "loss": 0.0997, |
| "mean_token_accuracy": 0.9698693633079529, |
| "num_tokens": 8869487.0, |
| "step": 8710 |
| }, |
| { |
| "epoch": 1.9670651928716445, |
| "grad_norm": 0.672024130821228, |
| "learning_rate": 0.00018198962327994585, |
| "loss": 0.1151, |
| "mean_token_accuracy": 0.9668726742267608, |
| "num_tokens": 8879699.0, |
| "step": 8720 |
| }, |
| { |
| "epoch": 1.969321001579066, |
| "grad_norm": 0.4891161620616913, |
| "learning_rate": 0.00018185427475750056, |
| "loss": 0.1115, |
| "mean_token_accuracy": 0.9718928813934327, |
| "num_tokens": 8889850.0, |
| "step": 8730 |
| }, |
| { |
| "epoch": 1.9715768102864877, |
| "grad_norm": 0.7066110968589783, |
| "learning_rate": 0.00018171892623505527, |
| "loss": 0.132, |
| "mean_token_accuracy": 0.9635340571403503, |
| "num_tokens": 8899992.0, |
| "step": 8740 |
| }, |
| { |
| "epoch": 1.9738326189939093, |
| "grad_norm": 0.727239727973938, |
| "learning_rate": 0.00018158357771260993, |
| "loss": 0.1036, |
| "mean_token_accuracy": 0.970716518163681, |
| "num_tokens": 8910215.0, |
| "step": 8750 |
| }, |
| { |
| "epoch": 1.976088427701331, |
| "grad_norm": 0.4740559160709381, |
| "learning_rate": 0.00018144822919016466, |
| "loss": 0.1136, |
| "mean_token_accuracy": 0.9690234005451203, |
| "num_tokens": 8920410.0, |
| "step": 8760 |
| }, |
| { |
| "epoch": 1.9783442364087525, |
| "grad_norm": 0.45103248953819275, |
| "learning_rate": 0.00018131288066771937, |
| "loss": 0.0803, |
| "mean_token_accuracy": 0.9752394497394562, |
| "num_tokens": 8930634.0, |
| "step": 8770 |
| }, |
| { |
| "epoch": 1.9806000451161743, |
| "grad_norm": 0.7137285470962524, |
| "learning_rate": 0.00018117753214527408, |
| "loss": 0.0891, |
| "mean_token_accuracy": 0.9740745484828949, |
| "num_tokens": 8940866.0, |
| "step": 8780 |
| }, |
| { |
| "epoch": 1.9828558538235956, |
| "grad_norm": 0.7460572123527527, |
| "learning_rate": 0.0001810421836228288, |
| "loss": 0.1182, |
| "mean_token_accuracy": 0.9643153727054596, |
| "num_tokens": 8951069.0, |
| "step": 8790 |
| }, |
| { |
| "epoch": 1.9851116625310175, |
| "grad_norm": 0.7210149168968201, |
| "learning_rate": 0.00018090683510038345, |
| "loss": 0.0918, |
| "mean_token_accuracy": 0.9729661166667938, |
| "num_tokens": 8960810.0, |
| "step": 8800 |
| }, |
| { |
| "epoch": 1.9873674712384388, |
| "grad_norm": 0.4664352238178253, |
| "learning_rate": 0.0001807714865779382, |
| "loss": 0.0964, |
| "mean_token_accuracy": 0.9717876076698303, |
| "num_tokens": 8971031.0, |
| "step": 8810 |
| }, |
| { |
| "epoch": 1.9896232799458606, |
| "grad_norm": 0.7352085113525391, |
| "learning_rate": 0.0001806361380554929, |
| "loss": 0.1127, |
| "mean_token_accuracy": 0.9690958023071289, |
| "num_tokens": 8981169.0, |
| "step": 8820 |
| }, |
| { |
| "epoch": 1.9918790886532822, |
| "grad_norm": 0.6887751221656799, |
| "learning_rate": 0.0001805007895330476, |
| "loss": 0.1206, |
| "mean_token_accuracy": 0.964536988735199, |
| "num_tokens": 8991349.0, |
| "step": 8830 |
| }, |
| { |
| "epoch": 1.9941348973607038, |
| "grad_norm": 0.5474720001220703, |
| "learning_rate": 0.00018036544101060226, |
| "loss": 0.0761, |
| "mean_token_accuracy": 0.9767542481422424, |
| "num_tokens": 9001504.0, |
| "step": 8840 |
| }, |
| { |
| "epoch": 1.9963907060681254, |
| "grad_norm": 0.7074306607246399, |
| "learning_rate": 0.00018023009248815697, |
| "loss": 0.108, |
| "mean_token_accuracy": 0.9695817053318023, |
| "num_tokens": 9011695.0, |
| "step": 8850 |
| }, |
| { |
| "epoch": 1.998646514775547, |
| "grad_norm": 0.5465816259384155, |
| "learning_rate": 0.0001800947439657117, |
| "loss": 0.1602, |
| "mean_token_accuracy": 0.9601962268352509, |
| "num_tokens": 9021880.0, |
| "step": 8860 |
| }, |
| { |
| "epoch": 2.000902323482969, |
| "grad_norm": 0.6290394067764282, |
| "learning_rate": 0.00017995939544326642, |
| "loss": 0.0804, |
| "mean_token_accuracy": 0.9762711524963379, |
| "num_tokens": 9032059.0, |
| "step": 8870 |
| }, |
| { |
| "epoch": 2.00315813219039, |
| "grad_norm": 0.5912337303161621, |
| "learning_rate": 0.00017982404692082107, |
| "loss": 0.1058, |
| "mean_token_accuracy": 0.9690792024135589, |
| "num_tokens": 9042287.0, |
| "step": 8880 |
| }, |
| { |
| "epoch": 2.005413940897812, |
| "grad_norm": 0.591269850730896, |
| "learning_rate": 0.00017968869839837578, |
| "loss": 0.0892, |
| "mean_token_accuracy": 0.9733344137668609, |
| "num_tokens": 9052495.0, |
| "step": 8890 |
| }, |
| { |
| "epoch": 2.0076697496052334, |
| "grad_norm": 1.021296739578247, |
| "learning_rate": 0.0001795533498759305, |
| "loss": 0.1061, |
| "mean_token_accuracy": 0.9714495182037354, |
| "num_tokens": 9062732.0, |
| "step": 8900 |
| }, |
| { |
| "epoch": 2.009925558312655, |
| "grad_norm": 0.3600910007953644, |
| "learning_rate": 0.00017941800135348523, |
| "loss": 0.089, |
| "mean_token_accuracy": 0.9722425937652588, |
| "num_tokens": 9072909.0, |
| "step": 8910 |
| }, |
| { |
| "epoch": 2.0121813670200766, |
| "grad_norm": 0.8148804903030396, |
| "learning_rate": 0.0001792826528310399, |
| "loss": 0.0982, |
| "mean_token_accuracy": 0.9704652488231659, |
| "num_tokens": 9083116.0, |
| "step": 8920 |
| }, |
| { |
| "epoch": 2.0144371757274984, |
| "grad_norm": 0.6611748933792114, |
| "learning_rate": 0.0001791473043085946, |
| "loss": 0.0848, |
| "mean_token_accuracy": 0.9746616184711456, |
| "num_tokens": 9093299.0, |
| "step": 8930 |
| }, |
| { |
| "epoch": 2.0166929844349197, |
| "grad_norm": 0.32425668835639954, |
| "learning_rate": 0.0001790119557861493, |
| "loss": 0.0813, |
| "mean_token_accuracy": 0.976304441690445, |
| "num_tokens": 9103515.0, |
| "step": 8940 |
| }, |
| { |
| "epoch": 2.0189487931423415, |
| "grad_norm": 0.61649489402771, |
| "learning_rate": 0.00017887660726370404, |
| "loss": 0.0898, |
| "mean_token_accuracy": 0.9724894106388092, |
| "num_tokens": 9113738.0, |
| "step": 8950 |
| }, |
| { |
| "epoch": 2.0212046018497634, |
| "grad_norm": 0.4641760289669037, |
| "learning_rate": 0.00017874125874125875, |
| "loss": 0.0822, |
| "mean_token_accuracy": 0.9755397379398346, |
| "num_tokens": 9123963.0, |
| "step": 8960 |
| }, |
| { |
| "epoch": 2.0234604105571847, |
| "grad_norm": 0.46858569979667664, |
| "learning_rate": 0.0001786059102188134, |
| "loss": 0.0884, |
| "mean_token_accuracy": 0.974696409702301, |
| "num_tokens": 9134154.0, |
| "step": 8970 |
| }, |
| { |
| "epoch": 2.0257162192646065, |
| "grad_norm": 0.7182545065879822, |
| "learning_rate": 0.00017847056169636812, |
| "loss": 0.0936, |
| "mean_token_accuracy": 0.9723126113414764, |
| "num_tokens": 9144310.0, |
| "step": 8980 |
| }, |
| { |
| "epoch": 2.027972027972028, |
| "grad_norm": 0.832270622253418, |
| "learning_rate": 0.00017833521317392283, |
| "loss": 0.0871, |
| "mean_token_accuracy": 0.9737198114395141, |
| "num_tokens": 9154434.0, |
| "step": 8990 |
| }, |
| { |
| "epoch": 2.0302278366794497, |
| "grad_norm": 0.6002981066703796, |
| "learning_rate": 0.00017819986465147757, |
| "loss": 0.0994, |
| "mean_token_accuracy": 0.9703204393386841, |
| "num_tokens": 9164498.0, |
| "step": 9000 |
| }, |
| { |
| "epoch": 2.032483645386871, |
| "grad_norm": 0.41282710433006287, |
| "learning_rate": 0.00017806451612903222, |
| "loss": 0.105, |
| "mean_token_accuracy": 0.9722214996814728, |
| "num_tokens": 9174726.0, |
| "step": 9010 |
| }, |
| { |
| "epoch": 2.034739454094293, |
| "grad_norm": 0.5806052088737488, |
| "learning_rate": 0.00017792916760658693, |
| "loss": 0.1056, |
| "mean_token_accuracy": 0.9686106681823731, |
| "num_tokens": 9184936.0, |
| "step": 9020 |
| }, |
| { |
| "epoch": 2.0369952628017143, |
| "grad_norm": 0.4993740916252136, |
| "learning_rate": 0.00017779381908414164, |
| "loss": 0.1028, |
| "mean_token_accuracy": 0.970165628194809, |
| "num_tokens": 9195123.0, |
| "step": 9030 |
| }, |
| { |
| "epoch": 2.039251071509136, |
| "grad_norm": 0.9050776958465576, |
| "learning_rate": 0.00017765847056169635, |
| "loss": 0.0967, |
| "mean_token_accuracy": 0.9713717639446259, |
| "num_tokens": 9205265.0, |
| "step": 9040 |
| }, |
| { |
| "epoch": 2.0415068802165575, |
| "grad_norm": 0.6305001974105835, |
| "learning_rate": 0.00017752312203925103, |
| "loss": 0.0995, |
| "mean_token_accuracy": 0.9726520895957946, |
| "num_tokens": 9215412.0, |
| "step": 9050 |
| }, |
| { |
| "epoch": 2.0437626889239793, |
| "grad_norm": 0.458783894777298, |
| "learning_rate": 0.00017738777351680574, |
| "loss": 0.1517, |
| "mean_token_accuracy": 0.9607601463794708, |
| "num_tokens": 9225569.0, |
| "step": 9060 |
| }, |
| { |
| "epoch": 2.0460184976314006, |
| "grad_norm": 0.47300443053245544, |
| "learning_rate": 0.00017725242499436045, |
| "loss": 0.0904, |
| "mean_token_accuracy": 0.972942465543747, |
| "num_tokens": 9235688.0, |
| "step": 9070 |
| }, |
| { |
| "epoch": 2.0482743063388225, |
| "grad_norm": 0.7441193461418152, |
| "learning_rate": 0.00017711707647191516, |
| "loss": 0.0901, |
| "mean_token_accuracy": 0.9721292495727539, |
| "num_tokens": 9245923.0, |
| "step": 9080 |
| }, |
| { |
| "epoch": 2.0505301150462443, |
| "grad_norm": 0.5683284997940063, |
| "learning_rate": 0.00017698172794946985, |
| "loss": 0.0899, |
| "mean_token_accuracy": 0.9734027445316314, |
| "num_tokens": 9256159.0, |
| "step": 9090 |
| }, |
| { |
| "epoch": 2.0527859237536656, |
| "grad_norm": 0.4916854798793793, |
| "learning_rate": 0.00017684637942702456, |
| "loss": 0.0858, |
| "mean_token_accuracy": 0.9744769513607026, |
| "num_tokens": 9266394.0, |
| "step": 9100 |
| }, |
| { |
| "epoch": 2.0550417324610875, |
| "grad_norm": 0.6196737885475159, |
| "learning_rate": 0.00017671103090457927, |
| "loss": 0.0822, |
| "mean_token_accuracy": 0.9734901905059814, |
| "num_tokens": 9276563.0, |
| "step": 9110 |
| }, |
| { |
| "epoch": 2.057297541168509, |
| "grad_norm": 0.4199332594871521, |
| "learning_rate": 0.00017657568238213398, |
| "loss": 0.0832, |
| "mean_token_accuracy": 0.9740483283996582, |
| "num_tokens": 9286778.0, |
| "step": 9120 |
| }, |
| { |
| "epoch": 2.0595533498759306, |
| "grad_norm": 0.8482980132102966, |
| "learning_rate": 0.00017644033385968869, |
| "loss": 0.1052, |
| "mean_token_accuracy": 0.9692319512367249, |
| "num_tokens": 9296960.0, |
| "step": 9130 |
| }, |
| { |
| "epoch": 2.061809158583352, |
| "grad_norm": 0.7818817496299744, |
| "learning_rate": 0.00017630498533724337, |
| "loss": 0.1009, |
| "mean_token_accuracy": 0.972999757528305, |
| "num_tokens": 9307194.0, |
| "step": 9140 |
| }, |
| { |
| "epoch": 2.064064967290774, |
| "grad_norm": 0.9999156594276428, |
| "learning_rate": 0.00017616963681479808, |
| "loss": 0.0764, |
| "mean_token_accuracy": 0.9767278075218201, |
| "num_tokens": 9317430.0, |
| "step": 9150 |
| }, |
| { |
| "epoch": 2.066320775998195, |
| "grad_norm": 0.3332657217979431, |
| "learning_rate": 0.0001760342882923528, |
| "loss": 0.0953, |
| "mean_token_accuracy": 0.9738193869590759, |
| "num_tokens": 9327609.0, |
| "step": 9160 |
| }, |
| { |
| "epoch": 2.068576584705617, |
| "grad_norm": 0.5017508268356323, |
| "learning_rate": 0.0001758989397699075, |
| "loss": 0.1039, |
| "mean_token_accuracy": 0.9686254799365998, |
| "num_tokens": 9337754.0, |
| "step": 9170 |
| }, |
| { |
| "epoch": 2.0708323934130384, |
| "grad_norm": 0.4979402422904968, |
| "learning_rate": 0.00017576359124746218, |
| "loss": 0.1049, |
| "mean_token_accuracy": 0.9714104771614075, |
| "num_tokens": 9347920.0, |
| "step": 9180 |
| }, |
| { |
| "epoch": 2.07308820212046, |
| "grad_norm": 0.4234994649887085, |
| "learning_rate": 0.0001756282427250169, |
| "loss": 0.0924, |
| "mean_token_accuracy": 0.9733720302581788, |
| "num_tokens": 9358064.0, |
| "step": 9190 |
| }, |
| { |
| "epoch": 2.075344010827882, |
| "grad_norm": 1.5676687955856323, |
| "learning_rate": 0.0001754928942025716, |
| "loss": 0.0943, |
| "mean_token_accuracy": 0.9714210510253907, |
| "num_tokens": 9368295.0, |
| "step": 9200 |
| }, |
| { |
| "epoch": 2.0775998195353034, |
| "grad_norm": 0.46907201409339905, |
| "learning_rate": 0.0001753575456801263, |
| "loss": 0.0821, |
| "mean_token_accuracy": 0.9753334045410156, |
| "num_tokens": 9378521.0, |
| "step": 9210 |
| }, |
| { |
| "epoch": 2.079855628242725, |
| "grad_norm": 0.7339596152305603, |
| "learning_rate": 0.000175222197157681, |
| "loss": 0.0935, |
| "mean_token_accuracy": 0.9735293567180634, |
| "num_tokens": 9388697.0, |
| "step": 9220 |
| }, |
| { |
| "epoch": 2.0821114369501466, |
| "grad_norm": 0.542509138584137, |
| "learning_rate": 0.0001750868486352357, |
| "loss": 0.0897, |
| "mean_token_accuracy": 0.9746010303497314, |
| "num_tokens": 9398881.0, |
| "step": 9230 |
| }, |
| { |
| "epoch": 2.0843672456575684, |
| "grad_norm": 0.5223485231399536, |
| "learning_rate": 0.0001749515001127904, |
| "loss": 0.1131, |
| "mean_token_accuracy": 0.9674809694290161, |
| "num_tokens": 9409095.0, |
| "step": 9240 |
| }, |
| { |
| "epoch": 2.0866230543649897, |
| "grad_norm": 0.7211313843727112, |
| "learning_rate": 0.00017481615159034512, |
| "loss": 0.1069, |
| "mean_token_accuracy": 0.9681223928928375, |
| "num_tokens": 9419227.0, |
| "step": 9250 |
| }, |
| { |
| "epoch": 2.0888788630724116, |
| "grad_norm": 0.5504807829856873, |
| "learning_rate": 0.0001746808030678998, |
| "loss": 0.1201, |
| "mean_token_accuracy": 0.9665165781974793, |
| "num_tokens": 9429353.0, |
| "step": 9260 |
| }, |
| { |
| "epoch": 2.091134671779833, |
| "grad_norm": 0.5223894715309143, |
| "learning_rate": 0.00017454545454545452, |
| "loss": 0.0947, |
| "mean_token_accuracy": 0.9707706809043884, |
| "num_tokens": 9439548.0, |
| "step": 9270 |
| }, |
| { |
| "epoch": 2.0933904804872547, |
| "grad_norm": 0.5483041405677795, |
| "learning_rate": 0.00017441010602300923, |
| "loss": 0.0905, |
| "mean_token_accuracy": 0.9730142235755921, |
| "num_tokens": 9449758.0, |
| "step": 9280 |
| }, |
| { |
| "epoch": 2.095646289194676, |
| "grad_norm": 0.4744812250137329, |
| "learning_rate": 0.00017427475750056394, |
| "loss": 0.0999, |
| "mean_token_accuracy": 0.9714322686195374, |
| "num_tokens": 9459901.0, |
| "step": 9290 |
| }, |
| { |
| "epoch": 2.097902097902098, |
| "grad_norm": 0.47873175144195557, |
| "learning_rate": 0.00017413940897811864, |
| "loss": 0.1186, |
| "mean_token_accuracy": 0.9649298131465912, |
| "num_tokens": 9470029.0, |
| "step": 9300 |
| }, |
| { |
| "epoch": 2.1001579066095193, |
| "grad_norm": 0.5018359422683716, |
| "learning_rate": 0.00017400406045567333, |
| "loss": 0.1124, |
| "mean_token_accuracy": 0.9666579306125641, |
| "num_tokens": 9480263.0, |
| "step": 9310 |
| }, |
| { |
| "epoch": 2.102413715316941, |
| "grad_norm": 0.6129780411720276, |
| "learning_rate": 0.00017386871193322804, |
| "loss": 0.0862, |
| "mean_token_accuracy": 0.9743798732757568, |
| "num_tokens": 9490447.0, |
| "step": 9320 |
| }, |
| { |
| "epoch": 2.104669524024363, |
| "grad_norm": 0.37456271052360535, |
| "learning_rate": 0.00017373336341078275, |
| "loss": 0.1023, |
| "mean_token_accuracy": 0.9715150356292724, |
| "num_tokens": 9500613.0, |
| "step": 9330 |
| }, |
| { |
| "epoch": 2.1069253327317843, |
| "grad_norm": 1.205664038658142, |
| "learning_rate": 0.00017359801488833746, |
| "loss": 0.1083, |
| "mean_token_accuracy": 0.9699222087860108, |
| "num_tokens": 9510834.0, |
| "step": 9340 |
| }, |
| { |
| "epoch": 2.109181141439206, |
| "grad_norm": 1.3629356622695923, |
| "learning_rate": 0.00017346266636589214, |
| "loss": 0.0932, |
| "mean_token_accuracy": 0.9720664858818054, |
| "num_tokens": 9521014.0, |
| "step": 9350 |
| }, |
| { |
| "epoch": 2.1114369501466275, |
| "grad_norm": 0.7105079293251038, |
| "learning_rate": 0.00017332731784344685, |
| "loss": 0.0966, |
| "mean_token_accuracy": 0.9712238907814026, |
| "num_tokens": 9531193.0, |
| "step": 9360 |
| }, |
| { |
| "epoch": 2.1136927588540493, |
| "grad_norm": 0.6067842841148376, |
| "learning_rate": 0.00017319196932100156, |
| "loss": 0.0939, |
| "mean_token_accuracy": 0.9708913087844848, |
| "num_tokens": 9541408.0, |
| "step": 9370 |
| }, |
| { |
| "epoch": 2.1159485675614707, |
| "grad_norm": 0.6547707915306091, |
| "learning_rate": 0.00017305662079855627, |
| "loss": 0.0896, |
| "mean_token_accuracy": 0.9725289940834045, |
| "num_tokens": 9551647.0, |
| "step": 9380 |
| }, |
| { |
| "epoch": 2.1182043762688925, |
| "grad_norm": 0.44134852290153503, |
| "learning_rate": 0.00017292127227611095, |
| "loss": 0.1144, |
| "mean_token_accuracy": 0.9662079870700836, |
| "num_tokens": 9561784.0, |
| "step": 9390 |
| }, |
| { |
| "epoch": 2.120460184976314, |
| "grad_norm": 0.8880122303962708, |
| "learning_rate": 0.00017278592375366566, |
| "loss": 0.1135, |
| "mean_token_accuracy": 0.967824912071228, |
| "num_tokens": 9571964.0, |
| "step": 9400 |
| }, |
| { |
| "epoch": 2.1227159936837356, |
| "grad_norm": 0.44176623225212097, |
| "learning_rate": 0.00017265057523122037, |
| "loss": 0.092, |
| "mean_token_accuracy": 0.9716949999332428, |
| "num_tokens": 9582130.0, |
| "step": 9410 |
| }, |
| { |
| "epoch": 2.124971802391157, |
| "grad_norm": 0.6585561037063599, |
| "learning_rate": 0.00017251522670877508, |
| "loss": 0.0847, |
| "mean_token_accuracy": 0.9737663388252258, |
| "num_tokens": 9592325.0, |
| "step": 9420 |
| }, |
| { |
| "epoch": 2.127227611098579, |
| "grad_norm": 0.6109176874160767, |
| "learning_rate": 0.0001723798781863298, |
| "loss": 0.1147, |
| "mean_token_accuracy": 0.9680899202823638, |
| "num_tokens": 9602555.0, |
| "step": 9430 |
| }, |
| { |
| "epoch": 2.1294834198060006, |
| "grad_norm": 0.4303933382034302, |
| "learning_rate": 0.00017224452966388447, |
| "loss": 0.0873, |
| "mean_token_accuracy": 0.9742409646511078, |
| "num_tokens": 9612766.0, |
| "step": 9440 |
| }, |
| { |
| "epoch": 2.131739228513422, |
| "grad_norm": 0.5676075220108032, |
| "learning_rate": 0.00017210918114143918, |
| "loss": 0.0815, |
| "mean_token_accuracy": 0.9740564048290252, |
| "num_tokens": 9622987.0, |
| "step": 9450 |
| }, |
| { |
| "epoch": 2.133995037220844, |
| "grad_norm": 0.3643551468849182, |
| "learning_rate": 0.0001719738326189939, |
| "loss": 0.0962, |
| "mean_token_accuracy": 0.9718107163906098, |
| "num_tokens": 9633207.0, |
| "step": 9460 |
| }, |
| { |
| "epoch": 2.136250845928265, |
| "grad_norm": 0.31422924995422363, |
| "learning_rate": 0.0001718384840965486, |
| "loss": 0.0781, |
| "mean_token_accuracy": 0.9776098966598511, |
| "num_tokens": 9643377.0, |
| "step": 9470 |
| }, |
| { |
| "epoch": 2.138506654635687, |
| "grad_norm": 0.5971875786781311, |
| "learning_rate": 0.0001717031355741033, |
| "loss": 0.1022, |
| "mean_token_accuracy": 0.9688220500946045, |
| "num_tokens": 9653564.0, |
| "step": 9480 |
| }, |
| { |
| "epoch": 2.1407624633431084, |
| "grad_norm": 0.5687423944473267, |
| "learning_rate": 0.000171567787051658, |
| "loss": 0.0996, |
| "mean_token_accuracy": 0.971611624956131, |
| "num_tokens": 9663760.0, |
| "step": 9490 |
| }, |
| { |
| "epoch": 2.14301827205053, |
| "grad_norm": 0.44779282808303833, |
| "learning_rate": 0.0001714324385292127, |
| "loss": 0.0887, |
| "mean_token_accuracy": 0.9741411805152893, |
| "num_tokens": 9673629.0, |
| "step": 9500 |
| }, |
| { |
| "epoch": 2.1452740807579516, |
| "grad_norm": 0.7396071553230286, |
| "learning_rate": 0.00017129709000676742, |
| "loss": 0.1019, |
| "mean_token_accuracy": 0.9707873880863189, |
| "num_tokens": 9683832.0, |
| "step": 9510 |
| }, |
| { |
| "epoch": 2.1475298894653734, |
| "grad_norm": 0.6024800539016724, |
| "learning_rate": 0.0001711617414843221, |
| "loss": 0.0935, |
| "mean_token_accuracy": 0.9714532136917114, |
| "num_tokens": 9694005.0, |
| "step": 9520 |
| }, |
| { |
| "epoch": 2.1497856981727947, |
| "grad_norm": 0.8652124404907227, |
| "learning_rate": 0.0001710263929618768, |
| "loss": 0.0953, |
| "mean_token_accuracy": 0.9726161181926727, |
| "num_tokens": 9704147.0, |
| "step": 9530 |
| }, |
| { |
| "epoch": 2.1520415068802166, |
| "grad_norm": 0.36145398020744324, |
| "learning_rate": 0.00017089104443943152, |
| "loss": 0.0884, |
| "mean_token_accuracy": 0.9719941794872284, |
| "num_tokens": 9714357.0, |
| "step": 9540 |
| }, |
| { |
| "epoch": 2.1542973155876384, |
| "grad_norm": 0.6179787516593933, |
| "learning_rate": 0.00017075569591698623, |
| "loss": 0.0783, |
| "mean_token_accuracy": 0.9779595613479615, |
| "num_tokens": 9723620.0, |
| "step": 9550 |
| }, |
| { |
| "epoch": 2.1565531242950597, |
| "grad_norm": 0.7116292119026184, |
| "learning_rate": 0.0001706203473945409, |
| "loss": 0.1045, |
| "mean_token_accuracy": 0.9707187950611115, |
| "num_tokens": 9733848.0, |
| "step": 9560 |
| }, |
| { |
| "epoch": 2.1588089330024816, |
| "grad_norm": 0.4667085111141205, |
| "learning_rate": 0.00017048499887209562, |
| "loss": 0.0713, |
| "mean_token_accuracy": 0.9795779466629029, |
| "num_tokens": 9743989.0, |
| "step": 9570 |
| }, |
| { |
| "epoch": 2.161064741709903, |
| "grad_norm": 0.5031459927558899, |
| "learning_rate": 0.00017034965034965033, |
| "loss": 0.0955, |
| "mean_token_accuracy": 0.9732905268669129, |
| "num_tokens": 9754221.0, |
| "step": 9580 |
| }, |
| { |
| "epoch": 2.1633205504173247, |
| "grad_norm": 0.42553097009658813, |
| "learning_rate": 0.00017021430182720504, |
| "loss": 0.0831, |
| "mean_token_accuracy": 0.9736292123794555, |
| "num_tokens": 9764370.0, |
| "step": 9590 |
| }, |
| { |
| "epoch": 2.165576359124746, |
| "grad_norm": 0.43609100580215454, |
| "learning_rate": 0.00017007895330475975, |
| "loss": 0.0977, |
| "mean_token_accuracy": 0.9721475183963776, |
| "num_tokens": 9774519.0, |
| "step": 9600 |
| }, |
| { |
| "epoch": 2.167832167832168, |
| "grad_norm": 0.3701200783252716, |
| "learning_rate": 0.00016994360478231443, |
| "loss": 0.0966, |
| "mean_token_accuracy": 0.9708240926265717, |
| "num_tokens": 9784743.0, |
| "step": 9610 |
| }, |
| { |
| "epoch": 2.1700879765395893, |
| "grad_norm": 0.7588269114494324, |
| "learning_rate": 0.00016980825625986914, |
| "loss": 0.0812, |
| "mean_token_accuracy": 0.9742601990699769, |
| "num_tokens": 9794970.0, |
| "step": 9620 |
| }, |
| { |
| "epoch": 2.172343785247011, |
| "grad_norm": 0.547471821308136, |
| "learning_rate": 0.00016967290773742385, |
| "loss": 0.1067, |
| "mean_token_accuracy": 0.9700693309307098, |
| "num_tokens": 9805155.0, |
| "step": 9630 |
| }, |
| { |
| "epoch": 2.1745995939544325, |
| "grad_norm": 0.5657344460487366, |
| "learning_rate": 0.00016953755921497856, |
| "loss": 0.1072, |
| "mean_token_accuracy": 0.9706630408763885, |
| "num_tokens": 9815333.0, |
| "step": 9640 |
| }, |
| { |
| "epoch": 2.1768554026618543, |
| "grad_norm": 0.6330400705337524, |
| "learning_rate": 0.00016940221069253325, |
| "loss": 0.0767, |
| "mean_token_accuracy": 0.9774750828742981, |
| "num_tokens": 9825549.0, |
| "step": 9650 |
| }, |
| { |
| "epoch": 2.1791112113692757, |
| "grad_norm": 1.0108215808868408, |
| "learning_rate": 0.00016926686217008796, |
| "loss": 0.1165, |
| "mean_token_accuracy": 0.9687487602233886, |
| "num_tokens": 9835749.0, |
| "step": 9660 |
| }, |
| { |
| "epoch": 2.1813670200766975, |
| "grad_norm": 0.559883177280426, |
| "learning_rate": 0.00016913151364764267, |
| "loss": 0.0684, |
| "mean_token_accuracy": 0.9800824344158172, |
| "num_tokens": 9845942.0, |
| "step": 9670 |
| }, |
| { |
| "epoch": 2.1836228287841193, |
| "grad_norm": 0.3940314054489136, |
| "learning_rate": 0.00016899616512519738, |
| "loss": 0.0889, |
| "mean_token_accuracy": 0.9734469890594483, |
| "num_tokens": 9856099.0, |
| "step": 9680 |
| }, |
| { |
| "epoch": 2.1858786374915407, |
| "grad_norm": 0.5258107781410217, |
| "learning_rate": 0.00016886081660275206, |
| "loss": 0.1021, |
| "mean_token_accuracy": 0.9695713877677917, |
| "num_tokens": 9866266.0, |
| "step": 9690 |
| }, |
| { |
| "epoch": 2.1881344461989625, |
| "grad_norm": 0.8086346387863159, |
| "learning_rate": 0.00016872546808030677, |
| "loss": 0.0823, |
| "mean_token_accuracy": 0.9752463161945343, |
| "num_tokens": 9876451.0, |
| "step": 9700 |
| }, |
| { |
| "epoch": 2.190390254906384, |
| "grad_norm": 0.4422919452190399, |
| "learning_rate": 0.00016859011955786148, |
| "loss": 0.0863, |
| "mean_token_accuracy": 0.9748727321624756, |
| "num_tokens": 9886651.0, |
| "step": 9710 |
| }, |
| { |
| "epoch": 2.1926460636138057, |
| "grad_norm": 0.5345275402069092, |
| "learning_rate": 0.0001684547710354162, |
| "loss": 0.092, |
| "mean_token_accuracy": 0.9734846889972687, |
| "num_tokens": 9896883.0, |
| "step": 9720 |
| }, |
| { |
| "epoch": 2.194901872321227, |
| "grad_norm": 0.6745142340660095, |
| "learning_rate": 0.00016831942251297087, |
| "loss": 0.0965, |
| "mean_token_accuracy": 0.9711147665977478, |
| "num_tokens": 9907087.0, |
| "step": 9730 |
| }, |
| { |
| "epoch": 2.197157681028649, |
| "grad_norm": 1.496596097946167, |
| "learning_rate": 0.00016818407399052558, |
| "loss": 0.0876, |
| "mean_token_accuracy": 0.9734964728355407, |
| "num_tokens": 9917322.0, |
| "step": 9740 |
| }, |
| { |
| "epoch": 2.19941348973607, |
| "grad_norm": 0.5157150626182556, |
| "learning_rate": 0.0001680487254680803, |
| "loss": 0.0947, |
| "mean_token_accuracy": 0.9723085820674896, |
| "num_tokens": 9927490.0, |
| "step": 9750 |
| }, |
| { |
| "epoch": 2.201669298443492, |
| "grad_norm": 0.689978301525116, |
| "learning_rate": 0.000167913376945635, |
| "loss": 0.0835, |
| "mean_token_accuracy": 0.9742916345596313, |
| "num_tokens": 9937648.0, |
| "step": 9760 |
| }, |
| { |
| "epoch": 2.203925107150914, |
| "grad_norm": 0.5611025094985962, |
| "learning_rate": 0.0001677780284231897, |
| "loss": 0.0785, |
| "mean_token_accuracy": 0.9771084129810333, |
| "num_tokens": 9947834.0, |
| "step": 9770 |
| }, |
| { |
| "epoch": 2.206180915858335, |
| "grad_norm": 0.6031618714332581, |
| "learning_rate": 0.0001676426799007444, |
| "loss": 0.0954, |
| "mean_token_accuracy": 0.9708240628242493, |
| "num_tokens": 9958057.0, |
| "step": 9780 |
| }, |
| { |
| "epoch": 2.208436724565757, |
| "grad_norm": 0.40428003668785095, |
| "learning_rate": 0.0001675073313782991, |
| "loss": 0.1122, |
| "mean_token_accuracy": 0.9690653860569001, |
| "num_tokens": 9968222.0, |
| "step": 9790 |
| }, |
| { |
| "epoch": 2.2106925332731784, |
| "grad_norm": 0.3744548559188843, |
| "learning_rate": 0.0001673719828558538, |
| "loss": 0.1027, |
| "mean_token_accuracy": 0.9691601276397706, |
| "num_tokens": 9978423.0, |
| "step": 9800 |
| }, |
| { |
| "epoch": 2.2129483419806, |
| "grad_norm": 0.901164710521698, |
| "learning_rate": 0.00016723663433340852, |
| "loss": 0.0715, |
| "mean_token_accuracy": 0.9771885097026825, |
| "num_tokens": 9988644.0, |
| "step": 9810 |
| }, |
| { |
| "epoch": 2.2152041506880216, |
| "grad_norm": 0.5163309574127197, |
| "learning_rate": 0.0001671012858109632, |
| "loss": 0.0705, |
| "mean_token_accuracy": 0.9775005519390106, |
| "num_tokens": 9998878.0, |
| "step": 9820 |
| }, |
| { |
| "epoch": 2.2174599593954434, |
| "grad_norm": 0.6472623944282532, |
| "learning_rate": 0.00016696593728851792, |
| "loss": 0.0867, |
| "mean_token_accuracy": 0.9749352514743805, |
| "num_tokens": 10009100.0, |
| "step": 9830 |
| }, |
| { |
| "epoch": 2.2197157681028648, |
| "grad_norm": 0.6509614586830139, |
| "learning_rate": 0.00016683058876607263, |
| "loss": 0.0922, |
| "mean_token_accuracy": 0.9715542435646057, |
| "num_tokens": 10019340.0, |
| "step": 9840 |
| }, |
| { |
| "epoch": 2.2219715768102866, |
| "grad_norm": 0.6951918005943298, |
| "learning_rate": 0.00016669524024362734, |
| "loss": 0.1022, |
| "mean_token_accuracy": 0.9700810790061951, |
| "num_tokens": 10029577.0, |
| "step": 9850 |
| }, |
| { |
| "epoch": 2.224227385517708, |
| "grad_norm": 0.894263505935669, |
| "learning_rate": 0.00016655989172118202, |
| "loss": 0.1011, |
| "mean_token_accuracy": 0.9722074806690216, |
| "num_tokens": 10039785.0, |
| "step": 9860 |
| }, |
| { |
| "epoch": 2.2264831942251297, |
| "grad_norm": 0.611517071723938, |
| "learning_rate": 0.00016642454319873673, |
| "loss": 0.092, |
| "mean_token_accuracy": 0.9712512075901032, |
| "num_tokens": 10049965.0, |
| "step": 9870 |
| }, |
| { |
| "epoch": 2.228739002932551, |
| "grad_norm": 0.3283737301826477, |
| "learning_rate": 0.00016628919467629144, |
| "loss": 0.1159, |
| "mean_token_accuracy": 0.9696214973926545, |
| "num_tokens": 10060161.0, |
| "step": 9880 |
| }, |
| { |
| "epoch": 2.230994811639973, |
| "grad_norm": 0.5803564786911011, |
| "learning_rate": 0.00016615384615384615, |
| "loss": 0.0653, |
| "mean_token_accuracy": 0.9800610899925232, |
| "num_tokens": 10070313.0, |
| "step": 9890 |
| }, |
| { |
| "epoch": 2.2332506203473947, |
| "grad_norm": 0.7790770530700684, |
| "learning_rate": 0.00016601849763140083, |
| "loss": 0.1069, |
| "mean_token_accuracy": 0.9694907486438751, |
| "num_tokens": 10080514.0, |
| "step": 9900 |
| }, |
| { |
| "epoch": 2.235506429054816, |
| "grad_norm": 0.31714528799057007, |
| "learning_rate": 0.00016588314910895554, |
| "loss": 0.0821, |
| "mean_token_accuracy": 0.9785817444324494, |
| "num_tokens": 10090403.0, |
| "step": 9910 |
| }, |
| { |
| "epoch": 2.237762237762238, |
| "grad_norm": 0.547499418258667, |
| "learning_rate": 0.00016574780058651025, |
| "loss": 0.1475, |
| "mean_token_accuracy": 0.9605782866477967, |
| "num_tokens": 10100605.0, |
| "step": 9920 |
| }, |
| { |
| "epoch": 2.2400180464696593, |
| "grad_norm": 0.2478630244731903, |
| "learning_rate": 0.00016561245206406496, |
| "loss": 0.0844, |
| "mean_token_accuracy": 0.9756044149398804, |
| "num_tokens": 10110816.0, |
| "step": 9930 |
| }, |
| { |
| "epoch": 2.242273855177081, |
| "grad_norm": 0.4540363550186157, |
| "learning_rate": 0.00016547710354161967, |
| "loss": 0.0754, |
| "mean_token_accuracy": 0.9743645370006562, |
| "num_tokens": 10120970.0, |
| "step": 9940 |
| }, |
| { |
| "epoch": 2.2445296638845025, |
| "grad_norm": 0.5862691402435303, |
| "learning_rate": 0.00016534175501917435, |
| "loss": 0.115, |
| "mean_token_accuracy": 0.9685929179191589, |
| "num_tokens": 10131187.0, |
| "step": 9950 |
| }, |
| { |
| "epoch": 2.2467854725919243, |
| "grad_norm": 0.892983078956604, |
| "learning_rate": 0.00016520640649672906, |
| "loss": 0.1041, |
| "mean_token_accuracy": 0.9705977141857147, |
| "num_tokens": 10141375.0, |
| "step": 9960 |
| }, |
| { |
| "epoch": 2.2490412812993457, |
| "grad_norm": 0.5582557320594788, |
| "learning_rate": 0.00016507105797428377, |
| "loss": 0.0623, |
| "mean_token_accuracy": 0.9798148095607757, |
| "num_tokens": 10151567.0, |
| "step": 9970 |
| }, |
| { |
| "epoch": 2.2512970900067675, |
| "grad_norm": 0.6146891117095947, |
| "learning_rate": 0.00016493570945183848, |
| "loss": 0.1045, |
| "mean_token_accuracy": 0.9701870501041412, |
| "num_tokens": 10161709.0, |
| "step": 9980 |
| }, |
| { |
| "epoch": 2.253552898714189, |
| "grad_norm": 0.6650478839874268, |
| "learning_rate": 0.00016480036092939316, |
| "loss": 0.1306, |
| "mean_token_accuracy": 0.9658924698829651, |
| "num_tokens": 10171850.0, |
| "step": 9990 |
| }, |
| { |
| "epoch": 2.2558087074216107, |
| "grad_norm": 0.6161482930183411, |
| "learning_rate": 0.00016466501240694787, |
| "loss": 0.0957, |
| "mean_token_accuracy": 0.9716996252536774, |
| "num_tokens": 10182072.0, |
| "step": 10000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 22165, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.239786402987008e+16, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|