| { |
| "config": { |
| "epochs": 1, |
| "batch_size": 16, |
| "gradient_accumulation_steps": 4, |
| "effective_batch_size": 64, |
| "cross_max_length": 64, |
| "lora_r": 8, |
| "lora_alpha": 16, |
| "target_modules": [ |
| "query_key_value" |
| ], |
| "total_steps": 13249, |
| "start_time": "2026-01-11T03:45:55.374803" |
| }, |
| "steps": [ |
| { |
| "step": 100, |
| "epoch": 0, |
| "loss": 3.0553906083106996, |
| "perplexity": 21.229476317462176, |
| "grad_norm": 0.9999998862745418, |
| "lr": 1.8882175226586103e-06, |
| "cross_attn_scale": 0.0755287009063444, |
| "tokens_per_sec": 29659.667067018858, |
| "step_time_ms": 72.3056697845459, |
| "elapsed_sec": 11.671439170837402, |
| "progress_pct": 0.7547739452034116 |
| }, |
| { |
| "step": 200, |
| "epoch": 0, |
| "loss": 3.0449283146858215, |
| "perplexity": 21.00852514617355, |
| "grad_norm": 0.9999998771969448, |
| "lr": 3.7764350453172205e-06, |
| "cross_attn_scale": 0.1510574018126888, |
| "tokens_per_sec": 29998.955284666587, |
| "step_time_ms": 68.73083829879761, |
| "elapsed_sec": 22.878730058670044, |
| "progress_pct": 1.5095478904068231 |
| }, |
| { |
| "step": 300, |
| "epoch": 0, |
| "loss": 3.047647387981415, |
| "perplexity": 21.065726598105954, |
| "grad_norm": 0.9999998694186822, |
| "lr": 5.664652567975831e-06, |
| "cross_attn_scale": 0.22658610271903323, |
| "tokens_per_sec": 30055.703757973926, |
| "step_time_ms": 68.56269359588623, |
| "elapsed_sec": 33.98429822921753, |
| "progress_pct": 2.2643218356102346 |
| }, |
| { |
| "step": 400, |
| "epoch": 0, |
| "loss": 3.0421357417106627, |
| "perplexity": 20.949939147514087, |
| "grad_norm": 0.9999998708066421, |
| "lr": 7.552870090634441e-06, |
| "cross_attn_scale": 0.3021148036253776, |
| "tokens_per_sec": 30210.86789891559, |
| "step_time_ms": 68.70847702026367, |
| "elapsed_sec": 45.12389397621155, |
| "progress_pct": 3.0190957808136463 |
| }, |
| { |
| "step": 500, |
| "epoch": 0, |
| "loss": 3.0340122056007384, |
| "perplexity": 20.780440954816772, |
| "grad_norm": 0.9999998721696909, |
| "lr": 9.441087613293053e-06, |
| "cross_attn_scale": 0.3776435045317221, |
| "tokens_per_sec": 30287.550862030017, |
| "step_time_ms": 68.67650508880615, |
| "elapsed_sec": 56.31241059303284, |
| "progress_pct": 3.773869726017058 |
| }, |
| { |
| "step": 600, |
| "epoch": 0, |
| "loss": 3.0240328454971315, |
| "perplexity": 20.574096755393853, |
| "grad_norm": 0.9999998774025239, |
| "lr": 1.1329305135951662e-05, |
| "cross_attn_scale": 0.45317220543806647, |
| "tokens_per_sec": 30337.509813968918, |
| "step_time_ms": 68.6249566078186, |
| "elapsed_sec": 67.54812812805176, |
| "progress_pct": 4.528643671220469 |
| }, |
| { |
| "step": 700, |
| "epoch": 0, |
| "loss": 3.0103702998161315, |
| "perplexity": 20.294913736798694, |
| "grad_norm": 0.9999998854215264, |
| "lr": 1.3217522658610274e-05, |
| "cross_attn_scale": 0.5287009063444109, |
| "tokens_per_sec": 30390.876870190146, |
| "step_time_ms": 68.71308088302612, |
| "elapsed_sec": 78.77034974098206, |
| "progress_pct": 5.283417616423881 |
| }, |
| { |
| "step": 800, |
| "epoch": 0, |
| "loss": 3.005305087566376, |
| "perplexity": 20.192375599076605, |
| "grad_norm": 0.9999998881305956, |
| "lr": 1.5105740181268882e-05, |
| "cross_attn_scale": 0.6042296072507553, |
| "tokens_per_sec": 30399.323564352733, |
| "step_time_ms": 68.67244720458984, |
| "elapsed_sec": 89.90821766853333, |
| "progress_pct": 6.0381915616272925 |
| }, |
| { |
| "step": 900, |
| "epoch": 0, |
| "loss": 2.9808620524406435, |
| "perplexity": 19.704795892601492, |
| "grad_norm": 0.999999884566103, |
| "lr": 1.6993957703927492e-05, |
| "cross_attn_scale": 0.6797583081570997, |
| "tokens_per_sec": 30464.53898853465, |
| "step_time_ms": 68.634774684906, |
| "elapsed_sec": 101.1823616027832, |
| "progress_pct": 6.792965506830704 |
| }, |
| { |
| "step": 1000, |
| "epoch": 0, |
| "loss": 2.9657063841819764, |
| "perplexity": 19.408408192965855, |
| "grad_norm": 0.9999998912970435, |
| "lr": 1.8882175226586106e-05, |
| "cross_attn_scale": 0.7552870090634441, |
| "tokens_per_sec": 30482.017177706704, |
| "step_time_ms": 68.62752199172974, |
| "elapsed_sec": 112.40210843086243, |
| "progress_pct": 7.547739452034116 |
| }, |
| { |
| "step": 1100, |
| "epoch": 0, |
| "loss": 2.9404901576042177, |
| "perplexity": 18.925120330844894, |
| "grad_norm": 0.9999998957905454, |
| "lr": 2.0770392749244713e-05, |
| "cross_attn_scale": 0.8308157099697885, |
| "tokens_per_sec": 30520.18917730466, |
| "step_time_ms": 68.61571073532104, |
| "elapsed_sec": 123.61365056037903, |
| "progress_pct": 8.302513397237528 |
| }, |
| { |
| "step": 1200, |
| "epoch": 0, |
| "loss": 2.916158149242401, |
| "perplexity": 18.470191258417472, |
| "grad_norm": 0.9999999059134038, |
| "lr": 2.2658610271903323e-05, |
| "cross_attn_scale": 0.9063444108761329, |
| "tokens_per_sec": 30514.05504365179, |
| "step_time_ms": 68.67437601089478, |
| "elapsed_sec": 134.80070066452026, |
| "progress_pct": 9.057287342440938 |
| }, |
| { |
| "step": 1300, |
| "epoch": 0, |
| "loss": 2.9002408885955813, |
| "perplexity": 18.17852384113777, |
| "grad_norm": 0.9999999221452138, |
| "lr": 2.4546827794561937e-05, |
| "cross_attn_scale": 0.9818731117824774, |
| "tokens_per_sec": 30518.41110587493, |
| "step_time_ms": 68.52020263671875, |
| "elapsed_sec": 145.9552721977234, |
| "progress_pct": 9.812061287644351 |
| }, |
| { |
| "step": 1400, |
| "epoch": 0, |
| "loss": 2.872870879173279, |
| "perplexity": 17.687724699355996, |
| "grad_norm": 0.9999999227685785, |
| "lr": 2.6435045317220547e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 30511.622849857926, |
| "step_time_ms": 68.70761156082153, |
| "elapsed_sec": 157.15584921836853, |
| "progress_pct": 10.566835232847762 |
| }, |
| { |
| "step": 1500, |
| "epoch": 0, |
| "loss": 2.8776234197616577, |
| "perplexity": 17.771986398880262, |
| "grad_norm": 0.9999999237417435, |
| "lr": 2.8323262839879154e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 30493.903728302623, |
| "step_time_ms": 68.75503063201904, |
| "elapsed_sec": 168.2848823070526, |
| "progress_pct": 11.321609178051174 |
| }, |
| { |
| "step": 1600, |
| "epoch": 0, |
| "loss": 2.8588545417785642, |
| "perplexity": 17.44153693803145, |
| "grad_norm": 0.9999999207595216, |
| "lr": 3.0211480362537764e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 30505.121130819396, |
| "step_time_ms": 68.64941120147705, |
| "elapsed_sec": 179.4573106765747, |
| "progress_pct": 12.076383123254585 |
| }, |
| { |
| "step": 1700, |
| "epoch": 0, |
| "loss": 2.8620681262016294, |
| "perplexity": 17.4976769464644, |
| "grad_norm": 0.9999999136486478, |
| "lr": 3.209969788519638e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 30496.028601579812, |
| "step_time_ms": 68.41788530349731, |
| "elapsed_sec": 190.5410726070404, |
| "progress_pct": 12.831157068457998 |
| }, |
| { |
| "step": 1800, |
| "epoch": 0, |
| "loss": 2.851705029010773, |
| "perplexity": 17.317283153287743, |
| "grad_norm": 0.9999999101516841, |
| "lr": 3.3987915407854985e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 30486.163746654358, |
| "step_time_ms": 68.65042924880981, |
| "elapsed_sec": 201.64810013771057, |
| "progress_pct": 13.585931013661408 |
| }, |
| { |
| "step": 1900, |
| "epoch": 0, |
| "loss": 2.839317078590393, |
| "perplexity": 17.104080804749838, |
| "grad_norm": 0.9999999094789626, |
| "lr": 3.5876132930513595e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 30499.8027812949, |
| "step_time_ms": 68.49920272827148, |
| "elapsed_sec": 212.8020317554474, |
| "progress_pct": 14.34070495886482 |
| }, |
| { |
| "step": 2000, |
| "epoch": 0, |
| "loss": 2.833750886917114, |
| "perplexity": 17.009140685170195, |
| "grad_norm": 0.9999999050466546, |
| "lr": 3.776435045317221e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 30498.478093732738, |
| "step_time_ms": 68.63363027572632, |
| "elapsed_sec": 223.98661923408508, |
| "progress_pct": 15.095478904068232 |
| }, |
| { |
| "step": 2100, |
| "epoch": 0, |
| "loss": 2.833835325241089, |
| "perplexity": 17.010576969139752, |
| "grad_norm": 0.9999999051555772, |
| "lr": 3.9652567975830816e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 30496.763241222277, |
| "step_time_ms": 68.50829124450684, |
| "elapsed_sec": 235.1098358631134, |
| "progress_pct": 15.850252849271643 |
| }, |
| { |
| "step": 2200, |
| "epoch": 0, |
| "loss": 2.830011742115021, |
| "perplexity": 16.9456598012593, |
| "grad_norm": 0.9999999086205261, |
| "lr": 4.1540785498489426e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 30504.708020780217, |
| "step_time_ms": 68.68033170700073, |
| "elapsed_sec": 246.3288288116455, |
| "progress_pct": 16.605026794475055 |
| }, |
| { |
| "step": 2300, |
| "epoch": 0, |
| "loss": 2.826364483833313, |
| "perplexity": 16.88396717606802, |
| "grad_norm": 0.9999999082911996, |
| "lr": 4.342900302114804e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 30501.204524633395, |
| "step_time_ms": 68.73678207397461, |
| "elapsed_sec": 257.5635986328125, |
| "progress_pct": 17.359800739678466 |
| }, |
| { |
| "step": 2400, |
| "epoch": 0, |
| "loss": 2.8305963826179505, |
| "perplexity": 16.95556981694399, |
| "grad_norm": 0.9999999116385626, |
| "lr": 4.5317220543806646e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 30500.794054619466, |
| "step_time_ms": 68.69770526885986, |
| "elapsed_sec": 268.69559478759766, |
| "progress_pct": 18.114574684881877 |
| }, |
| { |
| "step": 2500, |
| "epoch": 0, |
| "loss": 2.823909387588501, |
| "perplexity": 16.842566254075912, |
| "grad_norm": 0.9999999056484468, |
| "lr": 4.720543806646526e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 30494.381494274585, |
| "step_time_ms": 68.81577491760254, |
| "elapsed_sec": 279.87601590156555, |
| "progress_pct": 18.86934863008529 |
| }, |
| { |
| "step": 2600, |
| "epoch": 0, |
| "loss": 2.8317417907714844, |
| "perplexity": 16.975001991621266, |
| "grad_norm": 0.9999999060478559, |
| "lr": 4.9093655589123874e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 30493.046658925927, |
| "step_time_ms": 68.76836061477661, |
| "elapsed_sec": 291.05359983444214, |
| "progress_pct": 19.624122575288702 |
| }, |
| { |
| "step": 2700, |
| "epoch": 0, |
| "loss": 2.8178824305534365, |
| "perplexity": 16.741362113751652, |
| "grad_norm": 0.9999999106669464, |
| "lr": 5.098187311178248e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 30497.10975803011, |
| "step_time_ms": 68.62552165985107, |
| "elapsed_sec": 302.2955641746521, |
| "progress_pct": 20.378896520492113 |
| }, |
| { |
| "step": 2800, |
| "epoch": 0, |
| "loss": 2.8133517289161682, |
| "perplexity": 16.665683564931342, |
| "grad_norm": 0.9999999143853615, |
| "lr": 5.2870090634441094e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 30502.20993387066, |
| "step_time_ms": 68.7237548828125, |
| "elapsed_sec": 313.51184129714966, |
| "progress_pct": 21.133670465695523 |
| }, |
| { |
| "step": 2900, |
| "epoch": 0, |
| "loss": 2.8192299604415894, |
| "perplexity": 16.763936806188713, |
| "grad_norm": 0.9999999153165114, |
| "lr": 5.4758308157099705e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 30495.965582028683, |
| "step_time_ms": 68.75383377075195, |
| "elapsed_sec": 324.68157052993774, |
| "progress_pct": 21.888444410898934 |
| }, |
| { |
| "step": 3000, |
| "epoch": 0, |
| "loss": 2.814334969520569, |
| "perplexity": 16.682078000228916, |
| "grad_norm": 0.9999999103789803, |
| "lr": 5.664652567975831e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 30497.05613414167, |
| "step_time_ms": 68.89568090438843, |
| "elapsed_sec": 335.9107172489166, |
| "progress_pct": 22.64321835610235 |
| }, |
| { |
| "step": 3100, |
| "epoch": 0, |
| "loss": 2.812694685459137, |
| "perplexity": 16.654737083138976, |
| "grad_norm": 0.9999999103699377, |
| "lr": 5.853474320241692e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 30503.323091845978, |
| "step_time_ms": 68.7230896949768, |
| "elapsed_sec": 347.12381887435913, |
| "progress_pct": 23.39799230130576 |
| }, |
| { |
| "step": 3200, |
| "epoch": 0, |
| "loss": 2.8209491848945616, |
| "perplexity": 16.79278256533621, |
| "grad_norm": 0.9999999116175871, |
| "lr": 6.042296072507553e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 30471.149506940925, |
| "step_time_ms": 72.36854791641235, |
| "elapsed_sec": 358.6222107410431, |
| "progress_pct": 24.15276624650917 |
| }, |
| { |
| "step": 3300, |
| "epoch": 0, |
| "loss": 2.823811297416687, |
| "perplexity": 16.84091424488252, |
| "grad_norm": 0.9999999183970942, |
| "lr": 6.231117824773414e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 30475.15581021754, |
| "step_time_ms": 68.7209701538086, |
| "elapsed_sec": 369.78560733795166, |
| "progress_pct": 24.90754019171258 |
| }, |
| { |
| "step": 3400, |
| "epoch": 0, |
| "loss": 2.8320892524719237, |
| "perplexity": 16.98090117948981, |
| "grad_norm": 0.9999999248059107, |
| "lr": 6.419939577039276e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29670.286366915152, |
| "step_time_ms": 68.97594928741455, |
| "elapsed_sec": 391.05166888237, |
| "progress_pct": 25.662314136915995 |
| }, |
| { |
| "step": 3500, |
| "epoch": 0, |
| "loss": 2.815865466594696, |
| "perplexity": 16.70762941999754, |
| "grad_norm": 0.9999999286289808, |
| "lr": 6.608761329305136e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29686.79626575453, |
| "step_time_ms": 69.19201135635376, |
| "elapsed_sec": 402.31939792633057, |
| "progress_pct": 26.417088082119406 |
| }, |
| { |
| "step": 3600, |
| "epoch": 0, |
| "loss": 2.846110601425171, |
| "perplexity": 17.220673356933453, |
| "grad_norm": 0.9999999358015611, |
| "lr": 6.797583081570997e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29707.146446527197, |
| "step_time_ms": 68.82575511932373, |
| "elapsed_sec": 413.6111161708832, |
| "progress_pct": 27.171862027322817 |
| }, |
| { |
| "step": 3700, |
| "epoch": 0, |
| "loss": 2.8651936769485475, |
| "perplexity": 17.55245238082157, |
| "grad_norm": 0.9999999406451835, |
| "lr": 6.986404833836858e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29731.035920613755, |
| "step_time_ms": 68.87631177902222, |
| "elapsed_sec": 424.8875832557678, |
| "progress_pct": 27.926635972526228 |
| }, |
| { |
| "step": 3800, |
| "epoch": 0, |
| "loss": 2.8802473044395445, |
| "perplexity": 17.81867927325758, |
| "grad_norm": 0.999999938128731, |
| "lr": 7.175226586102719e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29740.50111024254, |
| "step_time_ms": 68.9161491394043, |
| "elapsed_sec": 436.10438680648804, |
| "progress_pct": 28.68140991772964 |
| }, |
| { |
| "step": 3900, |
| "epoch": 0, |
| "loss": 2.8760248684883116, |
| "perplexity": 17.7435996622643, |
| "grad_norm": 0.9999999370933762, |
| "lr": 7.36404833836858e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29756.389974960584, |
| "step_time_ms": 68.69559288024902, |
| "elapsed_sec": 447.3066461086273, |
| "progress_pct": 29.436183862933053 |
| }, |
| { |
| "step": 4000, |
| "epoch": 0, |
| "loss": 2.833030352592468, |
| "perplexity": 16.996889429728867, |
| "grad_norm": 0.9999999344961908, |
| "lr": 7.552870090634442e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29775.895470741994, |
| "step_time_ms": 68.65659475326538, |
| "elapsed_sec": 458.5267641544342, |
| "progress_pct": 30.190957808136464 |
| }, |
| { |
| "step": 4100, |
| "epoch": 0, |
| "loss": 2.8502792358398437, |
| "perplexity": 17.29260988289878, |
| "grad_norm": 0.9999999411152237, |
| "lr": 7.741691842900302e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29789.044694580516, |
| "step_time_ms": 68.78466129302979, |
| "elapsed_sec": 469.70932245254517, |
| "progress_pct": 30.945731753339874 |
| }, |
| { |
| "step": 4200, |
| "epoch": 0, |
| "loss": 2.845259420871735, |
| "perplexity": 17.206021691146056, |
| "grad_norm": 0.9999999597855691, |
| "lr": 7.930513595166163e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29810.506707537246, |
| "step_time_ms": 68.6014461517334, |
| "elapsed_sec": 480.918527841568, |
| "progress_pct": 31.700505698543285 |
| }, |
| { |
| "step": 4300, |
| "epoch": 0, |
| "loss": 2.851608917713165, |
| "perplexity": 17.315618846713456, |
| "grad_norm": 0.9999999759995412, |
| "lr": 8.119335347432024e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29823.229438646857, |
| "step_time_ms": 68.48395347595215, |
| "elapsed_sec": 492.0337359905243, |
| "progress_pct": 32.4552796437467 |
| }, |
| { |
| "step": 4400, |
| "epoch": 0, |
| "loss": 2.8928983926773073, |
| "perplexity": 18.045536929430227, |
| "grad_norm": 0.9999999802354099, |
| "lr": 8.308157099697885e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29839.21784462899, |
| "step_time_ms": 68.63518953323364, |
| "elapsed_sec": 503.2420446872711, |
| "progress_pct": 33.21005358895011 |
| }, |
| { |
| "step": 4500, |
| "epoch": 0, |
| "loss": 2.8919497418403624, |
| "perplexity": 18.028426133087862, |
| "grad_norm": 0.9999999832088211, |
| "lr": 8.496978851963746e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29851.35171674971, |
| "step_time_ms": 68.82947206497192, |
| "elapsed_sec": 514.4808900356293, |
| "progress_pct": 33.96482753415352 |
| }, |
| { |
| "step": 4600, |
| "epoch": 0, |
| "loss": 2.912613160610199, |
| "perplexity": 18.404830560286392, |
| "grad_norm": 0.9999999852952282, |
| "lr": 8.685800604229609e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29868.9929982354, |
| "step_time_ms": 68.60780239105225, |
| "elapsed_sec": 525.6990418434143, |
| "progress_pct": 34.71960147935693 |
| }, |
| { |
| "step": 4700, |
| "epoch": 0, |
| "loss": 2.9198369789123535, |
| "perplexity": 18.538265085260896, |
| "grad_norm": 0.9999999861072904, |
| "lr": 8.874622356495468e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29881.651406257948, |
| "step_time_ms": 68.78975629806519, |
| "elapsed_sec": 536.9214298725128, |
| "progress_pct": 35.47437542456034 |
| }, |
| { |
| "step": 4800, |
| "epoch": 0, |
| "loss": 2.9220591092109682, |
| "perplexity": 18.57950532941875, |
| "grad_norm": 0.9999999939528834, |
| "lr": 9.063444108761329e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29894.195528337976, |
| "step_time_ms": 68.80178928375244, |
| "elapsed_sec": 548.1407246589661, |
| "progress_pct": 36.22914936976375 |
| }, |
| { |
| "step": 4900, |
| "epoch": 0, |
| "loss": 2.9370435547828673, |
| "perplexity": 18.86000523511343, |
| "grad_norm": 1.0000000011274732, |
| "lr": 9.25226586102719e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29899.872434533, |
| "step_time_ms": 68.81359100341797, |
| "elapsed_sec": 559.2738242149353, |
| "progress_pct": 36.983923314967164 |
| }, |
| { |
| "step": 5000, |
| "epoch": 0, |
| "loss": 2.9583297395706176, |
| "perplexity": 19.265766020381903, |
| "grad_norm": 0.9999999952426983, |
| "lr": 9.441087613293051e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29914.890276799215, |
| "step_time_ms": 68.60477924346924, |
| "elapsed_sec": 570.4796454906464, |
| "progress_pct": 37.73869726017058 |
| }, |
| { |
| "step": 5100, |
| "epoch": 0, |
| "loss": 2.9550197100639344, |
| "perplexity": 19.20210119074545, |
| "grad_norm": 0.9999999955847756, |
| "lr": 9.629909365558912e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29927.735715089886, |
| "step_time_ms": 68.55671644210815, |
| "elapsed_sec": 581.667058467865, |
| "progress_pct": 38.49347120537399 |
| }, |
| { |
| "step": 5200, |
| "epoch": 0, |
| "loss": 2.9828960919380187, |
| "perplexity": 19.744917015876656, |
| "grad_norm": 0.9999999973577894, |
| "lr": 9.818731117824775e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29941.160552589776, |
| "step_time_ms": 68.76250982284546, |
| "elapsed_sec": 592.8996963500977, |
| "progress_pct": 39.248245150577404 |
| }, |
| { |
| "step": 5300, |
| "epoch": 0, |
| "loss": 2.9812774753570555, |
| "perplexity": 19.712983416903363, |
| "grad_norm": 0.9999999977527527, |
| "lr": 9.999999828225707e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29953.48526766276, |
| "step_time_ms": 68.57383966445923, |
| "elapsed_sec": 604.1068956851959, |
| "progress_pct": 40.003019095780814 |
| }, |
| { |
| "step": 5400, |
| "epoch": 0, |
| "loss": 2.9874549293518067, |
| "perplexity": 19.835136373740465, |
| "grad_norm": 1.000000005050804, |
| "lr": 9.99988388103099e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29965.8283068889, |
| "step_time_ms": 68.63211393356323, |
| "elapsed_sec": 615.2640204429626, |
| "progress_pct": 40.757793040984225 |
| }, |
| { |
| "step": 5500, |
| "epoch": 0, |
| "loss": 2.9855289316177367, |
| "perplexity": 19.7969707113172, |
| "grad_norm": 1.0000000047012423, |
| "lr": 9.999553221781354e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29979.453810055362, |
| "step_time_ms": 68.55385541915894, |
| "elapsed_sec": 626.467483997345, |
| "progress_pct": 41.512566986187636 |
| }, |
| { |
| "step": 5600, |
| "epoch": 0, |
| "loss": 3.001195156574249, |
| "perplexity": 20.10955663548813, |
| "grad_norm": 1.0000000009146937, |
| "lr": 9.999007864819872e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29985.720598712633, |
| "step_time_ms": 68.79624128341675, |
| "elapsed_sec": 637.6670501232147, |
| "progress_pct": 42.26734093139105 |
| }, |
| { |
| "step": 5700, |
| "epoch": 0, |
| "loss": 2.9869230651855467, |
| "perplexity": 19.824589580449725, |
| "grad_norm": 0.9999999955026968, |
| "lr": 9.998247833802596e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 30003.619229602187, |
| "step_time_ms": 68.55745553970337, |
| "elapsed_sec": 648.9761068820953, |
| "progress_pct": 43.02211487659446 |
| }, |
| { |
| "step": 5800, |
| "epoch": 0, |
| "loss": 2.992214176654816, |
| "perplexity": 19.929761687056967, |
| "grad_norm": 0.9999999920373142, |
| "lr": 9.997273161697535e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 30007.238330623877, |
| "step_time_ms": 68.54370355606079, |
| "elapsed_sec": 660.0942339897156, |
| "progress_pct": 43.77688882179787 |
| }, |
| { |
| "step": 5900, |
| "epoch": 0, |
| "loss": 2.9899471187591553, |
| "perplexity": 19.884630939803454, |
| "grad_norm": 0.9999999905480129, |
| "lr": 9.996083890783225e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 30016.763888893925, |
| "step_time_ms": 68.59871864318848, |
| "elapsed_sec": 671.2416126728058, |
| "progress_pct": 44.531662767001286 |
| }, |
| { |
| "step": 6000, |
| "epoch": 0, |
| "loss": 2.973553490638733, |
| "perplexity": 19.561307160862828, |
| "grad_norm": 0.9999999961927993, |
| "lr": 9.9946800726469e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 30019.099421691335, |
| "step_time_ms": 68.57965469360352, |
| "elapsed_sec": 682.3533148765564, |
| "progress_pct": 45.2864367122047 |
| }, |
| { |
| "step": 6100, |
| "epoch": 0, |
| "loss": 2.9827182602882387, |
| "perplexity": 19.74140605689802, |
| "grad_norm": 0.9999999982933864, |
| "lr": 9.993061768182244e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 30026.30841714172, |
| "step_time_ms": 68.58452320098877, |
| "elapsed_sec": 693.5168223381042, |
| "progress_pct": 46.04121065740811 |
| }, |
| { |
| "step": 6200, |
| "epoch": 0, |
| "loss": 2.988657066822052, |
| "perplexity": 19.858995272367505, |
| "grad_norm": 0.9999999972547158, |
| "lr": 9.991229047586758e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 30032.30921841359, |
| "step_time_ms": 68.62870931625366, |
| "elapsed_sec": 704.7209339141846, |
| "progress_pct": 46.79598460261152 |
| }, |
| { |
| "step": 6300, |
| "epoch": 0, |
| "loss": 3.0013710594177248, |
| "perplexity": 20.11309427481263, |
| "grad_norm": 1.0000000035538437, |
| "lr": 9.989181990358713e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 30027.30892734739, |
| "step_time_ms": 72.44897842407227, |
| "elapsed_sec": 716.3112769126892, |
| "progress_pct": 47.55075854781493 |
| }, |
| { |
| "step": 6400, |
| "epoch": 0, |
| "loss": 3.0535635590553283, |
| "perplexity": 21.1907244301477, |
| "grad_norm": 1.0000000054736338, |
| "lr": 9.9869206852937e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 30031.377185455563, |
| "step_time_ms": 68.55860948562622, |
| "elapsed_sec": 727.4697349071503, |
| "progress_pct": 48.30553249301834 |
| }, |
| { |
| "step": 6500, |
| "epoch": 0, |
| "loss": 3.0514983248710634, |
| "perplexity": 21.147005781827758, |
| "grad_norm": 1.0000000088419636, |
| "lr": 9.984445230480777e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 30037.350771694248, |
| "step_time_ms": 68.51820468902588, |
| "elapsed_sec": 738.6134738922119, |
| "progress_pct": 49.06030643822175 |
| }, |
| { |
| "step": 6600, |
| "epoch": 0, |
| "loss": 3.073709189891815, |
| "perplexity": 21.62195405375797, |
| "grad_norm": 1.0000000080646183, |
| "lr": 9.981755733298221e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 30044.33637658769, |
| "step_time_ms": 68.57419729232788, |
| "elapsed_sec": 749.8031148910522, |
| "progress_pct": 49.81508038342516 |
| }, |
| { |
| "step": 6700, |
| "epoch": 0, |
| "loss": 3.060198895931244, |
| "perplexity": 21.331799548252498, |
| "grad_norm": 1.0000000045052264, |
| "lr": 9.978852310408861e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29765.687951709533, |
| "step_time_ms": 68.89270305633545, |
| "elapsed_sec": 768.0259914398193, |
| "progress_pct": 50.56985432862857 |
| }, |
| { |
| "step": 6800, |
| "epoch": 0, |
| "loss": 3.0322815942764283, |
| "perplexity": 20.744509189304374, |
| "grad_norm": 1.0000000008077732, |
| "lr": 9.975735087755025e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29777.555021316624, |
| "step_time_ms": 68.59505653381348, |
| "elapsed_sec": 779.2872176170349, |
| "progress_pct": 51.32462827383199 |
| }, |
| { |
| "step": 6900, |
| "epoch": 0, |
| "loss": 3.022260241508484, |
| "perplexity": 20.537659333523006, |
| "grad_norm": 0.9999999955198917, |
| "lr": 9.972404200553072e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29787.611626839698, |
| "step_time_ms": 68.62642526626587, |
| "elapsed_sec": 790.5256149768829, |
| "progress_pct": 52.0794022190354 |
| }, |
| { |
| "step": 7000, |
| "epoch": 0, |
| "loss": 3.0193919491767884, |
| "perplexity": 20.478835724727677, |
| "grad_norm": 0.9999999978591363, |
| "lr": 9.968859793287533e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29800.945444945188, |
| "step_time_ms": 68.56673240661621, |
| "elapsed_sec": 801.7361745834351, |
| "progress_pct": 52.83417616423881 |
| }, |
| { |
| "step": 7100, |
| "epoch": 0, |
| "loss": 3.0333949518203736, |
| "perplexity": 20.767618106962548, |
| "grad_norm": 1.0000000012009478, |
| "lr": 9.965102019704832e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29810.921233546793, |
| "step_time_ms": 68.66058826446533, |
| "elapsed_sec": 812.898763179779, |
| "progress_pct": 53.58895010944222 |
| }, |
| { |
| "step": 7200, |
| "epoch": 0, |
| "loss": 3.070095462799072, |
| "perplexity": 21.543959223489487, |
| "grad_norm": 1.0000000010922643, |
| "lr": 9.961131042806633e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29820.53550024627, |
| "step_time_ms": 68.54245662689209, |
| "elapsed_sec": 824.1274540424347, |
| "progress_pct": 54.343724054645634 |
| }, |
| { |
| "step": 7300, |
| "epoch": 0, |
| "loss": 3.1172192811965944, |
| "perplexity": 22.58349390377302, |
| "grad_norm": 1.0000000016193091, |
| "lr": 9.956947034842753e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29828.650771642715, |
| "step_time_ms": 68.64347219467163, |
| "elapsed_sec": 835.327624797821, |
| "progress_pct": 55.098497999849045 |
| }, |
| { |
| "step": 7400, |
| "epoch": 0, |
| "loss": 3.0517469382286073, |
| "perplexity": 21.15226386352477, |
| "grad_norm": 0.9999999990690044, |
| "lr": 9.952550177303704e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29836.492375958653, |
| "step_time_ms": 68.64686489105225, |
| "elapsed_sec": 846.4573409557343, |
| "progress_pct": 55.853271945052455 |
| }, |
| { |
| "step": 7500, |
| "epoch": 0, |
| "loss": 3.043672297000885, |
| "perplexity": 20.982154631438764, |
| "grad_norm": 0.9999999930456833, |
| "lr": 9.947940660912811e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29846.262143054675, |
| "step_time_ms": 68.66245746612549, |
| "elapsed_sec": 857.693733215332, |
| "progress_pct": 56.608045890255866 |
| }, |
| { |
| "step": 7600, |
| "epoch": 0, |
| "loss": 3.035440671443939, |
| "perplexity": 20.8101463164234, |
| "grad_norm": 0.9999999920525307, |
| "lr": 9.943118685617943e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29854.453670190185, |
| "step_time_ms": 68.69853019714355, |
| "elapsed_sec": 868.9143431186676, |
| "progress_pct": 57.36281983545928 |
| }, |
| { |
| "step": 7700, |
| "epoch": 0, |
| "loss": 2.9939205646514893, |
| "perplexity": 19.963798625028573, |
| "grad_norm": 0.9999999901331601, |
| "lr": 9.93808446058284e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29860.685174223207, |
| "step_time_ms": 68.79801273345947, |
| "elapsed_sec": 880.1283643245697, |
| "progress_pct": 58.117593780662695 |
| }, |
| { |
| "step": 7800, |
| "epoch": 0, |
| "loss": 3.024644274711609, |
| "perplexity": 20.586680205764154, |
| "grad_norm": 0.9999999972906101, |
| "lr": 9.932838204178041e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29871.18400819665, |
| "step_time_ms": 68.64381074905396, |
| "elapsed_sec": 891.3801338672638, |
| "progress_pct": 58.872367725866106 |
| }, |
| { |
| "step": 7900, |
| "epoch": 0, |
| "loss": 3.0337059926986694, |
| "perplexity": 20.77407868983925, |
| "grad_norm": 0.9999999996058188, |
| "lr": 9.927380143971407e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29876.69149581676, |
| "step_time_ms": 68.74019622802734, |
| "elapsed_sec": 902.5685124397278, |
| "progress_pct": 59.627141671069516 |
| }, |
| { |
| "step": 8000, |
| "epoch": 0, |
| "loss": 3.0616763854026794, |
| "perplexity": 21.36334035235707, |
| "grad_norm": 1.0000000019298847, |
| "lr": 9.921710516718252e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29880.446721366232, |
| "step_time_ms": 68.79402160644531, |
| "elapsed_sec": 913.6925647258759, |
| "progress_pct": 60.38191561627293 |
| }, |
| { |
| "step": 8100, |
| "epoch": 0, |
| "loss": 3.0444650769233705, |
| "perplexity": 20.99879545774598, |
| "grad_norm": 1.0000000104659683, |
| "lr": 9.915829568351079e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29885.601052957238, |
| "step_time_ms": 68.65685224533081, |
| "elapsed_sec": 924.8072993755341, |
| "progress_pct": 61.13668956147634 |
| }, |
| { |
| "step": 8200, |
| "epoch": 0, |
| "loss": 2.9955936670303345, |
| "perplexity": 19.997228061635678, |
| "grad_norm": 1.0000000080553764, |
| "lr": 9.909737553968903e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29894.409481672225, |
| "step_time_ms": 68.80224943161011, |
| "elapsed_sec": 936.0114979743958, |
| "progress_pct": 61.89146350667975 |
| }, |
| { |
| "step": 8300, |
| "epoch": 0, |
| "loss": 3.0462084817886352, |
| "perplexity": 21.0354367909702, |
| "grad_norm": 1.0000000055912892, |
| "lr": 9.903434737826193e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29898.611187936058, |
| "step_time_ms": 68.72424364089966, |
| "elapsed_sec": 947.1763026714325, |
| "progress_pct": 62.64623745188316 |
| }, |
| { |
| "step": 8400, |
| "epoch": 0, |
| "loss": 3.04290563583374, |
| "perplexity": 20.966074593037924, |
| "grad_norm": 1.0000000033133467, |
| "lr": 9.896921393321402e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29903.666075391204, |
| "step_time_ms": 68.6596155166626, |
| "elapsed_sec": 958.3606882095337, |
| "progress_pct": 63.40101139708657 |
| }, |
| { |
| "step": 8500, |
| "epoch": 0, |
| "loss": 3.0263629794120788, |
| "perplexity": 20.622093053197236, |
| "grad_norm": 0.9999999969913438, |
| "lr": 9.890197802985112e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29909.96927952112, |
| "step_time_ms": 68.68655204772949, |
| "elapsed_sec": 969.5727109909058, |
| "progress_pct": 64.15578534228999 |
| }, |
| { |
| "step": 8600, |
| "epoch": 0, |
| "loss": 2.982111701965332, |
| "perplexity": 19.729435373574137, |
| "grad_norm": 0.9999999969984414, |
| "lr": 9.883264258467786e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29918.46704130954, |
| "step_time_ms": 68.8025951385498, |
| "elapsed_sec": 980.8517916202545, |
| "progress_pct": 64.9105592874934 |
| }, |
| { |
| "step": 8700, |
| "epoch": 0, |
| "loss": 3.008047037124634, |
| "perplexity": 20.247818049896424, |
| "grad_norm": 1.000000000964267, |
| "lr": 9.876121060527097e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29925.865710112143, |
| "step_time_ms": 68.60105514526367, |
| "elapsed_sec": 992.0734219551086, |
| "progress_pct": 65.66533323269681 |
| }, |
| { |
| "step": 8800, |
| "epoch": 0, |
| "loss": 3.0379968214035036, |
| "perplexity": 20.86340821478766, |
| "grad_norm": 1.000000000967493, |
| "lr": 9.868768519014906e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29930.96776317607, |
| "step_time_ms": 68.73708248138428, |
| "elapsed_sec": 1003.2306418418884, |
| "progress_pct": 66.42010717790022 |
| }, |
| { |
| "step": 8900, |
| "epoch": 0, |
| "loss": 3.0114793157577515, |
| "perplexity": 20.317433604804787, |
| "grad_norm": 1.0000000049415028, |
| "lr": 9.861206952863805e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29938.500064531298, |
| "step_time_ms": 68.66981267929077, |
| "elapsed_sec": 1014.4962818622589, |
| "progress_pct": 67.17488112310363 |
| }, |
| { |
| "step": 9000, |
| "epoch": 0, |
| "loss": 3.0084391617774964, |
| "perplexity": 20.255759275393853, |
| "grad_norm": 1.0000000038688899, |
| "lr": 9.853436690073285e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29943.498254578597, |
| "step_time_ms": 68.49103927612305, |
| "elapsed_sec": 1025.6270906925201, |
| "progress_pct": 67.92965506830704 |
| }, |
| { |
| "step": 9100, |
| "epoch": 0, |
| "loss": 3.0745689249038697, |
| "perplexity": 21.640551197849472, |
| "grad_norm": 1.000000004718636, |
| "lr": 9.845458067695514e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29948.06303955987, |
| "step_time_ms": 69.01569843292236, |
| "elapsed_sec": 1036.9010162353516, |
| "progress_pct": 68.68442901351045 |
| }, |
| { |
| "step": 9200, |
| "epoch": 0, |
| "loss": 3.0796532964706422, |
| "perplexity": 21.750859988858302, |
| "grad_norm": 1.000000002997285, |
| "lr": 9.837271431820715e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29949.88973194273, |
| "step_time_ms": 68.61918210983276, |
| "elapsed_sec": 1048.0913045406342, |
| "progress_pct": 69.43920295871386 |
| }, |
| { |
| "step": 9300, |
| "epoch": 0, |
| "loss": 3.153324291706085, |
| "perplexity": 23.4137695523185, |
| "grad_norm": 0.9999999963941149, |
| "lr": 9.828877137562149e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29954.18284575665, |
| "step_time_ms": 68.8855242729187, |
| "elapsed_sec": 1059.2905225753784, |
| "progress_pct": 70.19397690391727 |
| }, |
| { |
| "step": 9400, |
| "epoch": 0, |
| "loss": 3.3674811506271363, |
| "perplexity": 29.005374796594197, |
| "grad_norm": 0.9999999980681187, |
| "lr": 9.820275549040714e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29950.33890410087, |
| "step_time_ms": 72.31015920639038, |
| "elapsed_sec": 1070.8681161403656, |
| "progress_pct": 70.94875084912069 |
| }, |
| { |
| "step": 9500, |
| "epoch": 0, |
| "loss": 3.40507611989975, |
| "perplexity": 30.116588108336245, |
| "grad_norm": 0.9999999904071671, |
| "lr": 9.811467039369153e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29961.53980985831, |
| "step_time_ms": 68.4125828742981, |
| "elapsed_sec": 1082.0211913585663, |
| "progress_pct": 71.7035247943241 |
| }, |
| { |
| "step": 9600, |
| "epoch": 0, |
| "loss": 3.374427659511566, |
| "perplexity": 29.207562326020838, |
| "grad_norm": 0.9999999853951721, |
| "lr": 9.802451990635867e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29966.307797668982, |
| "step_time_ms": 68.59289169311523, |
| "elapsed_sec": 1093.2418575286865, |
| "progress_pct": 72.4582987395275 |
| }, |
| { |
| "step": 9700, |
| "epoch": 0, |
| "loss": 3.4392935371398927, |
| "perplexity": 31.164933521357394, |
| "grad_norm": 0.999999970808979, |
| "lr": 9.793230793888339e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29969.481180441442, |
| "step_time_ms": 68.55150938034058, |
| "elapsed_sec": 1104.3426744937897, |
| "progress_pct": 73.21307268473092 |
| }, |
| { |
| "step": 9800, |
| "epoch": 0, |
| "loss": 3.418562099933624, |
| "perplexity": 30.525490844018883, |
| "grad_norm": 0.9999999525241351, |
| "lr": 9.783803849116176e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29976.895268205295, |
| "step_time_ms": 68.71764898300171, |
| "elapsed_sec": 1115.6174013614655, |
| "progress_pct": 73.96784662993433 |
| }, |
| { |
| "step": 9900, |
| "epoch": 0, |
| "loss": 3.3591481018066407, |
| "perplexity": 28.76467586317593, |
| "grad_norm": 0.9999999456205916, |
| "lr": 9.774171565233752e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29984.36230858003, |
| "step_time_ms": 68.62653970718384, |
| "elapsed_sec": 1126.913010597229, |
| "progress_pct": 74.72262057513774 |
| }, |
| { |
| "step": 10000, |
| "epoch": 0, |
| "loss": 3.265201942920685, |
| "perplexity": 26.185398412761046, |
| "grad_norm": 0.999999952462774, |
| "lr": 9.764334360062481e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29767.874544478855, |
| "step_time_ms": 68.8528823852539, |
| "elapsed_sec": 1146.5747394561768, |
| "progress_pct": 75.47739452034116 |
| }, |
| { |
| "step": 10100, |
| "epoch": 0, |
| "loss": 3.204775149822235, |
| "perplexity": 24.64995684610944, |
| "grad_norm": 0.9999999653872187, |
| "lr": 9.754292660312681e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29778.15250087888, |
| "step_time_ms": 68.64858627319336, |
| "elapsed_sec": 1157.871127128601, |
| "progress_pct": 76.23216846554458 |
| }, |
| { |
| "step": 10200, |
| "epoch": 0, |
| "loss": 3.2032279491424562, |
| "perplexity": 24.61184790481275, |
| "grad_norm": 0.9999999683836718, |
| "lr": 9.744046901565074e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29782.495889385045, |
| "step_time_ms": 68.81840467453003, |
| "elapsed_sec": 1169.0545053482056, |
| "progress_pct": 76.98694241074799 |
| }, |
| { |
| "step": 10300, |
| "epoch": 0, |
| "loss": 3.169970953464508, |
| "perplexity": 23.80679284153244, |
| "grad_norm": 0.9999999616953779, |
| "lr": 9.733597528251886e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29786.453693018226, |
| "step_time_ms": 68.88345241546631, |
| "elapsed_sec": 1180.3069396018982, |
| "progress_pct": 77.7417163559514 |
| }, |
| { |
| "step": 10400, |
| "epoch": 0, |
| "loss": 3.1525204300880434, |
| "perplexity": 23.394955684527936, |
| "grad_norm": 0.9999999453561814, |
| "lr": 9.722944993637574e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29792.717834850704, |
| "step_time_ms": 68.55859756469727, |
| "elapsed_sec": 1191.465786933899, |
| "progress_pct": 78.49649030115481 |
| }, |
| { |
| "step": 10500, |
| "epoch": 0, |
| "loss": 3.12509893655777, |
| "perplexity": 22.762146990598122, |
| "grad_norm": 0.9999999442481443, |
| "lr": 9.712089759799156e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29801.167133433617, |
| "step_time_ms": 68.67520809173584, |
| "elapsed_sec": 1202.713901758194, |
| "progress_pct": 79.25126424635822 |
| }, |
| { |
| "step": 10600, |
| "epoch": 0, |
| "loss": 3.1125614857673645, |
| "perplexity": 22.478549204294474, |
| "grad_norm": 0.9999999499257827, |
| "lr": 9.701032297606176e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29806.945147531915, |
| "step_time_ms": 68.78292798995972, |
| "elapsed_sec": 1213.9093027114868, |
| "progress_pct": 80.00603819156163 |
| }, |
| { |
| "step": 10700, |
| "epoch": 0, |
| "loss": 3.17107980966568, |
| "perplexity": 23.83320579278082, |
| "grad_norm": 0.9999999544155758, |
| "lr": 9.689773086700275e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29811.05019124841, |
| "step_time_ms": 68.5623836517334, |
| "elapsed_sec": 1225.0971624851227, |
| "progress_pct": 80.76081213676504 |
| }, |
| { |
| "step": 10800, |
| "epoch": 0, |
| "loss": 3.1426844120025637, |
| "perplexity": 23.16597047477597, |
| "grad_norm": 0.9999999623638297, |
| "lr": 9.678312615474386e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29817.68600049851, |
| "step_time_ms": 68.59636545181274, |
| "elapsed_sec": 1236.327594280243, |
| "progress_pct": 81.51558608196845 |
| }, |
| { |
| "step": 10900, |
| "epoch": 0, |
| "loss": 3.1451004695892335, |
| "perplexity": 23.22200846173652, |
| "grad_norm": 0.9999999613214944, |
| "lr": 9.666651381051555e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29821.05204868826, |
| "step_time_ms": 68.49246263504028, |
| "elapsed_sec": 1247.4301691055298, |
| "progress_pct": 82.27036002717186 |
| }, |
| { |
| "step": 11000, |
| "epoch": 0, |
| "loss": 3.1183106231689455, |
| "perplexity": 22.608153672225175, |
| "grad_norm": 0.9999999582262188, |
| "lr": 9.654789889263361e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29829.229888281192, |
| "step_time_ms": 68.70523452758789, |
| "elapsed_sec": 1258.7128846645355, |
| "progress_pct": 83.02513397237527 |
| }, |
| { |
| "step": 11100, |
| "epoch": 0, |
| "loss": 3.1134038853645323, |
| "perplexity": 22.497493103135188, |
| "grad_norm": 0.9999999537734283, |
| "lr": 9.642728654627993e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29835.080075716916, |
| "step_time_ms": 68.42459917068481, |
| "elapsed_sec": 1269.830243587494, |
| "progress_pct": 83.77990791757868 |
| }, |
| { |
| "step": 11200, |
| "epoch": 0, |
| "loss": 3.1126963663101197, |
| "perplexity": 22.481581327694126, |
| "grad_norm": 0.9999999430285232, |
| "lr": 9.630468200327917e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29840.622569238225, |
| "step_time_ms": 68.69045734405518, |
| "elapsed_sec": 1281.010505437851, |
| "progress_pct": 84.5346818627821 |
| }, |
| { |
| "step": 11300, |
| "epoch": 0, |
| "loss": 3.1477469062805175, |
| "perplexity": 23.283545427902375, |
| "grad_norm": 0.9999999335019356, |
| "lr": 9.618009058187196e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29847.154809522483, |
| "step_time_ms": 68.6976432800293, |
| "elapsed_sec": 1292.2503416538239, |
| "progress_pct": 85.2894558079855 |
| }, |
| { |
| "step": 11400, |
| "epoch": 0, |
| "loss": 3.1873206615448, |
| "perplexity": 24.223437629163133, |
| "grad_norm": 0.9999999261044604, |
| "lr": 9.605351768648408e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29852.87892273204, |
| "step_time_ms": 68.48160028457642, |
| "elapsed_sec": 1303.4236028194427, |
| "progress_pct": 86.04422975318892 |
| }, |
| { |
| "step": 11500, |
| "epoch": 0, |
| "loss": 3.12576908826828, |
| "perplexity": 22.77740619475855, |
| "grad_norm": 0.9999999320799811, |
| "lr": 9.59249688074921e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29859.745047711865, |
| "step_time_ms": 68.7877631187439, |
| "elapsed_sec": 1314.6685926914215, |
| "progress_pct": 86.79900369839233 |
| }, |
| { |
| "step": 11600, |
| "epoch": 0, |
| "loss": 3.120550537109375, |
| "perplexity": 22.65885074815001, |
| "grad_norm": 0.999999936853886, |
| "lr": 9.579444952098517e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29862.469704053565, |
| "step_time_ms": 68.95772695541382, |
| "elapsed_sec": 1325.9032957553864, |
| "progress_pct": 87.55377764359574 |
| }, |
| { |
| "step": 11700, |
| "epoch": 0, |
| "loss": 3.10750075340271, |
| "perplexity": 22.365078647061083, |
| "grad_norm": 0.9999999361651032, |
| "lr": 9.566196548852327e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29868.643271348836, |
| "step_time_ms": 68.56596231460571, |
| "elapsed_sec": 1337.0910301208496, |
| "progress_pct": 88.30855158879915 |
| }, |
| { |
| "step": 11800, |
| "epoch": 0, |
| "loss": 3.0350870537757872, |
| "perplexity": 20.802788781962846, |
| "grad_norm": 0.9999999511592819, |
| "lr": 9.552752245689144e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29873.580938625204, |
| "step_time_ms": 68.68452787399292, |
| "elapsed_sec": 1348.3001613616943, |
| "progress_pct": 89.06332553400257 |
| }, |
| { |
| "step": 11900, |
| "epoch": 0, |
| "loss": 2.983770315647125, |
| "perplexity": 19.76218603786065, |
| "grad_norm": 0.9999999564182059, |
| "lr": 9.539112625785066e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29876.49008130546, |
| "step_time_ms": 68.60888242721558, |
| "elapsed_sec": 1359.4474079608917, |
| "progress_pct": 89.81809947920598 |
| }, |
| { |
| "step": 12000, |
| "epoch": 0, |
| "loss": 2.9881193041801453, |
| "perplexity": 19.848318717587674, |
| "grad_norm": 0.9999999694847291, |
| "lr": 9.525278280788481e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29880.676071785907, |
| "step_time_ms": 68.67148876190186, |
| "elapsed_sec": 1370.6449580192566, |
| "progress_pct": 90.5728734244094 |
| }, |
| { |
| "step": 12100, |
| "epoch": 0, |
| "loss": 3.010297691822052, |
| "perplexity": 20.293440217317546, |
| "grad_norm": 0.9999999756513553, |
| "lr": 9.511249810794403e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29890.41306297647, |
| "step_time_ms": 68.7269401550293, |
| "elapsed_sec": 1381.9189755916595, |
| "progress_pct": 91.3276473696128 |
| }, |
| { |
| "step": 12200, |
| "epoch": 0, |
| "loss": 3.044492042064667, |
| "perplexity": 20.99936170086693, |
| "grad_norm": 0.9999999580831208, |
| "lr": 9.497027824318445e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29895.381058749288, |
| "step_time_ms": 68.52395057678223, |
| "elapsed_sec": 1393.132401227951, |
| "progress_pct": 92.08242131481622 |
| }, |
| { |
| "step": 12300, |
| "epoch": 0, |
| "loss": 3.0311736488342285, |
| "perplexity": 20.721538132586037, |
| "grad_norm": 0.9999999508302908, |
| "lr": 9.482612938270422e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29901.363092922238, |
| "step_time_ms": 68.59883308410645, |
| "elapsed_sec": 1404.3868458271027, |
| "progress_pct": 92.83719526001963 |
| }, |
| { |
| "step": 12400, |
| "epoch": 0, |
| "loss": 3.026858425140381, |
| "perplexity": 20.632312712543328, |
| "grad_norm": 0.9999999444981733, |
| "lr": 9.468005777927585e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29905.344204329296, |
| "step_time_ms": 68.69802236557007, |
| "elapsed_sec": 1415.589993238449, |
| "progress_pct": 93.59196920522304 |
| }, |
| { |
| "step": 12500, |
| "epoch": 0, |
| "loss": 3.0245711183547974, |
| "perplexity": 20.58517421432855, |
| "grad_norm": 0.9999999515947503, |
| "lr": 9.453206976907509e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29900.696944622938, |
| "step_time_ms": 72.52441644668579, |
| "elapsed_sec": 1427.1709144115448, |
| "progress_pct": 94.34674315042645 |
| }, |
| { |
| "step": 12600, |
| "epoch": 0, |
| "loss": 2.980508255958557, |
| "perplexity": 19.6978256382329, |
| "grad_norm": 0.9999999723068747, |
| "lr": 9.438217177140604e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29907.388512661983, |
| "step_time_ms": 68.8193130493164, |
| "elapsed_sec": 1438.4461545944214, |
| "progress_pct": 95.10151709562986 |
| }, |
| { |
| "step": 12700, |
| "epoch": 0, |
| "loss": 2.958230459690094, |
| "perplexity": 19.263853412376523, |
| "grad_norm": 0.9999999839033161, |
| "lr": 9.423037028842264e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29910.816981131564, |
| "step_time_ms": 68.63985061645508, |
| "elapsed_sec": 1449.6005918979645, |
| "progress_pct": 95.85629104083327 |
| }, |
| { |
| "step": 12800, |
| "epoch": 0, |
| "loss": 3.013811137676239, |
| "perplexity": 20.364865521711458, |
| "grad_norm": 0.9999999886467441, |
| "lr": 9.407667190484671e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29915.38890613257, |
| "step_time_ms": 68.79305124282837, |
| "elapsed_sec": 1460.799962759018, |
| "progress_pct": 96.61106498603668 |
| }, |
| { |
| "step": 12900, |
| "epoch": 0, |
| "loss": 3.0150211238861084, |
| "perplexity": 20.389521641933122, |
| "grad_norm": 0.9999999887120172, |
| "lr": 9.392108328768228e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29917.230695050563, |
| "step_time_ms": 68.78490209579468, |
| "elapsed_sec": 1472.0201361179352, |
| "progress_pct": 97.36583893124009 |
| }, |
| { |
| "step": 13000, |
| "epoch": 0, |
| "loss": 3.05732549905777, |
| "perplexity": 21.270592799877004, |
| "grad_norm": 0.9999999784801565, |
| "lr": 9.376361118592639e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29922.00874517518, |
| "step_time_ms": 68.64551305770874, |
| "elapsed_sec": 1483.2417628765106, |
| "progress_pct": 98.1206128764435 |
| }, |
| { |
| "step": 13100, |
| "epoch": 0, |
| "loss": 3.0526057434082032, |
| "perplexity": 21.170437339911786, |
| "grad_norm": 0.9999999756636413, |
| "lr": 9.360426243027637e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29925.620308368234, |
| "step_time_ms": 68.60826253890991, |
| "elapsed_sec": 1494.4739503860474, |
| "progress_pct": 98.87538682164691 |
| }, |
| { |
| "step": 13200, |
| "epoch": 0, |
| "loss": 3.0945112991333006, |
| "perplexity": 22.076447122031155, |
| "grad_norm": 0.9999999676132567, |
| "lr": 9.344304393283349e-05, |
| "cross_attn_scale": 1, |
| "tokens_per_sec": 29928.514932177743, |
| "step_time_ms": 68.54766130447388, |
| "elapsed_sec": 1505.5981595516205, |
| "progress_pct": 99.63016076685032 |
| } |
| ], |
| "checkpoint": { |
| "pct": 100, |
| "step": 13249, |
| "final_loss": 3.102254876783655, |
| "total_tokens": 45225439, |
| "total_time_sec": 1511.2042155265808 |
| }, |
| "final": { |
| "total_time_sec": 1518.5682985782623, |
| "total_tokens": 45225439, |
| "final_loss": 3.102254876783655, |
| "final_perplexity": 22.248061401271826, |
| "avg_tokens_per_sec": 29781.62986962237 |
| } |
| } |