{ "config": { "epochs": 1, "batch_size": 16, "gradient_accumulation_steps": 4, "effective_batch_size": 64, "cross_max_length": 64, "lora_r": 8, "lora_alpha": 16, "target_modules": [ "query_key_value" ], "total_steps": 13249, "start_time": "2026-01-11T03:45:55.374803" }, "steps": [ { "step": 100, "epoch": 0, "loss": 3.0553906083106996, "perplexity": 21.229476317462176, "grad_norm": 0.9999998862745418, "lr": 1.8882175226586103e-06, "cross_attn_scale": 0.0755287009063444, "tokens_per_sec": 29659.667067018858, "step_time_ms": 72.3056697845459, "elapsed_sec": 11.671439170837402, "progress_pct": 0.7547739452034116 }, { "step": 200, "epoch": 0, "loss": 3.0449283146858215, "perplexity": 21.00852514617355, "grad_norm": 0.9999998771969448, "lr": 3.7764350453172205e-06, "cross_attn_scale": 0.1510574018126888, "tokens_per_sec": 29998.955284666587, "step_time_ms": 68.73083829879761, "elapsed_sec": 22.878730058670044, "progress_pct": 1.5095478904068231 }, { "step": 300, "epoch": 0, "loss": 3.047647387981415, "perplexity": 21.065726598105954, "grad_norm": 0.9999998694186822, "lr": 5.664652567975831e-06, "cross_attn_scale": 0.22658610271903323, "tokens_per_sec": 30055.703757973926, "step_time_ms": 68.56269359588623, "elapsed_sec": 33.98429822921753, "progress_pct": 2.2643218356102346 }, { "step": 400, "epoch": 0, "loss": 3.0421357417106627, "perplexity": 20.949939147514087, "grad_norm": 0.9999998708066421, "lr": 7.552870090634441e-06, "cross_attn_scale": 0.3021148036253776, "tokens_per_sec": 30210.86789891559, "step_time_ms": 68.70847702026367, "elapsed_sec": 45.12389397621155, "progress_pct": 3.0190957808136463 }, { "step": 500, "epoch": 0, "loss": 3.0340122056007384, "perplexity": 20.780440954816772, "grad_norm": 0.9999998721696909, "lr": 9.441087613293053e-06, "cross_attn_scale": 0.3776435045317221, "tokens_per_sec": 30287.550862030017, "step_time_ms": 68.67650508880615, "elapsed_sec": 56.31241059303284, "progress_pct": 3.773869726017058 }, { "step": 600, "epoch": 0, "loss": 3.0240328454971315, "perplexity": 20.574096755393853, "grad_norm": 0.9999998774025239, "lr": 1.1329305135951662e-05, "cross_attn_scale": 0.45317220543806647, "tokens_per_sec": 30337.509813968918, "step_time_ms": 68.6249566078186, "elapsed_sec": 67.54812812805176, "progress_pct": 4.528643671220469 }, { "step": 700, "epoch": 0, "loss": 3.0103702998161315, "perplexity": 20.294913736798694, "grad_norm": 0.9999998854215264, "lr": 1.3217522658610274e-05, "cross_attn_scale": 0.5287009063444109, "tokens_per_sec": 30390.876870190146, "step_time_ms": 68.71308088302612, "elapsed_sec": 78.77034974098206, "progress_pct": 5.283417616423881 }, { "step": 800, "epoch": 0, "loss": 3.005305087566376, "perplexity": 20.192375599076605, "grad_norm": 0.9999998881305956, "lr": 1.5105740181268882e-05, "cross_attn_scale": 0.6042296072507553, "tokens_per_sec": 30399.323564352733, "step_time_ms": 68.67244720458984, "elapsed_sec": 89.90821766853333, "progress_pct": 6.0381915616272925 }, { "step": 900, "epoch": 0, "loss": 2.9808620524406435, "perplexity": 19.704795892601492, "grad_norm": 0.999999884566103, "lr": 1.6993957703927492e-05, "cross_attn_scale": 0.6797583081570997, "tokens_per_sec": 30464.53898853465, "step_time_ms": 68.634774684906, "elapsed_sec": 101.1823616027832, "progress_pct": 6.792965506830704 }, { "step": 1000, "epoch": 0, "loss": 2.9657063841819764, "perplexity": 19.408408192965855, "grad_norm": 0.9999998912970435, "lr": 1.8882175226586106e-05, "cross_attn_scale": 0.7552870090634441, "tokens_per_sec": 30482.017177706704, "step_time_ms": 68.62752199172974, "elapsed_sec": 112.40210843086243, "progress_pct": 7.547739452034116 }, { "step": 1100, "epoch": 0, "loss": 2.9404901576042177, "perplexity": 18.925120330844894, "grad_norm": 0.9999998957905454, "lr": 2.0770392749244713e-05, "cross_attn_scale": 0.8308157099697885, "tokens_per_sec": 30520.18917730466, "step_time_ms": 68.61571073532104, "elapsed_sec": 123.61365056037903, "progress_pct": 8.302513397237528 }, { "step": 1200, "epoch": 0, "loss": 2.916158149242401, "perplexity": 18.470191258417472, "grad_norm": 0.9999999059134038, "lr": 2.2658610271903323e-05, "cross_attn_scale": 0.9063444108761329, "tokens_per_sec": 30514.05504365179, "step_time_ms": 68.67437601089478, "elapsed_sec": 134.80070066452026, "progress_pct": 9.057287342440938 }, { "step": 1300, "epoch": 0, "loss": 2.9002408885955813, "perplexity": 18.17852384113777, "grad_norm": 0.9999999221452138, "lr": 2.4546827794561937e-05, "cross_attn_scale": 0.9818731117824774, "tokens_per_sec": 30518.41110587493, "step_time_ms": 68.52020263671875, "elapsed_sec": 145.9552721977234, "progress_pct": 9.812061287644351 }, { "step": 1400, "epoch": 0, "loss": 2.872870879173279, "perplexity": 17.687724699355996, "grad_norm": 0.9999999227685785, "lr": 2.6435045317220547e-05, "cross_attn_scale": 1, "tokens_per_sec": 30511.622849857926, "step_time_ms": 68.70761156082153, "elapsed_sec": 157.15584921836853, "progress_pct": 10.566835232847762 }, { "step": 1500, "epoch": 0, "loss": 2.8776234197616577, "perplexity": 17.771986398880262, "grad_norm": 0.9999999237417435, "lr": 2.8323262839879154e-05, "cross_attn_scale": 1, "tokens_per_sec": 30493.903728302623, "step_time_ms": 68.75503063201904, "elapsed_sec": 168.2848823070526, "progress_pct": 11.321609178051174 }, { "step": 1600, "epoch": 0, "loss": 2.8588545417785642, "perplexity": 17.44153693803145, "grad_norm": 0.9999999207595216, "lr": 3.0211480362537764e-05, "cross_attn_scale": 1, "tokens_per_sec": 30505.121130819396, "step_time_ms": 68.64941120147705, "elapsed_sec": 179.4573106765747, "progress_pct": 12.076383123254585 }, { "step": 1700, "epoch": 0, "loss": 2.8620681262016294, "perplexity": 17.4976769464644, "grad_norm": 0.9999999136486478, "lr": 3.209969788519638e-05, "cross_attn_scale": 1, "tokens_per_sec": 30496.028601579812, "step_time_ms": 68.41788530349731, "elapsed_sec": 190.5410726070404, "progress_pct": 12.831157068457998 }, { "step": 1800, "epoch": 0, "loss": 2.851705029010773, "perplexity": 17.317283153287743, "grad_norm": 0.9999999101516841, "lr": 3.3987915407854985e-05, "cross_attn_scale": 1, "tokens_per_sec": 30486.163746654358, "step_time_ms": 68.65042924880981, "elapsed_sec": 201.64810013771057, "progress_pct": 13.585931013661408 }, { "step": 1900, "epoch": 0, "loss": 2.839317078590393, "perplexity": 17.104080804749838, "grad_norm": 0.9999999094789626, "lr": 3.5876132930513595e-05, "cross_attn_scale": 1, "tokens_per_sec": 30499.8027812949, "step_time_ms": 68.49920272827148, "elapsed_sec": 212.8020317554474, "progress_pct": 14.34070495886482 }, { "step": 2000, "epoch": 0, "loss": 2.833750886917114, "perplexity": 17.009140685170195, "grad_norm": 0.9999999050466546, "lr": 3.776435045317221e-05, "cross_attn_scale": 1, "tokens_per_sec": 30498.478093732738, "step_time_ms": 68.63363027572632, "elapsed_sec": 223.98661923408508, "progress_pct": 15.095478904068232 }, { "step": 2100, "epoch": 0, "loss": 2.833835325241089, "perplexity": 17.010576969139752, "grad_norm": 0.9999999051555772, "lr": 3.9652567975830816e-05, "cross_attn_scale": 1, "tokens_per_sec": 30496.763241222277, "step_time_ms": 68.50829124450684, "elapsed_sec": 235.1098358631134, "progress_pct": 15.850252849271643 }, { "step": 2200, "epoch": 0, "loss": 2.830011742115021, "perplexity": 16.9456598012593, "grad_norm": 0.9999999086205261, "lr": 4.1540785498489426e-05, "cross_attn_scale": 1, "tokens_per_sec": 30504.708020780217, "step_time_ms": 68.68033170700073, "elapsed_sec": 246.3288288116455, "progress_pct": 16.605026794475055 }, { "step": 2300, "epoch": 0, "loss": 2.826364483833313, "perplexity": 16.88396717606802, "grad_norm": 0.9999999082911996, "lr": 4.342900302114804e-05, "cross_attn_scale": 1, "tokens_per_sec": 30501.204524633395, "step_time_ms": 68.73678207397461, "elapsed_sec": 257.5635986328125, "progress_pct": 17.359800739678466 }, { "step": 2400, "epoch": 0, "loss": 2.8305963826179505, "perplexity": 16.95556981694399, "grad_norm": 0.9999999116385626, "lr": 4.5317220543806646e-05, "cross_attn_scale": 1, "tokens_per_sec": 30500.794054619466, "step_time_ms": 68.69770526885986, "elapsed_sec": 268.69559478759766, "progress_pct": 18.114574684881877 }, { "step": 2500, "epoch": 0, "loss": 2.823909387588501, "perplexity": 16.842566254075912, "grad_norm": 0.9999999056484468, "lr": 4.720543806646526e-05, "cross_attn_scale": 1, "tokens_per_sec": 30494.381494274585, "step_time_ms": 68.81577491760254, "elapsed_sec": 279.87601590156555, "progress_pct": 18.86934863008529 }, { "step": 2600, "epoch": 0, "loss": 2.8317417907714844, "perplexity": 16.975001991621266, "grad_norm": 0.9999999060478559, "lr": 4.9093655589123874e-05, "cross_attn_scale": 1, "tokens_per_sec": 30493.046658925927, "step_time_ms": 68.76836061477661, "elapsed_sec": 291.05359983444214, "progress_pct": 19.624122575288702 }, { "step": 2700, "epoch": 0, "loss": 2.8178824305534365, "perplexity": 16.741362113751652, "grad_norm": 0.9999999106669464, "lr": 5.098187311178248e-05, "cross_attn_scale": 1, "tokens_per_sec": 30497.10975803011, "step_time_ms": 68.62552165985107, "elapsed_sec": 302.2955641746521, "progress_pct": 20.378896520492113 }, { "step": 2800, "epoch": 0, "loss": 2.8133517289161682, "perplexity": 16.665683564931342, "grad_norm": 0.9999999143853615, "lr": 5.2870090634441094e-05, "cross_attn_scale": 1, "tokens_per_sec": 30502.20993387066, "step_time_ms": 68.7237548828125, "elapsed_sec": 313.51184129714966, "progress_pct": 21.133670465695523 }, { "step": 2900, "epoch": 0, "loss": 2.8192299604415894, "perplexity": 16.763936806188713, "grad_norm": 0.9999999153165114, "lr": 5.4758308157099705e-05, "cross_attn_scale": 1, "tokens_per_sec": 30495.965582028683, "step_time_ms": 68.75383377075195, "elapsed_sec": 324.68157052993774, "progress_pct": 21.888444410898934 }, { "step": 3000, "epoch": 0, "loss": 2.814334969520569, "perplexity": 16.682078000228916, "grad_norm": 0.9999999103789803, "lr": 5.664652567975831e-05, "cross_attn_scale": 1, "tokens_per_sec": 30497.05613414167, "step_time_ms": 68.89568090438843, "elapsed_sec": 335.9107172489166, "progress_pct": 22.64321835610235 }, { "step": 3100, "epoch": 0, "loss": 2.812694685459137, "perplexity": 16.654737083138976, "grad_norm": 0.9999999103699377, "lr": 5.853474320241692e-05, "cross_attn_scale": 1, "tokens_per_sec": 30503.323091845978, "step_time_ms": 68.7230896949768, "elapsed_sec": 347.12381887435913, "progress_pct": 23.39799230130576 }, { "step": 3200, "epoch": 0, "loss": 2.8209491848945616, "perplexity": 16.79278256533621, "grad_norm": 0.9999999116175871, "lr": 6.042296072507553e-05, "cross_attn_scale": 1, "tokens_per_sec": 30471.149506940925, "step_time_ms": 72.36854791641235, "elapsed_sec": 358.6222107410431, "progress_pct": 24.15276624650917 }, { "step": 3300, "epoch": 0, "loss": 2.823811297416687, "perplexity": 16.84091424488252, "grad_norm": 0.9999999183970942, "lr": 6.231117824773414e-05, "cross_attn_scale": 1, "tokens_per_sec": 30475.15581021754, "step_time_ms": 68.7209701538086, "elapsed_sec": 369.78560733795166, "progress_pct": 24.90754019171258 }, { "step": 3400, "epoch": 0, "loss": 2.8320892524719237, "perplexity": 16.98090117948981, "grad_norm": 0.9999999248059107, "lr": 6.419939577039276e-05, "cross_attn_scale": 1, "tokens_per_sec": 29670.286366915152, "step_time_ms": 68.97594928741455, "elapsed_sec": 391.05166888237, "progress_pct": 25.662314136915995 }, { "step": 3500, "epoch": 0, "loss": 2.815865466594696, "perplexity": 16.70762941999754, "grad_norm": 0.9999999286289808, "lr": 6.608761329305136e-05, "cross_attn_scale": 1, "tokens_per_sec": 29686.79626575453, "step_time_ms": 69.19201135635376, "elapsed_sec": 402.31939792633057, "progress_pct": 26.417088082119406 }, { "step": 3600, "epoch": 0, "loss": 2.846110601425171, "perplexity": 17.220673356933453, "grad_norm": 0.9999999358015611, "lr": 6.797583081570997e-05, "cross_attn_scale": 1, "tokens_per_sec": 29707.146446527197, "step_time_ms": 68.82575511932373, "elapsed_sec": 413.6111161708832, "progress_pct": 27.171862027322817 }, { "step": 3700, "epoch": 0, "loss": 2.8651936769485475, "perplexity": 17.55245238082157, "grad_norm": 0.9999999406451835, "lr": 6.986404833836858e-05, "cross_attn_scale": 1, "tokens_per_sec": 29731.035920613755, "step_time_ms": 68.87631177902222, "elapsed_sec": 424.8875832557678, "progress_pct": 27.926635972526228 }, { "step": 3800, "epoch": 0, "loss": 2.8802473044395445, "perplexity": 17.81867927325758, "grad_norm": 0.999999938128731, "lr": 7.175226586102719e-05, "cross_attn_scale": 1, "tokens_per_sec": 29740.50111024254, "step_time_ms": 68.9161491394043, "elapsed_sec": 436.10438680648804, "progress_pct": 28.68140991772964 }, { "step": 3900, "epoch": 0, "loss": 2.8760248684883116, "perplexity": 17.7435996622643, "grad_norm": 0.9999999370933762, "lr": 7.36404833836858e-05, "cross_attn_scale": 1, "tokens_per_sec": 29756.389974960584, "step_time_ms": 68.69559288024902, "elapsed_sec": 447.3066461086273, "progress_pct": 29.436183862933053 }, { "step": 4000, "epoch": 0, "loss": 2.833030352592468, "perplexity": 16.996889429728867, "grad_norm": 0.9999999344961908, "lr": 7.552870090634442e-05, "cross_attn_scale": 1, "tokens_per_sec": 29775.895470741994, "step_time_ms": 68.65659475326538, "elapsed_sec": 458.5267641544342, "progress_pct": 30.190957808136464 }, { "step": 4100, "epoch": 0, "loss": 2.8502792358398437, "perplexity": 17.29260988289878, "grad_norm": 0.9999999411152237, "lr": 7.741691842900302e-05, "cross_attn_scale": 1, "tokens_per_sec": 29789.044694580516, "step_time_ms": 68.78466129302979, "elapsed_sec": 469.70932245254517, "progress_pct": 30.945731753339874 }, { "step": 4200, "epoch": 0, "loss": 2.845259420871735, "perplexity": 17.206021691146056, "grad_norm": 0.9999999597855691, "lr": 7.930513595166163e-05, "cross_attn_scale": 1, "tokens_per_sec": 29810.506707537246, "step_time_ms": 68.6014461517334, "elapsed_sec": 480.918527841568, "progress_pct": 31.700505698543285 }, { "step": 4300, "epoch": 0, "loss": 2.851608917713165, "perplexity": 17.315618846713456, "grad_norm": 0.9999999759995412, "lr": 8.119335347432024e-05, "cross_attn_scale": 1, "tokens_per_sec": 29823.229438646857, "step_time_ms": 68.48395347595215, "elapsed_sec": 492.0337359905243, "progress_pct": 32.4552796437467 }, { "step": 4400, "epoch": 0, "loss": 2.8928983926773073, "perplexity": 18.045536929430227, "grad_norm": 0.9999999802354099, "lr": 8.308157099697885e-05, "cross_attn_scale": 1, "tokens_per_sec": 29839.21784462899, "step_time_ms": 68.63518953323364, "elapsed_sec": 503.2420446872711, "progress_pct": 33.21005358895011 }, { "step": 4500, "epoch": 0, "loss": 2.8919497418403624, "perplexity": 18.028426133087862, "grad_norm": 0.9999999832088211, "lr": 8.496978851963746e-05, "cross_attn_scale": 1, "tokens_per_sec": 29851.35171674971, "step_time_ms": 68.82947206497192, "elapsed_sec": 514.4808900356293, "progress_pct": 33.96482753415352 }, { "step": 4600, "epoch": 0, "loss": 2.912613160610199, "perplexity": 18.404830560286392, "grad_norm": 0.9999999852952282, "lr": 8.685800604229609e-05, "cross_attn_scale": 1, "tokens_per_sec": 29868.9929982354, "step_time_ms": 68.60780239105225, "elapsed_sec": 525.6990418434143, "progress_pct": 34.71960147935693 }, { "step": 4700, "epoch": 0, "loss": 2.9198369789123535, "perplexity": 18.538265085260896, "grad_norm": 0.9999999861072904, "lr": 8.874622356495468e-05, "cross_attn_scale": 1, "tokens_per_sec": 29881.651406257948, "step_time_ms": 68.78975629806519, "elapsed_sec": 536.9214298725128, "progress_pct": 35.47437542456034 }, { "step": 4800, "epoch": 0, "loss": 2.9220591092109682, "perplexity": 18.57950532941875, "grad_norm": 0.9999999939528834, "lr": 9.063444108761329e-05, "cross_attn_scale": 1, "tokens_per_sec": 29894.195528337976, "step_time_ms": 68.80178928375244, "elapsed_sec": 548.1407246589661, "progress_pct": 36.22914936976375 }, { "step": 4900, "epoch": 0, "loss": 2.9370435547828673, "perplexity": 18.86000523511343, "grad_norm": 1.0000000011274732, "lr": 9.25226586102719e-05, "cross_attn_scale": 1, "tokens_per_sec": 29899.872434533, "step_time_ms": 68.81359100341797, "elapsed_sec": 559.2738242149353, "progress_pct": 36.983923314967164 }, { "step": 5000, "epoch": 0, "loss": 2.9583297395706176, "perplexity": 19.265766020381903, "grad_norm": 0.9999999952426983, "lr": 9.441087613293051e-05, "cross_attn_scale": 1, "tokens_per_sec": 29914.890276799215, "step_time_ms": 68.60477924346924, "elapsed_sec": 570.4796454906464, "progress_pct": 37.73869726017058 }, { "step": 5100, "epoch": 0, "loss": 2.9550197100639344, "perplexity": 19.20210119074545, "grad_norm": 0.9999999955847756, "lr": 9.629909365558912e-05, "cross_attn_scale": 1, "tokens_per_sec": 29927.735715089886, "step_time_ms": 68.55671644210815, "elapsed_sec": 581.667058467865, "progress_pct": 38.49347120537399 }, { "step": 5200, "epoch": 0, "loss": 2.9828960919380187, "perplexity": 19.744917015876656, "grad_norm": 0.9999999973577894, "lr": 9.818731117824775e-05, "cross_attn_scale": 1, "tokens_per_sec": 29941.160552589776, "step_time_ms": 68.76250982284546, "elapsed_sec": 592.8996963500977, "progress_pct": 39.248245150577404 }, { "step": 5300, "epoch": 0, "loss": 2.9812774753570555, "perplexity": 19.712983416903363, "grad_norm": 0.9999999977527527, "lr": 9.999999828225707e-05, "cross_attn_scale": 1, "tokens_per_sec": 29953.48526766276, "step_time_ms": 68.57383966445923, "elapsed_sec": 604.1068956851959, "progress_pct": 40.003019095780814 }, { "step": 5400, "epoch": 0, "loss": 2.9874549293518067, "perplexity": 19.835136373740465, "grad_norm": 1.000000005050804, "lr": 9.99988388103099e-05, "cross_attn_scale": 1, "tokens_per_sec": 29965.8283068889, "step_time_ms": 68.63211393356323, "elapsed_sec": 615.2640204429626, "progress_pct": 40.757793040984225 }, { "step": 5500, "epoch": 0, "loss": 2.9855289316177367, "perplexity": 19.7969707113172, "grad_norm": 1.0000000047012423, "lr": 9.999553221781354e-05, "cross_attn_scale": 1, "tokens_per_sec": 29979.453810055362, "step_time_ms": 68.55385541915894, "elapsed_sec": 626.467483997345, "progress_pct": 41.512566986187636 }, { "step": 5600, "epoch": 0, "loss": 3.001195156574249, "perplexity": 20.10955663548813, "grad_norm": 1.0000000009146937, "lr": 9.999007864819872e-05, "cross_attn_scale": 1, "tokens_per_sec": 29985.720598712633, "step_time_ms": 68.79624128341675, "elapsed_sec": 637.6670501232147, "progress_pct": 42.26734093139105 }, { "step": 5700, "epoch": 0, "loss": 2.9869230651855467, "perplexity": 19.824589580449725, "grad_norm": 0.9999999955026968, "lr": 9.998247833802596e-05, "cross_attn_scale": 1, "tokens_per_sec": 30003.619229602187, "step_time_ms": 68.55745553970337, "elapsed_sec": 648.9761068820953, "progress_pct": 43.02211487659446 }, { "step": 5800, "epoch": 0, "loss": 2.992214176654816, "perplexity": 19.929761687056967, "grad_norm": 0.9999999920373142, "lr": 9.997273161697535e-05, "cross_attn_scale": 1, "tokens_per_sec": 30007.238330623877, "step_time_ms": 68.54370355606079, "elapsed_sec": 660.0942339897156, "progress_pct": 43.77688882179787 }, { "step": 5900, "epoch": 0, "loss": 2.9899471187591553, "perplexity": 19.884630939803454, "grad_norm": 0.9999999905480129, "lr": 9.996083890783225e-05, "cross_attn_scale": 1, "tokens_per_sec": 30016.763888893925, "step_time_ms": 68.59871864318848, "elapsed_sec": 671.2416126728058, "progress_pct": 44.531662767001286 }, { "step": 6000, "epoch": 0, "loss": 2.973553490638733, "perplexity": 19.561307160862828, "grad_norm": 0.9999999961927993, "lr": 9.9946800726469e-05, "cross_attn_scale": 1, "tokens_per_sec": 30019.099421691335, "step_time_ms": 68.57965469360352, "elapsed_sec": 682.3533148765564, "progress_pct": 45.2864367122047 }, { "step": 6100, "epoch": 0, "loss": 2.9827182602882387, "perplexity": 19.74140605689802, "grad_norm": 0.9999999982933864, "lr": 9.993061768182244e-05, "cross_attn_scale": 1, "tokens_per_sec": 30026.30841714172, "step_time_ms": 68.58452320098877, "elapsed_sec": 693.5168223381042, "progress_pct": 46.04121065740811 }, { "step": 6200, "epoch": 0, "loss": 2.988657066822052, "perplexity": 19.858995272367505, "grad_norm": 0.9999999972547158, "lr": 9.991229047586758e-05, "cross_attn_scale": 1, "tokens_per_sec": 30032.30921841359, "step_time_ms": 68.62870931625366, "elapsed_sec": 704.7209339141846, "progress_pct": 46.79598460261152 }, { "step": 6300, "epoch": 0, "loss": 3.0013710594177248, "perplexity": 20.11309427481263, "grad_norm": 1.0000000035538437, "lr": 9.989181990358713e-05, "cross_attn_scale": 1, "tokens_per_sec": 30027.30892734739, "step_time_ms": 72.44897842407227, "elapsed_sec": 716.3112769126892, "progress_pct": 47.55075854781493 }, { "step": 6400, "epoch": 0, "loss": 3.0535635590553283, "perplexity": 21.1907244301477, "grad_norm": 1.0000000054736338, "lr": 9.9869206852937e-05, "cross_attn_scale": 1, "tokens_per_sec": 30031.377185455563, "step_time_ms": 68.55860948562622, "elapsed_sec": 727.4697349071503, "progress_pct": 48.30553249301834 }, { "step": 6500, "epoch": 0, "loss": 3.0514983248710634, "perplexity": 21.147005781827758, "grad_norm": 1.0000000088419636, "lr": 9.984445230480777e-05, "cross_attn_scale": 1, "tokens_per_sec": 30037.350771694248, "step_time_ms": 68.51820468902588, "elapsed_sec": 738.6134738922119, "progress_pct": 49.06030643822175 }, { "step": 6600, "epoch": 0, "loss": 3.073709189891815, "perplexity": 21.62195405375797, "grad_norm": 1.0000000080646183, "lr": 9.981755733298221e-05, "cross_attn_scale": 1, "tokens_per_sec": 30044.33637658769, "step_time_ms": 68.57419729232788, "elapsed_sec": 749.8031148910522, "progress_pct": 49.81508038342516 }, { "step": 6700, "epoch": 0, "loss": 3.060198895931244, "perplexity": 21.331799548252498, "grad_norm": 1.0000000045052264, "lr": 9.978852310408861e-05, "cross_attn_scale": 1, "tokens_per_sec": 29765.687951709533, "step_time_ms": 68.89270305633545, "elapsed_sec": 768.0259914398193, "progress_pct": 50.56985432862857 }, { "step": 6800, "epoch": 0, "loss": 3.0322815942764283, "perplexity": 20.744509189304374, "grad_norm": 1.0000000008077732, "lr": 9.975735087755025e-05, "cross_attn_scale": 1, "tokens_per_sec": 29777.555021316624, "step_time_ms": 68.59505653381348, "elapsed_sec": 779.2872176170349, "progress_pct": 51.32462827383199 }, { "step": 6900, "epoch": 0, "loss": 3.022260241508484, "perplexity": 20.537659333523006, "grad_norm": 0.9999999955198917, "lr": 9.972404200553072e-05, "cross_attn_scale": 1, "tokens_per_sec": 29787.611626839698, "step_time_ms": 68.62642526626587, "elapsed_sec": 790.5256149768829, "progress_pct": 52.0794022190354 }, { "step": 7000, "epoch": 0, "loss": 3.0193919491767884, "perplexity": 20.478835724727677, "grad_norm": 0.9999999978591363, "lr": 9.968859793287533e-05, "cross_attn_scale": 1, "tokens_per_sec": 29800.945444945188, "step_time_ms": 68.56673240661621, "elapsed_sec": 801.7361745834351, "progress_pct": 52.83417616423881 }, { "step": 7100, "epoch": 0, "loss": 3.0333949518203736, "perplexity": 20.767618106962548, "grad_norm": 1.0000000012009478, "lr": 9.965102019704832e-05, "cross_attn_scale": 1, "tokens_per_sec": 29810.921233546793, "step_time_ms": 68.66058826446533, "elapsed_sec": 812.898763179779, "progress_pct": 53.58895010944222 }, { "step": 7200, "epoch": 0, "loss": 3.070095462799072, "perplexity": 21.543959223489487, "grad_norm": 1.0000000010922643, "lr": 9.961131042806633e-05, "cross_attn_scale": 1, "tokens_per_sec": 29820.53550024627, "step_time_ms": 68.54245662689209, "elapsed_sec": 824.1274540424347, "progress_pct": 54.343724054645634 }, { "step": 7300, "epoch": 0, "loss": 3.1172192811965944, "perplexity": 22.58349390377302, "grad_norm": 1.0000000016193091, "lr": 9.956947034842753e-05, "cross_attn_scale": 1, "tokens_per_sec": 29828.650771642715, "step_time_ms": 68.64347219467163, "elapsed_sec": 835.327624797821, "progress_pct": 55.098497999849045 }, { "step": 7400, "epoch": 0, "loss": 3.0517469382286073, "perplexity": 21.15226386352477, "grad_norm": 0.9999999990690044, "lr": 9.952550177303704e-05, "cross_attn_scale": 1, "tokens_per_sec": 29836.492375958653, "step_time_ms": 68.64686489105225, "elapsed_sec": 846.4573409557343, "progress_pct": 55.853271945052455 }, { "step": 7500, "epoch": 0, "loss": 3.043672297000885, "perplexity": 20.982154631438764, "grad_norm": 0.9999999930456833, "lr": 9.947940660912811e-05, "cross_attn_scale": 1, "tokens_per_sec": 29846.262143054675, "step_time_ms": 68.66245746612549, "elapsed_sec": 857.693733215332, "progress_pct": 56.608045890255866 }, { "step": 7600, "epoch": 0, "loss": 3.035440671443939, "perplexity": 20.8101463164234, "grad_norm": 0.9999999920525307, "lr": 9.943118685617943e-05, "cross_attn_scale": 1, "tokens_per_sec": 29854.453670190185, "step_time_ms": 68.69853019714355, "elapsed_sec": 868.9143431186676, "progress_pct": 57.36281983545928 }, { "step": 7700, "epoch": 0, "loss": 2.9939205646514893, "perplexity": 19.963798625028573, "grad_norm": 0.9999999901331601, "lr": 9.93808446058284e-05, "cross_attn_scale": 1, "tokens_per_sec": 29860.685174223207, "step_time_ms": 68.79801273345947, "elapsed_sec": 880.1283643245697, "progress_pct": 58.117593780662695 }, { "step": 7800, "epoch": 0, "loss": 3.024644274711609, "perplexity": 20.586680205764154, "grad_norm": 0.9999999972906101, "lr": 9.932838204178041e-05, "cross_attn_scale": 1, "tokens_per_sec": 29871.18400819665, "step_time_ms": 68.64381074905396, "elapsed_sec": 891.3801338672638, "progress_pct": 58.872367725866106 }, { "step": 7900, "epoch": 0, "loss": 3.0337059926986694, "perplexity": 20.77407868983925, "grad_norm": 0.9999999996058188, "lr": 9.927380143971407e-05, "cross_attn_scale": 1, "tokens_per_sec": 29876.69149581676, "step_time_ms": 68.74019622802734, "elapsed_sec": 902.5685124397278, "progress_pct": 59.627141671069516 }, { "step": 8000, "epoch": 0, "loss": 3.0616763854026794, "perplexity": 21.36334035235707, "grad_norm": 1.0000000019298847, "lr": 9.921710516718252e-05, "cross_attn_scale": 1, "tokens_per_sec": 29880.446721366232, "step_time_ms": 68.79402160644531, "elapsed_sec": 913.6925647258759, "progress_pct": 60.38191561627293 }, { "step": 8100, "epoch": 0, "loss": 3.0444650769233705, "perplexity": 20.99879545774598, "grad_norm": 1.0000000104659683, "lr": 9.915829568351079e-05, "cross_attn_scale": 1, "tokens_per_sec": 29885.601052957238, "step_time_ms": 68.65685224533081, "elapsed_sec": 924.8072993755341, "progress_pct": 61.13668956147634 }, { "step": 8200, "epoch": 0, "loss": 2.9955936670303345, "perplexity": 19.997228061635678, "grad_norm": 1.0000000080553764, "lr": 9.909737553968903e-05, "cross_attn_scale": 1, "tokens_per_sec": 29894.409481672225, "step_time_ms": 68.80224943161011, "elapsed_sec": 936.0114979743958, "progress_pct": 61.89146350667975 }, { "step": 8300, "epoch": 0, "loss": 3.0462084817886352, "perplexity": 21.0354367909702, "grad_norm": 1.0000000055912892, "lr": 9.903434737826193e-05, "cross_attn_scale": 1, "tokens_per_sec": 29898.611187936058, "step_time_ms": 68.72424364089966, "elapsed_sec": 947.1763026714325, "progress_pct": 62.64623745188316 }, { "step": 8400, "epoch": 0, "loss": 3.04290563583374, "perplexity": 20.966074593037924, "grad_norm": 1.0000000033133467, "lr": 9.896921393321402e-05, "cross_attn_scale": 1, "tokens_per_sec": 29903.666075391204, "step_time_ms": 68.6596155166626, "elapsed_sec": 958.3606882095337, "progress_pct": 63.40101139708657 }, { "step": 8500, "epoch": 0, "loss": 3.0263629794120788, "perplexity": 20.622093053197236, "grad_norm": 0.9999999969913438, "lr": 9.890197802985112e-05, "cross_attn_scale": 1, "tokens_per_sec": 29909.96927952112, "step_time_ms": 68.68655204772949, "elapsed_sec": 969.5727109909058, "progress_pct": 64.15578534228999 }, { "step": 8600, "epoch": 0, "loss": 2.982111701965332, "perplexity": 19.729435373574137, "grad_norm": 0.9999999969984414, "lr": 9.883264258467786e-05, "cross_attn_scale": 1, "tokens_per_sec": 29918.46704130954, "step_time_ms": 68.8025951385498, "elapsed_sec": 980.8517916202545, "progress_pct": 64.9105592874934 }, { "step": 8700, "epoch": 0, "loss": 3.008047037124634, "perplexity": 20.247818049896424, "grad_norm": 1.000000000964267, "lr": 9.876121060527097e-05, "cross_attn_scale": 1, "tokens_per_sec": 29925.865710112143, "step_time_ms": 68.60105514526367, "elapsed_sec": 992.0734219551086, "progress_pct": 65.66533323269681 }, { "step": 8800, "epoch": 0, "loss": 3.0379968214035036, "perplexity": 20.86340821478766, "grad_norm": 1.000000000967493, "lr": 9.868768519014906e-05, "cross_attn_scale": 1, "tokens_per_sec": 29930.96776317607, "step_time_ms": 68.73708248138428, "elapsed_sec": 1003.2306418418884, "progress_pct": 66.42010717790022 }, { "step": 8900, "epoch": 0, "loss": 3.0114793157577515, "perplexity": 20.317433604804787, "grad_norm": 1.0000000049415028, "lr": 9.861206952863805e-05, "cross_attn_scale": 1, "tokens_per_sec": 29938.500064531298, "step_time_ms": 68.66981267929077, "elapsed_sec": 1014.4962818622589, "progress_pct": 67.17488112310363 }, { "step": 9000, "epoch": 0, "loss": 3.0084391617774964, "perplexity": 20.255759275393853, "grad_norm": 1.0000000038688899, "lr": 9.853436690073285e-05, "cross_attn_scale": 1, "tokens_per_sec": 29943.498254578597, "step_time_ms": 68.49103927612305, "elapsed_sec": 1025.6270906925201, "progress_pct": 67.92965506830704 }, { "step": 9100, "epoch": 0, "loss": 3.0745689249038697, "perplexity": 21.640551197849472, "grad_norm": 1.000000004718636, "lr": 9.845458067695514e-05, "cross_attn_scale": 1, "tokens_per_sec": 29948.06303955987, "step_time_ms": 69.01569843292236, "elapsed_sec": 1036.9010162353516, "progress_pct": 68.68442901351045 }, { "step": 9200, "epoch": 0, "loss": 3.0796532964706422, "perplexity": 21.750859988858302, "grad_norm": 1.000000002997285, "lr": 9.837271431820715e-05, "cross_attn_scale": 1, "tokens_per_sec": 29949.88973194273, "step_time_ms": 68.61918210983276, "elapsed_sec": 1048.0913045406342, "progress_pct": 69.43920295871386 }, { "step": 9300, "epoch": 0, "loss": 3.153324291706085, "perplexity": 23.4137695523185, "grad_norm": 0.9999999963941149, "lr": 9.828877137562149e-05, "cross_attn_scale": 1, "tokens_per_sec": 29954.18284575665, "step_time_ms": 68.8855242729187, "elapsed_sec": 1059.2905225753784, "progress_pct": 70.19397690391727 }, { "step": 9400, "epoch": 0, "loss": 3.3674811506271363, "perplexity": 29.005374796594197, "grad_norm": 0.9999999980681187, "lr": 9.820275549040714e-05, "cross_attn_scale": 1, "tokens_per_sec": 29950.33890410087, "step_time_ms": 72.31015920639038, "elapsed_sec": 1070.8681161403656, "progress_pct": 70.94875084912069 }, { "step": 9500, "epoch": 0, "loss": 3.40507611989975, "perplexity": 30.116588108336245, "grad_norm": 0.9999999904071671, "lr": 9.811467039369153e-05, "cross_attn_scale": 1, "tokens_per_sec": 29961.53980985831, "step_time_ms": 68.4125828742981, "elapsed_sec": 1082.0211913585663, "progress_pct": 71.7035247943241 }, { "step": 9600, "epoch": 0, "loss": 3.374427659511566, "perplexity": 29.207562326020838, "grad_norm": 0.9999999853951721, "lr": 9.802451990635867e-05, "cross_attn_scale": 1, "tokens_per_sec": 29966.307797668982, "step_time_ms": 68.59289169311523, "elapsed_sec": 1093.2418575286865, "progress_pct": 72.4582987395275 }, { "step": 9700, "epoch": 0, "loss": 3.4392935371398927, "perplexity": 31.164933521357394, "grad_norm": 0.999999970808979, "lr": 9.793230793888339e-05, "cross_attn_scale": 1, "tokens_per_sec": 29969.481180441442, "step_time_ms": 68.55150938034058, "elapsed_sec": 1104.3426744937897, "progress_pct": 73.21307268473092 }, { "step": 9800, "epoch": 0, "loss": 3.418562099933624, "perplexity": 30.525490844018883, "grad_norm": 0.9999999525241351, "lr": 9.783803849116176e-05, "cross_attn_scale": 1, "tokens_per_sec": 29976.895268205295, "step_time_ms": 68.71764898300171, "elapsed_sec": 1115.6174013614655, "progress_pct": 73.96784662993433 }, { "step": 9900, "epoch": 0, "loss": 3.3591481018066407, "perplexity": 28.76467586317593, "grad_norm": 0.9999999456205916, "lr": 9.774171565233752e-05, "cross_attn_scale": 1, "tokens_per_sec": 29984.36230858003, "step_time_ms": 68.62653970718384, "elapsed_sec": 1126.913010597229, "progress_pct": 74.72262057513774 }, { "step": 10000, "epoch": 0, "loss": 3.265201942920685, "perplexity": 26.185398412761046, "grad_norm": 0.999999952462774, "lr": 9.764334360062481e-05, "cross_attn_scale": 1, "tokens_per_sec": 29767.874544478855, "step_time_ms": 68.8528823852539, "elapsed_sec": 1146.5747394561768, "progress_pct": 75.47739452034116 }, { "step": 10100, "epoch": 0, "loss": 3.204775149822235, "perplexity": 24.64995684610944, "grad_norm": 0.9999999653872187, "lr": 9.754292660312681e-05, "cross_attn_scale": 1, "tokens_per_sec": 29778.15250087888, "step_time_ms": 68.64858627319336, "elapsed_sec": 1157.871127128601, "progress_pct": 76.23216846554458 }, { "step": 10200, "epoch": 0, "loss": 3.2032279491424562, "perplexity": 24.61184790481275, "grad_norm": 0.9999999683836718, "lr": 9.744046901565074e-05, "cross_attn_scale": 1, "tokens_per_sec": 29782.495889385045, "step_time_ms": 68.81840467453003, "elapsed_sec": 1169.0545053482056, "progress_pct": 76.98694241074799 }, { "step": 10300, "epoch": 0, "loss": 3.169970953464508, "perplexity": 23.80679284153244, "grad_norm": 0.9999999616953779, "lr": 9.733597528251886e-05, "cross_attn_scale": 1, "tokens_per_sec": 29786.453693018226, "step_time_ms": 68.88345241546631, "elapsed_sec": 1180.3069396018982, "progress_pct": 77.7417163559514 }, { "step": 10400, "epoch": 0, "loss": 3.1525204300880434, "perplexity": 23.394955684527936, "grad_norm": 0.9999999453561814, "lr": 9.722944993637574e-05, "cross_attn_scale": 1, "tokens_per_sec": 29792.717834850704, "step_time_ms": 68.55859756469727, "elapsed_sec": 1191.465786933899, "progress_pct": 78.49649030115481 }, { "step": 10500, "epoch": 0, "loss": 3.12509893655777, "perplexity": 22.762146990598122, "grad_norm": 0.9999999442481443, "lr": 9.712089759799156e-05, "cross_attn_scale": 1, "tokens_per_sec": 29801.167133433617, "step_time_ms": 68.67520809173584, "elapsed_sec": 1202.713901758194, "progress_pct": 79.25126424635822 }, { "step": 10600, "epoch": 0, "loss": 3.1125614857673645, "perplexity": 22.478549204294474, "grad_norm": 0.9999999499257827, "lr": 9.701032297606176e-05, "cross_attn_scale": 1, "tokens_per_sec": 29806.945147531915, "step_time_ms": 68.78292798995972, "elapsed_sec": 1213.9093027114868, "progress_pct": 80.00603819156163 }, { "step": 10700, "epoch": 0, "loss": 3.17107980966568, "perplexity": 23.83320579278082, "grad_norm": 0.9999999544155758, "lr": 9.689773086700275e-05, "cross_attn_scale": 1, "tokens_per_sec": 29811.05019124841, "step_time_ms": 68.5623836517334, "elapsed_sec": 1225.0971624851227, "progress_pct": 80.76081213676504 }, { "step": 10800, "epoch": 0, "loss": 3.1426844120025637, "perplexity": 23.16597047477597, "grad_norm": 0.9999999623638297, "lr": 9.678312615474386e-05, "cross_attn_scale": 1, "tokens_per_sec": 29817.68600049851, "step_time_ms": 68.59636545181274, "elapsed_sec": 1236.327594280243, "progress_pct": 81.51558608196845 }, { "step": 10900, "epoch": 0, "loss": 3.1451004695892335, "perplexity": 23.22200846173652, "grad_norm": 0.9999999613214944, "lr": 9.666651381051555e-05, "cross_attn_scale": 1, "tokens_per_sec": 29821.05204868826, "step_time_ms": 68.49246263504028, "elapsed_sec": 1247.4301691055298, "progress_pct": 82.27036002717186 }, { "step": 11000, "epoch": 0, "loss": 3.1183106231689455, "perplexity": 22.608153672225175, "grad_norm": 0.9999999582262188, "lr": 9.654789889263361e-05, "cross_attn_scale": 1, "tokens_per_sec": 29829.229888281192, "step_time_ms": 68.70523452758789, "elapsed_sec": 1258.7128846645355, "progress_pct": 83.02513397237527 }, { "step": 11100, "epoch": 0, "loss": 3.1134038853645323, "perplexity": 22.497493103135188, "grad_norm": 0.9999999537734283, "lr": 9.642728654627993e-05, "cross_attn_scale": 1, "tokens_per_sec": 29835.080075716916, "step_time_ms": 68.42459917068481, "elapsed_sec": 1269.830243587494, "progress_pct": 83.77990791757868 }, { "step": 11200, "epoch": 0, "loss": 3.1126963663101197, "perplexity": 22.481581327694126, "grad_norm": 0.9999999430285232, "lr": 9.630468200327917e-05, "cross_attn_scale": 1, "tokens_per_sec": 29840.622569238225, "step_time_ms": 68.69045734405518, "elapsed_sec": 1281.010505437851, "progress_pct": 84.5346818627821 }, { "step": 11300, "epoch": 0, "loss": 3.1477469062805175, "perplexity": 23.283545427902375, "grad_norm": 0.9999999335019356, "lr": 9.618009058187196e-05, "cross_attn_scale": 1, "tokens_per_sec": 29847.154809522483, "step_time_ms": 68.6976432800293, "elapsed_sec": 1292.2503416538239, "progress_pct": 85.2894558079855 }, { "step": 11400, "epoch": 0, "loss": 3.1873206615448, "perplexity": 24.223437629163133, "grad_norm": 0.9999999261044604, "lr": 9.605351768648408e-05, "cross_attn_scale": 1, "tokens_per_sec": 29852.87892273204, "step_time_ms": 68.48160028457642, "elapsed_sec": 1303.4236028194427, "progress_pct": 86.04422975318892 }, { "step": 11500, "epoch": 0, "loss": 3.12576908826828, "perplexity": 22.77740619475855, "grad_norm": 0.9999999320799811, "lr": 9.59249688074921e-05, "cross_attn_scale": 1, "tokens_per_sec": 29859.745047711865, "step_time_ms": 68.7877631187439, "elapsed_sec": 1314.6685926914215, "progress_pct": 86.79900369839233 }, { "step": 11600, "epoch": 0, "loss": 3.120550537109375, "perplexity": 22.65885074815001, "grad_norm": 0.999999936853886, "lr": 9.579444952098517e-05, "cross_attn_scale": 1, "tokens_per_sec": 29862.469704053565, "step_time_ms": 68.95772695541382, "elapsed_sec": 1325.9032957553864, "progress_pct": 87.55377764359574 }, { "step": 11700, "epoch": 0, "loss": 3.10750075340271, "perplexity": 22.365078647061083, "grad_norm": 0.9999999361651032, "lr": 9.566196548852327e-05, "cross_attn_scale": 1, "tokens_per_sec": 29868.643271348836, "step_time_ms": 68.56596231460571, "elapsed_sec": 1337.0910301208496, "progress_pct": 88.30855158879915 }, { "step": 11800, "epoch": 0, "loss": 3.0350870537757872, "perplexity": 20.802788781962846, "grad_norm": 0.9999999511592819, "lr": 9.552752245689144e-05, "cross_attn_scale": 1, "tokens_per_sec": 29873.580938625204, "step_time_ms": 68.68452787399292, "elapsed_sec": 1348.3001613616943, "progress_pct": 89.06332553400257 }, { "step": 11900, "epoch": 0, "loss": 2.983770315647125, "perplexity": 19.76218603786065, "grad_norm": 0.9999999564182059, "lr": 9.539112625785066e-05, "cross_attn_scale": 1, "tokens_per_sec": 29876.49008130546, "step_time_ms": 68.60888242721558, "elapsed_sec": 1359.4474079608917, "progress_pct": 89.81809947920598 }, { "step": 12000, "epoch": 0, "loss": 2.9881193041801453, "perplexity": 19.848318717587674, "grad_norm": 0.9999999694847291, "lr": 9.525278280788481e-05, "cross_attn_scale": 1, "tokens_per_sec": 29880.676071785907, "step_time_ms": 68.67148876190186, "elapsed_sec": 1370.6449580192566, "progress_pct": 90.5728734244094 }, { "step": 12100, "epoch": 0, "loss": 3.010297691822052, "perplexity": 20.293440217317546, "grad_norm": 0.9999999756513553, "lr": 9.511249810794403e-05, "cross_attn_scale": 1, "tokens_per_sec": 29890.41306297647, "step_time_ms": 68.7269401550293, "elapsed_sec": 1381.9189755916595, "progress_pct": 91.3276473696128 }, { "step": 12200, "epoch": 0, "loss": 3.044492042064667, "perplexity": 20.99936170086693, "grad_norm": 0.9999999580831208, "lr": 9.497027824318445e-05, "cross_attn_scale": 1, "tokens_per_sec": 29895.381058749288, "step_time_ms": 68.52395057678223, "elapsed_sec": 1393.132401227951, "progress_pct": 92.08242131481622 }, { "step": 12300, "epoch": 0, "loss": 3.0311736488342285, "perplexity": 20.721538132586037, "grad_norm": 0.9999999508302908, "lr": 9.482612938270422e-05, "cross_attn_scale": 1, "tokens_per_sec": 29901.363092922238, "step_time_ms": 68.59883308410645, "elapsed_sec": 1404.3868458271027, "progress_pct": 92.83719526001963 }, { "step": 12400, "epoch": 0, "loss": 3.026858425140381, "perplexity": 20.632312712543328, "grad_norm": 0.9999999444981733, "lr": 9.468005777927585e-05, "cross_attn_scale": 1, "tokens_per_sec": 29905.344204329296, "step_time_ms": 68.69802236557007, "elapsed_sec": 1415.589993238449, "progress_pct": 93.59196920522304 }, { "step": 12500, "epoch": 0, "loss": 3.0245711183547974, "perplexity": 20.58517421432855, "grad_norm": 0.9999999515947503, "lr": 9.453206976907509e-05, "cross_attn_scale": 1, "tokens_per_sec": 29900.696944622938, "step_time_ms": 72.52441644668579, "elapsed_sec": 1427.1709144115448, "progress_pct": 94.34674315042645 }, { "step": 12600, "epoch": 0, "loss": 2.980508255958557, "perplexity": 19.6978256382329, "grad_norm": 0.9999999723068747, "lr": 9.438217177140604e-05, "cross_attn_scale": 1, "tokens_per_sec": 29907.388512661983, "step_time_ms": 68.8193130493164, "elapsed_sec": 1438.4461545944214, "progress_pct": 95.10151709562986 }, { "step": 12700, "epoch": 0, "loss": 2.958230459690094, "perplexity": 19.263853412376523, "grad_norm": 0.9999999839033161, "lr": 9.423037028842264e-05, "cross_attn_scale": 1, "tokens_per_sec": 29910.816981131564, "step_time_ms": 68.63985061645508, "elapsed_sec": 1449.6005918979645, "progress_pct": 95.85629104083327 }, { "step": 12800, "epoch": 0, "loss": 3.013811137676239, "perplexity": 20.364865521711458, "grad_norm": 0.9999999886467441, "lr": 9.407667190484671e-05, "cross_attn_scale": 1, "tokens_per_sec": 29915.38890613257, "step_time_ms": 68.79305124282837, "elapsed_sec": 1460.799962759018, "progress_pct": 96.61106498603668 }, { "step": 12900, "epoch": 0, "loss": 3.0150211238861084, "perplexity": 20.389521641933122, "grad_norm": 0.9999999887120172, "lr": 9.392108328768228e-05, "cross_attn_scale": 1, "tokens_per_sec": 29917.230695050563, "step_time_ms": 68.78490209579468, "elapsed_sec": 1472.0201361179352, "progress_pct": 97.36583893124009 }, { "step": 13000, "epoch": 0, "loss": 3.05732549905777, "perplexity": 21.270592799877004, "grad_norm": 0.9999999784801565, "lr": 9.376361118592639e-05, "cross_attn_scale": 1, "tokens_per_sec": 29922.00874517518, "step_time_ms": 68.64551305770874, "elapsed_sec": 1483.2417628765106, "progress_pct": 98.1206128764435 }, { "step": 13100, "epoch": 0, "loss": 3.0526057434082032, "perplexity": 21.170437339911786, "grad_norm": 0.9999999756636413, "lr": 9.360426243027637e-05, "cross_attn_scale": 1, "tokens_per_sec": 29925.620308368234, "step_time_ms": 68.60826253890991, "elapsed_sec": 1494.4739503860474, "progress_pct": 98.87538682164691 }, { "step": 13200, "epoch": 0, "loss": 3.0945112991333006, "perplexity": 22.076447122031155, "grad_norm": 0.9999999676132567, "lr": 9.344304393283349e-05, "cross_attn_scale": 1, "tokens_per_sec": 29928.514932177743, "step_time_ms": 68.54766130447388, "elapsed_sec": 1505.5981595516205, "progress_pct": 99.63016076685032 } ], "checkpoint": { "pct": 100, "step": 13249, "final_loss": 3.102254876783655, "total_tokens": 45225439, "total_time_sec": 1511.2042155265808 }, "final": { "total_time_sec": 1518.5682985782623, "total_tokens": 45225439, "final_loss": 3.102254876783655, "final_perplexity": 22.248061401271826, "avg_tokens_per_sec": 29781.62986962237 } }