base-test / training_log_final.json
quinnlue's picture
Upload training_log_final.json with huggingface_hub
1219a66 verified
{
"config": {
"epochs": 1,
"batch_size": 16,
"gradient_accumulation_steps": 4,
"effective_batch_size": 64,
"cross_max_length": 64,
"lora_r": 8,
"lora_alpha": 16,
"target_modules": [
"query_key_value"
],
"total_steps": 13249,
"start_time": "2026-01-11T03:45:55.374803"
},
"steps": [
{
"step": 100,
"epoch": 0,
"loss": 3.0553906083106996,
"perplexity": 21.229476317462176,
"grad_norm": 0.9999998862745418,
"lr": 1.8882175226586103e-06,
"cross_attn_scale": 0.0755287009063444,
"tokens_per_sec": 29659.667067018858,
"step_time_ms": 72.3056697845459,
"elapsed_sec": 11.671439170837402,
"progress_pct": 0.7547739452034116
},
{
"step": 200,
"epoch": 0,
"loss": 3.0449283146858215,
"perplexity": 21.00852514617355,
"grad_norm": 0.9999998771969448,
"lr": 3.7764350453172205e-06,
"cross_attn_scale": 0.1510574018126888,
"tokens_per_sec": 29998.955284666587,
"step_time_ms": 68.73083829879761,
"elapsed_sec": 22.878730058670044,
"progress_pct": 1.5095478904068231
},
{
"step": 300,
"epoch": 0,
"loss": 3.047647387981415,
"perplexity": 21.065726598105954,
"grad_norm": 0.9999998694186822,
"lr": 5.664652567975831e-06,
"cross_attn_scale": 0.22658610271903323,
"tokens_per_sec": 30055.703757973926,
"step_time_ms": 68.56269359588623,
"elapsed_sec": 33.98429822921753,
"progress_pct": 2.2643218356102346
},
{
"step": 400,
"epoch": 0,
"loss": 3.0421357417106627,
"perplexity": 20.949939147514087,
"grad_norm": 0.9999998708066421,
"lr": 7.552870090634441e-06,
"cross_attn_scale": 0.3021148036253776,
"tokens_per_sec": 30210.86789891559,
"step_time_ms": 68.70847702026367,
"elapsed_sec": 45.12389397621155,
"progress_pct": 3.0190957808136463
},
{
"step": 500,
"epoch": 0,
"loss": 3.0340122056007384,
"perplexity": 20.780440954816772,
"grad_norm": 0.9999998721696909,
"lr": 9.441087613293053e-06,
"cross_attn_scale": 0.3776435045317221,
"tokens_per_sec": 30287.550862030017,
"step_time_ms": 68.67650508880615,
"elapsed_sec": 56.31241059303284,
"progress_pct": 3.773869726017058
},
{
"step": 600,
"epoch": 0,
"loss": 3.0240328454971315,
"perplexity": 20.574096755393853,
"grad_norm": 0.9999998774025239,
"lr": 1.1329305135951662e-05,
"cross_attn_scale": 0.45317220543806647,
"tokens_per_sec": 30337.509813968918,
"step_time_ms": 68.6249566078186,
"elapsed_sec": 67.54812812805176,
"progress_pct": 4.528643671220469
},
{
"step": 700,
"epoch": 0,
"loss": 3.0103702998161315,
"perplexity": 20.294913736798694,
"grad_norm": 0.9999998854215264,
"lr": 1.3217522658610274e-05,
"cross_attn_scale": 0.5287009063444109,
"tokens_per_sec": 30390.876870190146,
"step_time_ms": 68.71308088302612,
"elapsed_sec": 78.77034974098206,
"progress_pct": 5.283417616423881
},
{
"step": 800,
"epoch": 0,
"loss": 3.005305087566376,
"perplexity": 20.192375599076605,
"grad_norm": 0.9999998881305956,
"lr": 1.5105740181268882e-05,
"cross_attn_scale": 0.6042296072507553,
"tokens_per_sec": 30399.323564352733,
"step_time_ms": 68.67244720458984,
"elapsed_sec": 89.90821766853333,
"progress_pct": 6.0381915616272925
},
{
"step": 900,
"epoch": 0,
"loss": 2.9808620524406435,
"perplexity": 19.704795892601492,
"grad_norm": 0.999999884566103,
"lr": 1.6993957703927492e-05,
"cross_attn_scale": 0.6797583081570997,
"tokens_per_sec": 30464.53898853465,
"step_time_ms": 68.634774684906,
"elapsed_sec": 101.1823616027832,
"progress_pct": 6.792965506830704
},
{
"step": 1000,
"epoch": 0,
"loss": 2.9657063841819764,
"perplexity": 19.408408192965855,
"grad_norm": 0.9999998912970435,
"lr": 1.8882175226586106e-05,
"cross_attn_scale": 0.7552870090634441,
"tokens_per_sec": 30482.017177706704,
"step_time_ms": 68.62752199172974,
"elapsed_sec": 112.40210843086243,
"progress_pct": 7.547739452034116
},
{
"step": 1100,
"epoch": 0,
"loss": 2.9404901576042177,
"perplexity": 18.925120330844894,
"grad_norm": 0.9999998957905454,
"lr": 2.0770392749244713e-05,
"cross_attn_scale": 0.8308157099697885,
"tokens_per_sec": 30520.18917730466,
"step_time_ms": 68.61571073532104,
"elapsed_sec": 123.61365056037903,
"progress_pct": 8.302513397237528
},
{
"step": 1200,
"epoch": 0,
"loss": 2.916158149242401,
"perplexity": 18.470191258417472,
"grad_norm": 0.9999999059134038,
"lr": 2.2658610271903323e-05,
"cross_attn_scale": 0.9063444108761329,
"tokens_per_sec": 30514.05504365179,
"step_time_ms": 68.67437601089478,
"elapsed_sec": 134.80070066452026,
"progress_pct": 9.057287342440938
},
{
"step": 1300,
"epoch": 0,
"loss": 2.9002408885955813,
"perplexity": 18.17852384113777,
"grad_norm": 0.9999999221452138,
"lr": 2.4546827794561937e-05,
"cross_attn_scale": 0.9818731117824774,
"tokens_per_sec": 30518.41110587493,
"step_time_ms": 68.52020263671875,
"elapsed_sec": 145.9552721977234,
"progress_pct": 9.812061287644351
},
{
"step": 1400,
"epoch": 0,
"loss": 2.872870879173279,
"perplexity": 17.687724699355996,
"grad_norm": 0.9999999227685785,
"lr": 2.6435045317220547e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 30511.622849857926,
"step_time_ms": 68.70761156082153,
"elapsed_sec": 157.15584921836853,
"progress_pct": 10.566835232847762
},
{
"step": 1500,
"epoch": 0,
"loss": 2.8776234197616577,
"perplexity": 17.771986398880262,
"grad_norm": 0.9999999237417435,
"lr": 2.8323262839879154e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 30493.903728302623,
"step_time_ms": 68.75503063201904,
"elapsed_sec": 168.2848823070526,
"progress_pct": 11.321609178051174
},
{
"step": 1600,
"epoch": 0,
"loss": 2.8588545417785642,
"perplexity": 17.44153693803145,
"grad_norm": 0.9999999207595216,
"lr": 3.0211480362537764e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 30505.121130819396,
"step_time_ms": 68.64941120147705,
"elapsed_sec": 179.4573106765747,
"progress_pct": 12.076383123254585
},
{
"step": 1700,
"epoch": 0,
"loss": 2.8620681262016294,
"perplexity": 17.4976769464644,
"grad_norm": 0.9999999136486478,
"lr": 3.209969788519638e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 30496.028601579812,
"step_time_ms": 68.41788530349731,
"elapsed_sec": 190.5410726070404,
"progress_pct": 12.831157068457998
},
{
"step": 1800,
"epoch": 0,
"loss": 2.851705029010773,
"perplexity": 17.317283153287743,
"grad_norm": 0.9999999101516841,
"lr": 3.3987915407854985e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 30486.163746654358,
"step_time_ms": 68.65042924880981,
"elapsed_sec": 201.64810013771057,
"progress_pct": 13.585931013661408
},
{
"step": 1900,
"epoch": 0,
"loss": 2.839317078590393,
"perplexity": 17.104080804749838,
"grad_norm": 0.9999999094789626,
"lr": 3.5876132930513595e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 30499.8027812949,
"step_time_ms": 68.49920272827148,
"elapsed_sec": 212.8020317554474,
"progress_pct": 14.34070495886482
},
{
"step": 2000,
"epoch": 0,
"loss": 2.833750886917114,
"perplexity": 17.009140685170195,
"grad_norm": 0.9999999050466546,
"lr": 3.776435045317221e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 30498.478093732738,
"step_time_ms": 68.63363027572632,
"elapsed_sec": 223.98661923408508,
"progress_pct": 15.095478904068232
},
{
"step": 2100,
"epoch": 0,
"loss": 2.833835325241089,
"perplexity": 17.010576969139752,
"grad_norm": 0.9999999051555772,
"lr": 3.9652567975830816e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 30496.763241222277,
"step_time_ms": 68.50829124450684,
"elapsed_sec": 235.1098358631134,
"progress_pct": 15.850252849271643
},
{
"step": 2200,
"epoch": 0,
"loss": 2.830011742115021,
"perplexity": 16.9456598012593,
"grad_norm": 0.9999999086205261,
"lr": 4.1540785498489426e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 30504.708020780217,
"step_time_ms": 68.68033170700073,
"elapsed_sec": 246.3288288116455,
"progress_pct": 16.605026794475055
},
{
"step": 2300,
"epoch": 0,
"loss": 2.826364483833313,
"perplexity": 16.88396717606802,
"grad_norm": 0.9999999082911996,
"lr": 4.342900302114804e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 30501.204524633395,
"step_time_ms": 68.73678207397461,
"elapsed_sec": 257.5635986328125,
"progress_pct": 17.359800739678466
},
{
"step": 2400,
"epoch": 0,
"loss": 2.8305963826179505,
"perplexity": 16.95556981694399,
"grad_norm": 0.9999999116385626,
"lr": 4.5317220543806646e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 30500.794054619466,
"step_time_ms": 68.69770526885986,
"elapsed_sec": 268.69559478759766,
"progress_pct": 18.114574684881877
},
{
"step": 2500,
"epoch": 0,
"loss": 2.823909387588501,
"perplexity": 16.842566254075912,
"grad_norm": 0.9999999056484468,
"lr": 4.720543806646526e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 30494.381494274585,
"step_time_ms": 68.81577491760254,
"elapsed_sec": 279.87601590156555,
"progress_pct": 18.86934863008529
},
{
"step": 2600,
"epoch": 0,
"loss": 2.8317417907714844,
"perplexity": 16.975001991621266,
"grad_norm": 0.9999999060478559,
"lr": 4.9093655589123874e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 30493.046658925927,
"step_time_ms": 68.76836061477661,
"elapsed_sec": 291.05359983444214,
"progress_pct": 19.624122575288702
},
{
"step": 2700,
"epoch": 0,
"loss": 2.8178824305534365,
"perplexity": 16.741362113751652,
"grad_norm": 0.9999999106669464,
"lr": 5.098187311178248e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 30497.10975803011,
"step_time_ms": 68.62552165985107,
"elapsed_sec": 302.2955641746521,
"progress_pct": 20.378896520492113
},
{
"step": 2800,
"epoch": 0,
"loss": 2.8133517289161682,
"perplexity": 16.665683564931342,
"grad_norm": 0.9999999143853615,
"lr": 5.2870090634441094e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 30502.20993387066,
"step_time_ms": 68.7237548828125,
"elapsed_sec": 313.51184129714966,
"progress_pct": 21.133670465695523
},
{
"step": 2900,
"epoch": 0,
"loss": 2.8192299604415894,
"perplexity": 16.763936806188713,
"grad_norm": 0.9999999153165114,
"lr": 5.4758308157099705e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 30495.965582028683,
"step_time_ms": 68.75383377075195,
"elapsed_sec": 324.68157052993774,
"progress_pct": 21.888444410898934
},
{
"step": 3000,
"epoch": 0,
"loss": 2.814334969520569,
"perplexity": 16.682078000228916,
"grad_norm": 0.9999999103789803,
"lr": 5.664652567975831e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 30497.05613414167,
"step_time_ms": 68.89568090438843,
"elapsed_sec": 335.9107172489166,
"progress_pct": 22.64321835610235
},
{
"step": 3100,
"epoch": 0,
"loss": 2.812694685459137,
"perplexity": 16.654737083138976,
"grad_norm": 0.9999999103699377,
"lr": 5.853474320241692e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 30503.323091845978,
"step_time_ms": 68.7230896949768,
"elapsed_sec": 347.12381887435913,
"progress_pct": 23.39799230130576
},
{
"step": 3200,
"epoch": 0,
"loss": 2.8209491848945616,
"perplexity": 16.79278256533621,
"grad_norm": 0.9999999116175871,
"lr": 6.042296072507553e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 30471.149506940925,
"step_time_ms": 72.36854791641235,
"elapsed_sec": 358.6222107410431,
"progress_pct": 24.15276624650917
},
{
"step": 3300,
"epoch": 0,
"loss": 2.823811297416687,
"perplexity": 16.84091424488252,
"grad_norm": 0.9999999183970942,
"lr": 6.231117824773414e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 30475.15581021754,
"step_time_ms": 68.7209701538086,
"elapsed_sec": 369.78560733795166,
"progress_pct": 24.90754019171258
},
{
"step": 3400,
"epoch": 0,
"loss": 2.8320892524719237,
"perplexity": 16.98090117948981,
"grad_norm": 0.9999999248059107,
"lr": 6.419939577039276e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29670.286366915152,
"step_time_ms": 68.97594928741455,
"elapsed_sec": 391.05166888237,
"progress_pct": 25.662314136915995
},
{
"step": 3500,
"epoch": 0,
"loss": 2.815865466594696,
"perplexity": 16.70762941999754,
"grad_norm": 0.9999999286289808,
"lr": 6.608761329305136e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29686.79626575453,
"step_time_ms": 69.19201135635376,
"elapsed_sec": 402.31939792633057,
"progress_pct": 26.417088082119406
},
{
"step": 3600,
"epoch": 0,
"loss": 2.846110601425171,
"perplexity": 17.220673356933453,
"grad_norm": 0.9999999358015611,
"lr": 6.797583081570997e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29707.146446527197,
"step_time_ms": 68.82575511932373,
"elapsed_sec": 413.6111161708832,
"progress_pct": 27.171862027322817
},
{
"step": 3700,
"epoch": 0,
"loss": 2.8651936769485475,
"perplexity": 17.55245238082157,
"grad_norm": 0.9999999406451835,
"lr": 6.986404833836858e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29731.035920613755,
"step_time_ms": 68.87631177902222,
"elapsed_sec": 424.8875832557678,
"progress_pct": 27.926635972526228
},
{
"step": 3800,
"epoch": 0,
"loss": 2.8802473044395445,
"perplexity": 17.81867927325758,
"grad_norm": 0.999999938128731,
"lr": 7.175226586102719e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29740.50111024254,
"step_time_ms": 68.9161491394043,
"elapsed_sec": 436.10438680648804,
"progress_pct": 28.68140991772964
},
{
"step": 3900,
"epoch": 0,
"loss": 2.8760248684883116,
"perplexity": 17.7435996622643,
"grad_norm": 0.9999999370933762,
"lr": 7.36404833836858e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29756.389974960584,
"step_time_ms": 68.69559288024902,
"elapsed_sec": 447.3066461086273,
"progress_pct": 29.436183862933053
},
{
"step": 4000,
"epoch": 0,
"loss": 2.833030352592468,
"perplexity": 16.996889429728867,
"grad_norm": 0.9999999344961908,
"lr": 7.552870090634442e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29775.895470741994,
"step_time_ms": 68.65659475326538,
"elapsed_sec": 458.5267641544342,
"progress_pct": 30.190957808136464
},
{
"step": 4100,
"epoch": 0,
"loss": 2.8502792358398437,
"perplexity": 17.29260988289878,
"grad_norm": 0.9999999411152237,
"lr": 7.741691842900302e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29789.044694580516,
"step_time_ms": 68.78466129302979,
"elapsed_sec": 469.70932245254517,
"progress_pct": 30.945731753339874
},
{
"step": 4200,
"epoch": 0,
"loss": 2.845259420871735,
"perplexity": 17.206021691146056,
"grad_norm": 0.9999999597855691,
"lr": 7.930513595166163e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29810.506707537246,
"step_time_ms": 68.6014461517334,
"elapsed_sec": 480.918527841568,
"progress_pct": 31.700505698543285
},
{
"step": 4300,
"epoch": 0,
"loss": 2.851608917713165,
"perplexity": 17.315618846713456,
"grad_norm": 0.9999999759995412,
"lr": 8.119335347432024e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29823.229438646857,
"step_time_ms": 68.48395347595215,
"elapsed_sec": 492.0337359905243,
"progress_pct": 32.4552796437467
},
{
"step": 4400,
"epoch": 0,
"loss": 2.8928983926773073,
"perplexity": 18.045536929430227,
"grad_norm": 0.9999999802354099,
"lr": 8.308157099697885e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29839.21784462899,
"step_time_ms": 68.63518953323364,
"elapsed_sec": 503.2420446872711,
"progress_pct": 33.21005358895011
},
{
"step": 4500,
"epoch": 0,
"loss": 2.8919497418403624,
"perplexity": 18.028426133087862,
"grad_norm": 0.9999999832088211,
"lr": 8.496978851963746e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29851.35171674971,
"step_time_ms": 68.82947206497192,
"elapsed_sec": 514.4808900356293,
"progress_pct": 33.96482753415352
},
{
"step": 4600,
"epoch": 0,
"loss": 2.912613160610199,
"perplexity": 18.404830560286392,
"grad_norm": 0.9999999852952282,
"lr": 8.685800604229609e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29868.9929982354,
"step_time_ms": 68.60780239105225,
"elapsed_sec": 525.6990418434143,
"progress_pct": 34.71960147935693
},
{
"step": 4700,
"epoch": 0,
"loss": 2.9198369789123535,
"perplexity": 18.538265085260896,
"grad_norm": 0.9999999861072904,
"lr": 8.874622356495468e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29881.651406257948,
"step_time_ms": 68.78975629806519,
"elapsed_sec": 536.9214298725128,
"progress_pct": 35.47437542456034
},
{
"step": 4800,
"epoch": 0,
"loss": 2.9220591092109682,
"perplexity": 18.57950532941875,
"grad_norm": 0.9999999939528834,
"lr": 9.063444108761329e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29894.195528337976,
"step_time_ms": 68.80178928375244,
"elapsed_sec": 548.1407246589661,
"progress_pct": 36.22914936976375
},
{
"step": 4900,
"epoch": 0,
"loss": 2.9370435547828673,
"perplexity": 18.86000523511343,
"grad_norm": 1.0000000011274732,
"lr": 9.25226586102719e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29899.872434533,
"step_time_ms": 68.81359100341797,
"elapsed_sec": 559.2738242149353,
"progress_pct": 36.983923314967164
},
{
"step": 5000,
"epoch": 0,
"loss": 2.9583297395706176,
"perplexity": 19.265766020381903,
"grad_norm": 0.9999999952426983,
"lr": 9.441087613293051e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29914.890276799215,
"step_time_ms": 68.60477924346924,
"elapsed_sec": 570.4796454906464,
"progress_pct": 37.73869726017058
},
{
"step": 5100,
"epoch": 0,
"loss": 2.9550197100639344,
"perplexity": 19.20210119074545,
"grad_norm": 0.9999999955847756,
"lr": 9.629909365558912e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29927.735715089886,
"step_time_ms": 68.55671644210815,
"elapsed_sec": 581.667058467865,
"progress_pct": 38.49347120537399
},
{
"step": 5200,
"epoch": 0,
"loss": 2.9828960919380187,
"perplexity": 19.744917015876656,
"grad_norm": 0.9999999973577894,
"lr": 9.818731117824775e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29941.160552589776,
"step_time_ms": 68.76250982284546,
"elapsed_sec": 592.8996963500977,
"progress_pct": 39.248245150577404
},
{
"step": 5300,
"epoch": 0,
"loss": 2.9812774753570555,
"perplexity": 19.712983416903363,
"grad_norm": 0.9999999977527527,
"lr": 9.999999828225707e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29953.48526766276,
"step_time_ms": 68.57383966445923,
"elapsed_sec": 604.1068956851959,
"progress_pct": 40.003019095780814
},
{
"step": 5400,
"epoch": 0,
"loss": 2.9874549293518067,
"perplexity": 19.835136373740465,
"grad_norm": 1.000000005050804,
"lr": 9.99988388103099e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29965.8283068889,
"step_time_ms": 68.63211393356323,
"elapsed_sec": 615.2640204429626,
"progress_pct": 40.757793040984225
},
{
"step": 5500,
"epoch": 0,
"loss": 2.9855289316177367,
"perplexity": 19.7969707113172,
"grad_norm": 1.0000000047012423,
"lr": 9.999553221781354e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29979.453810055362,
"step_time_ms": 68.55385541915894,
"elapsed_sec": 626.467483997345,
"progress_pct": 41.512566986187636
},
{
"step": 5600,
"epoch": 0,
"loss": 3.001195156574249,
"perplexity": 20.10955663548813,
"grad_norm": 1.0000000009146937,
"lr": 9.999007864819872e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29985.720598712633,
"step_time_ms": 68.79624128341675,
"elapsed_sec": 637.6670501232147,
"progress_pct": 42.26734093139105
},
{
"step": 5700,
"epoch": 0,
"loss": 2.9869230651855467,
"perplexity": 19.824589580449725,
"grad_norm": 0.9999999955026968,
"lr": 9.998247833802596e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 30003.619229602187,
"step_time_ms": 68.55745553970337,
"elapsed_sec": 648.9761068820953,
"progress_pct": 43.02211487659446
},
{
"step": 5800,
"epoch": 0,
"loss": 2.992214176654816,
"perplexity": 19.929761687056967,
"grad_norm": 0.9999999920373142,
"lr": 9.997273161697535e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 30007.238330623877,
"step_time_ms": 68.54370355606079,
"elapsed_sec": 660.0942339897156,
"progress_pct": 43.77688882179787
},
{
"step": 5900,
"epoch": 0,
"loss": 2.9899471187591553,
"perplexity": 19.884630939803454,
"grad_norm": 0.9999999905480129,
"lr": 9.996083890783225e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 30016.763888893925,
"step_time_ms": 68.59871864318848,
"elapsed_sec": 671.2416126728058,
"progress_pct": 44.531662767001286
},
{
"step": 6000,
"epoch": 0,
"loss": 2.973553490638733,
"perplexity": 19.561307160862828,
"grad_norm": 0.9999999961927993,
"lr": 9.9946800726469e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 30019.099421691335,
"step_time_ms": 68.57965469360352,
"elapsed_sec": 682.3533148765564,
"progress_pct": 45.2864367122047
},
{
"step": 6100,
"epoch": 0,
"loss": 2.9827182602882387,
"perplexity": 19.74140605689802,
"grad_norm": 0.9999999982933864,
"lr": 9.993061768182244e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 30026.30841714172,
"step_time_ms": 68.58452320098877,
"elapsed_sec": 693.5168223381042,
"progress_pct": 46.04121065740811
},
{
"step": 6200,
"epoch": 0,
"loss": 2.988657066822052,
"perplexity": 19.858995272367505,
"grad_norm": 0.9999999972547158,
"lr": 9.991229047586758e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 30032.30921841359,
"step_time_ms": 68.62870931625366,
"elapsed_sec": 704.7209339141846,
"progress_pct": 46.79598460261152
},
{
"step": 6300,
"epoch": 0,
"loss": 3.0013710594177248,
"perplexity": 20.11309427481263,
"grad_norm": 1.0000000035538437,
"lr": 9.989181990358713e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 30027.30892734739,
"step_time_ms": 72.44897842407227,
"elapsed_sec": 716.3112769126892,
"progress_pct": 47.55075854781493
},
{
"step": 6400,
"epoch": 0,
"loss": 3.0535635590553283,
"perplexity": 21.1907244301477,
"grad_norm": 1.0000000054736338,
"lr": 9.9869206852937e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 30031.377185455563,
"step_time_ms": 68.55860948562622,
"elapsed_sec": 727.4697349071503,
"progress_pct": 48.30553249301834
},
{
"step": 6500,
"epoch": 0,
"loss": 3.0514983248710634,
"perplexity": 21.147005781827758,
"grad_norm": 1.0000000088419636,
"lr": 9.984445230480777e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 30037.350771694248,
"step_time_ms": 68.51820468902588,
"elapsed_sec": 738.6134738922119,
"progress_pct": 49.06030643822175
},
{
"step": 6600,
"epoch": 0,
"loss": 3.073709189891815,
"perplexity": 21.62195405375797,
"grad_norm": 1.0000000080646183,
"lr": 9.981755733298221e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 30044.33637658769,
"step_time_ms": 68.57419729232788,
"elapsed_sec": 749.8031148910522,
"progress_pct": 49.81508038342516
},
{
"step": 6700,
"epoch": 0,
"loss": 3.060198895931244,
"perplexity": 21.331799548252498,
"grad_norm": 1.0000000045052264,
"lr": 9.978852310408861e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29765.687951709533,
"step_time_ms": 68.89270305633545,
"elapsed_sec": 768.0259914398193,
"progress_pct": 50.56985432862857
},
{
"step": 6800,
"epoch": 0,
"loss": 3.0322815942764283,
"perplexity": 20.744509189304374,
"grad_norm": 1.0000000008077732,
"lr": 9.975735087755025e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29777.555021316624,
"step_time_ms": 68.59505653381348,
"elapsed_sec": 779.2872176170349,
"progress_pct": 51.32462827383199
},
{
"step": 6900,
"epoch": 0,
"loss": 3.022260241508484,
"perplexity": 20.537659333523006,
"grad_norm": 0.9999999955198917,
"lr": 9.972404200553072e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29787.611626839698,
"step_time_ms": 68.62642526626587,
"elapsed_sec": 790.5256149768829,
"progress_pct": 52.0794022190354
},
{
"step": 7000,
"epoch": 0,
"loss": 3.0193919491767884,
"perplexity": 20.478835724727677,
"grad_norm": 0.9999999978591363,
"lr": 9.968859793287533e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29800.945444945188,
"step_time_ms": 68.56673240661621,
"elapsed_sec": 801.7361745834351,
"progress_pct": 52.83417616423881
},
{
"step": 7100,
"epoch": 0,
"loss": 3.0333949518203736,
"perplexity": 20.767618106962548,
"grad_norm": 1.0000000012009478,
"lr": 9.965102019704832e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29810.921233546793,
"step_time_ms": 68.66058826446533,
"elapsed_sec": 812.898763179779,
"progress_pct": 53.58895010944222
},
{
"step": 7200,
"epoch": 0,
"loss": 3.070095462799072,
"perplexity": 21.543959223489487,
"grad_norm": 1.0000000010922643,
"lr": 9.961131042806633e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29820.53550024627,
"step_time_ms": 68.54245662689209,
"elapsed_sec": 824.1274540424347,
"progress_pct": 54.343724054645634
},
{
"step": 7300,
"epoch": 0,
"loss": 3.1172192811965944,
"perplexity": 22.58349390377302,
"grad_norm": 1.0000000016193091,
"lr": 9.956947034842753e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29828.650771642715,
"step_time_ms": 68.64347219467163,
"elapsed_sec": 835.327624797821,
"progress_pct": 55.098497999849045
},
{
"step": 7400,
"epoch": 0,
"loss": 3.0517469382286073,
"perplexity": 21.15226386352477,
"grad_norm": 0.9999999990690044,
"lr": 9.952550177303704e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29836.492375958653,
"step_time_ms": 68.64686489105225,
"elapsed_sec": 846.4573409557343,
"progress_pct": 55.853271945052455
},
{
"step": 7500,
"epoch": 0,
"loss": 3.043672297000885,
"perplexity": 20.982154631438764,
"grad_norm": 0.9999999930456833,
"lr": 9.947940660912811e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29846.262143054675,
"step_time_ms": 68.66245746612549,
"elapsed_sec": 857.693733215332,
"progress_pct": 56.608045890255866
},
{
"step": 7600,
"epoch": 0,
"loss": 3.035440671443939,
"perplexity": 20.8101463164234,
"grad_norm": 0.9999999920525307,
"lr": 9.943118685617943e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29854.453670190185,
"step_time_ms": 68.69853019714355,
"elapsed_sec": 868.9143431186676,
"progress_pct": 57.36281983545928
},
{
"step": 7700,
"epoch": 0,
"loss": 2.9939205646514893,
"perplexity": 19.963798625028573,
"grad_norm": 0.9999999901331601,
"lr": 9.93808446058284e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29860.685174223207,
"step_time_ms": 68.79801273345947,
"elapsed_sec": 880.1283643245697,
"progress_pct": 58.117593780662695
},
{
"step": 7800,
"epoch": 0,
"loss": 3.024644274711609,
"perplexity": 20.586680205764154,
"grad_norm": 0.9999999972906101,
"lr": 9.932838204178041e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29871.18400819665,
"step_time_ms": 68.64381074905396,
"elapsed_sec": 891.3801338672638,
"progress_pct": 58.872367725866106
},
{
"step": 7900,
"epoch": 0,
"loss": 3.0337059926986694,
"perplexity": 20.77407868983925,
"grad_norm": 0.9999999996058188,
"lr": 9.927380143971407e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29876.69149581676,
"step_time_ms": 68.74019622802734,
"elapsed_sec": 902.5685124397278,
"progress_pct": 59.627141671069516
},
{
"step": 8000,
"epoch": 0,
"loss": 3.0616763854026794,
"perplexity": 21.36334035235707,
"grad_norm": 1.0000000019298847,
"lr": 9.921710516718252e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29880.446721366232,
"step_time_ms": 68.79402160644531,
"elapsed_sec": 913.6925647258759,
"progress_pct": 60.38191561627293
},
{
"step": 8100,
"epoch": 0,
"loss": 3.0444650769233705,
"perplexity": 20.99879545774598,
"grad_norm": 1.0000000104659683,
"lr": 9.915829568351079e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29885.601052957238,
"step_time_ms": 68.65685224533081,
"elapsed_sec": 924.8072993755341,
"progress_pct": 61.13668956147634
},
{
"step": 8200,
"epoch": 0,
"loss": 2.9955936670303345,
"perplexity": 19.997228061635678,
"grad_norm": 1.0000000080553764,
"lr": 9.909737553968903e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29894.409481672225,
"step_time_ms": 68.80224943161011,
"elapsed_sec": 936.0114979743958,
"progress_pct": 61.89146350667975
},
{
"step": 8300,
"epoch": 0,
"loss": 3.0462084817886352,
"perplexity": 21.0354367909702,
"grad_norm": 1.0000000055912892,
"lr": 9.903434737826193e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29898.611187936058,
"step_time_ms": 68.72424364089966,
"elapsed_sec": 947.1763026714325,
"progress_pct": 62.64623745188316
},
{
"step": 8400,
"epoch": 0,
"loss": 3.04290563583374,
"perplexity": 20.966074593037924,
"grad_norm": 1.0000000033133467,
"lr": 9.896921393321402e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29903.666075391204,
"step_time_ms": 68.6596155166626,
"elapsed_sec": 958.3606882095337,
"progress_pct": 63.40101139708657
},
{
"step": 8500,
"epoch": 0,
"loss": 3.0263629794120788,
"perplexity": 20.622093053197236,
"grad_norm": 0.9999999969913438,
"lr": 9.890197802985112e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29909.96927952112,
"step_time_ms": 68.68655204772949,
"elapsed_sec": 969.5727109909058,
"progress_pct": 64.15578534228999
},
{
"step": 8600,
"epoch": 0,
"loss": 2.982111701965332,
"perplexity": 19.729435373574137,
"grad_norm": 0.9999999969984414,
"lr": 9.883264258467786e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29918.46704130954,
"step_time_ms": 68.8025951385498,
"elapsed_sec": 980.8517916202545,
"progress_pct": 64.9105592874934
},
{
"step": 8700,
"epoch": 0,
"loss": 3.008047037124634,
"perplexity": 20.247818049896424,
"grad_norm": 1.000000000964267,
"lr": 9.876121060527097e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29925.865710112143,
"step_time_ms": 68.60105514526367,
"elapsed_sec": 992.0734219551086,
"progress_pct": 65.66533323269681
},
{
"step": 8800,
"epoch": 0,
"loss": 3.0379968214035036,
"perplexity": 20.86340821478766,
"grad_norm": 1.000000000967493,
"lr": 9.868768519014906e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29930.96776317607,
"step_time_ms": 68.73708248138428,
"elapsed_sec": 1003.2306418418884,
"progress_pct": 66.42010717790022
},
{
"step": 8900,
"epoch": 0,
"loss": 3.0114793157577515,
"perplexity": 20.317433604804787,
"grad_norm": 1.0000000049415028,
"lr": 9.861206952863805e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29938.500064531298,
"step_time_ms": 68.66981267929077,
"elapsed_sec": 1014.4962818622589,
"progress_pct": 67.17488112310363
},
{
"step": 9000,
"epoch": 0,
"loss": 3.0084391617774964,
"perplexity": 20.255759275393853,
"grad_norm": 1.0000000038688899,
"lr": 9.853436690073285e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29943.498254578597,
"step_time_ms": 68.49103927612305,
"elapsed_sec": 1025.6270906925201,
"progress_pct": 67.92965506830704
},
{
"step": 9100,
"epoch": 0,
"loss": 3.0745689249038697,
"perplexity": 21.640551197849472,
"grad_norm": 1.000000004718636,
"lr": 9.845458067695514e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29948.06303955987,
"step_time_ms": 69.01569843292236,
"elapsed_sec": 1036.9010162353516,
"progress_pct": 68.68442901351045
},
{
"step": 9200,
"epoch": 0,
"loss": 3.0796532964706422,
"perplexity": 21.750859988858302,
"grad_norm": 1.000000002997285,
"lr": 9.837271431820715e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29949.88973194273,
"step_time_ms": 68.61918210983276,
"elapsed_sec": 1048.0913045406342,
"progress_pct": 69.43920295871386
},
{
"step": 9300,
"epoch": 0,
"loss": 3.153324291706085,
"perplexity": 23.4137695523185,
"grad_norm": 0.9999999963941149,
"lr": 9.828877137562149e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29954.18284575665,
"step_time_ms": 68.8855242729187,
"elapsed_sec": 1059.2905225753784,
"progress_pct": 70.19397690391727
},
{
"step": 9400,
"epoch": 0,
"loss": 3.3674811506271363,
"perplexity": 29.005374796594197,
"grad_norm": 0.9999999980681187,
"lr": 9.820275549040714e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29950.33890410087,
"step_time_ms": 72.31015920639038,
"elapsed_sec": 1070.8681161403656,
"progress_pct": 70.94875084912069
},
{
"step": 9500,
"epoch": 0,
"loss": 3.40507611989975,
"perplexity": 30.116588108336245,
"grad_norm": 0.9999999904071671,
"lr": 9.811467039369153e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29961.53980985831,
"step_time_ms": 68.4125828742981,
"elapsed_sec": 1082.0211913585663,
"progress_pct": 71.7035247943241
},
{
"step": 9600,
"epoch": 0,
"loss": 3.374427659511566,
"perplexity": 29.207562326020838,
"grad_norm": 0.9999999853951721,
"lr": 9.802451990635867e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29966.307797668982,
"step_time_ms": 68.59289169311523,
"elapsed_sec": 1093.2418575286865,
"progress_pct": 72.4582987395275
},
{
"step": 9700,
"epoch": 0,
"loss": 3.4392935371398927,
"perplexity": 31.164933521357394,
"grad_norm": 0.999999970808979,
"lr": 9.793230793888339e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29969.481180441442,
"step_time_ms": 68.55150938034058,
"elapsed_sec": 1104.3426744937897,
"progress_pct": 73.21307268473092
},
{
"step": 9800,
"epoch": 0,
"loss": 3.418562099933624,
"perplexity": 30.525490844018883,
"grad_norm": 0.9999999525241351,
"lr": 9.783803849116176e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29976.895268205295,
"step_time_ms": 68.71764898300171,
"elapsed_sec": 1115.6174013614655,
"progress_pct": 73.96784662993433
},
{
"step": 9900,
"epoch": 0,
"loss": 3.3591481018066407,
"perplexity": 28.76467586317593,
"grad_norm": 0.9999999456205916,
"lr": 9.774171565233752e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29984.36230858003,
"step_time_ms": 68.62653970718384,
"elapsed_sec": 1126.913010597229,
"progress_pct": 74.72262057513774
},
{
"step": 10000,
"epoch": 0,
"loss": 3.265201942920685,
"perplexity": 26.185398412761046,
"grad_norm": 0.999999952462774,
"lr": 9.764334360062481e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29767.874544478855,
"step_time_ms": 68.8528823852539,
"elapsed_sec": 1146.5747394561768,
"progress_pct": 75.47739452034116
},
{
"step": 10100,
"epoch": 0,
"loss": 3.204775149822235,
"perplexity": 24.64995684610944,
"grad_norm": 0.9999999653872187,
"lr": 9.754292660312681e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29778.15250087888,
"step_time_ms": 68.64858627319336,
"elapsed_sec": 1157.871127128601,
"progress_pct": 76.23216846554458
},
{
"step": 10200,
"epoch": 0,
"loss": 3.2032279491424562,
"perplexity": 24.61184790481275,
"grad_norm": 0.9999999683836718,
"lr": 9.744046901565074e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29782.495889385045,
"step_time_ms": 68.81840467453003,
"elapsed_sec": 1169.0545053482056,
"progress_pct": 76.98694241074799
},
{
"step": 10300,
"epoch": 0,
"loss": 3.169970953464508,
"perplexity": 23.80679284153244,
"grad_norm": 0.9999999616953779,
"lr": 9.733597528251886e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29786.453693018226,
"step_time_ms": 68.88345241546631,
"elapsed_sec": 1180.3069396018982,
"progress_pct": 77.7417163559514
},
{
"step": 10400,
"epoch": 0,
"loss": 3.1525204300880434,
"perplexity": 23.394955684527936,
"grad_norm": 0.9999999453561814,
"lr": 9.722944993637574e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29792.717834850704,
"step_time_ms": 68.55859756469727,
"elapsed_sec": 1191.465786933899,
"progress_pct": 78.49649030115481
},
{
"step": 10500,
"epoch": 0,
"loss": 3.12509893655777,
"perplexity": 22.762146990598122,
"grad_norm": 0.9999999442481443,
"lr": 9.712089759799156e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29801.167133433617,
"step_time_ms": 68.67520809173584,
"elapsed_sec": 1202.713901758194,
"progress_pct": 79.25126424635822
},
{
"step": 10600,
"epoch": 0,
"loss": 3.1125614857673645,
"perplexity": 22.478549204294474,
"grad_norm": 0.9999999499257827,
"lr": 9.701032297606176e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29806.945147531915,
"step_time_ms": 68.78292798995972,
"elapsed_sec": 1213.9093027114868,
"progress_pct": 80.00603819156163
},
{
"step": 10700,
"epoch": 0,
"loss": 3.17107980966568,
"perplexity": 23.83320579278082,
"grad_norm": 0.9999999544155758,
"lr": 9.689773086700275e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29811.05019124841,
"step_time_ms": 68.5623836517334,
"elapsed_sec": 1225.0971624851227,
"progress_pct": 80.76081213676504
},
{
"step": 10800,
"epoch": 0,
"loss": 3.1426844120025637,
"perplexity": 23.16597047477597,
"grad_norm": 0.9999999623638297,
"lr": 9.678312615474386e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29817.68600049851,
"step_time_ms": 68.59636545181274,
"elapsed_sec": 1236.327594280243,
"progress_pct": 81.51558608196845
},
{
"step": 10900,
"epoch": 0,
"loss": 3.1451004695892335,
"perplexity": 23.22200846173652,
"grad_norm": 0.9999999613214944,
"lr": 9.666651381051555e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29821.05204868826,
"step_time_ms": 68.49246263504028,
"elapsed_sec": 1247.4301691055298,
"progress_pct": 82.27036002717186
},
{
"step": 11000,
"epoch": 0,
"loss": 3.1183106231689455,
"perplexity": 22.608153672225175,
"grad_norm": 0.9999999582262188,
"lr": 9.654789889263361e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29829.229888281192,
"step_time_ms": 68.70523452758789,
"elapsed_sec": 1258.7128846645355,
"progress_pct": 83.02513397237527
},
{
"step": 11100,
"epoch": 0,
"loss": 3.1134038853645323,
"perplexity": 22.497493103135188,
"grad_norm": 0.9999999537734283,
"lr": 9.642728654627993e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29835.080075716916,
"step_time_ms": 68.42459917068481,
"elapsed_sec": 1269.830243587494,
"progress_pct": 83.77990791757868
},
{
"step": 11200,
"epoch": 0,
"loss": 3.1126963663101197,
"perplexity": 22.481581327694126,
"grad_norm": 0.9999999430285232,
"lr": 9.630468200327917e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29840.622569238225,
"step_time_ms": 68.69045734405518,
"elapsed_sec": 1281.010505437851,
"progress_pct": 84.5346818627821
},
{
"step": 11300,
"epoch": 0,
"loss": 3.1477469062805175,
"perplexity": 23.283545427902375,
"grad_norm": 0.9999999335019356,
"lr": 9.618009058187196e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29847.154809522483,
"step_time_ms": 68.6976432800293,
"elapsed_sec": 1292.2503416538239,
"progress_pct": 85.2894558079855
},
{
"step": 11400,
"epoch": 0,
"loss": 3.1873206615448,
"perplexity": 24.223437629163133,
"grad_norm": 0.9999999261044604,
"lr": 9.605351768648408e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29852.87892273204,
"step_time_ms": 68.48160028457642,
"elapsed_sec": 1303.4236028194427,
"progress_pct": 86.04422975318892
},
{
"step": 11500,
"epoch": 0,
"loss": 3.12576908826828,
"perplexity": 22.77740619475855,
"grad_norm": 0.9999999320799811,
"lr": 9.59249688074921e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29859.745047711865,
"step_time_ms": 68.7877631187439,
"elapsed_sec": 1314.6685926914215,
"progress_pct": 86.79900369839233
},
{
"step": 11600,
"epoch": 0,
"loss": 3.120550537109375,
"perplexity": 22.65885074815001,
"grad_norm": 0.999999936853886,
"lr": 9.579444952098517e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29862.469704053565,
"step_time_ms": 68.95772695541382,
"elapsed_sec": 1325.9032957553864,
"progress_pct": 87.55377764359574
},
{
"step": 11700,
"epoch": 0,
"loss": 3.10750075340271,
"perplexity": 22.365078647061083,
"grad_norm": 0.9999999361651032,
"lr": 9.566196548852327e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29868.643271348836,
"step_time_ms": 68.56596231460571,
"elapsed_sec": 1337.0910301208496,
"progress_pct": 88.30855158879915
},
{
"step": 11800,
"epoch": 0,
"loss": 3.0350870537757872,
"perplexity": 20.802788781962846,
"grad_norm": 0.9999999511592819,
"lr": 9.552752245689144e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29873.580938625204,
"step_time_ms": 68.68452787399292,
"elapsed_sec": 1348.3001613616943,
"progress_pct": 89.06332553400257
},
{
"step": 11900,
"epoch": 0,
"loss": 2.983770315647125,
"perplexity": 19.76218603786065,
"grad_norm": 0.9999999564182059,
"lr": 9.539112625785066e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29876.49008130546,
"step_time_ms": 68.60888242721558,
"elapsed_sec": 1359.4474079608917,
"progress_pct": 89.81809947920598
},
{
"step": 12000,
"epoch": 0,
"loss": 2.9881193041801453,
"perplexity": 19.848318717587674,
"grad_norm": 0.9999999694847291,
"lr": 9.525278280788481e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29880.676071785907,
"step_time_ms": 68.67148876190186,
"elapsed_sec": 1370.6449580192566,
"progress_pct": 90.5728734244094
},
{
"step": 12100,
"epoch": 0,
"loss": 3.010297691822052,
"perplexity": 20.293440217317546,
"grad_norm": 0.9999999756513553,
"lr": 9.511249810794403e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29890.41306297647,
"step_time_ms": 68.7269401550293,
"elapsed_sec": 1381.9189755916595,
"progress_pct": 91.3276473696128
},
{
"step": 12200,
"epoch": 0,
"loss": 3.044492042064667,
"perplexity": 20.99936170086693,
"grad_norm": 0.9999999580831208,
"lr": 9.497027824318445e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29895.381058749288,
"step_time_ms": 68.52395057678223,
"elapsed_sec": 1393.132401227951,
"progress_pct": 92.08242131481622
},
{
"step": 12300,
"epoch": 0,
"loss": 3.0311736488342285,
"perplexity": 20.721538132586037,
"grad_norm": 0.9999999508302908,
"lr": 9.482612938270422e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29901.363092922238,
"step_time_ms": 68.59883308410645,
"elapsed_sec": 1404.3868458271027,
"progress_pct": 92.83719526001963
},
{
"step": 12400,
"epoch": 0,
"loss": 3.026858425140381,
"perplexity": 20.632312712543328,
"grad_norm": 0.9999999444981733,
"lr": 9.468005777927585e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29905.344204329296,
"step_time_ms": 68.69802236557007,
"elapsed_sec": 1415.589993238449,
"progress_pct": 93.59196920522304
},
{
"step": 12500,
"epoch": 0,
"loss": 3.0245711183547974,
"perplexity": 20.58517421432855,
"grad_norm": 0.9999999515947503,
"lr": 9.453206976907509e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29900.696944622938,
"step_time_ms": 72.52441644668579,
"elapsed_sec": 1427.1709144115448,
"progress_pct": 94.34674315042645
},
{
"step": 12600,
"epoch": 0,
"loss": 2.980508255958557,
"perplexity": 19.6978256382329,
"grad_norm": 0.9999999723068747,
"lr": 9.438217177140604e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29907.388512661983,
"step_time_ms": 68.8193130493164,
"elapsed_sec": 1438.4461545944214,
"progress_pct": 95.10151709562986
},
{
"step": 12700,
"epoch": 0,
"loss": 2.958230459690094,
"perplexity": 19.263853412376523,
"grad_norm": 0.9999999839033161,
"lr": 9.423037028842264e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29910.816981131564,
"step_time_ms": 68.63985061645508,
"elapsed_sec": 1449.6005918979645,
"progress_pct": 95.85629104083327
},
{
"step": 12800,
"epoch": 0,
"loss": 3.013811137676239,
"perplexity": 20.364865521711458,
"grad_norm": 0.9999999886467441,
"lr": 9.407667190484671e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29915.38890613257,
"step_time_ms": 68.79305124282837,
"elapsed_sec": 1460.799962759018,
"progress_pct": 96.61106498603668
},
{
"step": 12900,
"epoch": 0,
"loss": 3.0150211238861084,
"perplexity": 20.389521641933122,
"grad_norm": 0.9999999887120172,
"lr": 9.392108328768228e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29917.230695050563,
"step_time_ms": 68.78490209579468,
"elapsed_sec": 1472.0201361179352,
"progress_pct": 97.36583893124009
},
{
"step": 13000,
"epoch": 0,
"loss": 3.05732549905777,
"perplexity": 21.270592799877004,
"grad_norm": 0.9999999784801565,
"lr": 9.376361118592639e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29922.00874517518,
"step_time_ms": 68.64551305770874,
"elapsed_sec": 1483.2417628765106,
"progress_pct": 98.1206128764435
},
{
"step": 13100,
"epoch": 0,
"loss": 3.0526057434082032,
"perplexity": 21.170437339911786,
"grad_norm": 0.9999999756636413,
"lr": 9.360426243027637e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29925.620308368234,
"step_time_ms": 68.60826253890991,
"elapsed_sec": 1494.4739503860474,
"progress_pct": 98.87538682164691
},
{
"step": 13200,
"epoch": 0,
"loss": 3.0945112991333006,
"perplexity": 22.076447122031155,
"grad_norm": 0.9999999676132567,
"lr": 9.344304393283349e-05,
"cross_attn_scale": 1,
"tokens_per_sec": 29928.514932177743,
"step_time_ms": 68.54766130447388,
"elapsed_sec": 1505.5981595516205,
"progress_pct": 99.63016076685032
}
],
"checkpoint": {
"pct": 100,
"step": 13249,
"final_loss": 3.102254876783655,
"total_tokens": 45225439,
"total_time_sec": 1511.2042155265808
},
"final": {
"total_time_sec": 1518.5682985782623,
"total_tokens": 45225439,
"final_loss": 3.102254876783655,
"final_perplexity": 22.248061401271826,
"avg_tokens_per_sec": 29781.62986962237
}
}