// Pocket TTS ONNX Web Worker
console.log('Pocket TTS Worker Starting...');
self.postMessage({ type: 'status', status: 'Worker Thread Started', state: 'idle' });

// Load ONNX Runtime (will be loaded dynamically in loadModels for module worker)
let ort = null;

// Configuration
const MODELS = {
    mimi_encoder: './onnx/mimi_encoder.onnx',
    text_conditioner: './onnx/text_conditioner.onnx',
    flow_lm_main: './onnx/flow_lm_main_int8.onnx',
    flow_lm_flow: './onnx/flow_lm_flow_int8.onnx',
    mimi_decoder: './onnx/mimi_decoder_int8.onnx',
    tokenizer: './tokenizer.model',
    voices: './voices.bin'
};

const SAMPLE_RATE = 24000;
const SAMPLES_PER_FRAME = 1920;
const MAX_FRAMES = 500;
const DEBUG_LOGS = false;
// Text chunking target; lower if long passages hit generation limits.
const CHUNK_TARGET_TOKENS = 50;
const CHUNK_GAP_SEC = 0.25;
// If true, re-run voice conditioning per chunk to avoid stale AR state.
const RESET_FLOW_STATE_EACH_CHUNK = true;
// If true, reset decoder state per chunk to avoid carry-over artifacts.
const RESET_MIMI_STATE_EACH_CHUNK = true;

// State
let mimiEncoderSession = null;
let textConditionerSession = null;
let flowLmMainSession = null;
let flowLmFlowSession = null;
let mimiDecoderSession = null;
let tokenizerProcessor = null;
let tokenizerModelB64 = null;
let predefinedVoices = {};
let stTensors = []; // Optimization: Pre-allocated s/t tensors for max LSD
let isGenerating = false;
let isReady = false;

// Dynamic LSD (Latent Solver/Diffusion steps)
const MAX_LSD = 10;  // Default/max quality
let currentLSD = MAX_LSD;

// Current voice embedding (cached)
let currentVoiceEmbedding = null;
let currentVoiceName = null;

// Text preprocessing utilities
const ONES = ['', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen'];
const TENS = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety'];
const ORDINAL_ONES = ['', 'first', 'second', 'third', 'fourth', 'fifth', 'sixth', 'seventh', 'eighth', 'ninth', 'tenth', 'eleventh', 'twelfth', 'thirteenth', 'fourteenth', 'fifteenth', 'sixteenth', 'seventeenth', 'eighteenth', 'nineteenth'];
const ORDINAL_TENS = ['', '', 'twentieth', 'thirtieth', 'fortieth', 'fiftieth', 'sixtieth', 'seventieth', 'eightieth', 'ninetieth'];

function numberToWords(num, options = {}) {
    const { andword = '', zero = 'zero', group = 0 } = options;
    if (num === 0) return zero;
    const convert = (n) => {
        if (n < 20) return ONES[n];
        if (n < 100) return TENS[Math.floor(n / 10)] + (n % 10 ? ' ' + ONES[n % 10] : '');
        if (n < 1000) {
            const remainder = n % 100;
            return ONES[Math.floor(n / 100)] + ' hundred' + (remainder ? (andword ? ' ' + andword + ' ' : ' ') + convert(remainder) : '');
        }
        if (n < 1000000) {
            const thousands = Math.floor(n / 1000);
            const remainder = n % 1000;
            return convert(thousands) + ' thousand' + (remainder ? ' ' + convert(remainder) : '');
        }
        if (n < 1000000000) {
            const millions = Math.floor(n / 1000000);
            const remainder = n % 1000000;
            return convert(millions) + ' million' + (remainder ? ' ' + convert(remainder) : '');
        }
        const billions = Math.floor(n / 1000000000);
        const remainder = n % 1000000000;
        return convert(billions) + ' billion' + (remainder ? ' ' + convert(remainder) : '');
    };
    if (group === 2 && num > 1000 && num < 10000) {
        const high = Math.floor(num / 100);
        const low = num % 100;
        if (low === 0) return convert(high) + ' hundred';
        else if (low < 10) return convert(high) + ' ' + (zero === 'oh' ? 'oh' : zero) + ' ' + ONES[low];
        else return convert(high) + ' ' + convert(low);
    }
    return convert(num);
}

function ordinalToWords(num) {
    if (num < 20) return ORDINAL_ONES[num] || numberToWords(num) + 'th';
    if (num < 100) {
        const tens = Math.floor(num / 10);
        const ones = num % 10;
        if (ones === 0) return ORDINAL_TENS[tens];
        return TENS[tens] + ' ' + ORDINAL_ONES[ones];
    }
    const cardinal = numberToWords(num);
    if (cardinal.endsWith('y')) return cardinal.slice(0, -1) + 'ieth';
    if (cardinal.endsWith('one')) return cardinal.slice(0, -3) + 'first';
    if (cardinal.endsWith('two')) return cardinal.slice(0, -3) + 'second';
    if (cardinal.endsWith('three')) return cardinal.slice(0, -5) + 'third';
    if (cardinal.endsWith('ve')) return cardinal.slice(0, -2) + 'fth';
    if (cardinal.endsWith('e')) return cardinal.slice(0, -1) + 'th';
    if (cardinal.endsWith('t')) return cardinal + 'h';
    return cardinal + 'th';
}

const UNICODE_MAP = {
    'à': 'a', 'á': 'a', 'â': 'a', 'ã': 'a', 'ä': 'a', 'å': 'a', 'æ': 'ae', 'ç': 'c', 'è': 'e', 'é': 'e', 'ê': 'e', 'ë': 'e', 'ì': 'i', 'í': 'i', 'î': 'i', 'ï': 'i', 'ñ': 'n', 'ò': 'o', 'ó': 'o', 'ô': 'o', 'õ': 'o', 'ö': 'o', 'ø': 'o', 'ù': 'u', 'ú': 'u', 'û': 'u', 'ü': 'u', 'ý': 'y', 'ÿ': 'y', 'ß': 'ss', 'œ': 'oe', 'ð': 'd', 'þ': 'th', 'À': 'A', 'Á': 'A', 'Â': 'A', 'Ã': 'A', 'Ä': 'A', 'Å': 'A', 'Æ': 'AE', 'Ç': 'C', 'È': 'E', 'É': 'E', 'Ê': 'E', 'Ë': 'E', 'Ì': 'I', 'Í': 'I', 'Î': 'I', 'Ï': 'I', 'Ñ': 'N', 'Ò': 'O', 'Ó': 'O', 'Ô': 'O', 'Õ': 'O', 'Ö': 'O', 'Ø': 'O', 'Ù': 'U', 'Ú': 'U', 'Û': 'U', 'Ü': 'U', 'Ý': 'Y', '\u201C': '"', '\u201D': '"', '\u2018': "'", '\u2019': "'", '\u2026': '...', '\u2013': '-', '\u2014': '-'
};

function convertToAscii(text) {
    return text.split('').map(c => UNICODE_MAP[c] || c).join('').normalize('NFD').replace(/[\u0300-\u036f]/g, '');
}

const ABBREVIATIONS = [
    [/\bmrs\./gi, 'misuss'], [/\bms\./gi, 'miss'], [/\bmr\./gi, 'mister'], [/\bdr\./gi, 'doctor'], [/\bst\./gi, 'saint'], [/\bco\./gi, 'company'], [/\bjr\./gi, 'junior'], [/\bmaj\./gi, 'major'], [/\bgen\./gi, 'general'], [/\bdrs\./gi, 'doctors'], [/\brev\./gi, 'reverend'], [/\blt\./gi, 'lieutenant'], [/\bhon\./gi, 'honorable'], [/\bsgt\./gi, 'sergeant'], [/\bcapt\./gi, 'captain'], [/\besq\./gi, 'esquire'], [/\bltd\./gi, 'limited'], [/\bcol\./gi, 'colonel'], [/\bft\./gi, 'fort']
];
const CASED_ABBREVIATIONS = [
    [/\bTTS\b/g, 'text to speech'], [/\bHz\b/g, 'hertz'], [/\bkHz\b/g, 'kilohertz'], [/\bKBs\b/g, 'kilobytes'], [/\bKB\b/g, 'kilobyte'], [/\bMBs\b/g, 'megabytes'], [/\bMB\b/g, 'megabyte'], [/\bGBs\b/g, 'gigabytes'], [/\bGB\b/g, 'gigabyte'], [/\bTBs\b/g, 'terabytes'], [/\bTB\b/g, 'terabyte'], [/\bAPIs\b/g, "a p i's"], [/\bAPI\b/g, 'a p i'], [/\bCLIs\b/g, "c l i's"], [/\bCLI\b/g, 'c l i'], [/\bCPUs\b/g, "c p u's"], [/\bCPU\b/g, 'c p u'], [/\bGPUs\b/g, "g p u's"], [/\bGPU\b/g, 'g p u'], [/\bAve\b/g, 'avenue'], [/\betc\b/g, 'etcetera']
];

function expandAbbreviations(text) {
    for (const [regex, replacement] of [...ABBREVIATIONS, ...CASED_ABBREVIATIONS]) text = text.replace(regex, replacement);
    return text;
}

const NUM_PREFIX_RE = /#(\d)/g;
const NUM_SUFFIX_RE = /(\d)([KMBT])/gi;
const NUM_LETTER_SPLIT_RE = /(\d)([a-z])|([a-z])(\d)/gi;
const COMMA_NUMBER_RE = /(\d[\d,]+\d)/g;
const DATE_RE = /(^|[^/])(\d\d?[/-]\d\d?[/-]\d\d(?:\d\d)?)($|[^/])/g;
const PHONE_NUMBER_RE = /\(?\d{3}\)?[-.\s]\d{3}[-.\s]?\d{4}/g;
const TIME_RE = /(\d\d?):(\d\d)(?::(\d\d))?/g;
const POUNDS_RE = /£([\d,]*\d+)/g;
const DOLLARS_RE = /\$([\d.,]*\d+)/g;
const DECIMAL_NUMBER_RE = /(\d+(?:\.\d+)+)/g;
const MULTIPLY_RE = /(\d)\s?\*\s?(\d)/g;
const DIVIDE_RE = /(\d)\s?\/\s?(\d)/g;
const ADD_RE = /(\d)\s?\+\s?(\d)/g;
const SUBTRACT_RE = /(\d)?\s?-\s?(\d)/g;
const FRACTION_RE = /(\d+)\/(\d+)/g;
const ORDINAL_RE = /(\d+)(st|nd|rd|th)/gi;
const NUMBER_RE = /\d+/g;

function normalizeNumbers(text) {
    text = text.replace(NUM_PREFIX_RE, (_, d) => `number ${d}`);
    text = text.replace(NUM_SUFFIX_RE, (_, num, suffix) => {
        const map = { k: 'thousand', m: 'million', b: 'billion', t: 'trillion' };
        return `${num} ${map[suffix.toLowerCase()]}`;
    });
    for (let i = 0; i < 2; i++) {
        text = text.replace(NUM_LETTER_SPLIT_RE, (m, d1, l1, l2, d2) => {
            if (d1 && l1) return `${d1} ${l1}`;
            if (l2 && d2) return `${l2} ${d2}`;
            return m;
        });
    }
    text = text.replace(COMMA_NUMBER_RE, m => m.replace(/,/g, ''));
    text = text.replace(DATE_RE, (_, pre, date, post) => pre + date.split(/[./-]/).join(' dash ') + post);
    text = text.replace(PHONE_NUMBER_RE, m => {
        const digits = m.replace(/\D/g, '');
        return digits.length === 10 ? `${digits.slice(0, 3).split('').join(' ')}, ${digits.slice(3, 6).split('').join(' ')}, ${digits.slice(6).split('').join(' ')}` : m;
    });
    text = text.replace(TIME_RE, (_, hours, minutes, seconds) => {
        const h = parseInt(hours), m = parseInt(minutes), s = seconds ? parseInt(seconds) : 0;
        if (!seconds) return m === 0 ? (h === 0 ? '0' : h > 12 ? `${hours} minutes` : `${hours} o'clock`) : minutes.startsWith('0') ? `${hours} oh ${minutes[1]}` : `${hours} ${minutes}`;
        let res = '';
        if (h !== 0) res = hours + ' ' + (m === 0 ? 'oh oh' : minutes.startsWith('0') ? `oh ${minutes[1]}` : minutes);
        else if (m !== 0) res = minutes + ' ' + (s === 0 ? 'oh oh' : seconds.startsWith('0') ? `oh ${seconds[1]}` : seconds);
        else res = seconds;
        return res + ' ' + (s === 0 ? '' : seconds.startsWith('0') ? `oh ${seconds[1]}` : seconds);
    });
    text = text.replace(POUNDS_RE, (_, amount) => `${amount.replace(/,/g, '')} pounds`);
    text = text.replace(DOLLARS_RE, (_, amount) => {
        const parts = amount.replace(/,/g, '').split('.');
        const dollars = parseInt(parts[0]) || 0;
        const cents = parts[1] ? parseInt(parts[1]) : 0;
        if (dollars && cents) return `${dollars} ${dollars === 1 ? 'dollar' : 'dollars'}, ${cents} ${cents === 1 ? 'cent' : 'cents'}`;
        if (dollars) return `${dollars} ${dollars === 1 ? 'dollar' : 'dollars'}`;
        if (cents) return `${cents} ${cents === 1 ? 'cent' : 'cents'}`;
        return 'zero dollars';
    });
    text = text.replace(DECIMAL_NUMBER_RE, m => m.split('.').join(' point ').split('').join(' '));
    text = text.replace(MULTIPLY_RE, '$1 times $2');
    text = text.replace(DIVIDE_RE, '$1 over $2');
    text = text.replace(ADD_RE, '$1 plus $2');
    text = text.replace(SUBTRACT_RE, (_, a, b) => (a ? a : '') + ' minus ' + b);
    text = text.replace(FRACTION_RE, '$1 over $2');
    text = text.replace(ORDINAL_RE, (_, num) => ordinalToWords(parseInt(num)));
    text = text.replace(NUMBER_RE, m => {
        const num = parseInt(m);
        if (num > 1000 && num < 3000) {
            if (num === 2000) return 'two thousand';
            if (num > 2000 && num < 2010) return 'two thousand ' + numberToWords(num % 100);
            if (num % 100 === 0) return numberToWords(Math.floor(num / 100)) + ' hundred';
            return numberToWords(num, { zero: 'oh', group: 2 });
        }
        return numberToWords(num);
    });
    return text;
}

const SPECIAL_CHARACTERS = [
    [/@/g, ' at '], [/&/g, ' and '], [/%/g, ' percent '], [/:/g, '.'], [/;/g, ','], [/\+/g, ' plus '], [/\\/g, ' backslash '], [/~/g, ' about '], [/(^| )<3/g, ' heart '], [/<=/g, ' less than or equal to '], [/>=/g, ' greater than or equal to '], [/</g, ' less than '], [/>/g, ' greater than '], [/=/g, ' equals '], [/\//g, ' slash '], [/_/g, ' '],
];
const LINK_HEADER_RE = /https?:\/\//gi;
const DASH_RE = /(.) - (.)/g;
const DOT_RE = /([A-Z])\.([A-Z])/gi;
const PARENTHESES_RE = /[\(\[\{][^\)\]\}]*[\)\]\}](.)?/g;

function normalizeSpecial(text) {
    text = text.replace(LINK_HEADER_RE, 'h t t p s colon slash slash ');
    text = text.replace(DASH_RE, '$1, $2');
    text = text.replace(DOT_RE, '$1 dot $2');
    text = text.replace(PARENTHESES_RE, (m, after) => {
        let result = m.replace(/[\(\[\{]/g, ', ').replace(/[\)\]\}]/g, ', ');
        if (after && /[$.!?,]/.test(after)) result = result.slice(0, -2) + after;
        return result;
    });
    return text;
}

function expandSpecialCharacters(text) {
    for (const [regex, replacement] of SPECIAL_CHARACTERS) text = text.replace(regex, replacement);
    return text;
}

function collapseWhitespace(text) {
    return text.replace(/\s+/g, ' ').replace(/ ([.\?!,])/g, '$1');
}

function dedupPunctuation(text) {
    return text.replace(/\.\.\.+/g, '[ELLIPSIS]').replace(/,+/g, ',').replace(/[.,]*\.[.,]*/g, '.').replace(/[.,!]*![.,!]*/g, '!').replace(/[.,!?]*\?[.,!?]*/g, '?').replace(/\[ELLIPSIS\]/g, '...');
}

const SENTENCE_SPLIT_RE = /[^.!?]+[.!?]+|[^.!?]+$/g;

function splitTextIntoSentences(text) {
    const matches = text.match(SENTENCE_SPLIT_RE);
    if (!matches) return [];
    return matches.map(sentence => sentence.trim()).filter(Boolean);
}

function splitTokenIdsIntoChunks(tokenIds, maxTokens) {
    const chunks = [];
    for (let i = 0; i < tokenIds.length; i += maxTokens) {
        const chunkText = tokenizerProcessor.decodeIds(tokenIds.slice(i, i + maxTokens)).trim();
        if (chunkText) chunks.push(chunkText);
    }
    return chunks;
}

// Split text into sentence chunks (target <= CHUNK_TARGET_TOKENS tokens)
function splitIntoBestSentences(text) {
    const preparedText = prepareText(text);
    if (!preparedText) return [];

    const sentences = splitTextIntoSentences(preparedText);
    if (sentences.length === 0) return [];

    // Merge sentences into chunks that stay within the token target
    const chunks = [];
    let currentChunk = '';
    for (const sentenceText of sentences) {
        const sentenceTokenIds = tokenizerProcessor.encodeIds(sentenceText);
        const sentenceTokens = sentenceTokenIds.length;

        if (sentenceTokens > CHUNK_TARGET_TOKENS) {
            if (currentChunk !== '') {
                chunks.push(currentChunk.trim());
                currentChunk = '';
            }
            const splitChunks = splitTokenIdsIntoChunks(sentenceTokenIds, CHUNK_TARGET_TOKENS);
            for (const splitChunk of splitChunks) {
                if (splitChunk) chunks.push(splitChunk.trim());
            }
            continue;
        }

        if (currentChunk === '') {
            currentChunk = sentenceText;
            continue;
        }

        const combined = `${currentChunk} ${sentenceText}`;
        const combinedTokens = tokenizerProcessor.encodeIds(combined).length;
        if (combinedTokens > CHUNK_TARGET_TOKENS) {
            chunks.push(currentChunk.trim());
            currentChunk = sentenceText;
        } else {
            currentChunk = combined;
        }
    }

    if (currentChunk !== '') {
        chunks.push(currentChunk.trim());
    }

    return chunks;
}

// Pocket TTS specific text preprocessing
function prepareText(text) {
    text = text.trim();
    if (!text) return '';

    // Convert to ASCII
    text = convertToAscii(text);

    // Normalize numbers first
    text = normalizeNumbers(text);

    // Normalize special characters
    text = normalizeSpecial(text);

    // Expand abbreviations
    text = expandAbbreviations(text);

    // Expand special characters
    text = expandSpecialCharacters(text);

    // Collapse whitespace
    text = collapseWhitespace(text);

    // Deduplicate punctuation
    text = dedupPunctuation(text);

    // Final cleanup
    text = text.trim();

    // Ensure proper punctuation at end
    if (text && text[text.length - 1].match(/[a-zA-Z0-9]/)) {
        text = text + '.';
    }

    // Capitalize first letter
    if (text && !text[0].match(/[A-Z]/)) {
        text = text[0].toUpperCase() + text.slice(1);
    }

    return text;
}

// ----------------------------------------------------------------------------
// Worker Logic
// ----------------------------------------------------------------------------

self.onmessage = async (e) => {
    const { type, data } = e.data;
    console.log('Worker received message:', type);

    if (type === 'load') {
        try {
            await loadModels();
            postMessage({ type: 'loaded' });
        } catch (err) {
            postMessage({ type: 'error', error: err.toString() });
        }
    } else if (type === 'generate') {
        if (!isReady) {
            postMessage({ type: 'error', error: 'Models are not loaded yet.' });
            return;
        }
        if (isGenerating) return;
        try {
            await startGeneration(data.text, data.voice);
        } catch (err) {
            console.error('Generation Error:', err);
            postMessage({ type: 'error', error: err.toString() });
        }
    } else if (type === 'encode_voice') {
        if (!isReady) {
            postMessage({ type: 'error', error: 'Models are not loaded yet.' });
            return;
        }
        try {
            const embedding = await encodeVoiceAudio(data.audio);
            currentVoiceEmbedding = embedding;
            currentVoiceName = 'custom';
            postMessage({ type: 'voice_encoded', voiceName: 'custom' });
        } catch (err) {
            console.error('Voice encoding error:', err);
            postMessage({ type: 'error', error: 'Failed to encode voice: ' + err.toString() });
        }
    } else if (type === 'set_voice') {
        if (!isReady) {
            postMessage({ type: 'error', error: 'Models are not loaded yet.' });
            return;
        }
        if (data.voiceName === 'custom') {
            // Custom voice already set via encode_voice
            postMessage({ type: 'voice_set', voiceName: 'custom' });
        } else if (predefinedVoices[data.voiceName]) {
            currentVoiceEmbedding = predefinedVoices[data.voiceName];
            currentVoiceName = data.voiceName;
            postMessage({ type: 'voice_set', voiceName: data.voiceName });
        } else {
            postMessage({ type: 'error', error: `Unknown voice: ${data.voiceName}` });
        }
    } else if (type === 'set_lsd') {
        // Dynamic LSD adjustment for edge devices
        const newLSD = Math.max(1, Math.min(MAX_LSD, data.lsd));
        if (newLSD !== currentLSD) {
            console.log(`LSD adjusted: ${currentLSD} → ${newLSD}`);
            currentLSD = newLSD;
        }
    } else if (type === 'stop') {
        isGenerating = false;
        postMessage({ type: 'status', status: 'Stopped', state: 'idle' });
    }
};

async function loadModels() {
    if (mimiEncoderSession) return;

    postMessage({ type: 'status', status: 'Loading ONNX Runtime...', state: 'loading' });

    // Load ONNX Runtime dynamically
    const version = '1.20.0';
    const cdnBase = `https://cdn.jsdelivr.net/npm/onnxruntime-web@${version}/dist/`;

    try {
        const ortModule = await import(`https://cdn.jsdelivr.net/npm/onnxruntime-web@${version}/dist/ort.min.mjs`);
        ort = ortModule.default || ortModule;
    } catch (e) {
        console.error('Failed to load ONNX Runtime:', e);
        throw new Error('Failed to load ONNX Runtime: ' + e.message);
    }

    if (!ort) {
        throw new Error('ONNX Runtime failed to load');
    }

    postMessage({ type: 'status', status: 'Loading models...', state: 'loading' });

    // Configure WASM Paths
    ort.env.wasm.wasmPaths = cdnBase;

    // Enable SIMD for significant performance boost (2-4x faster)
    ort.env.wasm.simd = true;

    // Configure multi-threading
    if (!self.crossOriginIsolated) {
        console.warn('Environment is not cross-origin isolated. Disabling WASM multi-threading.');
        console.warn('To enable multi-threading, serve with headers:');
        console.warn('  Cross-Origin-Opener-Policy: same-origin');
        console.warn('  Cross-Origin-Embedder-Policy: require-corp');
        ort.env.wasm.numThreads = 1;
    } else {
        const threads = Math.min(navigator.hardwareConcurrency || 4, 8);
        ort.env.wasm.numThreads = threads;
        if (DEBUG_LOGS) {
            console.log(`Multi-threading enabled with ${threads} threads`);
        }
    }

    console.log(`ORT: crossOriginIsolated=${self.crossOriginIsolated}, simd=${ort.env.wasm.simd}, threads=${ort.env.wasm.numThreads}`);

    try {
        const sessionOptions = {
            executionProviders: ['wasm'],
            graphOptimizationLevel: 'all'
        };

        // Load all models in parallel
        postMessage({ type: 'status', status: 'Loading MIMI encoder...', state: 'loading' });
        if (DEBUG_LOGS) {
            console.log('Loading MIMI encoder...');
        }

        const [encoderRes, textCondRes, flowMainRes, flowFlowRes, decoderRes] = await Promise.all([
            ort.InferenceSession.create(MODELS.mimi_encoder, sessionOptions),
            ort.InferenceSession.create(MODELS.text_conditioner, sessionOptions),
            ort.InferenceSession.create(MODELS.flow_lm_main, sessionOptions),
            ort.InferenceSession.create(MODELS.flow_lm_flow, sessionOptions),
            ort.InferenceSession.create(MODELS.mimi_decoder, sessionOptions)
        ]);

        mimiEncoderSession = encoderRes;
        textConditionerSession = textCondRes;
        flowLmMainSession = flowMainRes;
        flowLmFlowSession = flowFlowRes;
        mimiDecoderSession = decoderRes;

        if (DEBUG_LOGS) {
            console.log('All models loaded successfully');
            console.log('Flow LM Main inputs:', flowLmMainSession.inputNames);
            console.log('Flow LM Main outputs:', flowLmMainSession.outputNames);
            console.log('MIMI decoder inputs:', mimiDecoderSession.inputNames);
            console.log('MIMI decoder outputs:', mimiDecoderSession.outputNames);
        }

        // Load tokenizer
        postMessage({ type: 'status', status: 'Loading tokenizer...', state: 'loading' });
        if (DEBUG_LOGS) {
            console.log('Loading tokenizer...');
        }

        const tokenizerResponse = await fetch(MODELS.tokenizer);
        if (!tokenizerResponse.ok) {
            throw new Error(`Failed to load tokenizer: ${tokenizerResponse.statusText}`);
        }
        const tokenizerBuffer = await tokenizerResponse.arrayBuffer();
        tokenizerModelB64 = btoa(String.fromCharCode(...new Uint8Array(tokenizerBuffer)));

        // Import and initialize sentencepiece processor
        const spModule = await import('./sentencepiece.js?v=2');
        const SentencePieceProcessor = spModule.SentencePieceProcessor;
        if (!SentencePieceProcessor) {
            throw new Error('SentencePieceProcessor not found in sentencepiece.js');
        }
        tokenizerProcessor = new SentencePieceProcessor();
        await tokenizerProcessor.loadFromB64StringModel(tokenizerModelB64);
        if (DEBUG_LOGS) {
            console.log('Tokenizer loaded');
        }

        // Load predefined voices
        postMessage({ type: 'status', status: 'Loading voices...', state: 'loading' });
        if (DEBUG_LOGS) {
            console.log('Loading predefined voices...');
        }

        try {
            const voicesResponse = await fetch(MODELS.voices);
            if (voicesResponse.ok) {
                const voicesData = await voicesResponse.arrayBuffer();
                predefinedVoices = parseVoicesBin(voicesData);
                if (DEBUG_LOGS) {
                    console.log('Loaded voices:', Object.keys(predefinedVoices));
                }

                // Set default voice
                if (predefinedVoices['cosette']) {
                    currentVoiceEmbedding = predefinedVoices['cosette'];
                    currentVoiceName = 'cosette';
                } else {
                    // Use first available voice
                    const firstVoice = Object.keys(predefinedVoices)[0];
                    if (firstVoice) {
                        currentVoiceEmbedding = predefinedVoices[firstVoice];
                        currentVoiceName = firstVoice;
                    }
                }
            }
        } catch (e) {
            console.warn('Could not load predefined voices:', e);
        }

        // Send list of available voices
        postMessage({
            type: 'voices_loaded',
            voices: Object.keys(predefinedVoices),
            defaultVoice: currentVoiceName
        });

        // Pre-allocate s/t tensors for Flow Matching Loop (Optimization)
        // Pre-allocate for MAX_LSD to support dynamic switching
        if (DEBUG_LOGS) {
            console.log(`Pre-allocating Flow Matching tensors for LSD 1-${MAX_LSD}...`);
        }
        stTensors = {};

        for (let lsd = 1; lsd <= MAX_LSD; lsd++) {
            stTensors[lsd] = [];
            const dt = 1.0 / lsd;
            for (let j = 0; j < lsd; j++) {
                const s = j / lsd;
                const t = s + dt;
                stTensors[lsd].push({
                    s: new ort.Tensor('float32', new Float32Array([s]), [1, 1]),
                    t: new ort.Tensor('float32', new Float32Array([t]), [1, 1])
                });
            }
        }

        isReady = true;
        postMessage({ type: 'status', status: 'Ready', state: 'idle' });
        postMessage({ type: 'model_status', status: 'ready', text: 'Ready' });
        postMessage({ type: 'loaded' });

    } catch (err) {
        console.error('Model load failed:', err);
        throw err;
    }
}

function parseVoicesBin(buffer) {
    // Simple binary format:
    // Header: 4 bytes (uint32) = number of voices
    // For each voice:
    //   - 32 bytes: voice name (null-terminated string)
    //   - 4 bytes (uint32): number of frames
    //   - 4 bytes (uint32): embedding dim (1024)
    //   - frames * dim * 4 bytes: float32 embeddings

    const voices = {};
    const view = new DataView(buffer);
    let offset = 0;

    const numVoices = view.getUint32(offset, true);
    offset += 4;

    for (let i = 0; i < numVoices; i++) {
        // Read voice name
        const nameBytes = new Uint8Array(buffer, offset, 32);
        const nameEnd = nameBytes.indexOf(0);
        const name = new TextDecoder().decode(nameBytes.subarray(0, nameEnd > 0 ? nameEnd : 32)).trim();
        offset += 32;

        // Read dimensions
        const numFrames = view.getUint32(offset, true);
        offset += 4;
        const embDim = view.getUint32(offset, true);
        offset += 4;

        // Read embeddings
        const embSize = numFrames * embDim;
        const embeddings = new Float32Array(buffer, offset, embSize);
        offset += embSize * 4;

        // Store as [1, numFrames, embDim] shaped array info
        voices[name] = {
            data: new Float32Array(embeddings),
            shape: [1, numFrames, embDim]
        };

        console.log(`Loaded voice '${name}': ${numFrames} frames, ${embDim} dim`);
    }

    return voices;
}

async function encodeVoiceAudio(audioData) {
    // audioData should be Float32Array at 24kHz, mono
    // Reshape to [1, 1, samples]
    const input = new ort.Tensor('float32', audioData, [1, 1, audioData.length]);

    const outputs = await mimiEncoderSession.run({ audio: input });
    const embeddings = outputs[mimiEncoderSession.outputNames[0]];

    return {
        data: new Float32Array(embeddings.data),
        shape: embeddings.dims
    };
}

// Hardcoded state shapes extracted from ONNX model metadata
// These are the initial shapes - dynamic dimensions start at 0
const FLOW_LM_STATE_SHAPES = {
    // KV cache layers: [kv=2, batch=1, max_seq=1000, heads=16, head_dim=64]
    state_0: { shape: [2, 1, 1000, 16, 64], dtype: 'float32' },
    state_1: { shape: [0], dtype: 'float32' },  // dynamic
    state_2: { shape: [1], dtype: 'int64' },    // step counter
    state_3: { shape: [2, 1, 1000, 16, 64], dtype: 'float32' },
    state_4: { shape: [0], dtype: 'float32' },
    state_5: { shape: [1], dtype: 'int64' },
    state_6: { shape: [2, 1, 1000, 16, 64], dtype: 'float32' },
    state_7: { shape: [0], dtype: 'float32' },
    state_8: { shape: [1], dtype: 'int64' },
    state_9: { shape: [2, 1, 1000, 16, 64], dtype: 'float32' },
    state_10: { shape: [0], dtype: 'float32' },
    state_11: { shape: [1], dtype: 'int64' },
    state_12: { shape: [2, 1, 1000, 16, 64], dtype: 'float32' },
    state_13: { shape: [0], dtype: 'float32' },
    state_14: { shape: [1], dtype: 'int64' },
    state_15: { shape: [2, 1, 1000, 16, 64], dtype: 'float32' },
    state_16: { shape: [0], dtype: 'float32' },
    state_17: { shape: [1], dtype: 'int64' },
};

const MIMI_DECODER_STATE_SHAPES = {
    state_0: { shape: [1], dtype: 'bool' },
    state_1: { shape: [1, 512, 6], dtype: 'float32' },
    state_2: { shape: [1], dtype: 'bool' },
    state_3: { shape: [1, 64, 2], dtype: 'float32' },
    state_4: { shape: [1, 256, 6], dtype: 'float32' },
    state_5: { shape: [1], dtype: 'bool' },
    state_6: { shape: [1, 256, 2], dtype: 'float32' },
    state_7: { shape: [1], dtype: 'bool' },
    state_8: { shape: [1, 128, 0], dtype: 'float32' },  // dynamic
    state_9: { shape: [1, 128, 5], dtype: 'float32' },
    state_10: { shape: [1], dtype: 'bool' },
    state_11: { shape: [1, 128, 2], dtype: 'float32' },
    state_12: { shape: [1], dtype: 'bool' },
    state_13: { shape: [1, 64, 0], dtype: 'float32' },  // dynamic
    state_14: { shape: [1, 64, 4], dtype: 'float32' },
    state_15: { shape: [1], dtype: 'bool' },
    state_16: { shape: [1, 64, 2], dtype: 'float32' },
    state_17: { shape: [1], dtype: 'bool' },
    state_18: { shape: [1, 32, 0], dtype: 'float32' },  // dynamic
    state_19: { shape: [2, 1, 8, 1000, 64], dtype: 'float32' },
    state_20: { shape: [1], dtype: 'int64' },
    state_21: { shape: [1], dtype: 'int64' },
    state_22: { shape: [2, 1, 8, 1000, 64], dtype: 'float32' },
    state_23: { shape: [1], dtype: 'int64' },
    state_24: { shape: [1], dtype: 'int64' },
    state_25: { shape: [1], dtype: 'bool' },
    state_26: { shape: [1, 512, 16], dtype: 'float32' },
    state_27: { shape: [1], dtype: 'bool' },
    state_28: { shape: [1, 1, 6], dtype: 'float32' },
    state_29: { shape: [1], dtype: 'bool' },
    state_30: { shape: [1, 64, 2], dtype: 'float32' },
    state_31: { shape: [1], dtype: 'bool' },
    state_32: { shape: [1, 32, 0], dtype: 'float32' },  // dynamic
    state_33: { shape: [1], dtype: 'bool' },
    state_34: { shape: [1, 512, 2], dtype: 'float32' },
    state_35: { shape: [1], dtype: 'bool' },
    state_36: { shape: [1, 64, 4], dtype: 'float32' },
    state_37: { shape: [1], dtype: 'bool' },
    state_38: { shape: [1, 128, 2], dtype: 'float32' },
    state_39: { shape: [1], dtype: 'bool' },
    state_40: { shape: [1, 64, 0], dtype: 'float32' },  // dynamic
    state_41: { shape: [1], dtype: 'bool' },
    state_42: { shape: [1, 128, 5], dtype: 'float32' },
    state_43: { shape: [1], dtype: 'bool' },
    state_44: { shape: [1, 256, 2], dtype: 'float32' },
    state_45: { shape: [1], dtype: 'bool' },
    state_46: { shape: [1, 128, 0], dtype: 'float32' },  // dynamic
    state_47: { shape: [1], dtype: 'bool' },
    state_48: { shape: [1, 256, 6], dtype: 'float32' },
    state_49: { shape: [2, 1, 8, 1000, 64], dtype: 'float32' },
    state_50: { shape: [1], dtype: 'int64' },
    state_51: { shape: [1], dtype: 'int64' },
    state_52: { shape: [2, 1, 8, 1000, 64], dtype: 'float32' },
    state_53: { shape: [1], dtype: 'int64' },
    state_54: { shape: [1], dtype: 'int64' },
    state_55: { shape: [1, 512, 16], dtype: 'float32' },
};

function initState(session, stateShapes) {
    /**
     * Initialize state tensors for a stateful ONNX model using hardcoded shapes.
     */
    const state = {};

    for (const inputName of session.inputNames) {
        if (inputName.startsWith('state_')) {
            const stateInfo = stateShapes[inputName];
            if (!stateInfo) {
                console.warn(`Unknown state input: ${inputName}, skipping`);
                continue;
            }

            const { shape, dtype } = stateInfo;
            const size = shape.reduce((a, b) => a * b, 1);

            let data;
            if (dtype === 'int64') {
                data = new BigInt64Array(size);
            } else if (dtype === 'bool') {
                data = new Uint8Array(size);
            } else {
                data = new Float32Array(size);
            }

            state[inputName] = new ort.Tensor(dtype, data, shape);
            if (DEBUG_LOGS) {
                console.log(`Init state ${inputName}: shape=${JSON.stringify(shape)}, dtype=${dtype}`);
            }
        }
    }

    return state;
}

async function startGeneration(text, voiceName) {
    isGenerating = true;
    currentLSD = MAX_LSD;  // Reset to max quality for each new generation
    postMessage({ type: 'status', status: 'Generating...', state: 'running' });
    postMessage({ type: 'generation_started', data: { time: performance.now() } });

    try {
        // Split text into sentence chunks (target <= CHUNK_TARGET_TOKENS tokens)
        const chunks = splitIntoBestSentences(text);
        console.log(`Split into ${chunks.length} chunks:`, chunks);

        if (chunks.length === 0) {
            throw new Error('No text to generate');
        }

        // Get voice embedding
        let voiceEmb = currentVoiceEmbedding;
        if (voiceName && voiceName !== currentVoiceName) {
            if (predefinedVoices[voiceName]) {
                voiceEmb = predefinedVoices[voiceName];
                currentVoiceEmbedding = voiceEmb;
                currentVoiceName = voiceName;
            }
        }

        if (!voiceEmb) {
            throw new Error('No voice embedding available. Please select a voice or upload custom audio.');
        }

        // Run generation pipeline with chunks
        await runGenerationPipeline(voiceEmb, chunks);

    } catch (err) {
        console.error('Generation error:', err);
        postMessage({ type: 'error', error: err.toString() });
    } finally {
        if (isGenerating) {
            postMessage({ type: 'stream_ended' });
            postMessage({ type: 'status', status: 'Finished', state: 'idle' });
        }
        isGenerating = false;
    }
}

async function runGenerationPipeline(voiceEmb, chunks) {
    // Initialize state - may be reset per chunk
    let mimiState = initState(mimiDecoderSession, MIMI_DECODER_STATE_SHAPES);
    const emptySeq = new ort.Tensor('float32', new Float32Array(0), [1, 0, 32]);
    const emptyTextEmb = new ort.Tensor('float32', new Float32Array(0), [1, 0, 1024]);

    // Voice embedding tensor
    const voiceTensor = new ort.Tensor('float32', voiceEmb.data, voiceEmb.shape);
    console.log('Voice embeddings shape:', voiceEmb.shape);

    async function buildVoiceConditionedState() {
        let flowLmState = initState(flowLmMainSession, FLOW_LM_STATE_SHAPES);
        console.log('Running voice conditioning...');
        const voiceCondInputs = {
            sequence: emptySeq,
            text_embeddings: voiceTensor,
            ...flowLmState
        };

        let condResult = await flowLmMainSession.run(voiceCondInputs);

        // Update state from voice conditioning
        for (let i = 2; i < flowLmMainSession.outputNames.length; i++) {
            const outputName = flowLmMainSession.outputNames[i];
            if (outputName.startsWith('out_state_')) {
                const stateIdx = parseInt(outputName.replace('out_state_', ''));
                flowLmState[`state_${stateIdx}`] = condResult[outputName];
            }
        }
        return flowLmState;
    }

    let flowLmState = await buildVoiceConditionedState();

    // Streaming parameters
    const FIRST_CHUNK_FRAMES = 3;
    const NORMAL_CHUNK_FRAMES = 12;

    // Tracking across all chunks
    const allGeneratedLatents = [];
    let isFirstAudioChunk = true;
    let totalDecodedFrames = 0;
    let totalFlowLmTime = 0;
    let totalDecodeTime = 0;
    const arStartTime = performance.now();

    // Process each text chunk
    for (let chunkIdx = 0; chunkIdx < chunks.length; chunkIdx++) {
        if (!isGenerating) break;

        if (RESET_FLOW_STATE_EACH_CHUNK && chunkIdx > 0) {
            flowLmState = await buildVoiceConditionedState();
        }
        if (RESET_MIMI_STATE_EACH_CHUNK && chunkIdx > 0) {
            mimiState = initState(mimiDecoderSession, MIMI_DECODER_STATE_SHAPES);
        }

        const chunkText = chunks[chunkIdx];
        console.log(`Processing chunk ${chunkIdx + 1}/${chunks.length}: "${chunkText}"`);

        let isFirstAudioChunkOfTextChunk = true;

        // Tokenize this chunk
        const tokenIds = tokenizerProcessor.encodeIds(chunkText);
        console.log(`Chunk ${chunkIdx + 1} tokens:`, tokenIds.length);

        // Text conditioning for this chunk
        const textInput = new ort.Tensor('int64', BigInt64Array.from(tokenIds.map(x => BigInt(x))), [1, tokenIds.length]);
        const textCondResult = await textConditionerSession.run({ token_ids: textInput });
        let textEmb = textCondResult[textConditionerSession.outputNames[0]];

        if (textEmb.dims.length === 2) {
            textEmb = new ort.Tensor('float32', textEmb.data, [1, textEmb.dims[0], textEmb.dims[1]]);
        }

        const textCondInputs = {
            sequence: emptySeq,
            text_embeddings: textEmb,
            ...flowLmState
        };

        let condResult = await flowLmMainSession.run(textCondInputs);

        // Update state from text conditioning
        for (let i = 2; i < flowLmMainSession.outputNames.length; i++) {
            const outputName = flowLmMainSession.outputNames[i];
            if (outputName.startsWith('out_state_')) {
                const stateIdx = parseInt(outputName.replace('out_state_', ''));
                flowLmState[`state_${stateIdx}`] = condResult[outputName];
            }
        }

        // AR generation for this chunk
        const chunkLatents = [];
        let currentLatent = new ort.Tensor('float32', new Float32Array(32).fill(NaN), [1, 1, 32]);
        let chunkDecodedFrames = 0;
        const FRAMES_AFTER_EOS = 3;  // Match PyTorch behavior - generate extra frames after EOS
        let eosStep = null;

        let chunkEnded = false;
        let chunkGenTimeMs = 0;
        for (let step = 0; step < MAX_FRAMES; step++) {
            if (!isGenerating) break;

            // Yield every 4 steps to allow message processing (e.g., set_lsd)
            if (step > 0 && step % 4 === 0) {
                await new Promise(r => setTimeout(r, 0));
            }

            const arInputs = {
                sequence: currentLatent,
                text_embeddings: emptyTextEmb,
                ...flowLmState
            };

            const stepStart = performance.now();
            const arResult = await flowLmMainSession.run(arInputs);
            const stepElapsed = performance.now() - stepStart;
            chunkGenTimeMs += stepElapsed;

            const conditioning = arResult['conditioning'];
            const eosLogit = arResult['eos_logit'].data[0];
            const isEos = eosLogit > -4.0;

            // Track when EOS is first detected
            if (isEos && eosStep === null) {
                eosStep = step;
            }

            // Only stop after FRAMES_AFTER_EOS additional frames
            const shouldStop = eosStep !== null && step >= eosStep + FRAMES_AFTER_EOS;

            // Flow matching (LSD loop) - uses currentLSD which can be adjusted dynamically
            const TEMP = 0.7;
            const STD = Math.sqrt(TEMP);
            let xData = new Float32Array(32);
            for (let i = 0; i < 32; i++) {
                let u = 0, v = 0;
                while (u === 0) u = Math.random();
                while (v === 0) v = Math.random();
                xData[i] = Math.sqrt(-2.0 * Math.log(u)) * Math.cos(2.0 * Math.PI * v) * STD;
            }

            const lsdSteps = currentLSD;
            const dt = 1.0 / lsdSteps;

            for (let j = 0; j < lsdSteps; j++) {
                const flowInputs = {
                    c: conditioning,
                    s: stTensors[lsdSteps][j].s,
                    t: stTensors[lsdSteps][j].t,
                    x: new ort.Tensor('float32', xData, [1, 32])
                };

                const flowResult = await flowLmFlowSession.run(flowInputs);
                const v = flowResult['flow_dir'].data;

                for (let k = 0; k < 32; k++) {
                    xData[k] += v[k] * dt;
                }
            }

            totalFlowLmTime += stepElapsed;

            const latentData = xData;
            chunkLatents.push(new Float32Array(latentData));
            allGeneratedLatents.push(new Float32Array(latentData));

            // Update state
            currentLatent = new ort.Tensor('float32', latentData, [1, 1, 32]);
            for (let i = 2; i < flowLmMainSession.outputNames.length; i++) {
                const outputName = flowLmMainSession.outputNames[i];
                if (outputName.startsWith('out_state_')) {
                    const stateIdx = parseInt(outputName.replace('out_state_', ''));
                    flowLmState[`state_${stateIdx}`] = arResult[outputName];
                }
            }

            // Decode audio chunks
            const pending = chunkLatents.length - chunkDecodedFrames;
            let decodeSize = 0;

            if (shouldStop) {
                decodeSize = pending;
            } else if (isFirstAudioChunk && pending >= FIRST_CHUNK_FRAMES) {
                decodeSize = FIRST_CHUNK_FRAMES;
            } else if (pending >= NORMAL_CHUNK_FRAMES) {
                decodeSize = NORMAL_CHUNK_FRAMES;
            }

            if (decodeSize > 0) {
                const decodeLatents = new Float32Array(decodeSize * 32);
                for (let i = 0; i < decodeSize; i++) {
                    decodeLatents.set(chunkLatents[chunkDecodedFrames + i], i * 32);
                }

                const latentTensor = new ort.Tensor('float32', decodeLatents, [1, decodeSize, 32]);
                const decodeInputs = { latent: latentTensor, ...mimiState };

                const decStart = performance.now();
                const decodeResult = await mimiDecoderSession.run(decodeInputs);
                const decElapsed = performance.now() - decStart;
                totalDecodeTime += decElapsed;
                chunkGenTimeMs += decElapsed;
                const audioChunk = decodeResult[mimiDecoderSession.outputNames[0]].data;

                // Update MIMI state
                for (let i = 1; i < mimiDecoderSession.outputNames.length; i++) {
                    const outputName = mimiDecoderSession.outputNames[i];
                    const stateIdx = i - 1;
                    mimiState[`state_${stateIdx}`] = decodeResult[outputName];
                }

                chunkDecodedFrames += decodeSize;
                totalDecodedFrames += decodeSize;

                const audioFloat32 = new Float32Array(audioChunk);
                const isLastChunk = shouldStop && chunkIdx === chunks.length - 1;
                postMessage({
                    type: 'audio_chunk',
                    data: audioFloat32,
                    metrics: {
                        bbTime: 0,
                        decTime: 0,
                        chunkDuration: audioFloat32.length / SAMPLE_RATE,
                        genTimeSec: chunkGenTimeMs / 1000,
                        isFirst: isFirstAudioChunk,
                        isLast: isLastChunk,
                        chunkStart: isFirstAudioChunkOfTextChunk
                    }
                }, [audioFloat32.buffer]);

                isFirstAudioChunk = false;
                isFirstAudioChunkOfTextChunk = false;
                chunkGenTimeMs = 0;
            }

            if (shouldStop) {
                console.log(`Chunk ${chunkIdx + 1} EOS at step ${eosStep}, stopped at step ${step}, ${chunkLatents.length} frames`);
                chunkEnded = true;
                break;
            }
        }

        if (chunkEnded && isGenerating && chunkIdx < chunks.length - 1) {
            const gapSamples = Math.max(1, Math.floor(CHUNK_GAP_SEC * SAMPLE_RATE));
            const silence = new Float32Array(gapSamples);
            postMessage({
                type: 'audio_chunk',
                data: silence,
                metrics: {
                    bbTime: 0,
                    decTime: 0,
                    chunkDuration: gapSamples / SAMPLE_RATE,
                    isFirst: false,
                    isLast: false,
                    isSilence: true
                }
            }, [silence.buffer]);
        }
    }

    const totalTime = (performance.now() - arStartTime) / 1000;
    const audioSeconds = allGeneratedLatents.length * SAMPLES_PER_FRAME / SAMPLE_RATE;

    // RTFx based on actual generation time (flow LM + decoder), not including conditioning
    const genTime = (totalFlowLmTime + totalDecodeTime) / 1000;
    const rtfx = audioSeconds / genTime;

    console.log(`Generation complete: ${allGeneratedLatents.length} frames (${audioSeconds.toFixed(2)}s audio)`);
    console.log(`  Total time: ${totalTime.toFixed(2)}s`);
    console.log(`  Gen time: ${genTime.toFixed(2)}s, RTFx: ${rtfx.toFixed(2)}x`);
    console.log(`  Flow LM: ${(totalFlowLmTime / 1000).toFixed(2)}s (${(totalFlowLmTime / allGeneratedLatents.length).toFixed(1)}ms/step)`);
    console.log(`  Decoder: ${(totalDecodeTime / 1000).toFixed(2)}s`);

    postMessage({
        type: 'status',
        status: `Finished (RTFx: ${rtfx.toFixed(2)}x)`,
        state: 'idle',
        metrics: { rtfx, genTime, totalTime, audioDuration: audioSeconds }
    });
}

// Pre-allocated buffers for step counter updates (avoid GC pressure in hot loop)
const stepBuffers = {};

function updateStateSteps(state, increment) {
    // Update step counters in state dict - reuse buffers to avoid allocation
    const incBigInt = BigInt(increment);
    for (const key in state) {
        if (key.includes('step') && state[key]) {
            const tensor = state[key];
            if (tensor.data instanceof BigInt64Array) {
                // Reuse buffer if same size, otherwise create new one
                if (!stepBuffers[key] || stepBuffers[key].length !== tensor.data.length) {
                    stepBuffers[key] = new BigInt64Array(tensor.data.length);
                }
                const buf = stepBuffers[key];
                for (let i = 0; i < tensor.data.length; i++) {
                    buf[i] = tensor.data[i] + incBigInt;
                }
                state[key] = new ort.Tensor('int64', buf, tensor.dims);
            }
        }
    }
}