import type { KokoroVocab } from "./model"; /** * Result of turning text → model input, with word-level bookkeeping so we can * align per-phoneme `pred_dur` back to words for timestamps. */ export interface TokenizeResult { inputIds: number[]; // token IDs for KModel phonemes: string[]; // flat phoneme sequence (each ID's char) words: Array<{ word: string; // Devanagari source ipa: string; // raw espeak-ng output for this word tokenStart: number; // index into inputIds for first kept phoneme tokenEnd: number; // exclusive }>; droppedChars: Record; // chars not in vocab, counts for diagnostics } /** * Convert per-word IPA (from phonemize.ts) into model-ready input_ids, keeping * track of which input_ids came from which word for downstream timestamping. * * Characters not in `vocab` are silently dropped — this mirrors KModel.forward's * own tokenization (`input_ids = filter(lambda i: i is not None, map(vocab.get, phonemes))`). */ export function tokenizeByWord( wordIpas: Array<{ word: string; ipa: string }>, vocab: KokoroVocab, ): TokenizeResult { const inputIds: number[] = []; const phonemes: string[] = []; const words: TokenizeResult["words"] = []; const droppedChars: Record = {}; const spaceId = vocab[" "]; for (let wi = 0; wi < wordIpas.length; wi++) { const { word, ipa } = wordIpas[wi]; // Insert word-boundary space BEFORE every word except the first. // (Trailing-space-after-last-word tends to get stripped by the predictor, // which throws off pred_dur ↔ input_ids alignment and drops the last // word from our timestamps. Leading-space avoids the issue.) if (wi > 0 && spaceId !== undefined) { inputIds.push(spaceId); phonemes.push(" "); } const tokenStart = inputIds.length; for (const c of ipa) { const id = vocab[c]; if (id === undefined) { droppedChars[c] = (droppedChars[c] ?? 0) + 1; } else { inputIds.push(id); phonemes.push(c); } } const tokenEnd = inputIds.length; words.push({ word, ipa, tokenStart, tokenEnd }); } return { inputIds, phonemes, words, droppedChars }; }