bol-tts-marathi / src /tokenize.ts
shreyask's picture
release: webgpu demo + source
2bb8806 verified
import type { KokoroVocab } from "./model";
/**
* Result of turning text → model input, with word-level bookkeeping so we can
* align per-phoneme `pred_dur` back to words for timestamps.
*/
export interface TokenizeResult {
inputIds: number[]; // token IDs for KModel
phonemes: string[]; // flat phoneme sequence (each ID's char)
words: Array<{
word: string; // Devanagari source
ipa: string; // raw espeak-ng output for this word
tokenStart: number; // index into inputIds for first kept phoneme
tokenEnd: number; // exclusive
}>;
droppedChars: Record<string, number>; // chars not in vocab, counts for diagnostics
}
/**
* Convert per-word IPA (from phonemize.ts) into model-ready input_ids, keeping
* track of which input_ids came from which word for downstream timestamping.
*
* Characters not in `vocab` are silently dropped — this mirrors KModel.forward's
* own tokenization (`input_ids = filter(lambda i: i is not None, map(vocab.get, phonemes))`).
*/
export function tokenizeByWord(
wordIpas: Array<{ word: string; ipa: string }>,
vocab: KokoroVocab,
): TokenizeResult {
const inputIds: number[] = [];
const phonemes: string[] = [];
const words: TokenizeResult["words"] = [];
const droppedChars: Record<string, number> = {};
const spaceId = vocab[" "];
for (let wi = 0; wi < wordIpas.length; wi++) {
const { word, ipa } = wordIpas[wi];
// Insert word-boundary space BEFORE every word except the first.
// (Trailing-space-after-last-word tends to get stripped by the predictor,
// which throws off pred_dur ↔ input_ids alignment and drops the last
// word from our timestamps. Leading-space avoids the issue.)
if (wi > 0 && spaceId !== undefined) {
inputIds.push(spaceId);
phonemes.push(" ");
}
const tokenStart = inputIds.length;
for (const c of ipa) {
const id = vocab[c];
if (id === undefined) {
droppedChars[c] = (droppedChars[c] ?? 0) + 1;
} else {
inputIds.push(id);
phonemes.push(c);
}
}
const tokenEnd = inputIds.length;
words.push({ word, ipa, tokenStart, tokenEnd });
}
return { inputIds, phonemes, words, droppedChars };
}