Spaces:
Running
Running
File size: 2,379 Bytes
2bb8806 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 | import type { KokoroVocab } from "./model";
/**
* Result of turning text → model input, with word-level bookkeeping so we can
* align per-phoneme `pred_dur` back to words for timestamps.
*/
export interface TokenizeResult {
inputIds: number[]; // token IDs for KModel
phonemes: string[]; // flat phoneme sequence (each ID's char)
words: Array<{
word: string; // Devanagari source
ipa: string; // raw espeak-ng output for this word
tokenStart: number; // index into inputIds for first kept phoneme
tokenEnd: number; // exclusive
}>;
droppedChars: Record<string, number>; // chars not in vocab, counts for diagnostics
}
/**
* Convert per-word IPA (from phonemize.ts) into model-ready input_ids, keeping
* track of which input_ids came from which word for downstream timestamping.
*
* Characters not in `vocab` are silently dropped — this mirrors KModel.forward's
* own tokenization (`input_ids = filter(lambda i: i is not None, map(vocab.get, phonemes))`).
*/
export function tokenizeByWord(
wordIpas: Array<{ word: string; ipa: string }>,
vocab: KokoroVocab,
): TokenizeResult {
const inputIds: number[] = [];
const phonemes: string[] = [];
const words: TokenizeResult["words"] = [];
const droppedChars: Record<string, number> = {};
const spaceId = vocab[" "];
for (let wi = 0; wi < wordIpas.length; wi++) {
const { word, ipa } = wordIpas[wi];
// Insert word-boundary space BEFORE every word except the first.
// (Trailing-space-after-last-word tends to get stripped by the predictor,
// which throws off pred_dur ↔ input_ids alignment and drops the last
// word from our timestamps. Leading-space avoids the issue.)
if (wi > 0 && spaceId !== undefined) {
inputIds.push(spaceId);
phonemes.push(" ");
}
const tokenStart = inputIds.length;
for (const c of ipa) {
const id = vocab[c];
if (id === undefined) {
droppedChars[c] = (droppedChars[c] ?? 0) + 1;
} else {
inputIds.push(id);
phonemes.push(c);
}
}
const tokenEnd = inputIds.length;
words.push({ word, ipa, tokenStart, tokenEnd });
}
return { inputIds, phonemes, words, droppedChars };
}
|