Spaces:
Running
Running
| import type { KokoroVocab } from "./model"; | |
| /** | |
| * Result of turning text → model input, with word-level bookkeeping so we can | |
| * align per-phoneme `pred_dur` back to words for timestamps. | |
| */ | |
| export interface TokenizeResult { | |
| inputIds: number[]; // token IDs for KModel | |
| phonemes: string[]; // flat phoneme sequence (each ID's char) | |
| words: Array<{ | |
| word: string; // Devanagari source | |
| ipa: string; // raw espeak-ng output for this word | |
| tokenStart: number; // index into inputIds for first kept phoneme | |
| tokenEnd: number; // exclusive | |
| }>; | |
| droppedChars: Record<string, number>; // chars not in vocab, counts for diagnostics | |
| } | |
| /** | |
| * Convert per-word IPA (from phonemize.ts) into model-ready input_ids, keeping | |
| * track of which input_ids came from which word for downstream timestamping. | |
| * | |
| * Characters not in `vocab` are silently dropped — this mirrors KModel.forward's | |
| * own tokenization (`input_ids = filter(lambda i: i is not None, map(vocab.get, phonemes))`). | |
| */ | |
| export function tokenizeByWord( | |
| wordIpas: Array<{ word: string; ipa: string }>, | |
| vocab: KokoroVocab, | |
| ): TokenizeResult { | |
| const inputIds: number[] = []; | |
| const phonemes: string[] = []; | |
| const words: TokenizeResult["words"] = []; | |
| const droppedChars: Record<string, number> = {}; | |
| const spaceId = vocab[" "]; | |
| for (let wi = 0; wi < wordIpas.length; wi++) { | |
| const { word, ipa } = wordIpas[wi]; | |
| // Insert word-boundary space BEFORE every word except the first. | |
| // (Trailing-space-after-last-word tends to get stripped by the predictor, | |
| // which throws off pred_dur ↔ input_ids alignment and drops the last | |
| // word from our timestamps. Leading-space avoids the issue.) | |
| if (wi > 0 && spaceId !== undefined) { | |
| inputIds.push(spaceId); | |
| phonemes.push(" "); | |
| } | |
| const tokenStart = inputIds.length; | |
| for (const c of ipa) { | |
| const id = vocab[c]; | |
| if (id === undefined) { | |
| droppedChars[c] = (droppedChars[c] ?? 0) + 1; | |
| } else { | |
| inputIds.push(id); | |
| phonemes.push(c); | |
| } | |
| } | |
| const tokenEnd = inputIds.length; | |
| words.push({ word, ipa, tokenStart, tokenEnd }); | |
| } | |
| return { inputIds, phonemes, words, droppedChars }; | |
| } | |