File size: 2,379 Bytes
2bb8806
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import type { KokoroVocab } from "./model";

/**
 * Result of turning text → model input, with word-level bookkeeping so we can
 * align per-phoneme `pred_dur` back to words for timestamps.
 */
export interface TokenizeResult {
  inputIds: number[];                          // token IDs for KModel
  phonemes: string[];                          // flat phoneme sequence (each ID's char)
  words: Array<{
    word: string;                              // Devanagari source
    ipa: string;                               // raw espeak-ng output for this word
    tokenStart: number;                        // index into inputIds for first kept phoneme
    tokenEnd: number;                          // exclusive
  }>;
  droppedChars: Record<string, number>;        // chars not in vocab, counts for diagnostics
}

/**
 * Convert per-word IPA (from phonemize.ts) into model-ready input_ids, keeping
 * track of which input_ids came from which word for downstream timestamping.
 *
 * Characters not in `vocab` are silently dropped — this mirrors KModel.forward's
 * own tokenization (`input_ids = filter(lambda i: i is not None, map(vocab.get, phonemes))`).
 */
export function tokenizeByWord(
  wordIpas: Array<{ word: string; ipa: string }>,
  vocab: KokoroVocab,
): TokenizeResult {
  const inputIds: number[] = [];
  const phonemes: string[] = [];
  const words: TokenizeResult["words"] = [];
  const droppedChars: Record<string, number> = {};

  const spaceId = vocab[" "];
  for (let wi = 0; wi < wordIpas.length; wi++) {
    const { word, ipa } = wordIpas[wi];
    // Insert word-boundary space BEFORE every word except the first.
    // (Trailing-space-after-last-word tends to get stripped by the predictor,
    //  which throws off pred_dur ↔ input_ids alignment and drops the last
    //  word from our timestamps. Leading-space avoids the issue.)
    if (wi > 0 && spaceId !== undefined) {
      inputIds.push(spaceId);
      phonemes.push(" ");
    }
    const tokenStart = inputIds.length;
    for (const c of ipa) {
      const id = vocab[c];
      if (id === undefined) {
        droppedChars[c] = (droppedChars[c] ?? 0) + 1;
      } else {
        inputIds.push(id);
        phonemes.push(c);
      }
    }
    const tokenEnd = inputIds.length;
    words.push({ word, ipa, tokenStart, tokenEnd });
  }

  return { inputIds, phonemes, words, droppedChars };
}