Spaces:
Running
Running
File size: 1,246 Bytes
9b1aef8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 | /**
* TextCleaner — maps IPA phoneme characters to integer token IDs.
* Direct port of KittenTTS Python TextCleaner class.
* https://github.com/KittenML/KittenTTS
*/
const _pad = "$";
const _punctuation = ';:,.!?¡¿—…"«»"" ';
const _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
const _letters_ipa =
"ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ";
const symbols = [_pad, ..._punctuation, ..._letters, ..._letters_ipa];
const charToIndex: Record<string, number> = {};
for (let i = 0; i < symbols.length; i++) {
charToIndex[symbols[i]] = i;
}
export function cleanText(text: string): number[] {
const indexes: number[] = [];
for (const char of text) {
const idx = charToIndex[char];
if (idx !== undefined) {
indexes.push(idx);
}
}
return indexes;
}
export function tokenize(phonemes: string): number[] {
const tokens = cleanText(phonemes);
// Add start/end tokens matching Python: insert 0 at start, append 10, append 0
tokens.unshift(0);
tokens.push(10);
tokens.push(0);
return tokens;
}
|