Supertonic-TTS-WebGPU / src /text-preprocessor.ts
miraclemind's picture
Preprocesses text for TTS by normalizing, cleaning, and correcting punctuation. Based on helpers in https://github.com/supertone-inc/supertonic
f76dade
raw
history blame
2.44 kB
/**
* Preprocesses text for TTS by normalizing, cleaning, and correcting punctuation.
* Based on: https://github.com/supertone-inc/supertonic/blob/main/csharp/Helper.cs
* @param text The input text to preprocess.
* @returns The cleaned and preprocessed text.
*/
export function preprocessText(text: string): string {
// Normalize to NFKD form to separate characters from their diacritics.
text = text.normalize("NFKD");
// Remove emojis.
text = text.replace(
/([\u2600-\u27BF]|[\uE000-\uF8FF]|\uD83C[\uDC00-\uDFFF]|\uD83D[\uDC00-\uDFFF]|\uD83E[\uDD10-\uDDFF])/g,
"",
);
// Define character replacements.
const replacements: Record<string, string> = {
"—": "-", // em dash
"–": "-", // en dash
"‑": "-", // non-breaking hyphen
"¯": " ", // macron
"_": " ", // underscore
"\u201C": '"', // left double quote
"\u201D": '"', // right double quote
"\u2018": "'", // left single quote
"\u2019": "'", // right single quote
"´": "'", // acute accent
"`": "'", // grave accent
"[": " ",
"]": " ",
"|": " ",
"/": " ",
"#": " ",
"→": " ",
"←": " ",
};
// Apply character replacements.
for (const [key, value] of Object.entries(replacements)) {
text = text.replace(new RegExp(`\\${key}`, "g"), value);
}
// Remove combining diacritical marks.
text = text.replace(/[\u0300-\u036f]/g, "");
// Remove special symbols that are not handled by emoji removal.
text = text.replace(/[♥☆♡©\\]/g, "");
// Replace known expressions.
const exprReplacements: Record<string, string> = {
"@": " at ",
" e.g.": " for example, ",
" i.e.": " that is, ",
};
for (const [key, value] of Object.entries(exprReplacements)) {
text = text.replace(new RegExp(key, "g"), value);
}
// Fix spacing around punctuation.
text = text
.replace(/ ,/g, ",")
.replace(/ \./g, ".")
.replace(/ !/g, "!")
.replace(/ \?/g, "?")
.replace(/ ;/g, ";")
.replace(/ :/g, ":")
.replace(/ '/g, "'");
// Remove duplicate quotes.
text = text.replace(/""+/g, '"');
text = text.replace(/''+/g, "'");
text = text.replace(/``+/g, "`");
// Remove extra spaces.
text = text.replace(/\s+/g, " ").trim();
// If text doesn't end with punctuation, quotes, or closing brackets, add a period.
if (!/[.!?;:,'")\]}…。」』】〉》›»]$/.test(text)) {
text += ".";
}
return text;
}