Preprocesses text for TTS by normalizing, cleaning, and correcting punctuation. Based on helpers in https://github.com/supertone-inc/supertonic
f76dade
| /** | |
| * Preprocesses text for TTS by normalizing, cleaning, and correcting punctuation. | |
| * Based on: https://github.com/supertone-inc/supertonic/blob/main/csharp/Helper.cs | |
| * @param text The input text to preprocess. | |
| * @returns The cleaned and preprocessed text. | |
| */ | |
| export function preprocessText(text: string): string { | |
| // Normalize to NFKD form to separate characters from their diacritics. | |
| text = text.normalize("NFKD"); | |
| // Remove emojis. | |
| text = text.replace( | |
| /([\u2600-\u27BF]|[\uE000-\uF8FF]|\uD83C[\uDC00-\uDFFF]|\uD83D[\uDC00-\uDFFF]|\uD83E[\uDD10-\uDDFF])/g, | |
| "", | |
| ); | |
| // Define character replacements. | |
| const replacements: Record<string, string> = { | |
| "—": "-", // em dash | |
| "–": "-", // en dash | |
| "‑": "-", // non-breaking hyphen | |
| "¯": " ", // macron | |
| "_": " ", // underscore | |
| "\u201C": '"', // left double quote | |
| "\u201D": '"', // right double quote | |
| "\u2018": "'", // left single quote | |
| "\u2019": "'", // right single quote | |
| "´": "'", // acute accent | |
| "`": "'", // grave accent | |
| "[": " ", | |
| "]": " ", | |
| "|": " ", | |
| "/": " ", | |
| "#": " ", | |
| "→": " ", | |
| "←": " ", | |
| }; | |
| // Apply character replacements. | |
| for (const [key, value] of Object.entries(replacements)) { | |
| text = text.replace(new RegExp(`\\${key}`, "g"), value); | |
| } | |
| // Remove combining diacritical marks. | |
| text = text.replace(/[\u0300-\u036f]/g, ""); | |
| // Remove special symbols that are not handled by emoji removal. | |
| text = text.replace(/[♥☆♡©\\]/g, ""); | |
| // Replace known expressions. | |
| const exprReplacements: Record<string, string> = { | |
| "@": " at ", | |
| " e.g.": " for example, ", | |
| " i.e.": " that is, ", | |
| }; | |
| for (const [key, value] of Object.entries(exprReplacements)) { | |
| text = text.replace(new RegExp(key, "g"), value); | |
| } | |
| // Fix spacing around punctuation. | |
| text = text | |
| .replace(/ ,/g, ",") | |
| .replace(/ \./g, ".") | |
| .replace(/ !/g, "!") | |
| .replace(/ \?/g, "?") | |
| .replace(/ ;/g, ";") | |
| .replace(/ :/g, ":") | |
| .replace(/ '/g, "'"); | |
| // Remove duplicate quotes. | |
| text = text.replace(/""+/g, '"'); | |
| text = text.replace(/''+/g, "'"); | |
| text = text.replace(/``+/g, "`"); | |
| // Remove extra spaces. | |
| text = text.replace(/\s+/g, " ").trim(); | |
| // If text doesn't end with punctuation, quotes, or closing brackets, add a period. | |
| if (!/[.!?;:,'")\]}…。」』】〉》›»]$/.test(text)) { | |
| text += "."; | |
| } | |
| return text; | |
| } |