/** * Preprocesses text for TTS by normalizing, cleaning, and correcting punctuation. * Based on: https://github.com/supertone-inc/supertonic/blob/main/csharp/Helper.cs * @param text The input text to preprocess. * @returns The cleaned and preprocessed text. */ export function preprocessText(text: string): string { // Normalize to NFKD form to separate characters from their diacritics. text = text.normalize("NFKD"); // Remove emojis. text = text.replace( /([\u2600-\u27BF]|[\uE000-\uF8FF]|\uD83C[\uDC00-\uDFFF]|\uD83D[\uDC00-\uDFFF]|\uD83E[\uDD10-\uDDFF])/g, "", ); // Define character replacements. const replacements: Record = { "—": "-", // em dash "–": "-", // en dash "‑": "-", // non-breaking hyphen "¯": " ", // macron "_": " ", // underscore "\u201C": '"', // left double quote "\u201D": '"', // right double quote "\u2018": "'", // left single quote "\u2019": "'", // right single quote "´": "'", // acute accent "`": "'", // grave accent "[": " ", "]": " ", "|": " ", "/": " ", "#": " ", "→": " ", "←": " ", }; // Apply character replacements. for (const [key, value] of Object.entries(replacements)) { text = text.replace(new RegExp(`\\${key}`, "g"), value); } // Remove combining diacritical marks. text = text.replace(/[\u0300-\u036f]/g, ""); // Remove special symbols that are not handled by emoji removal. text = text.replace(/[♥☆♡©\\]/g, ""); // Replace known expressions. const exprReplacements: Record = { "@": " at ", " e.g.": " for example, ", " i.e.": " that is, ", }; for (const [key, value] of Object.entries(exprReplacements)) { text = text.replace(new RegExp(key, "g"), value); } // Fix spacing around punctuation. text = text .replace(/ ,/g, ",") .replace(/ \./g, ".") .replace(/ !/g, "!") .replace(/ \?/g, "?") .replace(/ ;/g, ";") .replace(/ :/g, ":") .replace(/ '/g, "'"); // Remove duplicate quotes. text = text.replace(/""+/g, '"'); text = text.replace(/''+/g, "'"); text = text.replace(/``+/g, "`"); // Remove extra spaces. text = text.replace(/\s+/g, " ").trim(); // If text doesn't end with punctuation, quotes, or closing brackets, add a period. if (!/[.!?;:,'")\]}…。」』】〉》›»]$/.test(text)) { text += "."; } return text; }