File size: 2,441 Bytes
f76dade |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
/**
* Preprocesses text for TTS by normalizing, cleaning, and correcting punctuation.
* Based on: https://github.com/supertone-inc/supertonic/blob/main/csharp/Helper.cs
* @param text The input text to preprocess.
* @returns The cleaned and preprocessed text.
*/
export function preprocessText(text: string): string {
// Normalize to NFKD form to separate characters from their diacritics.
text = text.normalize("NFKD");
// Remove emojis.
text = text.replace(
/([\u2600-\u27BF]|[\uE000-\uF8FF]|\uD83C[\uDC00-\uDFFF]|\uD83D[\uDC00-\uDFFF]|\uD83E[\uDD10-\uDDFF])/g,
"",
);
// Define character replacements.
const replacements: Record<string, string> = {
"—": "-", // em dash
"–": "-", // en dash
"‑": "-", // non-breaking hyphen
"¯": " ", // macron
"_": " ", // underscore
"\u201C": '"', // left double quote
"\u201D": '"', // right double quote
"\u2018": "'", // left single quote
"\u2019": "'", // right single quote
"´": "'", // acute accent
"`": "'", // grave accent
"[": " ",
"]": " ",
"|": " ",
"/": " ",
"#": " ",
"→": " ",
"←": " ",
};
// Apply character replacements.
for (const [key, value] of Object.entries(replacements)) {
text = text.replace(new RegExp(`\\${key}`, "g"), value);
}
// Remove combining diacritical marks.
text = text.replace(/[\u0300-\u036f]/g, "");
// Remove special symbols that are not handled by emoji removal.
text = text.replace(/[♥☆♡©\\]/g, "");
// Replace known expressions.
const exprReplacements: Record<string, string> = {
"@": " at ",
" e.g.": " for example, ",
" i.e.": " that is, ",
};
for (const [key, value] of Object.entries(exprReplacements)) {
text = text.replace(new RegExp(key, "g"), value);
}
// Fix spacing around punctuation.
text = text
.replace(/ ,/g, ",")
.replace(/ \./g, ".")
.replace(/ !/g, "!")
.replace(/ \?/g, "?")
.replace(/ ;/g, ";")
.replace(/ :/g, ":")
.replace(/ '/g, "'");
// Remove duplicate quotes.
text = text.replace(/""+/g, '"');
text = text.replace(/''+/g, "'");
text = text.replace(/``+/g, "`");
// Remove extra spaces.
text = text.replace(/\s+/g, " ").trim();
// If text doesn't end with punctuation, quotes, or closing brackets, add a period.
if (!/[.!?;:,'")\]}…。」』】〉》›»]$/.test(text)) {
text += ".";
}
return text;
} |