File size: 2,678 Bytes
ef0913c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 | /**
* Wolof STT Normalizer Utility v2.0
* Includes change tracking and WhatsApp message shortening.
*/
const NORMALIZATION_RULES: Record<string, string> = {
"damae": "damay",
"dama": "damay",
"dma": "damay",
"jai": "jaay",
"jaai": "jaay",
"jaye": "jaay",
"jendi": "jënd",
"fei": "fey",
"fay": "fey",
"yere": "yére",
"yare": "yére",
"sandwiche": "sandwich",
"pan": "mburu",
"cafe": "café",
"sabu": "sabu",
"omo": "omo",
"patat": "patas",
"ognon": "sooble",
"riz": "ceeb",
"yof": "Yoff",
"dakar": "Dakar",
"pikine": "Pikine",
"guediawaye": "Guédiawaye",
"keur": "kër",
"ker": "kër",
"sikarche": "ci kër",
"sikarshe": "ci kër",
"sikarce": "ci kër",
"sikaarché": "ci kër",
"quartier": "quartier",
"banlieu": "banlieue",
"si": "ci",
"fane": "fan",
"fana": "fan",
"lana": "lan",
"lanna": "lan",
"nakka": "naka",
"nakha": "naka",
"niak": "ñàkk",
"niakk": "ñàkk",
"dencal": "denc", // requested dencal -> denc
"limal": "lim",
"ganee": "gañ",
"gane": "gañ",
"borom": "boroom",
"xaalisou": "xaalis",
"xaliss": "xaalis",
};
const CAPITALIZED_PLACES = ["Yoff", "Dakar", "Pikine", "Guédiawaye"];
export interface NormalizationResult {
normalizedText: string;
changes: string[]; // Format: ["damae -> damay", ...]
}
export function normalizeWolof(rawText: string): NormalizationResult {
if (!rawText) return { normalizedText: '', changes: [] };
let text = rawText.trim().replace(/\s{2,}/g, " ");
const changes: string[] = [];
const words = text.split(" ");
const processedWords = words.map(word => {
const lowerWord = word.toLowerCase().replace(/[.,/#!$%^&*;:{}=\-_`~()]/g, "");
if (NORMALIZATION_RULES[lowerWord]) {
const replacement = NORMALIZATION_RULES[lowerWord];
if (lowerWord !== replacement.toLowerCase()) {
changes.push(`${lowerWord} -> ${replacement}`);
}
return replacement;
}
const matchingPlace = CAPITALIZED_PLACES.find(p => p.toLowerCase() === lowerWord);
if (matchingPlace) {
if (matchingPlace !== word) {
changes.push(`${word} -> ${matchingPlace}`);
}
return matchingPlace;
}
return word;
});
let normalizedText = processedWords.join(" ");
if (normalizedText.length > 0) {
normalizedText = normalizedText.charAt(0).toUpperCase() + normalizedText.slice(1);
}
return { normalizedText, changes: Array.from(new Set(changes)) };
}
|