edtech / apps /whatsapp-worker /src /normalizeWolof.ts
CognxSafeTrack
Audit Final: Fix feedback message construction & Global Hardening
42c5945
/**
* Wolof STT Normalizer Utility v2.0
* Includes change tracking and WhatsApp message shortening.
*/
const NORMALIZATION_RULES: Record<string, string> = {
// A. Corrections fréquentes verbes
"damae": "damay",
"dama": "damay",
"dma": "damay",
"jai": "jaay",
"jaai": "jaay",
"jaye": "jaay",
"jendi": "jënd",
"fei": "fey",
"fay": "fey",
// B. Produits / business courant
"yere": "yére",
"yare": "yére",
"sandwiche": "sandwich",
"pan": "mburu",
"cafe": "café",
"sabu": "sabu",
"omo": "omo",
"patat": "patas",
"ognon": "sooble",
"riz": "ceeb",
// C. Lieux
"yof": "Yoff",
"dakar": "Dakar",
"pikine": "Pikine",
"guediawaye": "Guédiawaye",
"keur": "kër",
"ker": "kër",
"sikarche": "ci kër",
"sikarshe": "ci kër",
"sikarce": "ci kër",
"sikaarché": "ci kër",
"quartier": "quartier",
"banlieu": "banlieue",
// D. Mots fonctionnels
"si": "ci",
"fane": "fan",
"fana": "fan",
"lana": "lan",
"lanna": "lan",
"nakka": "naka",
"nakha": "naka",
// E. Finance
"niak": "ñàkk",
"niakk": "ñàkk",
"dencal": "denc", // requested dencal -> denc
"limal": "lim",
"ganee": "gañ",
"gane": "gañ",
"borom": "boroom",
"xaalisou": "xaalis",
"xaliss": "xaalis",
};
const CAPITALIZED_PLACES = ["Yoff", "Dakar", "Pikine", "Guédiawaye"];
export interface NormalizationResult {
normalizedText: string;
changes: string[]; // Format: ["damae -> damay", ...]
}
/**
* Normalizes Wolof STT output to standard orthography with change tracking.
*/
export function normalizeWolof(rawText: string): NormalizationResult {
if (!rawText) return { normalizedText: '', changes: [] };
let text = rawText.trim().replace(/\s{2,}/g, " ");
const changes: string[] = [];
// Basic cleaning while keeping specific characters
const words = text.split(" ");
const processedWords = words.map(word => {
const lowerWord = word.toLowerCase().replace(/[.,/#!$%^&*;:{}=\-_`~()]/g, "");
if (NORMALIZATION_RULES[lowerWord]) {
const replacement = NORMALIZATION_RULES[lowerWord];
if (lowerWord !== replacement.toLowerCase()) {
changes.push(`${lowerWord} -> ${replacement}`);
}
return replacement;
}
// Handle capitalized places
const matchingPlace = CAPITALIZED_PLACES.find(p => p.toLowerCase() === lowerWord);
if (matchingPlace) {
if (matchingPlace !== word) {
changes.push(`${word} -> ${matchingPlace}`);
}
return matchingPlace;
}
return word;
});
let normalizedText = processedWords.join(" ");
// Preserve ñ and ë is already handled by not removing them in regex above if they are part of word
// Capitalize first letter of phrase
if (normalizedText.length > 0) {
normalizedText = normalizedText.charAt(0).toUpperCase() + normalizedText.slice(1);
}
// Special Business Verbs Logic (Ensure common ones are standard)
// jaay (vendre), jënd (acheter), fey (payer), denc (épargner), lim (compter)
// These are already in NORMALIZATION_RULES.
return { normalizedText, changes: Array.from(new Set(changes)) };
}
/**
* WhatsApp-friendly message shortener.
*/
export function shortenForWhatsApp(text: string, _lang: string = 'FR'): string[] {
const messages: string[] = [];
// 1. Emoji limit (Max 2)
const emojiRegex = /(\p{Emoji_Presentation}|\p{Emoji}\uFE0F)/gu;
const emojis = text.match(emojiRegex) || [];
let cleanText = text;
if (emojis.length > 2) {
let count = 0;
cleanText = text.replace(emojiRegex, (match) => {
count++;
return count <= 2 ? match : '';
});
}
// 2. Remove "marketing" repetitive phrases (heuristic)
cleanText = cleanText.replace(/Bienvenue sur XAMLÉ, l'académie de l'entrepreneur informel/gi, "Xamle");
// 3. Length check and split (Max 280)
if (cleanText.length > 280) {
// Try to split by sentence
const sentences = cleanText.split(/(?<=[.!?])\s+/);
let current = "";
for (const s of sentences) {
if ((current + s).length > 280) {
if (current) messages.push(current.trim());
current = s;
} else {
current += (current ? " " : "") + s;
}
}
if (current) messages.push(current.trim());
} else {
messages.push(cleanText.trim());
}
// Limit to 2 messages maximum
return messages.slice(0, 2);
}
/**
* Formats a bilingual message (Wolof first, FR second on new line)
*/
export function formatBilingual(wo: string, fr: string): string {
return `${wo}\n(FR) ${fr}`;
}