Spaces:

safetrack
/

edtech

Running

edtech / apps /api /src /scripts /normalizeWolof.ts

CognxSafeTrack

chore: finalize Sprint P2 & P3 optimizations, baseline prisma migrations, and update technical audit docs

cfbb685 23 days ago

2.9 kB

	/**
	* Wolof STT Normalizer Utility v2.0
	* Includes change tracking and WhatsApp message shortening.
	*/

	const NORMALIZATION_RULES: Record<string, string> = {
	"damae": "damay",
	"dama": "damay",
	"dma": "damay",
	"jai": "jaay",
	"jaai": "jaay",
	"jaye": "jaay",
	"jendi": "jënd",
	"fei": "fey",
	"fay": "fey",
	"yere": "yére",
	"yare": "yére",
	"sandwiche": "sandwich",
	"pan": "mburu",
	"cafe": "café",
	"sabu": "sabu",
	"omo": "omo",
	"patat": "patas",
	"ognon": "sooble",
	"riz": "ceeb",
	"yof": "Yoff",
	"dakar": "Dakar",
	"pikine": "Pikine",
	"guediawaye": "Guédiawaye",
	"keur": "kër",
	"ker": "kër",
	"sikarche": "ci kër",
	"sikarshe": "ci kër",
	"sikarce": "ci kër",
	"sikaarché": "ci kër",
	"quartier": "quartier",
	"banlieu": "banlieue",
	"si": "ci",
	"fane": "fan",
	"fana": "fan",
	"lana": "lan",
	"lanna": "lan",
	"nakka": "naka",
	"nakha": "naka",
	"niak": "ñàkk",
	"niakk": "ñàkk",
	"dencal": "denc", // requested dencal -> denc
	"limal": "lim",
	"ganee": "gañ",
	"gane": "gañ",
	"borom": "boroom",
	"xaalisou": "xaalis",
	"xaliss": "xaalis",
	};

	const CAPITALIZED_PLACES = ["Yoff", "Dakar", "Pikine", "Guédiawaye"];

	export interface NormalizationResult {
	normalizedText: string;
	changes: string[]; // Format: ["damae -> damay", ...]
	}

	export function normalizeWolof(rawText: string, customRules?: Record<string, string>): NormalizationResult {
	if (!rawText) return { normalizedText: '', changes: [] };

	// Merge static rules with dynamic rules (dynamic takes precedence)
	const activeRules = customRules
	? { ...NORMALIZATION_RULES, ...customRules }
	: NORMALIZATION_RULES;

	let text = rawText.trim().replace(/\s{2,}/g, " ");
	const changes: string[] = [];

	const words = text.split(" ");
	const processedWords = words.map(word => {
	const lowerWord = word.toLowerCase().replace(/[.,/#!$%^&*;:{}=\-_`~()]/g, "");
	if (activeRules[lowerWord]) {
	const replacement = activeRules[lowerWord];
	if (lowerWord !== replacement.toLowerCase()) {
	changes.push(`${lowerWord} -> ${replacement}`);
	}
	return replacement;
	}

	const matchingPlace = CAPITALIZED_PLACES.find(p => p.toLowerCase() === lowerWord);
	if (matchingPlace) {
	if (matchingPlace !== word) {
	changes.push(`${word} -> ${matchingPlace}`);
	}
	return matchingPlace;
	}

	return word;
	});

	let normalizedText = processedWords.join(" ");

	if (normalizedText.length > 0) {
	normalizedText = normalizedText.charAt(0).toUpperCase() + normalizedText.slice(1);
	}

	return { normalizedText, changes: Array.from(new Set(changes)) };
	}