File size: 2,678 Bytes
ef0913c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
/**
 * Wolof STT Normalizer Utility v2.0
 * Includes change tracking and WhatsApp message shortening.
 */

const NORMALIZATION_RULES: Record<string, string> = {
    "damae": "damay",
    "dama": "damay",
    "dma": "damay",
    "jai": "jaay",
    "jaai": "jaay",
    "jaye": "jaay",
    "jendi": "jënd",
    "fei": "fey",
    "fay": "fey",
    "yere": "yére",
    "yare": "yére",
    "sandwiche": "sandwich",
    "pan": "mburu",
    "cafe": "café",
    "sabu": "sabu",
    "omo": "omo",
    "patat": "patas",
    "ognon": "sooble",
    "riz": "ceeb",
    "yof": "Yoff",
    "dakar": "Dakar",
    "pikine": "Pikine",
    "guediawaye": "Guédiawaye",
    "keur": "kër",
    "ker": "kër",
    "sikarche": "ci kër",
    "sikarshe": "ci kër",
    "sikarce": "ci kër",
    "sikaarché": "ci kër",
    "quartier": "quartier",
    "banlieu": "banlieue",
    "si": "ci",
    "fane": "fan",
    "fana": "fan",
    "lana": "lan",
    "lanna": "lan",
    "nakka": "naka",
    "nakha": "naka",
    "niak": "ñàkk",
    "niakk": "ñàkk",
    "dencal": "denc", // requested dencal -> denc
    "limal": "lim",
    "ganee": "gañ",
    "gane": "gañ",
    "borom": "boroom",
    "xaalisou": "xaalis",
    "xaliss": "xaalis",
};

const CAPITALIZED_PLACES = ["Yoff", "Dakar", "Pikine", "Guédiawaye"];

export interface NormalizationResult {
    normalizedText: string;
    changes: string[]; // Format: ["damae -> damay", ...]
}

export function normalizeWolof(rawText: string): NormalizationResult {
    if (!rawText) return { normalizedText: '', changes: [] };

    let text = rawText.trim().replace(/\s{2,}/g, " ");
    const changes: string[] = [];

    const words = text.split(" ");
    const processedWords = words.map(word => {
        const lowerWord = word.toLowerCase().replace(/[.,/#!$%^&*;:{}=\-_`~()]/g, "");
        if (NORMALIZATION_RULES[lowerWord]) {
            const replacement = NORMALIZATION_RULES[lowerWord];
            if (lowerWord !== replacement.toLowerCase()) {
                changes.push(`${lowerWord} -> ${replacement}`);
            }
            return replacement;
        }

        const matchingPlace = CAPITALIZED_PLACES.find(p => p.toLowerCase() === lowerWord);
        if (matchingPlace) {
            if (matchingPlace !== word) {
                changes.push(`${word} -> ${matchingPlace}`);
            }
            return matchingPlace;
        }

        return word;
    });

    let normalizedText = processedWords.join(" ");

    if (normalizedText.length > 0) {
        normalizedText = normalizedText.charAt(0).toUpperCase() + normalizedText.slice(1);
    }

    return { normalizedText, changes: Array.from(new Set(changes)) };
}