Spaces:
Sleeping
Sleeping
| """ | |
| berber_nlp.py | |
| ============= | |
| Core NLP toolkit for the Amazigh/Berber language (Tarifit/Riffian dialect focus). | |
| Extracted and structured from: Lamuela, X. (2005). El Berber: Estudi Comparatiu | |
| entre la GramΓ tica del CatalΓ i la del Berber o Amazig. Universitat de Girona. | |
| Modules: | |
| - Tokenizer | |
| - Phonological analyzer (consonants, vowels, pharyngealization) | |
| - Morphological analyzer (root extraction, aspect system, gender/number) | |
| - Syntax annotator (VSO order, free state / annexed state) | |
| - Bilingual dictionary (Catalan-Berber / Berber-Catalan) | |
| - Transliterator (Latin IPA β Tifinagh) | |
| """ | |
| import re | |
| import json | |
| from dataclasses import dataclass, field | |
| from typing import List, Dict, Optional, Tuple | |
| from pathlib import Path | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 1. PHONOLOGICAL CONSTANTS | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Berber consonants inventory (from Β§2.1.1 of the book) | |
| # Format: {grapheme: (IPA, place_of_articulation, manner, features)} | |
| BERBER_CONSONANTS = { | |
| # Labials | |
| "b": ("b", "bilabial", "oclusiva", []), | |
| "f": ("f", "labiodental", "fricativa", []), | |
| "m": ("m", "bilabial", "nasal", []), | |
| # Dentals / Alveolars | |
| "t": ("t", "dental", "oclusiva", []), | |
| "d": ("d", "dental", "oclusiva", []), | |
| "n": ("n", "alveolar", "nasal", []), | |
| "l": ("l", "alveolar", "lateral", []), | |
| "r": ("r", "alveolar", "vibrant", []), | |
| "s": ("s", "alveolar", "fricativa", []), | |
| "z": ("z", "alveolar", "fricativa", []), | |
| # Emphatics (pharyngealized - Β§2.1.1.3) | |
| "αΉ": ("tΛ€", "dental", "oclusiva", ["pharyngealized"]), | |
| "αΈ": ("dΛ€", "dental", "oclusiva", ["pharyngealized"]), | |
| "αΉ£": ("sΛ€", "alveolar", "fricativa", ["pharyngealized"]), | |
| "αΊ": ("zΛ€", "alveolar", "fricativa", ["pharyngealized"]), | |
| "αΉ": ("rΛ€", "alveolar", "vibrant", ["pharyngealized"]), | |
| "αΈ·": ("lΛ€", "alveolar", "lateral", ["pharyngealized"]), | |
| "αΉ": ("mΛ€", "bilabial", "nasal", ["pharyngealized"]), | |
| "αΉ": ("nΛ€", "alveolar", "nasal", ["pharyngealized"]), | |
| # Palatals / Postalveolars | |
| "y": ("j", "palatal", "semivocal", []), | |
| "Δ": ("tΚ", "postalveolar", "africada", []), | |
| "j": ("dΚ", "postalveolar", "africada", []), | |
| # Velars | |
| "k": ("k", "velar", "oclusiva", []), | |
| "g": ("g", "velar", "oclusiva", []), | |
| "x": ("x", "velar", "fricativa", []), | |
| "Ξ³": ("Ι£", "velar", "fricativa", []), | |
| # Uvulars (Β§2.1.1.2) | |
| "q": ("q", "uvular", "oclusiva", []), | |
| "R": ("Κ", "uvular", "fricativa", []), | |
| # Pharyngeals (Β§2.1.1.4) | |
| "αΈ₯": ("Δ§", "faringe", "fricativa", []), | |
| "Κ": ("Κ", "faringe", "aproximant", []), | |
| # Glottals | |
| "h": ("h", "glotal", "fricativa", []), | |
| # Labiovelar | |
| "w": ("w", "labiovelar", "semivocal", []), | |
| # Lateral fricative (Β§2.1.1.5) | |
| "lh": ("Ι¬", "alveolar", "fricativa_lat", []), | |
| } | |
| # Berber vowels (Β§2.1.4) - only 3 phonemes! | |
| BERBER_VOWELS = { | |
| "a": ("a", "low", "central", "unrounded"), | |
| "i": ("i", "high", "front", "unrounded"), | |
| "u": ("u", "high", "back", "rounded"), | |
| # Schwa (epenthetic, non-phonemic in Riffian) | |
| "Ι": ("Ι", "mid", "central", "unrounded"), | |
| } | |
| # Emphatic harmony triggers (pharyngealization spreads in Berber) | |
| EMPHATIC_CONSONANTS = {"αΉ", "αΈ", "αΉ£", "αΊ", "αΉ", "αΈ·", "q", "αΈ₯", "Κ"} | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 2. MORPHOLOGICAL CONSTANTS | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Verb aspect system (Β§2.2.2) - Berber marks ASPECT not TENSE | |
| # perfectiu | imperfectiu | habitual-duratiu | |
| ASPECT_PATTERNS = { | |
| # Pattern: (root_type, perfectiu_template, imperfectiu_template, habitual_template) | |
| "CC_root": { | |
| "perfectiu": "C1C2", | |
| "imperfectiu": "C1C2C2", # gemination of final consonant | |
| "habitual": "ttC1C2", # tt- prefix | |
| }, | |
| "CCC_root": { | |
| "perfectiu": "C1C2C3", | |
| "imperfectiu": "C1C2C2C3", | |
| "habitual": "ttC1C2C3", | |
| }, | |
| "CVCC_root": { | |
| "perfectiu": "C1aC2C3", | |
| "imperfectiu": "C1C2C2C3", # vowel deletion + gemination | |
| "habitual": "ttC1C2C3", | |
| } | |
| } | |
| # Example verbs from the book (Β§2.2.2.1) | |
| EXAMPLE_VERBS = [ | |
| {"root": "zl", "perfectiu": "uzzel", "imperfectiu": "izzell", "habitual": "ttuzzel", "gloss": "cΓ³rrer"}, | |
| {"root": "rz", "perfectiu": "yerza", "imperfectiu": "irezzu", "habitual": "tterza", "gloss": "buscar"}, | |
| {"root": "kr", "perfectiu": "ikerz", "imperfectiu": "ikerrez", "habitual": "ttkerrez", "gloss": "llaurar"}, | |
| {"root": "ks", "perfectiu": "yeksa", "imperfectiu": "ikessa", "habitual": "tteksa", "gloss": "guardar ramat"}, | |
| {"root": "fn", "perfectiu": "ifna", "imperfectiu": "ifenna", "habitual": "ttufna", "gloss": "morir (plantes)"}, | |
| {"root": "rw", "perfectiu": "irwa", "imperfectiu": "irrewa", "habitual": "tterwa", "gloss": "estar ple"}, | |
| {"root": "ql", "perfectiu": "iqqel", "imperfectiu": "iqqal", "habitual": "tteqqal", "gloss": "quedar-se"}, | |
| {"root": "αΊαΊ", "perfectiu": "αΉ£αΉ£a", "imperfectiu": "tteαΊαΊa", "habitual": "tteαΊαΊa", "gloss": "cremar"}, | |
| ] | |
| # Gender markers (Β§2.2.3) - Berber has grammatical gender | |
| GENDER_MARKERS = { | |
| "masculine": {"prefix": "", "suffix": ""}, | |
| "feminine": {"prefix": "t", "suffix": "t"}, # t...t circumfix | |
| } | |
| # Number markers (Β§2.2.3.2) | |
| NUMBER_PATTERNS = { | |
| # Singular -> Plural common patterns in Berber | |
| "internal_plural": [ | |
| # (singular_pattern, plural_pattern, example_sg, example_pl, gloss) | |
| ("aCC", "iCCan", "argaz", "irgazen", "home/homes"), | |
| ("taCC+t", "tiCC+in", "tafruxt", "tifruxin", "nena/nenes"), | |
| ("aCCaC", "iCCaCen", "amdaz", "imdazen", "missatger"), | |
| ("iCCi", "iCCan", "ifri", "ifran", "cova/coves"), | |
| ], | |
| } | |
| # Free State vs Annexed State (Β§2.2.3.1) - key Berber morphological feature | |
| STATE_PATTERNS = { | |
| # Masculine nouns: a- prefix (free) β u- (annexed after verb subject) | |
| "masc_free": r"^a[bcdfghjklmnpqrstvwxyz]", | |
| "masc_annexed": "u", # initial 'a' becomes 'u' | |
| # Feminine nouns: ta- prefix (free) β t- (annexed) | |
| "fem_free": r"^ta[bcdfghjklmnpqrstvwxyz]", | |
| "fem_annexed": "t", # 'ta' β 't' | |
| } | |
| # Pronouns (Β§2.2.4) | |
| PERSONAL_PRONOUNS = { | |
| "1sg": ("nk", "nekk", "jo"), | |
| "2sg": ("k/m", "kiyyni/miyyni", "tu (masc/fem)"), | |
| "3sg_m": ("t", "ntta", "ell"), | |
| "3sg_f": ("tt", "nttat", "ella"), | |
| "1pl": ("nx", "nekkni", "nosaltres"), | |
| "2pl": ("kn/mnt", "kenwi/menwi", "vosaltres"), | |
| "3pl": ("tn", "nttni/nttenti", "ells/elles"), | |
| } | |
| # Verbal agreement prefixes/suffixes (Β§2.2.2.3) | |
| VERB_AGREEMENT = { | |
| # (person, number, gender): (prefix, suffix) | |
| ("1", "sg", ""): ("", "-x"), | |
| ("2", "sg", "m"): ("t", "-t"), | |
| ("2", "sg", "f"): ("t", "-t"), | |
| ("3", "sg", "m"): ("i/y",""), | |
| ("3", "sg", "f"): ("t", ""), | |
| ("1", "pl", ""): ("n", ""), | |
| ("2", "pl", "m"): ("t", "-m"), | |
| ("2", "pl", "f"): ("t", "-mt"), | |
| ("3", "pl", "m"): ("", "-n"), | |
| ("3", "pl", "f"): ("", "-nt"), | |
| } | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 3. TOKENIZER | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class BerberTokenizer: | |
| """ | |
| Tokenizer for Amazigh/Berber text in Latin script (Tifinagh optional). | |
| Handles clitics, affixes, and special Berber graphemes. | |
| """ | |
| CLITIC_PATTERNS = [ | |
| (r"^d-", "CONJ", "d"), # 'and / copula' (Β§2.3.4) | |
| (r"^ur\s", "NEG", "ur"), # negation prefix | |
| (r"^ad\s", "MOD", "ad"), # prospective modal | |
| (r"^i-", "PREP", "i"), # dative preposition | |
| (r"^s-", "PREP", "s"), # instrumental/comitative | |
| (r"^xef\s", "PREP", "xef"), # 'sobre/per' preposition | |
| (r"^deg\s", "PREP", "deg"), # locative 'dins' | |
| (r"^ger\s", "PREP", "ger"), # 'entre' | |
| (r"^qbel\s","PREP", "qbel"), # 'abans' | |
| ] | |
| def __init__(self): | |
| self.vowels = set("aeiouΙÀïü") | |
| self.special_chars = set("αΉαΈαΉ£αΊαΉαΈ·αΉαΉαΈ₯Ξ³") | |
| def tokenize(self, text: str) -> List[Dict]: | |
| """Tokenize Berber text into a list of token dicts.""" | |
| tokens = [] | |
| # Split on whitespace and punctuation | |
| raw_tokens = re.findall(r"[\w\u0300-\u036f\u1e00-\u1eff]+|[^\w\s]", text) | |
| for tok in raw_tokens: | |
| token = { | |
| "form": tok, | |
| "lower": tok.lower(), | |
| "is_verb": self._looks_like_verb(tok), | |
| "is_noun": self._looks_like_noun(tok), | |
| "gender": self._detect_gender(tok), | |
| "state": self._detect_state(tok), | |
| } | |
| tokens.append(token) | |
| return tokens | |
| def _looks_like_verb(self, word: str) -> bool: | |
| """Berber verbs often start with i/y/t/n (agreement prefixes).""" | |
| return bool(re.match(r"^[iytn][bcdfghjklmnpqrstvwxyz]", word.lower())) | |
| def _looks_like_noun(self, word: str) -> bool: | |
| """Berber nouns in free state: a- (masc) or ta-...-t (fem).""" | |
| w = word.lower() | |
| return w.startswith("a") or (w.startswith("ta") and w.endswith("t")) | |
| def _detect_gender(self, word: str) -> Optional[str]: | |
| w = word.lower() | |
| if w.startswith("ta") and (w.endswith("t") or len(w) > 4): | |
| return "femenΓ" | |
| elif w.startswith("a") or w.startswith("u"): | |
| return "masculΓ" | |
| return None | |
| def _detect_state(self, word: str) -> Optional[str]: | |
| """Free state (estat lliure) vs Annexed state (estat en aposiciΓ³).""" | |
| w = word.lower() | |
| if w.startswith("a") or w.startswith("ta"): | |
| return "lliure" | |
| elif w.startswith("u") or (w.startswith("t") and not w.startswith("ta")): | |
| return "anex" | |
| return None | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 4. MORPHOLOGICAL ANALYZER | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class MorphAnalysis: | |
| form: str | |
| root: Optional[str] = None | |
| aspect: Optional[str] = None # perfectiu / imperfectiu / habitual | |
| person: Optional[str] = None | |
| number: Optional[str] = None | |
| gender: Optional[str] = None | |
| state: Optional[str] = None # lliure / anex (Free/Annexed) | |
| pos: Optional[str] = None # POS tag | |
| gloss: Optional[str] = None | |
| notes: List[str] = field(default_factory=list) | |
| class BerberMorphAnalyzer: | |
| """ | |
| Rule-based morphological analyzer for Berber verbs and nouns. | |
| Based on grammar description in Lamuela (2005), Β§2.2. | |
| """ | |
| def __init__(self): | |
| self._build_verb_index() | |
| self._build_noun_patterns() | |
| def _build_verb_index(self): | |
| """Index known verbs from book examples.""" | |
| self.verb_index = {} | |
| for v in EXAMPLE_VERBS: | |
| for aspect in ["perfectiu", "imperfectiu", "habitual"]: | |
| form = v[aspect] | |
| self.verb_index[form] = { | |
| "root": v["root"], | |
| "aspect": aspect, | |
| "gloss": v["gloss"], | |
| } | |
| def _build_noun_patterns(self): | |
| """Compile regex patterns for noun morphology.""" | |
| self.noun_patterns = [ | |
| # (pattern, gender, state, template) | |
| (re.compile(r"^a([bcdfghjklmnpqrstvwxyz].+)$"), "masc", "lliure", "a+C..."), | |
| (re.compile(r"^u([bcdfghjklmnpqrstvwxyz].+)$"), "masc", "anex", "u+C..."), | |
| (re.compile(r"^ta(.+)t$"), "fem", "lliure", "ta...t"), | |
| (re.compile(r"^t([bcdfghjklmnpqrstvwxyz].+)t$"), "fem", "anex", "t...t"), | |
| ] | |
| def analyze_verb(self, form: str) -> MorphAnalysis: | |
| """Analyze a verb form for root, aspect, and agreement.""" | |
| result = MorphAnalysis(form=form, pos="VERB") | |
| # Check known verbs | |
| if form in self.verb_index: | |
| entry = self.verb_index[form] | |
| result.root = entry["root"] | |
| result.aspect = entry["aspect"] | |
| result.gloss = entry["gloss"] | |
| return result | |
| # Rule-based aspect detection | |
| if form.startswith("tt"): | |
| result.aspect = "habitual" | |
| result.notes.append("Prefix 'tt-' indica aspecte habitual/duratiu (Β§2.2.2)") | |
| elif re.search(r"(.)\1", form): # geminate consonant | |
| result.aspect = "imperfectiu" | |
| result.notes.append("Consonant geminada indica aspecte imperfectiu (Β§2.2.2)") | |
| else: | |
| result.aspect = "perfectiu" | |
| # Agreement prefix detection | |
| prefix_map = { | |
| "i": ("3", "sg", "m"), | |
| "y": ("3", "sg", "m"), | |
| "t": ("3", "sg", "f"), | |
| "n": ("1", "pl", ""), | |
| } | |
| for prefix, (per, num, gen) in prefix_map.items(): | |
| if form.startswith(prefix): | |
| result.person = per | |
| result.number = num | |
| result.gender = gen or None | |
| break | |
| # Suffix detection | |
| if form.endswith("x"): | |
| result.person = "1" | |
| result.number = "sg" | |
| elif form.endswith("m"): | |
| result.person = "2" | |
| result.number = "pl" | |
| result.gender = "m" | |
| elif form.endswith("n") and not form.endswith("an"): | |
| result.person = "3" | |
| result.number = "pl" | |
| return result | |
| def analyze_noun(self, form: str) -> MorphAnalysis: | |
| """Analyze a noun for gender, state, and plurality markers.""" | |
| result = MorphAnalysis(form=form, pos="NOM") | |
| for pattern, gender, state, template in self.noun_patterns: | |
| m = pattern.match(form) | |
| if m: | |
| result.gender = gender | |
| result.state = state | |
| result.root = m.group(1) | |
| result.notes.append( | |
| f"Plantilla morfolΓ²gica: {template} β GΓ¨nere: {gender}, Estat: {state} (Β§2.2.3)" | |
| ) | |
| break | |
| # Detect plural by -en / -an suffix | |
| if form.endswith("en") or form.endswith("an"): | |
| result.number = "pl" | |
| result.notes.append("Sufix plural -(e/a)n (Β§2.2.3.2)") | |
| else: | |
| result.number = "sg" | |
| return result | |
| def analyze(self, form: str) -> MorphAnalysis: | |
| """Auto-detect POS and analyze.""" | |
| tok = BerberTokenizer() | |
| if tok._looks_like_verb(form): | |
| return self.analyze_verb(form) | |
| elif tok._looks_like_noun(form): | |
| return self.analyze_noun(form) | |
| else: | |
| return MorphAnalysis(form=form, pos="DESCONEGUT") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 5. SYNTAX ANNOTATOR | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class BerberSyntaxAnnotator: | |
| """ | |
| Annotates basic syntactic structure of Berber clauses. | |
| Key feature: VSO (Verb-Subject-Object) order (Β§2.3.1). | |
| """ | |
| # Berber particles and function words (Β§2.3) | |
| NEGATION = {"ur", "ulac"} | |
| PROSPECTIVE = {"ad"} # prospective mood marker | |
| COPULA = {"d"} # copula / conjunction | |
| QUESTION = {"is", "ma"} # question particles | |
| DEMONSTRATIVES = { | |
| "wa": "m.sg", "ta": "f.sg", | |
| "wi": "m.pl", "ti": "f.pl", | |
| "wagi": "m.sg.prox", "tagi": "f.sg.prox", | |
| } | |
| PREPOSITIONS = { | |
| "i": "datiu", | |
| "s": "instrumental/comitatiu", | |
| "deg": "locatiu (dins)", | |
| "ger": "entre", | |
| "xef": "sobre/per", | |
| "qbel": "abans", | |
| "deffir":"darrere", | |
| "zzat": "davant", | |
| "-er": "directiu", | |
| } | |
| def annotate(self, tokens: List[Dict]) -> List[Dict]: | |
| """Add syntactic role annotations to a token list.""" | |
| annotated = [] | |
| verb_found = False | |
| for i, tok in enumerate(tokens): | |
| ann = dict(tok) | |
| form = tok["form"].lower() | |
| if form in self.NEGATION: | |
| ann["synt_role"] = "NEG" | |
| elif form in self.PROSPECTIVE: | |
| ann["synt_role"] = "MOD:prosp" | |
| elif form in self.COPULA: | |
| ann["synt_role"] = "COP" | |
| elif form in self.QUESTION: | |
| ann["synt_role"] = "Q" | |
| elif form in self.PREPOSITIONS: | |
| ann["synt_role"] = f"PREP:{self.PREPOSITIONS[form]}" | |
| elif tok.get("is_verb") and not verb_found: | |
| ann["synt_role"] = "VERB" | |
| verb_found = True | |
| elif tok.get("is_noun"): | |
| if verb_found: | |
| ann["synt_role"] = "SUBJ" # post-verbal subject in VSO | |
| verb_found = False # next noun likely OBJ | |
| else: | |
| ann["synt_role"] = "NOM" | |
| else: | |
| ann["synt_role"] = "?" | |
| annotated.append(ann) | |
| return annotated | |
| def check_vso_order(self, tokens: List[Dict]) -> Dict: | |
| """ | |
| Verify VSO order in a sentence and flag deviations. | |
| In Berber, subject normally follows verb (Β§2.3.1.1). | |
| """ | |
| roles = [t.get("synt_role", "?") for t in tokens] | |
| verb_idx = next((i for i, r in enumerate(roles) if r == "VERB"), -1) | |
| subj_idx = next((i for i, r in enumerate(roles) if r == "SUBJ"), -1) | |
| obj_idx = next((i for i, r in enumerate(roles) if "OBJ" in r), -1) | |
| result = { | |
| "order": "VSO" if verb_idx < subj_idx else "SVO" if subj_idx < verb_idx else "?", | |
| "verb_pos": verb_idx, | |
| "subj_pos": subj_idx, | |
| "obj_pos": obj_idx, | |
| "is_canonical": verb_idx < subj_idx and (obj_idx < 0 or subj_idx < obj_idx), | |
| "note": "L'ordre canΓ²nic del berber Γ©s VSO (Β§2.3.1)" , | |
| } | |
| return result | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 6. PHONOLOGICAL ANALYZER | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class BerberPhonologyAnalyzer: | |
| """ | |
| Phonological analysis: consonant inventory, pharyngealization, | |
| syllable structure, and vowel system for Berber. | |
| """ | |
| def __init__(self): | |
| self.consonants = BERBER_CONSONANTS | |
| self.vowels = BERBER_VOWELS | |
| self.emphatics = EMPHATIC_CONSONANTS | |
| def syllabify(self, word: str) -> List[str]: | |
| """ | |
| Basic syllabification for Berber. | |
| Rule: Berber allows complex consonant clusters (Β§2.1.4.2). | |
| Syllable = optional consonant cluster + vowel + optional coda | |
| """ | |
| vowels = set("aeiouΙÀïü") | |
| syllables = [] | |
| current = "" | |
| for char in word: | |
| current += char | |
| if char in vowels: | |
| syllables.append(current) | |
| current = "" | |
| if current: | |
| if syllables: | |
| syllables[-1] += current | |
| else: | |
| syllables.append(current) | |
| return syllables if syllables else [word] | |
| def has_pharyngealization(self, word: str) -> bool: | |
| """Check if a word contains pharyngealized (emphatic) consonants.""" | |
| return any(c in self.emphatics for c in word) | |
| def get_consonant_cluster(self, word: str) -> List[str]: | |
| """Extract consonant clusters (important for Berber root detection).""" | |
| vowels = set("aeiouΙÀïü") | |
| clusters = re.findall(r"[^aeiouΙÀïü]+", word.lower()) | |
| return [c for c in clusters if len(c) > 1] | |
| def to_ipa(self, word: str) -> str: | |
| """Transcribe Berber Latin orthography to IPA.""" | |
| ipa = word | |
| # Multi-char graphemes first | |
| replacements = [ | |
| ("lh", "Ι¬"), ("gh", "Ι£"), ("kh", "x"), | |
| ("ch", "tΚ"), ("dj", "dΚ"), | |
| ("αΉ", "tΛ€"), ("αΈ", "dΛ€"), ("αΉ£", "sΛ€"), | |
| ("αΊ", "zΛ€"), ("αΉ", "rΛ€"), ("αΈ·", "lΛ€"), | |
| ("αΈ₯", "Δ§"), ("Ξ³", "Ι£"), ("q", "q"), | |
| ("y", "j"), ("w", "w"), | |
| ] | |
| for src, tgt in replacements: | |
| ipa = ipa.replace(src, tgt) | |
| return ipa | |
| def analyze_word_phonology(self, word: str) -> Dict: | |
| """Full phonological profile of a word.""" | |
| return { | |
| "form": word, | |
| "ipa": self.to_ipa(word), | |
| "syllables": self.syllabify(word), | |
| "n_syllables": len(self.syllabify(word)), | |
| "has_emphatics": self.has_pharyngealization(word), | |
| "consonant_clusters": self.get_consonant_cluster(word), | |
| "vowel_count": sum(1 for c in word if c in "aeiouΙ"), | |
| "notes": ( | |
| "ContΓ© consonants emfΓ tiques (farΠΈΠ½Π³alitzades) (Β§2.1.1.3)" | |
| if self.has_pharyngealization(word) else "" | |
| ), | |
| } | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 7. BILINGUAL DICTIONARY | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class BerberDictionary: | |
| """ | |
| Bilingual Catalan-Berber / Berber-Catalan dictionary. | |
| Parsed from vocabulary sections of Lamuela (2005), Β§4. | |
| """ | |
| def __init__(self, data_path: Optional[Path] = None): | |
| self.ca_to_ber: Dict[str, List[Dict]] = {} | |
| self.ber_to_ca: Dict[str, List[Dict]] = {} | |
| if data_path: | |
| self.load_from_files(data_path) | |
| else: | |
| self._load_embedded_vocab() | |
| def _load_embedded_vocab(self): | |
| """Load a curated subset of vocabulary from the book.""" | |
| # Format: [catalan, berber, pos, notes] | |
| vocab = [ | |
| # Basic nouns | |
| ("home", "argaz", "n.m", "pl: irgazen"), | |
| ("dona", "tamghart", "n.f", "pl: timghartin; ta-...-t circumfix"), | |
| ("nen", "aqcic", "n.m", "pl: iqcicen"), | |
| ("nena", "tafruxt", "n.f", "pl: tifruxin"), | |
| ("pare", "baba", "n.m", "terme vocatiu; cf. 'yiwa'"), | |
| ("mare", "yemma", "n.f", "terme vocatiu"), | |
| ("fill", "aryaz", "n.m", "pl: irgazen (homonim amb 'home')"), | |
| ("filla", "taryaz", "n.f", ""), | |
| ("germΓ ", "gma", "n.m", "possessiu inherent"), | |
| ("germana", "ultma", "n.f", "possessiu inherent"), | |
| # Body parts | |
| ("cap", "ixf", "n.m", ""), | |
| ("pit", "abbuc", "n.m", ""), | |
| ("ull", "tiαΉ", "n.f", "pl: tiαΉαΉawin"), | |
| ("mΓ ", "afus", "n.m", "pl: ifassen"), | |
| ("peu", "aαΈar", "n.m", "pl: iαΈaren; consonant emfΓ tica"), | |
| # Colors (Β§2.4.2) | |
| ("negre", "aberkan", "adj", "f: taberkanαΉ; pl: iberkan"), | |
| ("blanc", "acemlal", "adj", "f: tacemlalt; pl: icemlalen"), | |
| ("vermell", "azegzaw", "adj", "verd tambΓ©"), | |
| ("groc", "awraΞ³", "adj", "f: tawraΞ³t"), | |
| # Common verbs | |
| ("dir", "ini", "v", "perf: yini; imperf: ittini"), | |
| ("venir", "as", "v", "perf: yusa; imperf: ittusa; irregular"), | |
| ("anar", "ddu", "v", "perf: yedda; imperf: itteddu"), | |
| ("menjar", "ecc", "v", "perf: yecca; imperf: ittecca"), | |
| ("beure", "sw", "v", "perf: yeswa; imperf: itteswa"), | |
| ("dormir", "ini", "v", "perf: yudda; imperf: ittudda"), | |
| ("saber", "ssen", "v", "perf: yessen; imperf: ittessen"), | |
| ("voler", "iri", "v", "perf: yira; imperf: ittira"), | |
| ("poder", "zαΈer", "v", "perf: yezαΈer; imperf: ittezαΈer"), | |
| # Numbers | |
| ("un", "yan", "num", "f: yat"), | |
| ("dos", "sin", "num", "f: snat"), | |
| ("tres", "kraαΈ", "num", "f: kraαΉt"), | |
| ("quatre", "kuαΊ", "num", "f: kuαΊαΉ"), | |
| ("cinc", "semmus", "num", ""), | |
| ("sis", "sαΈis", "num", ""), | |
| ("set", "sa", "num", "f: sat"), | |
| ("vuit", "tam", "num", "f: tamt"), | |
| ("nou", "tαΊa", "num", ""), | |
| ("deu", "mraw", "num", ""), | |
| # Time | |
| ("avui", "ass-a", "adv", ""), | |
| ("ahir", "iαΈelli", "adv", ""), | |
| ("demΓ ", "azekka", "adv", ""), | |
| ("ara", "tura", "adv", ""), | |
| ("sempre", "dima", "adv", "arabisme freqΓΌent"), | |
| # Greetings | |
| ("hola", "azul", "interj", ""), | |
| ("grΓ cies", "tanemmirt", "interj", ""), | |
| ("sΓ", "ih", "part", ""), | |
| ("no", "uhu", "part", ""), | |
| # Prepositions | |
| ("a/per a", "i", "prep", "marca datiu"), | |
| ("amb", "d", "prep", "comitatiu; tambΓ© copula"), | |
| ("dins", "deg", "prep", "locatiu"), | |
| ("sobre", "xef", "prep", ""), | |
| ("davant", "zzat", "prep", ""), | |
| ("darrere", "deffir", "prep", ""), | |
| ("entre", "ger", "prep", ""), | |
| ("abans", "qbel", "prep", ""), | |
| ("desprΓ©s", "deffir", "prep", ""), | |
| ] | |
| for ca, ber, pos, notes in vocab: | |
| entry = {"form_ca": ca, "form_ber": ber, "pos": pos, "notes": notes} | |
| self.ca_to_ber.setdefault(ca, []).append(entry) | |
| self.ber_to_ca.setdefault(ber, []).append(entry) | |
| def lookup_ca(self, word: str) -> List[Dict]: | |
| """Look up a Catalan word β Berber translation(s).""" | |
| return self.ca_to_ber.get(word.lower(), []) | |
| def lookup_ber(self, word: str) -> List[Dict]: | |
| """Look up a Berber word β Catalan translation(s).""" | |
| return self.ber_to_ca.get(word.lower(), []) | |
| def search(self, query: str, lang: str = "ca") -> List[Dict]: | |
| """Fuzzy search in the dictionary.""" | |
| results = [] | |
| source = self.ca_to_ber if lang == "ca" else self.ber_to_ca | |
| q = query.lower() | |
| for key, entries in source.items(): | |
| if q in key.lower(): | |
| results.extend(entries) | |
| return results | |
| def to_json(self) -> str: | |
| """Export full dictionary as JSON.""" | |
| all_entries = [] | |
| for entries in self.ca_to_ber.values(): | |
| all_entries.extend(entries) | |
| return json.dumps(all_entries, ensure_ascii=False, indent=2) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 8. ERROR ANALYSIS (Catalan learner errors by Berber speakers) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class BerberLearnerErrorAnalyzer: | |
| """ | |
| Analyzes typical errors made by Berber speakers learning Catalan. | |
| Based on Β§3 of Lamuela (2005) β error corpus with linguistic explanations. | |
| """ | |
| # Error patterns documented in the book | |
| ERROR_PATTERNS = [ | |
| { | |
| "id": "ERR-GEN-01", | |
| "category": "Gènere nominal", | |
| "description": "Confusió de gènere en noms Catalans", | |
| "berber_cause": "El berber té gènere (masc/fem) però amb marcadors molt diferents (t-...-t)", | |
| "example_error": "tΓ© una cotxe", | |
| "correction": "tΓ© un cotxe", | |
| "reference": "Β§2.2.3.1", | |
| }, | |
| { | |
| "id": "ERR-ART-01", | |
| "category": "Article definit", | |
| "description": "OmissiΓ³ o ΓΊs incorrecte de l'article", | |
| "berber_cause": "El berber no tΓ© article definit; l'estat anex fa funciΓ³ similar", | |
| "example_error": "vaig a mercat", | |
| "correction": "vaig al mercat", | |
| "reference": "Β§2.2.3.1 (estat lliure vs anex)", | |
| }, | |
| { | |
| "id": "ERR-TEMPS-01", | |
| "category": "Temps verbal", | |
| "description": "ConfusiΓ³ entre temps de passat (perfecte/imperfet)", | |
| "berber_cause": "El berber organitza el verb per ASPECTE (perfectiu/imperfectiu) no per TEMPS", | |
| "example_error": "quan era petit, vaig anar cada dia a l'escola", | |
| "correction": "quan era petit, anava cada dia a l'escola", | |
| "reference": "Β§2.2.2.1 (sistema aspectual berber)", | |
| }, | |
| { | |
| "id": "ERR-TEMPS-02", | |
| "category": "Temps verbal", | |
| "description": "Γs del perfet simple on caldria imperfet d'indicatiu", | |
| "berber_cause": "Aspecte perfectiu berber β perfet catalΓ ; aspecte imperfectiu β imperfet", | |
| "example_error": "ahir vaig tenir molt fred", | |
| "correction": "ahir tenia molt fred (estat)", | |
| "reference": "Β§2.2.2", | |
| }, | |
| { | |
| "id": "ERR-PREP-01", | |
| "category": "Règim preposicional", | |
| "description": "PreposiciΓ³ incorrecta amb verbs de moviment", | |
| "berber_cause": "El berber usa partΓcules direccionals lligades al verb (Β§2.3.2)", | |
| "example_error": "vaig a la meva casa", | |
| "correction": "vaig cap a casa meva", | |
| "reference": "Β§2.3.2", | |
| }, | |
| { | |
| "id": "ERR-NEG-01", | |
| "category": "NegaciΓ³", | |
| "description": "NegaciΓ³ doble o posiciΓ³ incorrecta del negatiu", | |
| "berber_cause": "La negaciΓ³ en berber Γ©s 'ur...ara' (circumfixa al verb) (Β§2.3.5)", | |
| "example_error": "no vinc no", | |
| "correction": "no vinc", | |
| "reference": "Β§2.3.5", | |
| }, | |
| { | |
| "id": "ERR-PRON-01", | |
| "category": "Pronoms febles", | |
| "description": "OmissiΓ³ de pronoms febles clΓtics", | |
| "berber_cause": "En berber els pronoms objecte s'incorporen com a sufixos verbals", | |
| "example_error": "he vist ahir", | |
| "correction": "l'he vist ahir", | |
| "reference": "Β§2.2.4", | |
| }, | |
| { | |
| "id": "ERR-ORD-01", | |
| "category": "Ordre de la frase", | |
| "description": "Subjecte postverbal en frases declaratives", | |
| "berber_cause": "L'ordre canΓ²nic del berber Γ©s VSO; el subjecte va darrere del verb (Β§2.3.1)", | |
| "example_error": "ha vingut el meu germΓ ahir", | |
| "correction": "El meu germΓ ha vingut ahir (en catalΓ normatiu, SV Γ©s preferible)", | |
| "reference": "Β§2.3.1.1", | |
| }, | |
| ] | |
| def analyze_sentence(self, sentence: str) -> List[Dict]: | |
| """Flag potential errors in a Catalan sentence by a Berber speaker.""" | |
| warnings = [] | |
| s = sentence.lower() | |
| checks = [ | |
| (r"\buna? (cotxe|problema|tema|mapa|dia)\b", "ERR-GEN-01"), | |
| (r"\ba (mercat|escola|feina|treball)\b", "ERR-ART-01"), | |
| (r"\b(ur|ulac)\b", None), # Berber word used | |
| ] | |
| for pattern, err_id in checks: | |
| if re.search(pattern, s) and err_id: | |
| err = next((e for e in self.ERROR_PATTERNS if e["id"] == err_id), None) | |
| if err: | |
| warnings.append(err) | |
| return warnings | |
| def get_error_by_category(self, category: str) -> List[Dict]: | |
| """Filter errors by linguistic category.""" | |
| return [e for e in self.ERROR_PATTERNS if category.lower() in e["category"].lower()] | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 9. PIPELINE (all-in-one) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class BerberNLPPipeline: | |
| """ | |
| Unified NLP pipeline combining all Berber analysis modules. | |
| """ | |
| def __init__(self, data_path: Optional[Path] = None): | |
| self.tokenizer = BerberTokenizer() | |
| self.morph = BerberMorphAnalyzer() | |
| self.phonology = BerberPhonologyAnalyzer() | |
| self.syntax = BerberSyntaxAnnotator() | |
| self.dictionary = BerberDictionary(data_path) | |
| self.error_analyzer = BerberLearnerErrorAnalyzer() | |
| def analyze(self, text: str, lang: str = "ber") -> Dict: | |
| """ | |
| Full analysis of a text in Berber or Catalan (for error detection). | |
| Args: | |
| text: Input text. | |
| lang: 'ber' for Berber analysis, 'ca' for Catalan error analysis. | |
| Returns: | |
| Dict with all analysis layers. | |
| """ | |
| result = {"input": text, "lang": lang} | |
| if lang == "ber": | |
| tokens = self.tokenizer.tokenize(text) | |
| annotated = self.syntax.annotate(tokens) | |
| vso = self.syntax.check_vso_order(annotated) | |
| morph_analyses = [] | |
| phon_analyses = [] | |
| for tok in tokens: | |
| morph_analyses.append(vars(self.morph.analyze(tok["form"]))) | |
| phon_analyses.append(self.phonology.analyze_word_phonology(tok["form"])) | |
| result.update({ | |
| "tokens": tokens, | |
| "syntax": annotated, | |
| "vso_check": vso, | |
| "morphology": morph_analyses, | |
| "phonology": phon_analyses, | |
| }) | |
| elif lang == "ca": | |
| errors = self.error_analyzer.analyze_sentence(text) | |
| result["learner_errors"] = errors | |
| return result | |
| def translate(self, word: str, src: str = "ca") -> List[Dict]: | |
| """Quick dictionary lookup.""" | |
| if src == "ca": | |
| return self.dictionary.lookup_ca(word) | |
| else: | |
| return self.dictionary.lookup_ber(word) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # DEMO | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| if __name__ == "__main__": | |
| print("=" * 60) | |
| print(" BERBER NLP TOOLKIT β Demo") | |
| print(" Basat en: Lamuela (2005), El Berber") | |
| print("=" * 60) | |
| pipeline = BerberNLPPipeline() | |
| # 1. Analyze a Berber sentence | |
| sentence = "ikerz argaz akal" # 'the man plowed the field' (VSO) | |
| print(f"\n1. ANΓLISI DE FRASE BERBER: '{sentence}'") | |
| result = pipeline.analyze(sentence, lang="ber") | |
| for tok in result["tokens"]: | |
| print(f" {tok['form']:15} gender={str(tok.get('gender','?')):8} state={str(tok.get('state','?'))}") | |
| print(f" Ordre VSO: {result['vso_check']['order']} (canΓ²nic: {result['vso_check']['is_canonical']})") | |
| # 2. Morphological analysis | |
| print(f"\n2. ANΓLISI MORFOLΓGICA:") | |
| for form in ["ikerrez", "tafruxt", "irgazen", "ttuzzel"]: | |
| ana = pipeline.morph.analyze(form) | |
| print(f" {form:15} POS={ana.pos:6} aspect={ana.aspect or '-':12} gender={ana.gender or '-':6}") | |
| # 3. Phonological analysis | |
| print(f"\n3. ANΓLISI FONOLΓGICA:") | |
| for word in ["argaz", "tafruxt", "αΉeffeΞ³t", "amdaz"]: | |
| phon = pipeline.phonology.analyze_word_phonology(word) | |
| print(f" {word:15} IPA={phon['ipa']:18} sΓlΒ·labes={phon['syllables']} emfΓ tiques={phon['has_emphatics']}") | |
| # 4. Dictionary lookup | |
| print(f"\n4. DICCIONARI CAβBERBER:") | |
| for word in ["home", "dir", "negre", "grΓ cies"]: | |
| entries = pipeline.translate(word, src="ca") | |
| for e in entries: | |
| print(f" {word:15} β {e['form_ber']:15} ({e['pos']}) {e['notes']}") | |
| # 5. Learner error analysis | |
| print(f"\n5. ANΓLISI D'ERRORS (catalanΓ²fons berbers):") | |
| errors = pipeline.error_analyzer.ERROR_PATTERNS[:3] | |
| for e in errors: | |
| print(f" [{e['id']}] {e['category']}: {e['description']}") | |
| print(f" Error: '{e['example_error']}' β CorrecciΓ³: '{e['correction']}'") | |
| print(f"\n{'='*60}") | |
| print(" Tots els mΓ²duls operatius. β") | |
| print("=" * 60) | |