berber_nlp / berber_nlp.py
jamalinu's picture
Upload berber_nlp.py
b9b82dc verified
"""
berber_nlp.py
=============
Core NLP toolkit for the Amazigh/Berber language (Tarifit/Riffian dialect focus).
Extracted and structured from: Lamuela, X. (2005). El Berber: Estudi Comparatiu
entre la GramΓ tica del CatalΓ  i la del Berber o Amazig. Universitat de Girona.
Modules:
- Tokenizer
- Phonological analyzer (consonants, vowels, pharyngealization)
- Morphological analyzer (root extraction, aspect system, gender/number)
- Syntax annotator (VSO order, free state / annexed state)
- Bilingual dictionary (Catalan-Berber / Berber-Catalan)
- Transliterator (Latin IPA ↔ Tifinagh)
"""
import re
import json
from dataclasses import dataclass, field
from typing import List, Dict, Optional, Tuple
from pathlib import Path
# ─────────────────────────────────────────────────────────────────────────────
# 1. PHONOLOGICAL CONSTANTS
# ─────────────────────────────────────────────────────────────────────────────
# Berber consonants inventory (from Β§2.1.1 of the book)
# Format: {grapheme: (IPA, place_of_articulation, manner, features)}
BERBER_CONSONANTS = {
# Labials
"b": ("b", "bilabial", "oclusiva", []),
"f": ("f", "labiodental", "fricativa", []),
"m": ("m", "bilabial", "nasal", []),
# Dentals / Alveolars
"t": ("t", "dental", "oclusiva", []),
"d": ("d", "dental", "oclusiva", []),
"n": ("n", "alveolar", "nasal", []),
"l": ("l", "alveolar", "lateral", []),
"r": ("r", "alveolar", "vibrant", []),
"s": ("s", "alveolar", "fricativa", []),
"z": ("z", "alveolar", "fricativa", []),
# Emphatics (pharyngealized - Β§2.1.1.3)
"αΉ­": ("tΛ€", "dental", "oclusiva", ["pharyngealized"]),
"ḍ": ("dˀ", "dental", "oclusiva", ["pharyngealized"]),
"αΉ£": ("sΛ€", "alveolar", "fricativa", ["pharyngealized"]),
"αΊ“": ("zΛ€", "alveolar", "fricativa", ["pharyngealized"]),
"αΉ›": ("rΛ€", "alveolar", "vibrant", ["pharyngealized"]),
"αΈ·": ("lΛ€", "alveolar", "lateral", ["pharyngealized"]),
"αΉƒ": ("mΛ€", "bilabial", "nasal", ["pharyngealized"]),
"αΉ‡": ("nΛ€", "alveolar", "nasal", ["pharyngealized"]),
# Palatals / Postalveolars
"y": ("j", "palatal", "semivocal", []),
"č": ("tΚƒ", "postalveolar", "africada", []),
"j": ("dΚ’", "postalveolar", "africada", []),
# Velars
"k": ("k", "velar", "oclusiva", []),
"g": ("g", "velar", "oclusiva", []),
"x": ("x", "velar", "fricativa", []),
"Ξ³": ("Ι£", "velar", "fricativa", []),
# Uvulars (Β§2.1.1.2)
"q": ("q", "uvular", "oclusiva", []),
"R": ("ʁ", "uvular", "fricativa", []),
# Pharyngeals (Β§2.1.1.4)
"αΈ₯": ("Δ§", "faringe", "fricativa", []),
"Κ•": ("Κ•", "faringe", "aproximant", []),
# Glottals
"h": ("h", "glotal", "fricativa", []),
# Labiovelar
"w": ("w", "labiovelar", "semivocal", []),
# Lateral fricative (Β§2.1.1.5)
"lh": ("Ι¬", "alveolar", "fricativa_lat", []),
}
# Berber vowels (Β§2.1.4) - only 3 phonemes!
BERBER_VOWELS = {
"a": ("a", "low", "central", "unrounded"),
"i": ("i", "high", "front", "unrounded"),
"u": ("u", "high", "back", "rounded"),
# Schwa (epenthetic, non-phonemic in Riffian)
"Ι™": ("Ι™", "mid", "central", "unrounded"),
}
# Emphatic harmony triggers (pharyngealization spreads in Berber)
EMPHATIC_CONSONANTS = {"αΉ­", "ḍ", "αΉ£", "αΊ“", "αΉ›", "αΈ·", "q", "αΈ₯", "Κ•"}
# ─────────────────────────────────────────────────────────────────────────────
# 2. MORPHOLOGICAL CONSTANTS
# ─────────────────────────────────────────────────────────────────────────────
# Verb aspect system (Β§2.2.2) - Berber marks ASPECT not TENSE
# perfectiu | imperfectiu | habitual-duratiu
ASPECT_PATTERNS = {
# Pattern: (root_type, perfectiu_template, imperfectiu_template, habitual_template)
"CC_root": {
"perfectiu": "C1C2",
"imperfectiu": "C1C2C2", # gemination of final consonant
"habitual": "ttC1C2", # tt- prefix
},
"CCC_root": {
"perfectiu": "C1C2C3",
"imperfectiu": "C1C2C2C3",
"habitual": "ttC1C2C3",
},
"CVCC_root": {
"perfectiu": "C1aC2C3",
"imperfectiu": "C1C2C2C3", # vowel deletion + gemination
"habitual": "ttC1C2C3",
}
}
# Example verbs from the book (Β§2.2.2.1)
EXAMPLE_VERBS = [
{"root": "zl", "perfectiu": "uzzel", "imperfectiu": "izzell", "habitual": "ttuzzel", "gloss": "cΓ³rrer"},
{"root": "rz", "perfectiu": "yerza", "imperfectiu": "irezzu", "habitual": "tterza", "gloss": "buscar"},
{"root": "kr", "perfectiu": "ikerz", "imperfectiu": "ikerrez", "habitual": "ttkerrez", "gloss": "llaurar"},
{"root": "ks", "perfectiu": "yeksa", "imperfectiu": "ikessa", "habitual": "tteksa", "gloss": "guardar ramat"},
{"root": "fn", "perfectiu": "ifna", "imperfectiu": "ifenna", "habitual": "ttufna", "gloss": "morir (plantes)"},
{"root": "rw", "perfectiu": "irwa", "imperfectiu": "irrewa", "habitual": "tterwa", "gloss": "estar ple"},
{"root": "ql", "perfectiu": "iqqel", "imperfectiu": "iqqal", "habitual": "tteqqal", "gloss": "quedar-se"},
{"root": "αΊ“αΊ“", "perfectiu": "αΉ£αΉ£a", "imperfectiu": "tteαΊ“αΊ“a", "habitual": "tteαΊ“αΊ“a", "gloss": "cremar"},
]
# Gender markers (Β§2.2.3) - Berber has grammatical gender
GENDER_MARKERS = {
"masculine": {"prefix": "", "suffix": ""},
"feminine": {"prefix": "t", "suffix": "t"}, # t...t circumfix
}
# Number markers (Β§2.2.3.2)
NUMBER_PATTERNS = {
# Singular -> Plural common patterns in Berber
"internal_plural": [
# (singular_pattern, plural_pattern, example_sg, example_pl, gloss)
("aCC", "iCCan", "argaz", "irgazen", "home/homes"),
("taCC+t", "tiCC+in", "tafruxt", "tifruxin", "nena/nenes"),
("aCCaC", "iCCaCen", "amdaz", "imdazen", "missatger"),
("iCCi", "iCCan", "ifri", "ifran", "cova/coves"),
],
}
# Free State vs Annexed State (Β§2.2.3.1) - key Berber morphological feature
STATE_PATTERNS = {
# Masculine nouns: a- prefix (free) β†’ u- (annexed after verb subject)
"masc_free": r"^a[bcdfghjklmnpqrstvwxyz]",
"masc_annexed": "u", # initial 'a' becomes 'u'
# Feminine nouns: ta- prefix (free) β†’ t- (annexed)
"fem_free": r"^ta[bcdfghjklmnpqrstvwxyz]",
"fem_annexed": "t", # 'ta' β†’ 't'
}
# Pronouns (Β§2.2.4)
PERSONAL_PRONOUNS = {
"1sg": ("nk", "nekk", "jo"),
"2sg": ("k/m", "kiyyni/miyyni", "tu (masc/fem)"),
"3sg_m": ("t", "ntta", "ell"),
"3sg_f": ("tt", "nttat", "ella"),
"1pl": ("nx", "nekkni", "nosaltres"),
"2pl": ("kn/mnt", "kenwi/menwi", "vosaltres"),
"3pl": ("tn", "nttni/nttenti", "ells/elles"),
}
# Verbal agreement prefixes/suffixes (Β§2.2.2.3)
VERB_AGREEMENT = {
# (person, number, gender): (prefix, suffix)
("1", "sg", ""): ("", "-x"),
("2", "sg", "m"): ("t", "-t"),
("2", "sg", "f"): ("t", "-t"),
("3", "sg", "m"): ("i/y",""),
("3", "sg", "f"): ("t", ""),
("1", "pl", ""): ("n", ""),
("2", "pl", "m"): ("t", "-m"),
("2", "pl", "f"): ("t", "-mt"),
("3", "pl", "m"): ("", "-n"),
("3", "pl", "f"): ("", "-nt"),
}
# ─────────────────────────────────────────────────────────────────────────────
# 3. TOKENIZER
# ─────────────────────────────────────────────────────────────────────────────
class BerberTokenizer:
"""
Tokenizer for Amazigh/Berber text in Latin script (Tifinagh optional).
Handles clitics, affixes, and special Berber graphemes.
"""
CLITIC_PATTERNS = [
(r"^d-", "CONJ", "d"), # 'and / copula' (Β§2.3.4)
(r"^ur\s", "NEG", "ur"), # negation prefix
(r"^ad\s", "MOD", "ad"), # prospective modal
(r"^i-", "PREP", "i"), # dative preposition
(r"^s-", "PREP", "s"), # instrumental/comitative
(r"^xef\s", "PREP", "xef"), # 'sobre/per' preposition
(r"^deg\s", "PREP", "deg"), # locative 'dins'
(r"^ger\s", "PREP", "ger"), # 'entre'
(r"^qbel\s","PREP", "qbel"), # 'abans'
]
def __init__(self):
self.vowels = set("aeiouəÀïü")
self.special_chars = set("ṭḍṣẓṛḷṃṇαΈ₯Ξ³")
def tokenize(self, text: str) -> List[Dict]:
"""Tokenize Berber text into a list of token dicts."""
tokens = []
# Split on whitespace and punctuation
raw_tokens = re.findall(r"[\w\u0300-\u036f\u1e00-\u1eff]+|[^\w\s]", text)
for tok in raw_tokens:
token = {
"form": tok,
"lower": tok.lower(),
"is_verb": self._looks_like_verb(tok),
"is_noun": self._looks_like_noun(tok),
"gender": self._detect_gender(tok),
"state": self._detect_state(tok),
}
tokens.append(token)
return tokens
def _looks_like_verb(self, word: str) -> bool:
"""Berber verbs often start with i/y/t/n (agreement prefixes)."""
return bool(re.match(r"^[iytn][bcdfghjklmnpqrstvwxyz]", word.lower()))
def _looks_like_noun(self, word: str) -> bool:
"""Berber nouns in free state: a- (masc) or ta-...-t (fem)."""
w = word.lower()
return w.startswith("a") or (w.startswith("ta") and w.endswith("t"))
def _detect_gender(self, word: str) -> Optional[str]:
w = word.lower()
if w.startswith("ta") and (w.endswith("t") or len(w) > 4):
return "femenΓ­"
elif w.startswith("a") or w.startswith("u"):
return "masculΓ­"
return None
def _detect_state(self, word: str) -> Optional[str]:
"""Free state (estat lliure) vs Annexed state (estat en aposiciΓ³)."""
w = word.lower()
if w.startswith("a") or w.startswith("ta"):
return "lliure"
elif w.startswith("u") or (w.startswith("t") and not w.startswith("ta")):
return "anex"
return None
# ─────────────────────────────────────────────────────────────────────────────
# 4. MORPHOLOGICAL ANALYZER
# ─────────────────────────────────────────────────────────────────────────────
@dataclass
class MorphAnalysis:
form: str
root: Optional[str] = None
aspect: Optional[str] = None # perfectiu / imperfectiu / habitual
person: Optional[str] = None
number: Optional[str] = None
gender: Optional[str] = None
state: Optional[str] = None # lliure / anex (Free/Annexed)
pos: Optional[str] = None # POS tag
gloss: Optional[str] = None
notes: List[str] = field(default_factory=list)
class BerberMorphAnalyzer:
"""
Rule-based morphological analyzer for Berber verbs and nouns.
Based on grammar description in Lamuela (2005), Β§2.2.
"""
def __init__(self):
self._build_verb_index()
self._build_noun_patterns()
def _build_verb_index(self):
"""Index known verbs from book examples."""
self.verb_index = {}
for v in EXAMPLE_VERBS:
for aspect in ["perfectiu", "imperfectiu", "habitual"]:
form = v[aspect]
self.verb_index[form] = {
"root": v["root"],
"aspect": aspect,
"gloss": v["gloss"],
}
def _build_noun_patterns(self):
"""Compile regex patterns for noun morphology."""
self.noun_patterns = [
# (pattern, gender, state, template)
(re.compile(r"^a([bcdfghjklmnpqrstvwxyz].+)$"), "masc", "lliure", "a+C..."),
(re.compile(r"^u([bcdfghjklmnpqrstvwxyz].+)$"), "masc", "anex", "u+C..."),
(re.compile(r"^ta(.+)t$"), "fem", "lliure", "ta...t"),
(re.compile(r"^t([bcdfghjklmnpqrstvwxyz].+)t$"), "fem", "anex", "t...t"),
]
def analyze_verb(self, form: str) -> MorphAnalysis:
"""Analyze a verb form for root, aspect, and agreement."""
result = MorphAnalysis(form=form, pos="VERB")
# Check known verbs
if form in self.verb_index:
entry = self.verb_index[form]
result.root = entry["root"]
result.aspect = entry["aspect"]
result.gloss = entry["gloss"]
return result
# Rule-based aspect detection
if form.startswith("tt"):
result.aspect = "habitual"
result.notes.append("Prefix 'tt-' indica aspecte habitual/duratiu (Β§2.2.2)")
elif re.search(r"(.)\1", form): # geminate consonant
result.aspect = "imperfectiu"
result.notes.append("Consonant geminada indica aspecte imperfectiu (Β§2.2.2)")
else:
result.aspect = "perfectiu"
# Agreement prefix detection
prefix_map = {
"i": ("3", "sg", "m"),
"y": ("3", "sg", "m"),
"t": ("3", "sg", "f"),
"n": ("1", "pl", ""),
}
for prefix, (per, num, gen) in prefix_map.items():
if form.startswith(prefix):
result.person = per
result.number = num
result.gender = gen or None
break
# Suffix detection
if form.endswith("x"):
result.person = "1"
result.number = "sg"
elif form.endswith("m"):
result.person = "2"
result.number = "pl"
result.gender = "m"
elif form.endswith("n") and not form.endswith("an"):
result.person = "3"
result.number = "pl"
return result
def analyze_noun(self, form: str) -> MorphAnalysis:
"""Analyze a noun for gender, state, and plurality markers."""
result = MorphAnalysis(form=form, pos="NOM")
for pattern, gender, state, template in self.noun_patterns:
m = pattern.match(form)
if m:
result.gender = gender
result.state = state
result.root = m.group(1)
result.notes.append(
f"Plantilla morfològica: {template} — Gènere: {gender}, Estat: {state} (§2.2.3)"
)
break
# Detect plural by -en / -an suffix
if form.endswith("en") or form.endswith("an"):
result.number = "pl"
result.notes.append("Sufix plural -(e/a)n (Β§2.2.3.2)")
else:
result.number = "sg"
return result
def analyze(self, form: str) -> MorphAnalysis:
"""Auto-detect POS and analyze."""
tok = BerberTokenizer()
if tok._looks_like_verb(form):
return self.analyze_verb(form)
elif tok._looks_like_noun(form):
return self.analyze_noun(form)
else:
return MorphAnalysis(form=form, pos="DESCONEGUT")
# ─────────────────────────────────────────────────────────────────────────────
# 5. SYNTAX ANNOTATOR
# ─────────────────────────────────────────────────────────────────────────────
class BerberSyntaxAnnotator:
"""
Annotates basic syntactic structure of Berber clauses.
Key feature: VSO (Verb-Subject-Object) order (Β§2.3.1).
"""
# Berber particles and function words (Β§2.3)
NEGATION = {"ur", "ulac"}
PROSPECTIVE = {"ad"} # prospective mood marker
COPULA = {"d"} # copula / conjunction
QUESTION = {"is", "ma"} # question particles
DEMONSTRATIVES = {
"wa": "m.sg", "ta": "f.sg",
"wi": "m.pl", "ti": "f.pl",
"wagi": "m.sg.prox", "tagi": "f.sg.prox",
}
PREPOSITIONS = {
"i": "datiu",
"s": "instrumental/comitatiu",
"deg": "locatiu (dins)",
"ger": "entre",
"xef": "sobre/per",
"qbel": "abans",
"deffir":"darrere",
"zzat": "davant",
"-er": "directiu",
}
def annotate(self, tokens: List[Dict]) -> List[Dict]:
"""Add syntactic role annotations to a token list."""
annotated = []
verb_found = False
for i, tok in enumerate(tokens):
ann = dict(tok)
form = tok["form"].lower()
if form in self.NEGATION:
ann["synt_role"] = "NEG"
elif form in self.PROSPECTIVE:
ann["synt_role"] = "MOD:prosp"
elif form in self.COPULA:
ann["synt_role"] = "COP"
elif form in self.QUESTION:
ann["synt_role"] = "Q"
elif form in self.PREPOSITIONS:
ann["synt_role"] = f"PREP:{self.PREPOSITIONS[form]}"
elif tok.get("is_verb") and not verb_found:
ann["synt_role"] = "VERB"
verb_found = True
elif tok.get("is_noun"):
if verb_found:
ann["synt_role"] = "SUBJ" # post-verbal subject in VSO
verb_found = False # next noun likely OBJ
else:
ann["synt_role"] = "NOM"
else:
ann["synt_role"] = "?"
annotated.append(ann)
return annotated
def check_vso_order(self, tokens: List[Dict]) -> Dict:
"""
Verify VSO order in a sentence and flag deviations.
In Berber, subject normally follows verb (Β§2.3.1.1).
"""
roles = [t.get("synt_role", "?") for t in tokens]
verb_idx = next((i for i, r in enumerate(roles) if r == "VERB"), -1)
subj_idx = next((i for i, r in enumerate(roles) if r == "SUBJ"), -1)
obj_idx = next((i for i, r in enumerate(roles) if "OBJ" in r), -1)
result = {
"order": "VSO" if verb_idx < subj_idx else "SVO" if subj_idx < verb_idx else "?",
"verb_pos": verb_idx,
"subj_pos": subj_idx,
"obj_pos": obj_idx,
"is_canonical": verb_idx < subj_idx and (obj_idx < 0 or subj_idx < obj_idx),
"note": "L'ordre canΓ²nic del berber Γ©s VSO (Β§2.3.1)" ,
}
return result
# ─────────────────────────────────────────────────────────────────────────────
# 6. PHONOLOGICAL ANALYZER
# ─────────────────────────────────────────────────────────────────────────────
class BerberPhonologyAnalyzer:
"""
Phonological analysis: consonant inventory, pharyngealization,
syllable structure, and vowel system for Berber.
"""
def __init__(self):
self.consonants = BERBER_CONSONANTS
self.vowels = BERBER_VOWELS
self.emphatics = EMPHATIC_CONSONANTS
def syllabify(self, word: str) -> List[str]:
"""
Basic syllabification for Berber.
Rule: Berber allows complex consonant clusters (Β§2.1.4.2).
Syllable = optional consonant cluster + vowel + optional coda
"""
vowels = set("aeiouəÀïü")
syllables = []
current = ""
for char in word:
current += char
if char in vowels:
syllables.append(current)
current = ""
if current:
if syllables:
syllables[-1] += current
else:
syllables.append(current)
return syllables if syllables else [word]
def has_pharyngealization(self, word: str) -> bool:
"""Check if a word contains pharyngealized (emphatic) consonants."""
return any(c in self.emphatics for c in word)
def get_consonant_cluster(self, word: str) -> List[str]:
"""Extract consonant clusters (important for Berber root detection)."""
vowels = set("aeiouəÀïü")
clusters = re.findall(r"[^aeiouəÀïü]+", word.lower())
return [c for c in clusters if len(c) > 1]
def to_ipa(self, word: str) -> str:
"""Transcribe Berber Latin orthography to IPA."""
ipa = word
# Multi-char graphemes first
replacements = [
("lh", "Ι¬"), ("gh", "Ι£"), ("kh", "x"),
("ch", "tʃ"), ("dj", "dʒ"),
("ṭ", "tˀ"), ("ḍ", "dˀ"), ("ṣ", "sˀ"),
("αΊ“", "zΛ€"), ("αΉ›", "rΛ€"), ("αΈ·", "lΛ€"),
("αΈ₯", "Δ§"), ("Ξ³", "Ι£"), ("q", "q"),
("y", "j"), ("w", "w"),
]
for src, tgt in replacements:
ipa = ipa.replace(src, tgt)
return ipa
def analyze_word_phonology(self, word: str) -> Dict:
"""Full phonological profile of a word."""
return {
"form": word,
"ipa": self.to_ipa(word),
"syllables": self.syllabify(word),
"n_syllables": len(self.syllabify(word)),
"has_emphatics": self.has_pharyngealization(word),
"consonant_clusters": self.get_consonant_cluster(word),
"vowel_count": sum(1 for c in word if c in "aeiouΙ™"),
"notes": (
"ContΓ© consonants emfΓ tiques (farΠΈΠ½Π³alitzades) (Β§2.1.1.3)"
if self.has_pharyngealization(word) else ""
),
}
# ─────────────────────────────────────────────────────────────────────────────
# 7. BILINGUAL DICTIONARY
# ─────────────────────────────────────────────────────────────────────────────
class BerberDictionary:
"""
Bilingual Catalan-Berber / Berber-Catalan dictionary.
Parsed from vocabulary sections of Lamuela (2005), Β§4.
"""
def __init__(self, data_path: Optional[Path] = None):
self.ca_to_ber: Dict[str, List[Dict]] = {}
self.ber_to_ca: Dict[str, List[Dict]] = {}
if data_path:
self.load_from_files(data_path)
else:
self._load_embedded_vocab()
def _load_embedded_vocab(self):
"""Load a curated subset of vocabulary from the book."""
# Format: [catalan, berber, pos, notes]
vocab = [
# Basic nouns
("home", "argaz", "n.m", "pl: irgazen"),
("dona", "tamghart", "n.f", "pl: timghartin; ta-...-t circumfix"),
("nen", "aqcic", "n.m", "pl: iqcicen"),
("nena", "tafruxt", "n.f", "pl: tifruxin"),
("pare", "baba", "n.m", "terme vocatiu; cf. 'yiwa'"),
("mare", "yemma", "n.f", "terme vocatiu"),
("fill", "aryaz", "n.m", "pl: irgazen (homonim amb 'home')"),
("filla", "taryaz", "n.f", ""),
("germΓ ", "gma", "n.m", "possessiu inherent"),
("germana", "ultma", "n.f", "possessiu inherent"),
# Body parts
("cap", "ixf", "n.m", ""),
("pit", "abbuc", "n.m", ""),
("ull", "tiαΉ­", "n.f", "pl: tiαΉ­αΉ­awin"),
("mΓ ", "afus", "n.m", "pl: ifassen"),
("peu", "aḍar", "n.m", "pl: iḍaren; consonant emfàtica"),
# Colors (Β§2.4.2)
("negre", "aberkan", "adj", "f: taberkanαΉ­; pl: iberkan"),
("blanc", "acemlal", "adj", "f: tacemlalt; pl: icemlalen"),
("vermell", "azegzaw", "adj", "verd tambΓ©"),
("groc", "awraΞ³", "adj", "f: tawraΞ³t"),
# Common verbs
("dir", "ini", "v", "perf: yini; imperf: ittini"),
("venir", "as", "v", "perf: yusa; imperf: ittusa; irregular"),
("anar", "ddu", "v", "perf: yedda; imperf: itteddu"),
("menjar", "ecc", "v", "perf: yecca; imperf: ittecca"),
("beure", "sw", "v", "perf: yeswa; imperf: itteswa"),
("dormir", "ini", "v", "perf: yudda; imperf: ittudda"),
("saber", "ssen", "v", "perf: yessen; imperf: ittessen"),
("voler", "iri", "v", "perf: yira; imperf: ittira"),
("poder", "zḍer", "v", "perf: yezḍer; imperf: ittezḍer"),
# Numbers
("un", "yan", "num", "f: yat"),
("dos", "sin", "num", "f: snat"),
("tres", "kraḍ", "num", "f: kraṭt"),
("quatre", "kuαΊ“", "num", "f: kuαΊ“αΉ­"),
("cinc", "semmus", "num", ""),
("sis", "sḍis", "num", ""),
("set", "sa", "num", "f: sat"),
("vuit", "tam", "num", "f: tamt"),
("nou", "tαΊ“a", "num", ""),
("deu", "mraw", "num", ""),
# Time
("avui", "ass-a", "adv", ""),
("ahir", "iḍelli", "adv", ""),
("demΓ ", "azekka", "adv", ""),
("ara", "tura", "adv", ""),
("sempre", "dima", "adv", "arabisme freqΓΌent"),
# Greetings
("hola", "azul", "interj", ""),
("grΓ cies", "tanemmirt", "interj", ""),
("sΓ­", "ih", "part", ""),
("no", "uhu", "part", ""),
# Prepositions
("a/per a", "i", "prep", "marca datiu"),
("amb", "d", "prep", "comitatiu; tambΓ© copula"),
("dins", "deg", "prep", "locatiu"),
("sobre", "xef", "prep", ""),
("davant", "zzat", "prep", ""),
("darrere", "deffir", "prep", ""),
("entre", "ger", "prep", ""),
("abans", "qbel", "prep", ""),
("desprΓ©s", "deffir", "prep", ""),
]
for ca, ber, pos, notes in vocab:
entry = {"form_ca": ca, "form_ber": ber, "pos": pos, "notes": notes}
self.ca_to_ber.setdefault(ca, []).append(entry)
self.ber_to_ca.setdefault(ber, []).append(entry)
def lookup_ca(self, word: str) -> List[Dict]:
"""Look up a Catalan word β†’ Berber translation(s)."""
return self.ca_to_ber.get(word.lower(), [])
def lookup_ber(self, word: str) -> List[Dict]:
"""Look up a Berber word β†’ Catalan translation(s)."""
return self.ber_to_ca.get(word.lower(), [])
def search(self, query: str, lang: str = "ca") -> List[Dict]:
"""Fuzzy search in the dictionary."""
results = []
source = self.ca_to_ber if lang == "ca" else self.ber_to_ca
q = query.lower()
for key, entries in source.items():
if q in key.lower():
results.extend(entries)
return results
def to_json(self) -> str:
"""Export full dictionary as JSON."""
all_entries = []
for entries in self.ca_to_ber.values():
all_entries.extend(entries)
return json.dumps(all_entries, ensure_ascii=False, indent=2)
# ─────────────────────────────────────────────────────────────────────────────
# 8. ERROR ANALYSIS (Catalan learner errors by Berber speakers)
# ─────────────────────────────────────────────────────────────────────────────
class BerberLearnerErrorAnalyzer:
"""
Analyzes typical errors made by Berber speakers learning Catalan.
Based on Β§3 of Lamuela (2005) β€” error corpus with linguistic explanations.
"""
# Error patterns documented in the book
ERROR_PATTERNS = [
{
"id": "ERR-GEN-01",
"category": "Gènere nominal",
"description": "Confusió de gènere en noms Catalans",
"berber_cause": "El berber té gènere (masc/fem) però amb marcadors molt diferents (t-...-t)",
"example_error": "tΓ© una cotxe",
"correction": "tΓ© un cotxe",
"reference": "Β§2.2.3.1",
},
{
"id": "ERR-ART-01",
"category": "Article definit",
"description": "OmissiΓ³ o ΓΊs incorrecte de l'article",
"berber_cause": "El berber no tΓ© article definit; l'estat anex fa funciΓ³ similar",
"example_error": "vaig a mercat",
"correction": "vaig al mercat",
"reference": "Β§2.2.3.1 (estat lliure vs anex)",
},
{
"id": "ERR-TEMPS-01",
"category": "Temps verbal",
"description": "ConfusiΓ³ entre temps de passat (perfecte/imperfet)",
"berber_cause": "El berber organitza el verb per ASPECTE (perfectiu/imperfectiu) no per TEMPS",
"example_error": "quan era petit, vaig anar cada dia a l'escola",
"correction": "quan era petit, anava cada dia a l'escola",
"reference": "Β§2.2.2.1 (sistema aspectual berber)",
},
{
"id": "ERR-TEMPS-02",
"category": "Temps verbal",
"description": "Ús del perfet simple on caldria imperfet d'indicatiu",
"berber_cause": "Aspecte perfectiu berber β‰  perfet catalΓ ; aspecte imperfectiu β‰  imperfet",
"example_error": "ahir vaig tenir molt fred",
"correction": "ahir tenia molt fred (estat)",
"reference": "Β§2.2.2",
},
{
"id": "ERR-PREP-01",
"category": "Règim preposicional",
"description": "PreposiciΓ³ incorrecta amb verbs de moviment",
"berber_cause": "El berber usa partΓ­cules direccionals lligades al verb (Β§2.3.2)",
"example_error": "vaig a la meva casa",
"correction": "vaig cap a casa meva",
"reference": "Β§2.3.2",
},
{
"id": "ERR-NEG-01",
"category": "NegaciΓ³",
"description": "NegaciΓ³ doble o posiciΓ³ incorrecta del negatiu",
"berber_cause": "La negaciΓ³ en berber Γ©s 'ur...ara' (circumfixa al verb) (Β§2.3.5)",
"example_error": "no vinc no",
"correction": "no vinc",
"reference": "Β§2.3.5",
},
{
"id": "ERR-PRON-01",
"category": "Pronoms febles",
"description": "OmissiΓ³ de pronoms febles clΓ­tics",
"berber_cause": "En berber els pronoms objecte s'incorporen com a sufixos verbals",
"example_error": "he vist ahir",
"correction": "l'he vist ahir",
"reference": "Β§2.2.4",
},
{
"id": "ERR-ORD-01",
"category": "Ordre de la frase",
"description": "Subjecte postverbal en frases declaratives",
"berber_cause": "L'ordre canΓ²nic del berber Γ©s VSO; el subjecte va darrere del verb (Β§2.3.1)",
"example_error": "ha vingut el meu germΓ  ahir",
"correction": "El meu germΓ  ha vingut ahir (en catalΓ  normatiu, SV Γ©s preferible)",
"reference": "Β§2.3.1.1",
},
]
def analyze_sentence(self, sentence: str) -> List[Dict]:
"""Flag potential errors in a Catalan sentence by a Berber speaker."""
warnings = []
s = sentence.lower()
checks = [
(r"\buna? (cotxe|problema|tema|mapa|dia)\b", "ERR-GEN-01"),
(r"\ba (mercat|escola|feina|treball)\b", "ERR-ART-01"),
(r"\b(ur|ulac)\b", None), # Berber word used
]
for pattern, err_id in checks:
if re.search(pattern, s) and err_id:
err = next((e for e in self.ERROR_PATTERNS if e["id"] == err_id), None)
if err:
warnings.append(err)
return warnings
def get_error_by_category(self, category: str) -> List[Dict]:
"""Filter errors by linguistic category."""
return [e for e in self.ERROR_PATTERNS if category.lower() in e["category"].lower()]
# ─────────────────────────────────────────────────────────────────────────────
# 9. PIPELINE (all-in-one)
# ─────────────────────────────────────────────────────────────────────────────
class BerberNLPPipeline:
"""
Unified NLP pipeline combining all Berber analysis modules.
"""
def __init__(self, data_path: Optional[Path] = None):
self.tokenizer = BerberTokenizer()
self.morph = BerberMorphAnalyzer()
self.phonology = BerberPhonologyAnalyzer()
self.syntax = BerberSyntaxAnnotator()
self.dictionary = BerberDictionary(data_path)
self.error_analyzer = BerberLearnerErrorAnalyzer()
def analyze(self, text: str, lang: str = "ber") -> Dict:
"""
Full analysis of a text in Berber or Catalan (for error detection).
Args:
text: Input text.
lang: 'ber' for Berber analysis, 'ca' for Catalan error analysis.
Returns:
Dict with all analysis layers.
"""
result = {"input": text, "lang": lang}
if lang == "ber":
tokens = self.tokenizer.tokenize(text)
annotated = self.syntax.annotate(tokens)
vso = self.syntax.check_vso_order(annotated)
morph_analyses = []
phon_analyses = []
for tok in tokens:
morph_analyses.append(vars(self.morph.analyze(tok["form"])))
phon_analyses.append(self.phonology.analyze_word_phonology(tok["form"]))
result.update({
"tokens": tokens,
"syntax": annotated,
"vso_check": vso,
"morphology": morph_analyses,
"phonology": phon_analyses,
})
elif lang == "ca":
errors = self.error_analyzer.analyze_sentence(text)
result["learner_errors"] = errors
return result
def translate(self, word: str, src: str = "ca") -> List[Dict]:
"""Quick dictionary lookup."""
if src == "ca":
return self.dictionary.lookup_ca(word)
else:
return self.dictionary.lookup_ber(word)
# ─────────────────────────────────────────────────────────────────────────────
# DEMO
# ─────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
print("=" * 60)
print(" BERBER NLP TOOLKIT β€” Demo")
print(" Basat en: Lamuela (2005), El Berber")
print("=" * 60)
pipeline = BerberNLPPipeline()
# 1. Analyze a Berber sentence
sentence = "ikerz argaz akal" # 'the man plowed the field' (VSO)
print(f"\n1. ANΓ€LISI DE FRASE BERBER: '{sentence}'")
result = pipeline.analyze(sentence, lang="ber")
for tok in result["tokens"]:
print(f" {tok['form']:15} gender={str(tok.get('gender','?')):8} state={str(tok.get('state','?'))}")
print(f" Ordre VSO: {result['vso_check']['order']} (canΓ²nic: {result['vso_check']['is_canonical']})")
# 2. Morphological analysis
print(f"\n2. ANΓ€LISI MORFOLΓ’GICA:")
for form in ["ikerrez", "tafruxt", "irgazen", "ttuzzel"]:
ana = pipeline.morph.analyze(form)
print(f" {form:15} POS={ana.pos:6} aspect={ana.aspect or '-':12} gender={ana.gender or '-':6}")
# 3. Phonological analysis
print(f"\n3. ANΓ€LISI FONOLΓ’GICA:")
for word in ["argaz", "tafruxt", "αΉ­effeΞ³t", "amdaz"]:
phon = pipeline.phonology.analyze_word_phonology(word)
print(f" {word:15} IPA={phon['ipa']:18} sΓ­lΒ·labes={phon['syllables']} emfΓ tiques={phon['has_emphatics']}")
# 4. Dictionary lookup
print(f"\n4. DICCIONARI CA→BERBER:")
for word in ["home", "dir", "negre", "grΓ cies"]:
entries = pipeline.translate(word, src="ca")
for e in entries:
print(f" {word:15} β†’ {e['form_ber']:15} ({e['pos']}) {e['notes']}")
# 5. Learner error analysis
print(f"\n5. ANΓ€LISI D'ERRORS (catalanΓ²fons berbers):")
errors = pipeline.error_analyzer.ERROR_PATTERNS[:3]
for e in errors:
print(f" [{e['id']}] {e['category']}: {e['description']}")
print(f" Error: '{e['example_error']}' β†’ CorrecciΓ³: '{e['correction']}'")
print(f"\n{'='*60}")
print(" Tots els mΓ²duls operatius. βœ“")
print("=" * 60)