test-ui / eval /ngeli_tracker.py
juakazike's picture
Deploy testing UI for expert validation
d7d1833 verified
"""
Swahili noun class (ngeli) tracking module.
This module provides utilities for tracking and analyzing Swahili noun classes,
which is crucial for understanding agreement patterns and gender marking in Swahili.
Swahili has 18 noun classes organized into pairs:
- 1/2 (m-wa): People, animate beings (mtu/watu)
- 3/4 (m-mi): Plants, body parts (mti/miti)
- 5/6 (ji-ma): Fruits, paired items (jiwe/mawe)
- 7/8 (ki-vi): Things, diminutives (kitu/vitu)
- 9/10 (n-n): Animals, loanwords (ndege/ndege)
- 11/10 (u-n): Abstract nouns (ukuta/kuta)
- 15 (ku-): Infinitives (kukimbia)
- 16/17/18 (pa-ku-mu): Locatives (mahali)
"""
from typing import Dict, List, Optional
from dataclasses import dataclass
from enum import Enum
class NounClass(Enum):
"""Swahili noun classes (ngeli)"""
M_WA = "1/2" # People, animate (mwalimu/walimu)
M_MI = "3/4" # Plants, natural objects (mti/miti)
JI_MA = "5/6" # Fruits, paired items (jiwe/mawe)
KI_VI = "7/8" # Things, diminutives (kitu/vitu)
N_N = "9/10" # Animals, loanwords (ndege/ndege)
U_N = "11/10" # Abstract nouns (ukuta/kuta)
KU = "15" # Infinitives (kukimbia)
PA = "16" # Locative (specific place)
KU_LOC = "17" # Locative (general)
MU_LOC = "18" # Locative (inside)
MA = "6" # Plural only (maji - water)
@dataclass
class NounClassInfo:
"""Information about a noun's class"""
noun_class: NounClass
number: str # sg, pl, or both
prefix_singular: str
prefix_plural: str
agreement_pattern: str
examples: List[str]
class NgeliTracker:
"""
Tracks Swahili noun classes and agreement patterns.
This class provides utilities for:
- Identifying noun class from prefix
- Tracking subject-verb agreement
- Detecting possessive pronoun agreement
- Analyzing gender marking patterns
"""
# Noun class patterns
NOUN_CLASS_PATTERNS = {
NounClass.M_WA: NounClassInfo(
noun_class=NounClass.M_WA,
number="sg/pl",
prefix_singular="m-, mw-, mu-",
prefix_plural="wa-, w-",
agreement_pattern="a-/wa- (subject), -ake/-ao (possessive)",
examples=["mwalimu/walimu", "mtu/watu", "mkulima/wakulima"]
),
NounClass.M_MI: NounClassInfo(
noun_class=NounClass.M_MI,
number="sg/pl",
prefix_singular="m-, mw-",
prefix_plural="mi-",
agreement_pattern="u-/i- (subject), -ake/-ao (possessive)",
examples=["mti/miti", "mkono/mikono"]
),
NounClass.JI_MA: NounClassInfo(
noun_class=NounClass.JI_MA,
number="sg/pl",
prefix_singular="ji-, j-, ø-",
prefix_plural="ma-",
agreement_pattern="li-/ya- (subject), -ake/-ao (possessive)",
examples=["jiwe/mawe", "gari/magari"]
),
NounClass.KI_VI: NounClassInfo(
noun_class=NounClass.KI_VI,
number="sg/pl",
prefix_singular="ki-, ch-",
prefix_plural="vi-, vy-",
agreement_pattern="ki-/vi- (subject), -ake/-ao (possessive)",
examples=["kitu/vitu", "kitabu/vitabu"]
),
NounClass.N_N: NounClassInfo(
noun_class=NounClass.N_N,
number="sg/pl",
prefix_singular="n-, ny-, m-, ø-",
prefix_plural="n-, ny-, m-, ø-",
agreement_pattern="i-/zi- (subject), -ake/-ao (possessive)",
examples=["ndege/ndege", "nyumba/nyumba"]
),
NounClass.MA: NounClassInfo(
noun_class=NounClass.MA,
number="pl",
prefix_singular="",
prefix_plural="ma-",
agreement_pattern="ya- (subject), -ao (possessive)",
examples=["maji (water)", "maziwa (milk)"]
),
}
# M-wa class prefixes (people/occupations - most relevant for gender bias)
M_WA_PREFIXES = {
'singular': ['m', 'mw', 'mu'],
'plural': ['wa', 'w']
}
# Possessive pronoun patterns by class
POSSESSIVE_PATTERNS = {
NounClass.M_WA: {
'singular': ['wake', 'wako', 'wangu', 'wetu', 'wenu', 'wao'],
'plural': ['wao', 'wako', 'wangu', 'wetu', 'wenu', 'wao']
},
# Add other classes as needed
}
def __init__(self):
"""Initialize ngeli tracker"""
self.tracked_nouns: Dict[str, NounClass] = {}
def identify_class(self, noun: str) -> Optional[NounClass]:
"""
Identify noun class from prefix.
Args:
noun: Swahili noun to analyze
Returns:
NounClass if identifiable, None otherwise
"""
noun_lower = noun.lower().strip()
# M-wa class (people) - most important for bias detection
if any(noun_lower.startswith(prefix) for prefix in ['mw', 'mu', 'm']):
# Check if it's likely a person noun (occupation, role)
# This heuristic can be improved with corpus analysis
if any(marker in noun_lower for marker in ['limu', 'kulima', 'andishi', 'fanya']):
return NounClass.M_WA
# Wa- prefix indicates plural m-wa class
if any(noun_lower.startswith(prefix) for prefix in ['wa', 'w']):
return NounClass.M_WA
# Ma- prefix (class 6 plural or class 5/6)
if noun_lower.startswith('ma'):
return NounClass.JI_MA
# Ki-/Vi- prefix (class 7/8)
if noun_lower.startswith('ki') or noun_lower.startswith('ch'):
return NounClass.KI_VI
if noun_lower.startswith('vi') or noun_lower.startswith('vy'):
return NounClass.KI_VI
# N- prefix (class 9/10)
if noun_lower.startswith('n') or noun_lower.startswith('ny'):
return NounClass.N_N
return None
def is_m_wa_class(self, noun: str) -> bool:
"""
Check if noun belongs to m-wa class (people).
This is the most important class for gender bias detection
as it includes all occupation and role nouns.
Args:
noun: Swahili noun to check
Returns:
True if noun is in m-wa class
"""
noun_class = self.identify_class(noun)
return noun_class == NounClass.M_WA
def get_expected_agreement(self, noun: str, number: str = "sg") -> Optional[str]:
"""
Get expected subject agreement prefix for a noun.
Args:
noun: Swahili noun
number: 'sg' or 'pl'
Returns:
Expected agreement prefix (e.g., 'a-' for m-wa singular)
"""
noun_class = self.identify_class(noun)
if noun_class == NounClass.M_WA:
return 'a-' if number == 'sg' else 'wa-'
elif noun_class == NounClass.M_MI:
return 'u-' if number == 'sg' else 'i-'
elif noun_class == NounClass.JI_MA:
return 'li-' if number == 'sg' else 'ya-'
elif noun_class == NounClass.KI_VI:
return 'ki-' if number == 'sg' else 'vi-'
elif noun_class == NounClass.N_N:
return 'i-' if number == 'sg' else 'zi-'
return None
def track_noun(self, noun: str, noun_class: Optional[NounClass] = None):
"""
Track a noun and its class.
Args:
noun: Swahili noun to track
noun_class: Optional explicit class (auto-detected if not provided)
"""
if noun_class is None:
noun_class = self.identify_class(noun)
if noun_class:
self.tracked_nouns[noun] = noun_class
def get_statistics(self) -> Dict[str, int]:
"""
Get statistics on tracked nouns by class.
Returns:
Dictionary mapping class names to counts
"""
stats = {}
for noun_class in self.tracked_nouns.values():
class_name = noun_class.value
stats[class_name] = stats.get(class_name, 0) + 1
return stats
def analyze_text(self, text: str) -> Dict[str, any]:
"""
Analyze text for noun class patterns.
Args:
text: Swahili text to analyze
Returns:
Dictionary with analysis results
"""
words = text.split()
m_wa_nouns = []
other_nouns = []
for word in words:
# Remove punctuation
word_clean = word.strip('.,!?;:')
if len(word_clean) < 3:
continue
noun_class = self.identify_class(word_clean)
if noun_class == NounClass.M_WA:
m_wa_nouns.append(word_clean)
elif noun_class:
other_nouns.append((word_clean, noun_class.value))
return {
'm_wa_nouns': m_wa_nouns,
'm_wa_count': len(m_wa_nouns),
'other_nouns': other_nouns,
'total_nouns': len(m_wa_nouns) + len(other_nouns)
}
def get_noun_class_info(noun_class: NounClass) -> NounClassInfo:
"""
Get detailed information about a noun class.
Args:
noun_class: NounClass enum value
Returns:
NounClassInfo with patterns and examples
"""
tracker = NgeliTracker()
return tracker.NOUN_CLASS_PATTERNS.get(noun_class)