test-ui / eval /context_checker.py
juakazike's picture
Deploy testing UI for expert validation
d7d1833 verified
"""
Context-Aware Correction Checker for Gender Bias Detection
This module implements context detection to prevent over-correction of legitimate
gender references. It checks for conditions where bias correction should be skipped:
- Quoted text (historical quotes, citations)
- Proper nouns (organization names, titles)
- Historical context (past references, dates)
- Biographical context (specific person references)
- Statistical context (factual gender-specific data)
- Medical context (biological/health accuracy)
- Counter-stereotypes (positive challenges to stereotypes)
Based on industry best practices from:
- MBIAS: Mitigating Bias While Retaining Context
- SC2: Content Preservation in Long Text Style Transfer
- Token-Level Disentanglement approaches
"""
import re
from typing import Dict, List, Optional, Tuple
from dataclasses import dataclass
from enum import Enum
class ContextCondition(Enum):
"""Context conditions that may prevent correction."""
QUOTE = "quote"
HISTORICAL = "historical"
PROPER_NOUN = "proper_noun"
BIOGRAPHICAL = "biographical"
STATISTICAL = "statistical"
MEDICAL = "medical"
COUNTER_STEREOTYPE = "counter_stereotype"
LEGAL = "legal"
ARTISTIC = "artistic"
ORGANIZATION = "organization"
@dataclass
class ContextCheckResult:
"""Result of a context check."""
should_correct: bool
blocked_by: Optional[ContextCondition] = None
reason: str = ""
confidence: float = 1.0
matched_pattern: str = ""
class ContextChecker:
"""
Checks text context to determine if bias correction should be applied.
This helps preserve meaning in cases where gender references are:
- Historically accurate
- Part of proper nouns/organization names
- Quoting someone directly
- Providing statistical facts
- Medically/biologically necessary
"""
# Context detection patterns organized by condition type
# {term} placeholder is replaced with the actual biased term
CONTEXT_PATTERNS: Dict[ContextCondition, List[str]] = {
ContextCondition.QUOTE: [
# Direct quotes - various quote styles (ASCII and Unicode)
# Note: Using {{0,100}} to escape the braces from .format()
r'"[^"]{{0,100}}{term}[^"]{{0,100}}"', # "term"
r"'[^']{{0,100}}{term}[^']{{0,100}}'", # 'term'
r'«[^»]{{0,100}}{term}[^»]{{0,100}}»', # «term» French
r'„[^"]{{0,100}}{term}[^"]{{0,100}}"', # „term" German
r'"[^"]{{0,100}}{term}[^"]{{0,100}}"', # "term" smart quotes
r'\"[^\"]{{0,100}}{term}[^\"]{{0,100}}\"', # \"term\" escaped
# Reported speech markers (Swahili & English)
r'\b(alisema|anasema|walisema|said|says|stated|wrote|claimed)\b.{{0,50}}{term}',
r'{term}.{{0,50}}\b(alisema|anasema|said|says)\b',
],
ContextCondition.HISTORICAL: [
# Year references (escape braces for .format())
r'\b(mwaka\s+)?\d{{4}}\b.{{0,50}}{term}', # "mwaka 1990" or "1990"
r'{term}.{{0,50}}\b(mwaka\s+)?\d{{4}}\b',
r'\bin\s+\d{{4}}\b.{{0,30}}{term}', # "in 1990"
# Historical markers (Swahili)
r'\b(kihistoria|historia|zamani|kale|enzi)\b.{{0,50}}{term}',
r'{term}.{{0,50}}\b(kihistoria|historia|zamani)\b',
# Historical markers (English)
r'\b(historically|history|ancient|traditional|formerly)\b.{{0,50}}{term}',
# Past tense markers
r'\b(ilikuwa|walikuwa|alikuwa|was|were|used\s+to)\b.{{0,30}}{term}',
],
ContextCondition.PROPER_NOUN: [
# Proper noun after term (e.g., "Mama Robert", "Baba Kanumba")
# Must be preceded by word boundary, not sentence start (escape braces)
r'(?<=[.!?]\s{{1,5}}|\A)(?![A-Z])\b{term}\s+[A-Z][a-z]+', # Stricter: not at sentence start
r'(?<=[a-z])\s+{term}\s+[A-Z][a-z]+', # Mid-sentence "mama Robert"
# Swahili naming convention: Mama/Baba + Name (very specific)
r'\b[Mm]ama\s+[A-Z][a-z]{{2,}}', # "Mama Robert" (min 3 char name)
r'\b[Bb]aba\s+[A-Z][a-z]{{2,}}', # "Baba Kanumba"
# Capitalized title + term (not sentence start)
r'(?<=[a-z.,;:]\s)[A-Z][a-z]+\s+{term}', # "Chairman Mao" mid-sentence
# Organization markers (Swahili)
r'\b(Chama\s+cha|Shirika\s+la|Taasisi\s+ya|Kampuni\s+ya)\b.{{0,30}}{term}',
# Organization markers (English)
r'\b(Organization|Company|Association|Foundation|Institute)\s+.{{0,20}}{term}',
r'{term}.{{0,20}}\b(Inc|Ltd|LLC|Corp|Foundation)\b',
# Title patterns
r'\b(Mheshimiwa|Dkt\.|Dr\.|Prof\.|Mr\.|Mrs\.|Ms\.)\s+.{{0,20}}{term}',
],
ContextCondition.BIOGRAPHICAL: [
# Specific person reference (Swahili) - escape braces
r'\b(yeye|huyu|yule)\s+(ni|alikuwa|amekuwa).{{0,30}}{term}',
r'{term}\s+wa\s+kwanza', # "first [role]"
r'\baliyekuwa\b.{{0,20}}{term}', # "who was [role]"
r'\balikuwa\b.{{0,20}}{term}', # "alikuwa mke wa" pattern
# Specific person reference (English)
r'\b(she|he)\s+(is|was|became|served\s+as).{{0,30}}{term}',
r'\bthe\s+first\s+(female|male|woman|man)\s+{term}',
# Name + role pattern - REQUIRE two capitalized names (not IGNORECASE for names)
# This is checked specially in _check_condition to avoid false positives
],
ContextCondition.STATISTICAL: [
# Percentage patterns - term can be before or after with any separator
r'\d+(\.\d+)?%\s*.{{0,30}}{term}', # "70% of women"
r'\d+(\.\d+)?%.{{0,30}}{term}', # "70%... women" (any chars)
r'{term}.{{0,30}}\d+(\.\d+)?%',
# Statistical markers (Swahili)
r'\b(takwimu|idadi|asilimia|wastani)\b.{{0,30}}{term}',
# Statistical markers (English)
r'\b(statistics|data|survey|study|research|percent|majority|minority)\b.{{0,30}}{term}',
# Numeric context
r'\b\d+\s+(kati\s+ya|out\s+of|of\s+the)\s+\d+\b.{{0,30}}{term}',
],
ContextCondition.MEDICAL: [
# Pregnancy/birth (Swahili) - term can be before or after
r'\b(mjamzito|ujauzito|uzazi|kujifungua|mimba)\b.{{0,50}}{term}',
r'{term}.{{0,50}}\b(mjamzito|ujauzito|uzazi|kujifungua)\b',
# "Mama mjamzito" pattern - very common in Swahili health contexts
r'\b{term}\s+mjamzito\b',
r'\bmjamzito.{{0,10}}{term}',
# Pregnancy/birth (English)
r'\b(pregnant|pregnancy|childbirth|maternal|obstetric|gynecolog)\b.{{0,50}}{term}',
# Medical procedure context
r'\b(saratani\s+ya\s+shingo|cervical\s+cancer|breast\s+cancer|prostate)\b.{{0,50}}{term}',
# Healthcare setting markers
r'\b(hospitali|clinic|daktari|nurse|doctor|hospital)\b.{{0,30}}{term}',
],
ContextCondition.COUNTER_STEREOTYPE: [
# Role reversal patterns (Swahili) - no term placeholder, no escaping needed
r'\b(mwanamke|mama)\b.{0,30}\b(mhandisi|rubani|fundi|mkurugenzi|daktari)\b',
r'\b(mwanamume|baba)\b.{0,30}\b(muuguzi|mkunga|mlezi|mpishi)\b',
# Role reversal patterns (English)
r'\b(female|woman|she)\b.{0,30}\b(engineer|pilot|mechanic|CEO|surgeon)\b',
r'\b(male|man|he)\b.{0,30}\b(nurse|secretary|nanny|caregiver)\b',
# "First female/male" achievements
r'\b(wa\s+kwanza|first)\b.{0,20}\b(wa\s+kike|wa\s+kiume|female|male)\b',
],
ContextCondition.LEGAL: [
# Legal document markers (Swahili)
r'\b(sheria|mahakama|kesi|mshtakiwa|mlalamikaji)\b.{{0,30}}{term}',
# Legal document markers (English)
r'\b(court|legal|plaintiff|defendant|witness|law|statute)\b.{{0,30}}{term}',
# Official document context
r'\b(hati|certificate|document|official|sworn)\b.{{0,30}}{term}',
],
ContextCondition.ARTISTIC: [
# Creative work markers
r'\b(wimbo|filamu|kitabu|hadithi|mchezo)\b.{{0,30}}{term}',
r'\b(song|film|movie|book|novel|play|poem|lyrics)\b.{{0,30}}{term}',
# Character/role context
r'\b(mhusika|character|role|actor|actress)\b.{{0,30}}{term}',
],
ContextCondition.ORGANIZATION: [
# Organization name patterns (Swahili)
r'\b(TAWOMA|BAWATA|TAMWA|UWT)\b', # Known women's orgs
r'\bChama\s+cha\s+\w+\s+{term}',
# Organization acronyms near term
r'\b[A-Z]{{2,6}}\b.{{0,20}}{term}',
],
}
# Swahili-specific patterns for common false positive scenarios
SWAHILI_PRESERVE_PATTERNS = [
# "Mama [Name]" - common Swahili naming convention (teknonymn)
r'\b[Mm]ama\s+[A-Z][a-z]+\b',
# "Baba [Name]" - common Swahili naming convention
r'\b[Bb]aba\s+[A-Z][a-z]+\b',
# Religious/cultural titles
r'\b(Bibi|Babu|Shangazi|Mjomba)\s+[A-Z][a-z]+\b',
]
def __init__(self, strict_mode: bool = False):
"""
Initialize the context checker.
Args:
strict_mode: If True, any context match blocks correction.
If False, uses confidence scoring.
"""
self.strict_mode = strict_mode
self._compiled_patterns: Dict[ContextCondition, List[re.Pattern]] = {}
self._compile_patterns()
def _compile_patterns(self) -> None:
"""Pre-compile regex patterns for efficiency."""
for condition, patterns in self.CONTEXT_PATTERNS.items():
self._compiled_patterns[condition] = []
for pattern in patterns:
try:
# Patterns with {term} are templates, compile without term for now
if '{term}' not in pattern:
self._compiled_patterns[condition].append(
re.compile(pattern, re.IGNORECASE | re.UNICODE)
)
except re.error:
continue
def _get_pattern_for_term(self, pattern_template: str, term: str) -> Optional[re.Pattern]:
"""Create a compiled pattern with the specific term inserted."""
try:
pattern = pattern_template.format(term=re.escape(term))
return re.compile(pattern, re.IGNORECASE | re.UNICODE)
except (re.error, KeyError):
return None
def check_context(
self,
text: str,
biased_term: str,
avoid_when: str = "",
constraints: str = ""
) -> ContextCheckResult:
"""
Check if correction should be applied based on context.
Args:
text: Full text being analyzed
biased_term: The specific biased term found
avoid_when: Pipe-separated list of conditions from lexicon
constraints: Additional constraints from lexicon
Returns:
ContextCheckResult indicating whether to proceed with correction
"""
# Parse avoid_when conditions from lexicon
conditions_to_check = self._parse_avoid_when(avoid_when)
# If no specific conditions, check all common ones
if not conditions_to_check:
conditions_to_check = [
ContextCondition.QUOTE,
ContextCondition.PROPER_NOUN,
ContextCondition.BIOGRAPHICAL,
]
# Check each condition
for condition in conditions_to_check:
result = self._check_condition(text, biased_term, condition)
if not result.should_correct:
return result
# Check Swahili-specific preservation patterns
for pattern in self.SWAHILI_PRESERVE_PATTERNS:
if re.search(pattern, text):
# Check if the biased term is part of this preserved pattern
full_match = re.search(pattern, text)
if full_match and biased_term.lower() in full_match.group(0).lower():
return ContextCheckResult(
should_correct=False,
blocked_by=ContextCondition.PROPER_NOUN,
reason=f"Term is part of Swahili naming convention: {full_match.group(0)}",
confidence=0.9,
matched_pattern=pattern
)
# All checks passed - proceed with correction
return ContextCheckResult(
should_correct=True,
reason="No blocking context detected",
confidence=1.0
)
def _parse_avoid_when(self, avoid_when: str) -> List[ContextCondition]:
"""Parse the avoid_when field into ContextCondition enums."""
if not avoid_when or avoid_when.strip() == "":
return []
conditions = []
for part in avoid_when.split('|'):
part = part.strip().lower()
try:
conditions.append(ContextCondition(part))
except ValueError:
# Unknown condition, skip
continue
return conditions
def _check_condition(
self,
text: str,
term: str,
condition: ContextCondition
) -> ContextCheckResult:
"""Check a specific context condition."""
patterns = self.CONTEXT_PATTERNS.get(condition, [])
for pattern_template in patterns:
# Handle patterns with {term} placeholder
if '{term}' in pattern_template:
pattern = self._get_pattern_for_term(pattern_template, term)
if pattern and pattern.search(text):
return ContextCheckResult(
should_correct=False,
blocked_by=condition,
reason=f"Detected {condition.value} context",
confidence=0.85,
matched_pattern=pattern_template
)
else:
# Pre-compiled pattern without term
compiled = self._compiled_patterns.get(condition, [])
for cp in compiled:
if cp.search(text):
return ContextCheckResult(
should_correct=False,
blocked_by=condition,
reason=f"Detected {condition.value} context",
confidence=0.85,
matched_pattern=cp.pattern
)
# Special check for biographical: Name + term pattern (case-sensitive for names)
if condition == ContextCondition.BIOGRAPHICAL:
# Check for "FirstName LastName ... term" pattern (strict capitalization)
name_pattern = re.compile(
r'[A-Z][a-z]+\s+[A-Z][a-z]+.{0,30}' + re.escape(term),
re.UNICODE # NOT IGNORECASE - names must be capitalized
)
if name_pattern.search(text):
return ContextCheckResult(
should_correct=False,
blocked_by=condition,
reason=f"Detected {condition.value} context (name reference)",
confidence=0.85,
matched_pattern="[Name] + term"
)
# Check for "term + Name" pattern (e.g., "mke wa Nelson Mandela")
term_name_pattern = re.compile(
re.escape(term) + r'\s+(wa\s+)?[A-Z][a-z]+(\s+[A-Z][a-z]+)?',
re.UNICODE # NOT IGNORECASE
)
if term_name_pattern.search(text):
return ContextCheckResult(
should_correct=False,
blocked_by=condition,
reason=f"Detected {condition.value} context (name reference)",
confidence=0.85,
matched_pattern="term + [Name]"
)
# No match found for this condition
return ContextCheckResult(
should_correct=True,
reason=f"No {condition.value} context detected",
confidence=1.0
)
def is_in_quotes(self, text: str, term: str) -> bool:
"""Quick check if term appears within quotes."""
quote_patterns = [
r'"[^"]*' + re.escape(term) + r'[^"]*"',
r"'[^']*" + re.escape(term) + r"[^']*'",
]
for pattern in quote_patterns:
if re.search(pattern, text, re.IGNORECASE):
return True
return False
def extract_proper_nouns(self, text: str) -> List[str]:
"""
Extract potential proper nouns from text.
Useful for preserving entities during ML fallback correction.
"""
# Simple heuristic: capitalized words not at sentence start
proper_nouns = []
# Split into sentences
sentences = re.split(r'[.!?]\s+', text)
for sentence in sentences:
words = sentence.split()
for i, word in enumerate(words):
# Skip first word (sentence start)
if i == 0:
continue
# Check if capitalized
if word and word[0].isupper():
# Clean punctuation
clean_word = re.sub(r'[^\w]', '', word)
if clean_word and len(clean_word) > 1:
proper_nouns.append(clean_word)
return list(set(proper_nouns))
def get_preservation_entities(self, text: str) -> List[str]:
"""
Get entities that should be preserved during correction.
Combines proper nouns, organization names, and other key entities.
"""
entities = set()
# Add proper nouns
entities.update(self.extract_proper_nouns(text))
# Add organization patterns
org_patterns = [
r'\b[A-Z]{2,6}\b', # Acronyms
r'\b[A-Z][a-z]+\s+[A-Z][a-z]+\b', # Two-word names
]
for pattern in org_patterns:
matches = re.findall(pattern, text)
entities.update(matches)
return list(entities)
# Convenience function for quick context check
def should_apply_correction(
text: str,
biased_term: str,
avoid_when: str = "",
constraints: str = ""
) -> Tuple[bool, str]:
"""
Quick check if correction should be applied.
Args:
text: Full text being analyzed
biased_term: The biased term found
avoid_when: Conditions from lexicon
constraints: Additional constraints
Returns:
Tuple of (should_correct: bool, reason: str)
"""
checker = ContextChecker()
result = checker.check_context(text, biased_term, avoid_when, constraints)
return result.should_correct, result.reason
if __name__ == "__main__":
# Test examples
checker = ContextChecker()
test_cases = [
# Should NOT correct - proper noun (Swahili naming)
("Mama Robert alisema watoto wapate elimu", "mama Robert", "proper_noun"),
# Should NOT correct - historical quote
('"Mwanamke anapaswa kukaa nyumbani" alisema mtu zamani', "mwanamke anapaswa", "quote|historical"),
# Should NOT correct - biographical
("Winnie Mandela alikuwa mke wa Nelson Mandela", "mke wa", "biographical"),
# Should NOT correct - statistical
("70% ya wanawake wanafanya kazi", "wanawake", "statistical"),
# Should NOT correct - medical
("Mama mjamzito anahitaji huduma", "mama", "medical"),
# SHOULD correct - general stereotype
("Wanawake hawafai kuongoza", "wanawake", ""),
# SHOULD correct - general bias
("Mwanamke anapaswa kupika", "mwanamke anapaswa", ""),
]
print("Context Checker Test Results")
print("=" * 60)
for text, term, avoid_when in test_cases:
result = checker.check_context(text, term, avoid_when)
status = "SKIP" if not result.should_correct else "CORRECT"
print(f"\n[{status}] Term: '{term}'")
print(f" Text: {text[:60]}...")
print(f" Reason: {result.reason}")
if result.blocked_by:
print(f" Blocked by: {result.blocked_by.value}")