Spaces:

juakazike
/

test-ui

Running

File size: 21,187 Bytes

d7d1833

"""

Context-Aware Correction Checker for Gender Bias Detection



This module implements context detection to prevent over-correction of legitimate

gender references. It checks for conditions where bias correction should be skipped:

- Quoted text (historical quotes, citations)

- Proper nouns (organization names, titles)

- Historical context (past references, dates)

- Biographical context (specific person references)

- Statistical context (factual gender-specific data)

- Medical context (biological/health accuracy)

- Counter-stereotypes (positive challenges to stereotypes)



Based on industry best practices from:

- MBIAS: Mitigating Bias While Retaining Context

- SC2: Content Preservation in Long Text Style Transfer

- Token-Level Disentanglement approaches

"""

import re
from typing import Dict, List, Optional, Tuple
from dataclasses import dataclass
from enum import Enum


class ContextCondition(Enum):
    """Context conditions that may prevent correction."""
    QUOTE = "quote"
    HISTORICAL = "historical"
    PROPER_NOUN = "proper_noun"
    BIOGRAPHICAL = "biographical"
    STATISTICAL = "statistical"
    MEDICAL = "medical"
    COUNTER_STEREOTYPE = "counter_stereotype"
    LEGAL = "legal"
    ARTISTIC = "artistic"
    ORGANIZATION = "organization"


@dataclass
class ContextCheckResult:
    """Result of a context check."""
    should_correct: bool
    blocked_by: Optional[ContextCondition] = None
    reason: str = ""
    confidence: float = 1.0
    matched_pattern: str = ""


class ContextChecker:
    """

    Checks text context to determine if bias correction should be applied.



    This helps preserve meaning in cases where gender references are:

    - Historically accurate

    - Part of proper nouns/organization names

    - Quoting someone directly

    - Providing statistical facts

    - Medically/biologically necessary

    """

    # Context detection patterns organized by condition type
    # {term} placeholder is replaced with the actual biased term
    CONTEXT_PATTERNS: Dict[ContextCondition, List[str]] = {
        ContextCondition.QUOTE: [
            # Direct quotes - various quote styles (ASCII and Unicode)
            # Note: Using {{0,100}} to escape the braces from .format()
            r'"[^"]{{0,100}}{term}[^"]{{0,100}}"',           # "term"
            r"'[^']{{0,100}}{term}[^']{{0,100}}'",           # 'term'
            r'«[^»]{{0,100}}{term}[^»]{{0,100}}»',           # «term» French
            r'„[^"]{{0,100}}{term}[^"]{{0,100}}"',           # „term" German
            r'"[^"]{{0,100}}{term}[^"]{{0,100}}"',           # "term" smart quotes
            r'\"[^\"]{{0,100}}{term}[^\"]{{0,100}}\"',       # \"term\" escaped
            # Reported speech markers (Swahili & English)
            r'\b(alisema|anasema|walisema|said|says|stated|wrote|claimed)\b.{{0,50}}{term}',
            r'{term}.{{0,50}}\b(alisema|anasema|said|says)\b',
        ],

        ContextCondition.HISTORICAL: [
            # Year references (escape braces for .format())
            r'\b(mwaka\s+)?\d{{4}}\b.{{0,50}}{term}',        # "mwaka 1990" or "1990"
            r'{term}.{{0,50}}\b(mwaka\s+)?\d{{4}}\b',
            r'\bin\s+\d{{4}}\b.{{0,30}}{term}',              # "in 1990"
            # Historical markers (Swahili)
            r'\b(kihistoria|historia|zamani|kale|enzi)\b.{{0,50}}{term}',
            r'{term}.{{0,50}}\b(kihistoria|historia|zamani)\b',
            # Historical markers (English)
            r'\b(historically|history|ancient|traditional|formerly)\b.{{0,50}}{term}',
            # Past tense markers
            r'\b(ilikuwa|walikuwa|alikuwa|was|were|used\s+to)\b.{{0,30}}{term}',
        ],

        ContextCondition.PROPER_NOUN: [
            # Proper noun after term (e.g., "Mama Robert", "Baba Kanumba")
            # Must be preceded by word boundary, not sentence start (escape braces)
            r'(?<=[.!?]\s{{1,5}}|\A)(?![A-Z])\b{term}\s+[A-Z][a-z]+',  # Stricter: not at sentence start
            r'(?<=[a-z])\s+{term}\s+[A-Z][a-z]+',       # Mid-sentence "mama Robert"
            # Swahili naming convention: Mama/Baba + Name (very specific)
            r'\b[Mm]ama\s+[A-Z][a-z]{{2,}}',              # "Mama Robert" (min 3 char name)
            r'\b[Bb]aba\s+[A-Z][a-z]{{2,}}',              # "Baba Kanumba"
            # Capitalized title + term (not sentence start)
            r'(?<=[a-z.,;:]\s)[A-Z][a-z]+\s+{term}',    # "Chairman Mao" mid-sentence
            # Organization markers (Swahili)
            r'\b(Chama\s+cha|Shirika\s+la|Taasisi\s+ya|Kampuni\s+ya)\b.{{0,30}}{term}',
            # Organization markers (English)
            r'\b(Organization|Company|Association|Foundation|Institute)\s+.{{0,20}}{term}',
            r'{term}.{{0,20}}\b(Inc|Ltd|LLC|Corp|Foundation)\b',
            # Title patterns
            r'\b(Mheshimiwa|Dkt\.|Dr\.|Prof\.|Mr\.|Mrs\.|Ms\.)\s+.{{0,20}}{term}',
        ],

        ContextCondition.BIOGRAPHICAL: [
            # Specific person reference (Swahili) - escape braces
            r'\b(yeye|huyu|yule)\s+(ni|alikuwa|amekuwa).{{0,30}}{term}',
            r'{term}\s+wa\s+kwanza',                     # "first [role]"
            r'\baliyekuwa\b.{{0,20}}{term}',               # "who was [role]"
            r'\balikuwa\b.{{0,20}}{term}',                 # "alikuwa mke wa" pattern
            # Specific person reference (English)
            r'\b(she|he)\s+(is|was|became|served\s+as).{{0,30}}{term}',
            r'\bthe\s+first\s+(female|male|woman|man)\s+{term}',
            # Name + role pattern - REQUIRE two capitalized names (not IGNORECASE for names)
            # This is checked specially in _check_condition to avoid false positives
        ],

        ContextCondition.STATISTICAL: [
            # Percentage patterns - term can be before or after with any separator
            r'\d+(\.\d+)?%\s*.{{0,30}}{term}',             # "70% of women"
            r'\d+(\.\d+)?%.{{0,30}}{term}',                # "70%... women" (any chars)
            r'{term}.{{0,30}}\d+(\.\d+)?%',
            # Statistical markers (Swahili)
            r'\b(takwimu|idadi|asilimia|wastani)\b.{{0,30}}{term}',
            # Statistical markers (English)
            r'\b(statistics|data|survey|study|research|percent|majority|minority)\b.{{0,30}}{term}',
            # Numeric context
            r'\b\d+\s+(kati\s+ya|out\s+of|of\s+the)\s+\d+\b.{{0,30}}{term}',
        ],

        ContextCondition.MEDICAL: [
            # Pregnancy/birth (Swahili) - term can be before or after
            r'\b(mjamzito|ujauzito|uzazi|kujifungua|mimba)\b.{{0,50}}{term}',
            r'{term}.{{0,50}}\b(mjamzito|ujauzito|uzazi|kujifungua)\b',
            # "Mama mjamzito" pattern - very common in Swahili health contexts
            r'\b{term}\s+mjamzito\b',
            r'\bmjamzito.{{0,10}}{term}',
            # Pregnancy/birth (English)
            r'\b(pregnant|pregnancy|childbirth|maternal|obstetric|gynecolog)\b.{{0,50}}{term}',
            # Medical procedure context
            r'\b(saratani\s+ya\s+shingo|cervical\s+cancer|breast\s+cancer|prostate)\b.{{0,50}}{term}',
            # Healthcare setting markers
            r'\b(hospitali|clinic|daktari|nurse|doctor|hospital)\b.{{0,30}}{term}',
        ],

        ContextCondition.COUNTER_STEREOTYPE: [
            # Role reversal patterns (Swahili) - no term placeholder, no escaping needed
            r'\b(mwanamke|mama)\b.{0,30}\b(mhandisi|rubani|fundi|mkurugenzi|daktari)\b',
            r'\b(mwanamume|baba)\b.{0,30}\b(muuguzi|mkunga|mlezi|mpishi)\b',
            # Role reversal patterns (English)
            r'\b(female|woman|she)\b.{0,30}\b(engineer|pilot|mechanic|CEO|surgeon)\b',
            r'\b(male|man|he)\b.{0,30}\b(nurse|secretary|nanny|caregiver)\b',
            # "First female/male" achievements
            r'\b(wa\s+kwanza|first)\b.{0,20}\b(wa\s+kike|wa\s+kiume|female|male)\b',
        ],

        ContextCondition.LEGAL: [
            # Legal document markers (Swahili)
            r'\b(sheria|mahakama|kesi|mshtakiwa|mlalamikaji)\b.{{0,30}}{term}',
            # Legal document markers (English)
            r'\b(court|legal|plaintiff|defendant|witness|law|statute)\b.{{0,30}}{term}',
            # Official document context
            r'\b(hati|certificate|document|official|sworn)\b.{{0,30}}{term}',
        ],

        ContextCondition.ARTISTIC: [
            # Creative work markers
            r'\b(wimbo|filamu|kitabu|hadithi|mchezo)\b.{{0,30}}{term}',
            r'\b(song|film|movie|book|novel|play|poem|lyrics)\b.{{0,30}}{term}',
            # Character/role context
            r'\b(mhusika|character|role|actor|actress)\b.{{0,30}}{term}',
        ],

        ContextCondition.ORGANIZATION: [
            # Organization name patterns (Swahili)
            r'\b(TAWOMA|BAWATA|TAMWA|UWT)\b',           # Known women's orgs
            r'\bChama\s+cha\s+\w+\s+{term}',
            # Organization acronyms near term
            r'\b[A-Z]{{2,6}}\b.{{0,20}}{term}',
        ],
    }

    # Swahili-specific patterns for common false positive scenarios
    SWAHILI_PRESERVE_PATTERNS = [
        # "Mama [Name]" - common Swahili naming convention (teknonymn)
        r'\b[Mm]ama\s+[A-Z][a-z]+\b',
        # "Baba [Name]" - common Swahili naming convention
        r'\b[Bb]aba\s+[A-Z][a-z]+\b',
        # Religious/cultural titles
        r'\b(Bibi|Babu|Shangazi|Mjomba)\s+[A-Z][a-z]+\b',
    ]

    def __init__(self, strict_mode: bool = False):
        """

        Initialize the context checker.



        Args:

            strict_mode: If True, any context match blocks correction.

                        If False, uses confidence scoring.

        """
        self.strict_mode = strict_mode
        self._compiled_patterns: Dict[ContextCondition, List[re.Pattern]] = {}
        self._compile_patterns()

    def _compile_patterns(self) -> None:
        """Pre-compile regex patterns for efficiency."""
        for condition, patterns in self.CONTEXT_PATTERNS.items():
            self._compiled_patterns[condition] = []
            for pattern in patterns:
                try:
                    # Patterns with {term} are templates, compile without term for now
                    if '{term}' not in pattern:
                        self._compiled_patterns[condition].append(
                            re.compile(pattern, re.IGNORECASE | re.UNICODE)
                        )
                except re.error:
                    continue

    def _get_pattern_for_term(self, pattern_template: str, term: str) -> Optional[re.Pattern]:
        """Create a compiled pattern with the specific term inserted."""
        try:
            pattern = pattern_template.format(term=re.escape(term))
            return re.compile(pattern, re.IGNORECASE | re.UNICODE)
        except (re.error, KeyError):
            return None

    def check_context(

        self,

        text: str,

        biased_term: str,

        avoid_when: str = "",

        constraints: str = ""

    ) -> ContextCheckResult:
        """

        Check if correction should be applied based on context.



        Args:

            text: Full text being analyzed

            biased_term: The specific biased term found

            avoid_when: Pipe-separated list of conditions from lexicon

            constraints: Additional constraints from lexicon



        Returns:

            ContextCheckResult indicating whether to proceed with correction

        """
        # Parse avoid_when conditions from lexicon
        conditions_to_check = self._parse_avoid_when(avoid_when)

        # If no specific conditions, check all common ones
        if not conditions_to_check:
            conditions_to_check = [
                ContextCondition.QUOTE,
                ContextCondition.PROPER_NOUN,
                ContextCondition.BIOGRAPHICAL,
            ]

        # Check each condition
        for condition in conditions_to_check:
            result = self._check_condition(text, biased_term, condition)
            if not result.should_correct:
                return result

        # Check Swahili-specific preservation patterns
        for pattern in self.SWAHILI_PRESERVE_PATTERNS:
            if re.search(pattern, text):
                # Check if the biased term is part of this preserved pattern
                full_match = re.search(pattern, text)
                if full_match and biased_term.lower() in full_match.group(0).lower():
                    return ContextCheckResult(
                        should_correct=False,
                        blocked_by=ContextCondition.PROPER_NOUN,
                        reason=f"Term is part of Swahili naming convention: {full_match.group(0)}",
                        confidence=0.9,
                        matched_pattern=pattern
                    )

        # All checks passed - proceed with correction
        return ContextCheckResult(
            should_correct=True,
            reason="No blocking context detected",
            confidence=1.0
        )

    def _parse_avoid_when(self, avoid_when: str) -> List[ContextCondition]:
        """Parse the avoid_when field into ContextCondition enums."""
        if not avoid_when or avoid_when.strip() == "":
            return []

        conditions = []
        for part in avoid_when.split('|'):
            part = part.strip().lower()
            try:
                conditions.append(ContextCondition(part))
            except ValueError:
                # Unknown condition, skip
                continue

        return conditions

    def _check_condition(

        self,

        text: str,

        term: str,

        condition: ContextCondition

    ) -> ContextCheckResult:
        """Check a specific context condition."""
        patterns = self.CONTEXT_PATTERNS.get(condition, [])

        for pattern_template in patterns:
            # Handle patterns with {term} placeholder
            if '{term}' in pattern_template:
                pattern = self._get_pattern_for_term(pattern_template, term)
                if pattern and pattern.search(text):
                    return ContextCheckResult(
                        should_correct=False,
                        blocked_by=condition,
                        reason=f"Detected {condition.value} context",
                        confidence=0.85,
                        matched_pattern=pattern_template
                    )
            else:
                # Pre-compiled pattern without term
                compiled = self._compiled_patterns.get(condition, [])
                for cp in compiled:
                    if cp.search(text):
                        return ContextCheckResult(
                            should_correct=False,
                            blocked_by=condition,
                            reason=f"Detected {condition.value} context",
                            confidence=0.85,
                            matched_pattern=cp.pattern
                        )

        # Special check for biographical: Name + term pattern (case-sensitive for names)
        if condition == ContextCondition.BIOGRAPHICAL:
            # Check for "FirstName LastName ... term" pattern (strict capitalization)
            name_pattern = re.compile(
                r'[A-Z][a-z]+\s+[A-Z][a-z]+.{0,30}' + re.escape(term),
                re.UNICODE  # NOT IGNORECASE - names must be capitalized
            )
            if name_pattern.search(text):
                return ContextCheckResult(
                    should_correct=False,
                    blocked_by=condition,
                    reason=f"Detected {condition.value} context (name reference)",
                    confidence=0.85,
                    matched_pattern="[Name] + term"
                )

            # Check for "term + Name" pattern (e.g., "mke wa Nelson Mandela")
            term_name_pattern = re.compile(
                re.escape(term) + r'\s+(wa\s+)?[A-Z][a-z]+(\s+[A-Z][a-z]+)?',
                re.UNICODE  # NOT IGNORECASE
            )
            if term_name_pattern.search(text):
                return ContextCheckResult(
                    should_correct=False,
                    blocked_by=condition,
                    reason=f"Detected {condition.value} context (name reference)",
                    confidence=0.85,
                    matched_pattern="term + [Name]"
                )

        # No match found for this condition
        return ContextCheckResult(
            should_correct=True,
            reason=f"No {condition.value} context detected",
            confidence=1.0
        )

    def is_in_quotes(self, text: str, term: str) -> bool:
        """Quick check if term appears within quotes."""
        quote_patterns = [
            r'"[^"]*' + re.escape(term) + r'[^"]*"',
            r"'[^']*" + re.escape(term) + r"[^']*'",
        ]
        for pattern in quote_patterns:
            if re.search(pattern, text, re.IGNORECASE):
                return True
        return False

    def extract_proper_nouns(self, text: str) -> List[str]:
        """

        Extract potential proper nouns from text.



        Useful for preserving entities during ML fallback correction.

        """
        # Simple heuristic: capitalized words not at sentence start
        proper_nouns = []

        # Split into sentences
        sentences = re.split(r'[.!?]\s+', text)

        for sentence in sentences:
            words = sentence.split()
            for i, word in enumerate(words):
                # Skip first word (sentence start)
                if i == 0:
                    continue
                # Check if capitalized
                if word and word[0].isupper():
                    # Clean punctuation
                    clean_word = re.sub(r'[^\w]', '', word)
                    if clean_word and len(clean_word) > 1:
                        proper_nouns.append(clean_word)

        return list(set(proper_nouns))

    def get_preservation_entities(self, text: str) -> List[str]:
        """

        Get entities that should be preserved during correction.



        Combines proper nouns, organization names, and other key entities.

        """
        entities = set()

        # Add proper nouns
        entities.update(self.extract_proper_nouns(text))

        # Add organization patterns
        org_patterns = [
            r'\b[A-Z]{2,6}\b',  # Acronyms
            r'\b[A-Z][a-z]+\s+[A-Z][a-z]+\b',  # Two-word names
        ]

        for pattern in org_patterns:
            matches = re.findall(pattern, text)
            entities.update(matches)

        return list(entities)


# Convenience function for quick context check
def should_apply_correction(

    text: str,

    biased_term: str,

    avoid_when: str = "",

    constraints: str = ""

) -> Tuple[bool, str]:
    """

    Quick check if correction should be applied.



    Args:

        text: Full text being analyzed

        biased_term: The biased term found

        avoid_when: Conditions from lexicon

        constraints: Additional constraints



    Returns:

        Tuple of (should_correct: bool, reason: str)

    """
    checker = ContextChecker()
    result = checker.check_context(text, biased_term, avoid_when, constraints)
    return result.should_correct, result.reason


if __name__ == "__main__":
    # Test examples
    checker = ContextChecker()

    test_cases = [
        # Should NOT correct - proper noun (Swahili naming)
        ("Mama Robert alisema watoto wapate elimu", "mama Robert", "proper_noun"),

        # Should NOT correct - historical quote
        ('"Mwanamke anapaswa kukaa nyumbani" alisema mtu zamani', "mwanamke anapaswa", "quote|historical"),

        # Should NOT correct - biographical
        ("Winnie Mandela alikuwa mke wa Nelson Mandela", "mke wa", "biographical"),

        # Should NOT correct - statistical
        ("70% ya wanawake wanafanya kazi", "wanawake", "statistical"),

        # Should NOT correct - medical
        ("Mama mjamzito anahitaji huduma", "mama", "medical"),

        # SHOULD correct - general stereotype
        ("Wanawake hawafai kuongoza", "wanawake", ""),

        # SHOULD correct - general bias
        ("Mwanamke anapaswa kupika", "mwanamke anapaswa", ""),
    ]

    print("Context Checker Test Results")
    print("=" * 60)

    for text, term, avoid_when in test_cases:
        result = checker.check_context(text, term, avoid_when)
        status = "SKIP" if not result.should_correct else "CORRECT"
        print(f"\n[{status}] Term: '{term}'")
        print(f"  Text: {text[:60]}...")
        print(f"  Reason: {result.reason}")
        if result.blocked_by:
            print(f"  Blocked by: {result.blocked_by.value}")