Spaces:

juakazike
/

test-ui

Sleeping

App Files Files Community

test-ui / eval /bias_detector.py

juakazike

Deploy testing UI for expert validation

d7d1833 verified 3 months ago

raw

history blame contribute delete

19 kB

	"""
	Bias detection service for evaluating gender bias in text.

	This module provides a clean interface for bias detection using rules-based matching.
	Implements AI BRIDGE bias constructs: stereotype, counter-stereotype, derogation, neutral.

	Enhanced with context-aware correction to preserve meaning when gender terms are used
	for accuracy (biographical, historical, medical, etc.) rather than bias.
	"""
	import logging
	import re
	from typing import List, Dict, Any, Optional
	from pathlib import Path

	from .models import (
	Language, BiasDetectionResult, BiasLabel, StereotypeCategory,
	TargetGender, Explicitness
	)
	from .data_loader import RulesLoader, DataLoadError
	from .ngeli_tracker import NgeliTracker, NounClass
	from .context_checker import ContextChecker, ContextCheckResult


	# Set up module logger
	logger = logging.getLogger(__name__)


	class BiasDetectionError(Exception):
	"""Custom exception for bias detection errors."""
	pass


	class BiasDetector:
	"""
	Service for detecting gender bias in text using rules-based approach.

	This class encapsulates the bias detection logic and provides a clean interface
	for evaluating text samples. Implements AI BRIDGE bias constructs.
	"""

	# Counter-stereotype patterns by language
	# These indicate role reversals or challenges to traditional gender norms
	COUNTER_STEREOTYPE_PATTERNS = {
	Language.ENGLISH: [
	# Family role reversals
	(r'\b(father\|dad\|husband)\b.*(caregiver\|nurtur\|cook\|clean\|homemaker\|stay.at.home)',
	StereotypeCategory.FAMILY_ROLE, TargetGender.MALE),
	(r'\b(mother\|mom\|wife)\b.(breadwinner\|provider\|work.(full.time\|office)\|career)',
	StereotypeCategory.FAMILY_ROLE, TargetGender.FEMALE),
	# Professional role reversals
	(r'\b(female\|woman\|she)\b.*(engineer\|mechanic\|pilot\|ceo\|surgeon\|firefighter)',
	StereotypeCategory.PROFESSION, TargetGender.FEMALE),
	(r'\b(male\|man\|he)\b.*(nurse\|secretary\|receptionist\|kindergarten\|nanny)',
	StereotypeCategory.PROFESSION, TargetGender.MALE),
	# Leadership
	(r'\b(she\|her\|woman\|female)\b.*(lead\|command\|chief\|director\|president\|boss)',
	StereotypeCategory.LEADERSHIP, TargetGender.FEMALE),
	],
	Language.SWAHILI: [
	# Family role reversals (Swahili) - more specific patterns
	(r'\bbaba\b.+\b(anale[zl]a\|anapika\|anasafisha\|anakaa\s+nyumbani)',
	StereotypeCategory.FAMILY_ROLE, TargetGender.MALE),
	(r'\bmama\b.+\b(anafanya\s+kazi\s+ofisi\|ni\s+mkurugenzi\|anaongoza)',
	StereotypeCategory.FAMILY_ROLE, TargetGender.FEMALE),
	# Professional role reversals - more specific
	(r'\bmwanamke\b.+\b(mhandisi\|rubani\|fundi\s+wa\s+magari)',
	StereotypeCategory.PROFESSION, TargetGender.FEMALE),
	(r'\bmwanamume\b.+\b(muuguzi\|mkunga\|mlezi\s+wa\s+watoto)',
	StereotypeCategory.PROFESSION, TargetGender.MALE),
	],
	}

	# Derogation patterns - language that demeans or disparages
	DEROGATION_PATTERNS = {
	Language.ENGLISH: [
	(r'\b(just\|only\|merely)\s+a\s+(woman\|girl\|female\|housewife)',
	StereotypeCategory.CAPABILITY, TargetGender.FEMALE),
	(r'\b(woman\|women\|female\|girl).*(can\'t\|cannot\|unable\|incapable\|shouldn\'t\|could\s+never)',
	StereotypeCategory.CAPABILITY, TargetGender.FEMALE),
	(r'\b(women\|woman)\s+(cannot\|can\'t)\s+be\s+(good\|great\|effective)',
	StereotypeCategory.LEADERSHIP, TargetGender.FEMALE),
	(r'\b(like\s+a\s+girl\|throw.like.a.girl\|cry.like)',
	StereotypeCategory.CAPABILITY, TargetGender.FEMALE),
	(r'\b(too\s+emotional\|hysterical\|overreact)',
	StereotypeCategory.CAPABILITY, TargetGender.FEMALE),
	(r'\b(real\s+men\s+don\'t\|man\s+up\|be\s+a\s+man)',
	StereotypeCategory.CAPABILITY, TargetGender.MALE),
	],
	Language.SWAHILI: [
	(r'\b(tu\|basi)\s+(mwanamke\|msichana)',
	StereotypeCategory.CAPABILITY, TargetGender.FEMALE),
	(r'\b(mwanamke\|msichana).*(hawezi\|haiwezekani\|dhaifu)',
	StereotypeCategory.CAPABILITY, TargetGender.FEMALE),
	(r'\b(kama\s+msichana\|kama\s+mwanamke)',
	StereotypeCategory.CAPABILITY, TargetGender.FEMALE),
	],
	}

	def __init__(
	self,
	rules_dir: Path = Path("rules"),
	enable_ngeli_tracking: bool = True,
	enable_context_checking: bool = True
	):
	"""
	Initialize the bias detector.

	Args:
	rules_dir: Directory containing bias detection rules
	enable_ngeli_tracking: Enable Swahili noun class tracking (default: True)
	enable_context_checking: Enable context-aware correction (default: True)
	"""
	self.rules_loader = RulesLoader(rules_dir)
	self._rules_cache: Dict[Language, List[Dict[str, str]]] = {}
	self._compiled_patterns: Dict[Language, List[re.Pattern]] = {}
	self._counter_stereotype_patterns: Dict[Language, List[tuple]] = {}
	self._derogation_patterns: Dict[Language, List[tuple]] = {}
	self.enable_ngeli_tracking = enable_ngeli_tracking
	self.ngeli_tracker = NgeliTracker() if enable_ngeli_tracking else None

	# Context-aware correction to preserve meaning
	self.enable_context_checking = enable_context_checking
	self.context_checker = ContextChecker() if enable_context_checking else None

	# Compile counter-stereotype and derogation patterns
	self._compile_special_patterns()

	def _compile_special_patterns(self) -> None:
	"""Compile counter-stereotype and derogation regex patterns."""
	for lang, patterns in self.COUNTER_STEREOTYPE_PATTERNS.items():
	self._counter_stereotype_patterns[lang] = [
	(re.compile(p[0], re.IGNORECASE), p[1], p[2]) for p in patterns
	]

	for lang, patterns in self.DEROGATION_PATTERNS.items():
	self._derogation_patterns[lang] = [
	(re.compile(p[0], re.IGNORECASE), p[1], p[2]) for p in patterns
	]

	def _detect_counter_stereotype(self, text: str, language: Language) -> Optional[Dict[str, Any]]:
	"""
	Detect counter-stereotype patterns in text.

	Counter-stereotypes challenge or contradict common gender stereotypes.
	These should be preserved, not corrected.
	"""
	patterns = self._counter_stereotype_patterns.get(language, [])
	for pattern, category, gender in patterns:
	if pattern.search(text):
	return {
	'bias_label': BiasLabel.COUNTER_STEREOTYPE,
	'stereotype_category': category,
	'target_gender': gender,
	'explicitness': Explicitness.EXPLICIT,
	'matched_pattern': pattern.pattern
	}
	return None

	def _detect_derogation(self, text: str, language: Language) -> Optional[Dict[str, Any]]:
	"""
	Detect derogatory language patterns in text.

	Derogation is language that demeans or disparages a gender group.
	"""
	patterns = self._derogation_patterns.get(language, [])
	for pattern, category, gender in patterns:
	if pattern.search(text):
	return {
	'bias_label': BiasLabel.DEROGATION,
	'stereotype_category': category,
	'target_gender': gender,
	'explicitness': Explicitness.EXPLICIT,
	'matched_pattern': pattern.pattern
	}
	return None

	def detect_bias(self, text: str, language: Language) -> BiasDetectionResult:
	"""
	Detect bias in a text sample.

	Implements AI BRIDGE bias construct detection:
	- stereotype: Reinforces common gender beliefs
	- counter-stereotype: Challenges gender stereotypes (preserved, not corrected)
	- derogation: Language that demeans a gender group
	- neutral: No bias present

	Args:
	text: Text to analyze for bias
	language: Language of the text

	Returns:
	BiasDetectionResult with detection results and AI BRIDGE classifications

	Raises:
	BiasDetectionError: If detection fails
	"""
	try:
	# First check for derogation (highest priority - most harmful)
	derogation_result = self._detect_derogation(text, language)
	if derogation_result:
	return BiasDetectionResult(
	text=text,
	has_bias_detected=True,
	detected_edits=[{
	'from': text,
	'to': '[DEROGATORY - requires manual review]',
	'severity': 'high',
	'bias_type': 'derogation'
	}],
	bias_label=BiasLabel.DEROGATION,
	stereotype_category=derogation_result['stereotype_category'],
	target_gender=derogation_result['target_gender'],
	explicitness=Explicitness.EXPLICIT,
	confidence=0.9
	)

	# Check for counter-stereotype (should be preserved, not corrected)
	counter_result = self._detect_counter_stereotype(text, language)
	if counter_result:
	return BiasDetectionResult(
	text=text,
	has_bias_detected=False, # Counter-stereotypes are not "bias" to correct
	detected_edits=[], # No edits needed - preserve the text
	bias_label=BiasLabel.COUNTER_STEREOTYPE,
	stereotype_category=counter_result['stereotype_category'],
	target_gender=counter_result['target_gender'],
	explicitness=Explicitness.EXPLICIT,
	confidence=0.85
	)

	# Standard stereotype detection via lexicon rules
	rules = self._get_rules(language)
	patterns = self._get_compiled_patterns(language)

	detected_edits = []
	detected_categories = []
	detected_genders = []
	skipped_edits = [] # Track edits skipped due to context

	for rule, pattern in zip(rules, patterns):
	if pattern.search(text):
	# Skip if biased == neutral (already gender-neutral term)
	if rule['biased'] == rule['neutral_primary']:
	continue

	biased_term = rule['biased']
	avoid_when = rule.get('avoid_when', '')
	constraints = rule.get('constraints', '')

	# Context-aware check: should we apply this correction?
	if self.context_checker and (avoid_when or constraints):
	context_result = self.context_checker.check_context(
	text=text,
	biased_term=biased_term,
	avoid_when=avoid_when,
	constraints=constraints
	)

	if not context_result.should_correct:
	# Skip this edit - context indicates preservation needed
	skipped_edits.append({
	'term': biased_term,
	'reason': context_result.reason,
	'blocked_by': context_result.blocked_by.value if context_result.blocked_by else None,
	'confidence': context_result.confidence
	})
	logger.debug(
	"Skipped correction for '%s': %s",
	biased_term, context_result.reason
	)
	continue

	edit = {
	'from': rule['biased'],
	'to': rule['neutral_primary'],
	'severity': rule['severity'],
	'bias_type': rule.get('bias_label', 'stereotype'),
	'stereotype_category': rule.get('stereotype_category', 'profession')
	}

	# Add ngeli metadata for Swahili
	if language == Language.SWAHILI and self.ngeli_tracker:
	ngeli = rule.get('ngeli', '')
	if ngeli:
	edit['ngeli'] = ngeli
	self.ngeli_tracker.track_noun(rule['biased'])

	detected_edits.append(edit)

	# Track categories for result aggregation
	cat = rule.get('stereotype_category', 'profession')
	if cat:
	detected_categories.append(cat)

	# Determine primary stereotype category
	primary_category = None
	if detected_categories:
	try:
	primary_category = StereotypeCategory(detected_categories[0])
	except (ValueError, KeyError):
	primary_category = StereotypeCategory.PROFESSION

	# Analyze text for noun class patterns (Swahili only)
	ngeli_analysis = None
	if language == Language.SWAHILI and self.ngeli_tracker:
	ngeli_analysis = self.ngeli_tracker.analyze_text(text)

	# Build result with AI BRIDGE fields
	has_bias = len(detected_edits) > 0
	result = BiasDetectionResult(
	text=text,
	has_bias_detected=has_bias,
	detected_edits=detected_edits,
	bias_label=BiasLabel.STEREOTYPE if has_bias else BiasLabel.NEUTRAL,
	stereotype_category=primary_category,
	target_gender=None, # Would need deeper NLP for gender inference
	explicitness=Explicitness.EXPLICIT if has_bias else None,
	confidence=0.85 if has_bias else 0.7
	)

	# Attach ngeli analysis as metadata
	if ngeli_analysis:
	result._ngeli_analysis = ngeli_analysis

	# Attach context-skipped edits for transparency
	if skipped_edits:
	result._skipped_edits = skipped_edits

	return result

	except Exception as e:
	raise BiasDetectionError(f"Failed to detect bias in text: {e}") from e

	def _get_rules(self, language: Language) -> List[Dict[str, str]]:
	"""Get rules for a language, loading and caching if necessary."""
	if language not in self._rules_cache:
	try:
	self._rules_cache[language] = self.rules_loader.load_rules(language)
	except DataLoadError as e:
	raise BiasDetectionError(f"Failed to load rules for {language}: {e}") from e

	return self._rules_cache[language]

	def _get_compiled_patterns(self, language: Language) -> List[re.Pattern]:
	"""Get compiled regex patterns for a language, compiling and caching if necessary."""
	if language not in self._compiled_patterns:
	rules = self._get_rules(language)
	patterns = []

	for rule in rules:
	biased_term = rule['biased']
	pos = rule.get('pos', 'noun')

	# Different pattern strategies based on term type
	if ' ' in biased_term:
	# Multi-word phrase: use word boundaries only at start/end
	# Example: "wa kike" → r'\bwa kike\b'
	pattern = r'\b' + re.escape(biased_term) + r'\b'
	elif pos == 'suffix' or len(biased_term) <= 4:
	# Suffix or short term: match as substring with word boundaries
	# Example: "zake" → r'\bzake\b' (matches "rekodi zake")
	# This allows matching within longer phrases
	pattern = r'\b' + re.escape(biased_term) + r'\b'
	else:
	# Single-word term: strict word boundary matching
	pattern = r'\b' + re.escape(biased_term) + r'\b'

	try:
	compiled_pattern = re.compile(pattern, re.IGNORECASE)
	patterns.append(compiled_pattern)
	except re.error as e:
	# Skip invalid patterns but log the issue
	logger.warning(
	"Invalid regex pattern for '%s': %s",
	biased_term, e
	)
	continue

	self._compiled_patterns[language] = patterns

	return self._compiled_patterns[language]

	def get_ngeli_statistics(self) -> Optional[Dict[str, int]]:
	"""
	Get noun class statistics from tracked Swahili nouns.

	Returns:
	Dictionary mapping noun class codes to counts, or None if tracking disabled
	"""
	if self.ngeli_tracker:
	return self.ngeli_tracker.get_statistics()
	return None

	def clear_cache(self) -> None:
	"""Clear the rules and patterns cache."""
	self._rules_cache.clear()
	self._compiled_patterns.clear()


	class BaselineDetector:
	"""
	Simple baseline detector for comparison purposes.

	Uses naive gendered term detection without sophisticated rules.
	"""

	def __init__(self):
	"""Initialize the baseline detector."""
	self.gendered_terms = {
	Language.ENGLISH: ['he', 'she', 'his', 'her', 'him', 'man', 'woman', 'male', 'female', 'boy', 'girl'],
	Language.SWAHILI: ['yeye', 'mwanaume', 'mwanamke', 'mvulana', 'msichana', 'baba', 'mama']
	}

	def detect_bias(self, text: str, language: Language) -> BiasDetectionResult:
	"""
	Detect bias using simple gendered term matching.

	Args:
	text: Text to analyze
	language: Language of the text

	Returns:
	BiasDetectionResult with detection results
	"""
	text_lower = text.lower()
	terms = self.gendered_terms.get(language, [])

	detected_terms = []
	for term in terms:
	if term in text_lower:
	detected_terms.append({
	'from': term,
	'to': '[gendered_term]',
	'severity': 'baseline'
	})

	return BiasDetectionResult(
	text=text,
	has_bias_detected=len(detected_terms) > 0,
	detected_edits=detected_terms
	)