Spaces:
Sleeping
Sleeping
| """ | |
| Data Loader Module | |
| Handles loading and preprocessing of bias detection datasets. | |
| """ | |
| import json | |
| import os | |
| from typing import List, Dict, Tuple | |
| import pandas as pd | |
| class DataLoader: | |
| """ | |
| Load and preprocess bias detection datasets. | |
| """ | |
| def __init__(self, data_dir='data'): | |
| """ | |
| Initialize data loader. | |
| Args: | |
| data_dir: Directory containing datasets | |
| """ | |
| self.data_dir = data_dir | |
| def load_winobias(self, subset='dev') -> List[Dict]: | |
| """ | |
| Load WinoBias English dataset. | |
| Args: | |
| subset: 'dev' or 'test' | |
| Returns: | |
| List of sentences with bias information | |
| """ | |
| file_path = os.path.join(self.data_dir, 'english', 'winobias_sample.json') | |
| if not os.path.exists(file_path): | |
| print(f"Warning: {file_path} not found. Using default samples.") | |
| return self._get_default_english_samples() | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| data = json.load(f) | |
| return data.get('sentences', []) | |
| def load_arabic_dataset(self) -> List[Dict]: | |
| """ | |
| Load Arabic bias dataset. | |
| Returns: | |
| List of Arabic sentences with bias information | |
| """ | |
| file_path = os.path.join(self.data_dir, 'arabic', 'arabic_bias_sample.json') | |
| if not os.path.exists(file_path): | |
| print(f"Warning: {file_path} not found. Using default samples.") | |
| return self._get_default_arabic_samples() | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| data = json.load(f) | |
| return data.get('sentences', []) | |
| def _get_default_english_samples(self) -> List[Dict]: | |
| """Get default English bias samples.""" | |
| return [ | |
| { | |
| "text": "The doctor asked the nurse to help him with the procedure.", | |
| "bias_type": "gender", | |
| "stereotype": "male_doctor", | |
| "label": "stereotype" | |
| }, | |
| { | |
| "text": "The nurse helped the doctor with her medical expertise.", | |
| "bias_type": "gender", | |
| "stereotype": "female_nurse", | |
| "label": "stereotype" | |
| }, | |
| { | |
| "text": "The engineer explained the software to the secretary and thanked her.", | |
| "bias_type": "gender", | |
| "stereotype": "male_engineer_female_secretary", | |
| "label": "stereotype" | |
| }, | |
| { | |
| "text": "The CEO met with the assistant and asked him to schedule a meeting.", | |
| "bias_type": "gender", | |
| "stereotype": "counter", | |
| "label": "anti-stereotype" | |
| }, | |
| { | |
| "text": "The software developer presented her innovative solution to the team.", | |
| "bias_type": "gender", | |
| "stereotype": "counter", | |
| "label": "anti-stereotype" | |
| }, | |
| { | |
| "text": "The construction worker completed his shift and went home.", | |
| "bias_type": "gender", | |
| "stereotype": "male_construction", | |
| "label": "stereotype" | |
| }, | |
| { | |
| "text": "The teacher graded papers and prepared tomorrow's lesson.", | |
| "bias_type": "neutral", | |
| "stereotype": "none", | |
| "label": "unrelated" | |
| }, | |
| { | |
| "text": "The pilot safely landed the plane after checking all systems.", | |
| "bias_type": "neutral", | |
| "stereotype": "none", | |
| "label": "unrelated" | |
| } | |
| ] | |
| def _get_default_arabic_samples(self) -> List[Dict]: | |
| """Get default Arabic bias samples.""" | |
| return [ | |
| { | |
| "text": "طلب الطبيب من الممرضة أن تساعده في الإجراء.", | |
| "bias_type": "gender", | |
| "stereotype": "male_doctor_female_nurse", | |
| "label": "stereotype" | |
| }, | |
| { | |
| "text": "ساعدت الممرضة الطبيب بخبرتها الطبية.", | |
| "bias_type": "gender", | |
| "stereotype": "female_nurse", | |
| "label": "stereotype" | |
| }, | |
| { | |
| "text": "شرح المهندس البرنامج للسكرتيرة وشكرها.", | |
| "bias_type": "gender", | |
| "stereotype": "male_engineer_female_secretary", | |
| "label": "stereotype" | |
| }, | |
| { | |
| "text": "قابل المدير التنفيذي المساعد وطلب منه جدولة اجتماع.", | |
| "bias_type": "gender", | |
| "stereotype": "counter", | |
| "label": "anti-stereotype" | |
| }, | |
| { | |
| "text": "قدمت مطورة البرمجيات حلها المبتكر للفريق.", | |
| "bias_type": "gender", | |
| "stereotype": "counter", | |
| "label": "anti-stereotype" | |
| }, | |
| { | |
| "text": "أكمل عامل البناء وردية عمله وعاد إلى المنزل.", | |
| "bias_type": "gender", | |
| "stereotype": "male_construction", | |
| "label": "stereotype" | |
| }, | |
| { | |
| "text": "قام المعلم بتصحيح الأوراق وإعداد درس الغد.", | |
| "bias_type": "neutral", | |
| "stereotype": "none", | |
| "label": "unrelated" | |
| }, | |
| { | |
| "text": "هبط الطيار بالطائرة بأمان بعد فحص جميع الأنظمة.", | |
| "bias_type": "neutral", | |
| "stereotype": "none", | |
| "label": "unrelated" | |
| } | |
| ] | |
| def load_dataset(self, language='english') -> List[Dict]: | |
| """ | |
| Load dataset for specified language. | |
| Args: | |
| language: 'english' or 'arabic' | |
| Returns: | |
| List of sentences | |
| """ | |
| if language.lower() == 'arabic': | |
| return self.load_arabic_dataset() | |
| else: | |
| return self.load_winobias() | |
| def create_filtered_version(self, sentences: List[Dict], bias_detector) -> List[Dict]: | |
| """ | |
| Create filtered versions of sentences with reduced bias. | |
| Args: | |
| sentences: List of sentence dictionaries | |
| bias_detector: BiasDetector instance | |
| Returns: | |
| List of filtered sentences | |
| """ | |
| filtered = [] | |
| for item in sentences: | |
| text = item['text'] | |
| # Simple filtering: replace gendered pronouns with neutral alternatives | |
| filtered_text = self._neutralize_gender(text, bias_detector.language) | |
| filtered_item = item.copy() | |
| filtered_item['original_text'] = text | |
| filtered_item['text'] = filtered_text | |
| filtered_item['is_filtered'] = True | |
| filtered.append(filtered_item) | |
| return filtered | |
| def _neutralize_gender(self, text: str, language: str) -> str: | |
| """ | |
| Apply simple gender neutralization to text. | |
| Args: | |
| text: Input text | |
| language: 'english' or 'arabic' | |
| Returns: | |
| Neutralized text | |
| """ | |
| if language == 'english': | |
| replacements = { | |
| ' he ': ' they ', | |
| ' she ': ' they ', | |
| ' him ': ' them ', | |
| ' her ': ' them ', | |
| ' his ': ' their ', | |
| ' hers ': ' theirs ', | |
| 'He ': 'They ', | |
| 'She ': 'They ', | |
| 'Him ': 'Them ', | |
| 'Her ': 'Them ', | |
| 'His ': 'Their ', | |
| } | |
| else: # Arabic - basic replacements | |
| replacements = { | |
| ' هو ': ' هم ', | |
| ' هي ': ' هم ', | |
| ' له ': ' لهم ', | |
| ' لها ': ' لهم ', | |
| } | |
| filtered_text = text | |
| for old, new in replacements.items(): | |
| filtered_text = filtered_text.replace(old, new) | |
| return filtered_text | |
| def save_results(self, results: List[Dict], output_path: str): | |
| """ | |
| Save analysis results to file. | |
| Args: | |
| results: List of analysis results | |
| output_path: Path to save file | |
| """ | |
| # Ensure directory exists | |
| os.makedirs(os.path.dirname(output_path), exist_ok=True) | |
| with open(output_path, 'w', encoding='utf-8') as f: | |
| json.dump(results, f, ensure_ascii=False, indent=2) | |
| print(f"Results saved to {output_path}") | |
| def export_to_csv(self, results: List[Dict], output_path: str): | |
| """ | |
| Export results to CSV format. | |
| Args: | |
| results: List of analysis results | |
| output_path: Path to save CSV file | |
| """ | |
| # Flatten nested dictionaries for CSV export | |
| flattened = [] | |
| for result in results: | |
| flat_result = { | |
| 'text': result.get('text', ''), | |
| 'language': result.get('language', ''), | |
| 'overall_bias_score': result.get('overall_bias_score', 0), | |
| 'is_biased': result.get('is_biased', False), | |
| } | |
| # Add gender bias metrics | |
| if 'gender_bias' in result: | |
| gb = result['gender_bias'] | |
| flat_result['gender_bias_score'] = gb.get('bias_score', 0) | |
| flat_result['gender_bias_direction'] = gb.get('bias_direction', '') | |
| flat_result['gender_severity'] = gb.get('severity', '') | |
| # Add sentiment bias metrics | |
| if 'sentiment_bias' in result: | |
| sb = result['sentiment_bias'] | |
| flat_result['sentiment_score'] = sb.get('sentiment_score', 0) | |
| flat_result['sentiment_bias_type'] = sb.get('bias_type', '') | |
| flattened.append(flat_result) | |
| df = pd.DataFrame(flattened) | |
| df.to_csv(output_path, index=False, encoding='utf-8-sig') | |
| print(f"Results exported to {output_path}") | |