"""
Data Loader Module
Handles loading and preprocessing of bias detection datasets.
"""

import json
import os
from typing import List, Dict, Tuple
import pandas as pd


class DataLoader:
    """
    Load and preprocess bias detection datasets.
    """
    
    def __init__(self, data_dir='data'):
        """
        Initialize data loader.
        
        Args:
            data_dir: Directory containing datasets
        """
        self.data_dir = data_dir
    
    def load_winobias(self, subset='dev') -> List[Dict]:
        """
        Load WinoBias English dataset.
        
        Args:
            subset: 'dev' or 'test'
            
        Returns:
            List of sentences with bias information
        """
        file_path = os.path.join(self.data_dir, 'english', 'winobias_sample.json')
        
        if not os.path.exists(file_path):
            print(f"Warning: {file_path} not found. Using default samples.")
            return self._get_default_english_samples()
        
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        return data.get('sentences', [])
    
    def load_arabic_dataset(self) -> List[Dict]:
        """
        Load Arabic bias dataset.
        
        Returns:
            List of Arabic sentences with bias information
        """
        file_path = os.path.join(self.data_dir, 'arabic', 'arabic_bias_sample.json')
        
        if not os.path.exists(file_path):
            print(f"Warning: {file_path} not found. Using default samples.")
            return self._get_default_arabic_samples()
        
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        return data.get('sentences', [])
    
    def _get_default_english_samples(self) -> List[Dict]:
        """Get default English bias samples."""
        return [
            {
                "text": "The doctor asked the nurse to help him with the procedure.",
                "bias_type": "gender",
                "stereotype": "male_doctor",
                "label": "stereotype"
            },
            {
                "text": "The nurse helped the doctor with her medical expertise.",
                "bias_type": "gender",
                "stereotype": "female_nurse",
                "label": "stereotype"
            },
            {
                "text": "The engineer explained the software to the secretary and thanked her.",
                "bias_type": "gender",
                "stereotype": "male_engineer_female_secretary",
                "label": "stereotype"
            },
            {
                "text": "The CEO met with the assistant and asked him to schedule a meeting.",
                "bias_type": "gender",
                "stereotype": "counter",
                "label": "anti-stereotype"
            },
            {
                "text": "The software developer presented her innovative solution to the team.",
                "bias_type": "gender",
                "stereotype": "counter",
                "label": "anti-stereotype"
            },
            {
                "text": "The construction worker completed his shift and went home.",
                "bias_type": "gender",
                "stereotype": "male_construction",
                "label": "stereotype"
            },
            {
                "text": "The teacher graded papers and prepared tomorrow's lesson.",
                "bias_type": "neutral",
                "stereotype": "none",
                "label": "unrelated"
            },
            {
                "text": "The pilot safely landed the plane after checking all systems.",
                "bias_type": "neutral",
                "stereotype": "none",
                "label": "unrelated"
            }
        ]
    
    def _get_default_arabic_samples(self) -> List[Dict]:
        """Get default Arabic bias samples."""
        return [
            {
                "text": "طلب الطبيب من الممرضة أن تساعده في الإجراء.",
                "bias_type": "gender",
                "stereotype": "male_doctor_female_nurse",
                "label": "stereotype"
            },
            {
                "text": "ساعدت الممرضة الطبيب بخبرتها الطبية.",
                "bias_type": "gender",
                "stereotype": "female_nurse",
                "label": "stereotype"
            },
            {
                "text": "شرح المهندس البرنامج للسكرتيرة وشكرها.",
                "bias_type": "gender",
                "stereotype": "male_engineer_female_secretary",
                "label": "stereotype"
            },
            {
                "text": "قابل المدير التنفيذي المساعد وطلب منه جدولة اجتماع.",
                "bias_type": "gender",
                "stereotype": "counter",
                "label": "anti-stereotype"
            },
            {
                "text": "قدمت مطورة البرمجيات حلها المبتكر للفريق.",
                "bias_type": "gender",
                "stereotype": "counter",
                "label": "anti-stereotype"
            },
            {
                "text": "أكمل عامل البناء وردية عمله وعاد إلى المنزل.",
                "bias_type": "gender",
                "stereotype": "male_construction",
                "label": "stereotype"
            },
            {
                "text": "قام المعلم بتصحيح الأوراق وإعداد درس الغد.",
                "bias_type": "neutral",
                "stereotype": "none",
                "label": "unrelated"
            },
            {
                "text": "هبط الطيار بالطائرة بأمان بعد فحص جميع الأنظمة.",
                "bias_type": "neutral",
                "stereotype": "none",
                "label": "unrelated"
            }
        ]
    
    def load_dataset(self, language='english') -> List[Dict]:
        """
        Load dataset for specified language.
        
        Args:
            language: 'english' or 'arabic'
            
        Returns:
            List of sentences
        """
        if language.lower() == 'arabic':
            return self.load_arabic_dataset()
        else:
            return self.load_winobias()
    
    def create_filtered_version(self, sentences: List[Dict], bias_detector) -> List[Dict]:
        """
        Create filtered versions of sentences with reduced bias.
        
        Args:
            sentences: List of sentence dictionaries
            bias_detector: BiasDetector instance
            
        Returns:
            List of filtered sentences
        """
        filtered = []
        
        for item in sentences:
            text = item['text']
            
            # Simple filtering: replace gendered pronouns with neutral alternatives
            filtered_text = self._neutralize_gender(text, bias_detector.language)
            
            filtered_item = item.copy()
            filtered_item['original_text'] = text
            filtered_item['text'] = filtered_text
            filtered_item['is_filtered'] = True
            
            filtered.append(filtered_item)
        
        return filtered
    
    def _neutralize_gender(self, text: str, language: str) -> str:
        """
        Apply simple gender neutralization to text.
        
        Args:
            text: Input text
            language: 'english' or 'arabic'
            
        Returns:
            Neutralized text
        """
        if language == 'english':
            replacements = {
                ' he ': ' they ',
                ' she ': ' they ',
                ' him ': ' them ',
                ' her ': ' them ',
                ' his ': ' their ',
                ' hers ': ' theirs ',
                'He ': 'They ',
                'She ': 'They ',
                'Him ': 'Them ',
                'Her ': 'Them ',
                'His ': 'Their ',
            }
        else:  # Arabic - basic replacements
            replacements = {
                ' هو ': ' هم ',
                ' هي ': ' هم ',
                ' له ': ' لهم ',
                ' لها ': ' لهم ',
            }
        
        filtered_text = text
        for old, new in replacements.items():
            filtered_text = filtered_text.replace(old, new)
        
        return filtered_text
    
    def save_results(self, results: List[Dict], output_path: str):
        """
        Save analysis results to file.
        
        Args:
            results: List of analysis results
            output_path: Path to save file
        """
        # Ensure directory exists
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
        
        print(f"Results saved to {output_path}")
    
    def export_to_csv(self, results: List[Dict], output_path: str):
        """
        Export results to CSV format.
        
        Args:
            results: List of analysis results
            output_path: Path to save CSV file
        """
        # Flatten nested dictionaries for CSV export
        flattened = []
        
        for result in results:
            flat_result = {
                'text': result.get('text', ''),
                'language': result.get('language', ''),
                'overall_bias_score': result.get('overall_bias_score', 0),
                'is_biased': result.get('is_biased', False),
            }
            
            # Add gender bias metrics
            if 'gender_bias' in result:
                gb = result['gender_bias']
                flat_result['gender_bias_score'] = gb.get('bias_score', 0)
                flat_result['gender_bias_direction'] = gb.get('bias_direction', '')
                flat_result['gender_severity'] = gb.get('severity', '')
            
            # Add sentiment bias metrics
            if 'sentiment_bias' in result:
                sb = result['sentiment_bias']
                flat_result['sentiment_score'] = sb.get('sentiment_score', 0)
                flat_result['sentiment_bias_type'] = sb.get('bias_type', '')
            
            flattened.append(flat_result)
        
        df = pd.DataFrame(flattened)
        df.to_csv(output_path, index=False, encoding='utf-8-sig')
        
        print(f"Results exported to {output_path}")