Spaces:
Sleeping
Sleeping
| """ | |
| Evaluator Module | |
| Comprehensive evaluation framework for bias detection system. | |
| """ | |
| import os | |
| from typing import List, Dict | |
| from datetime import datetime | |
| import json | |
| class Evaluator: | |
| """ | |
| Evaluate bias detection system performance. | |
| """ | |
| def __init__(self, bias_detector, fairness_metrics, data_loader): | |
| """ | |
| Initialize evaluator. | |
| Args: | |
| bias_detector: BiasDetector instance | |
| fairness_metrics: FairnessMetrics instance | |
| data_loader: DataLoader instance | |
| """ | |
| self.bias_detector = bias_detector | |
| self.fairness_metrics = fairness_metrics | |
| self.data_loader = data_loader | |
| def run_benchmark(self, language='english') -> Dict: | |
| """ | |
| Run comprehensive benchmark on dataset. | |
| Args: | |
| language: 'english' or 'arabic' | |
| Returns: | |
| Benchmark results dictionary | |
| """ | |
| print(f"\n{'='*60}") | |
| print(f"Running {language.upper()} Benchmark") | |
| print(f"{'='*60}\n") | |
| # Load dataset | |
| print("Loading dataset...") | |
| dataset = self.data_loader.load_dataset(language) | |
| print(f"Loaded {len(dataset)} sentences\n") | |
| # Analyze unfiltered data | |
| print("Analyzing unfiltered text...") | |
| unfiltered_results = [] | |
| for item in dataset: | |
| result = self.bias_detector.analyze_text(item['text']) | |
| result['original_label'] = item.get('label', 'unknown') | |
| result['stereotype_type'] = item.get('stereotype', 'unknown') | |
| unfiltered_results.append(result) | |
| # Create and analyze filtered data | |
| print("Creating filtered versions...") | |
| filtered_dataset = self.data_loader.create_filtered_version(dataset, self.bias_detector) | |
| print("Analyzing filtered text...") | |
| filtered_results = [] | |
| for item in filtered_dataset: | |
| result = self.bias_detector.analyze_text(item['text']) | |
| result['original_label'] = item.get('label', 'unknown') | |
| result['stereotype_type'] = item.get('stereotype', 'unknown') | |
| result['original_text'] = item.get('original_text', '') | |
| filtered_results.append(result) | |
| # Calculate fairness metrics | |
| print("\nCalculating fairness metrics...") | |
| unfiltered_fairness = self.fairness_metrics.calculate_fairness_score(unfiltered_results) | |
| filtered_fairness = self.fairness_metrics.calculate_fairness_score(filtered_results) | |
| # Compare filtered vs unfiltered | |
| comparison = self.fairness_metrics.compare_filtered_unfiltered( | |
| unfiltered_results, | |
| filtered_results | |
| ) | |
| # Calculate StereoSet score | |
| stereoset_unfiltered = self.fairness_metrics.calculate_stereoset_score(dataset) | |
| # Compile results | |
| benchmark_results = { | |
| 'language': language, | |
| 'timestamp': datetime.now().isoformat(), | |
| 'dataset_size': len(dataset), | |
| 'unfiltered_analysis': { | |
| 'results': unfiltered_results, | |
| 'fairness_metrics': unfiltered_fairness, | |
| 'stereoset_metrics': stereoset_unfiltered | |
| }, | |
| 'filtered_analysis': { | |
| 'results': filtered_results, | |
| 'fairness_metrics': filtered_fairness | |
| }, | |
| 'comparison': comparison, | |
| 'summary': self._generate_summary(comparison, unfiltered_fairness, filtered_fairness) | |
| } | |
| print("\n" + "="*60) | |
| print("Benchmark Complete!") | |
| print("="*60) | |
| return benchmark_results | |
| def evaluate_weat(self, language='english') -> Dict: | |
| """ | |
| Run WEAT evaluation. | |
| Args: | |
| language: 'english' or 'arabic' | |
| Returns: | |
| WEAT results | |
| """ | |
| from modules.fairness_metrics import get_default_weat_word_sets | |
| print(f"\nRunning WEAT for {language}...") | |
| word_sets = get_default_weat_word_sets(language) | |
| weat_result = self.fairness_metrics.calculate_weat( | |
| target_words_1=word_sets['male_names'], | |
| target_words_2=word_sets['female_names'], | |
| attribute_words_1=word_sets['career_words'], | |
| attribute_words_2=word_sets['family_words'] | |
| ) | |
| print(f"WEAT Score: {weat_result['weat_score']:.4f}") | |
| print(f"Effect Size: {weat_result['effect_size']:.4f}") | |
| print(f"Interpretation: {weat_result['interpretation']}") | |
| print(f"Statistically Significant: {weat_result['is_significant']}") | |
| return weat_result | |
| def _generate_summary(self, comparison: Dict, | |
| unfiltered_fairness: Dict, | |
| filtered_fairness: Dict) -> Dict: | |
| """Generate human-readable summary.""" | |
| return { | |
| 'original_fairness_score': unfiltered_fairness['overall_fairness_score'], | |
| 'filtered_fairness_score': filtered_fairness['overall_fairness_score'], | |
| 'improvement': comparison['improvement'], | |
| 'improvement_percentage': comparison['improvement_percentage'], | |
| 'bias_reduction': comparison['bias_reduction'], | |
| 'original_grade': unfiltered_fairness['grade'], | |
| 'filtered_grade': filtered_fairness['grade'], | |
| 'status': comparison['status'], | |
| 'recommendation': self._get_recommendation(comparison) | |
| } | |
| def _get_recommendation(self, comparison: Dict) -> str: | |
| """Generate recommendation based on results.""" | |
| improvement = comparison['improvement'] | |
| if improvement > 10: | |
| return "Excellent improvement! The filtering significantly reduced bias." | |
| elif improvement > 5: | |
| return "Good improvement. The filtering reduced bias noticeably." | |
| elif improvement > 0: | |
| return "Modest improvement. Consider additional filtering strategies." | |
| elif improvement == 0: | |
| return "No improvement detected. Review filtering approach." | |
| else: | |
| return "Warning: Filtering may have introduced new biases." | |
| def generate_report(self, benchmark_results: Dict, output_dir='results') -> str: | |
| """ | |
| Generate detailed evaluation report. | |
| Args: | |
| benchmark_results: Results from run_benchmark | |
| output_dir: Directory to save report | |
| Returns: | |
| Path to generated report | |
| """ | |
| os.makedirs(output_dir, exist_ok=True) | |
| language = benchmark_results['language'] | |
| timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') | |
| # Save JSON report | |
| json_path = os.path.join(output_dir, f'benchmark_{language}_{timestamp}.json') | |
| with open(json_path, 'w', encoding='utf-8') as f: | |
| json.dump(benchmark_results, f, ensure_ascii=False, indent=2) | |
| # Generate text report | |
| report_path = os.path.join(output_dir, f'report_{language}_{timestamp}.txt') | |
| with open(report_path, 'w', encoding='utf-8') as f: | |
| f.write("="*70 + "\n") | |
| f.write(f"BIAS DETECTION EVALUATION REPORT - {language.upper()}\n") | |
| f.write("="*70 + "\n\n") | |
| summary = benchmark_results['summary'] | |
| f.write("SUMMARY\n") | |
| f.write("-"*70 + "\n") | |
| f.write(f"Dataset Size: {benchmark_results['dataset_size']} sentences\n") | |
| f.write(f"Language: {language}\n") | |
| f.write(f"Timestamp: {benchmark_results['timestamp']}\n\n") | |
| f.write("FAIRNESS SCORES\n") | |
| f.write("-"*70 + "\n") | |
| f.write(f"Original (Unfiltered):\n") | |
| f.write(f" - Fairness Score: {summary['original_fairness_score']:.2f}/100\n") | |
| f.write(f" - Grade: {summary['original_grade']}\n\n") | |
| f.write(f"Filtered:\n") | |
| f.write(f" - Fairness Score: {summary['filtered_fairness_score']:.2f}/100\n") | |
| f.write(f" - Grade: {summary['filtered_grade']}\n\n") | |
| f.write("IMPROVEMENT METRICS\n") | |
| f.write("-"*70 + "\n") | |
| f.write(f"Absolute Improvement: {summary['improvement']:.2f} points\n") | |
| f.write(f"Relative Improvement: {summary['improvement_percentage']:.2f}%\n") | |
| f.write(f"Bias Reduction: {summary['bias_reduction']:.3f}\n") | |
| f.write(f"Status: {summary['status'].upper()}\n\n") | |
| f.write("RECOMMENDATION\n") | |
| f.write("-"*70 + "\n") | |
| f.write(f"{summary['recommendation']}\n\n") | |
| # StereoSet metrics | |
| stereoset = benchmark_results['unfiltered_analysis']['stereoset_metrics'] | |
| f.write("STEREOSET METRICS\n") | |
| f.write("-"*70 + "\n") | |
| f.write(f"Stereotype Count: {stereoset['stereotype_count']}\n") | |
| f.write(f"Anti-Stereotype Count: {stereoset['anti_stereotype_count']}\n") | |
| f.write(f"Unrelated Count: {stereoset['unrelated_count']}\n") | |
| f.write(f"StereoSet Score: {stereoset['stereoset_score']:.2f}/100\n") | |
| f.write(f"Interpretation: {stereoset['interpretation']}\n\n") | |
| f.write("="*70 + "\n") | |
| print(f"\nReport saved to: {report_path}") | |
| print(f"JSON data saved to: {json_path}") | |
| return report_path | |
| def quick_test(self, text: str) -> Dict: | |
| """ | |
| Quick test on a single text. | |
| Args: | |
| text: Text to analyze | |
| Returns: | |
| Analysis results | |
| """ | |
| result = self.bias_detector.analyze_text(text) | |
| print("\n" + "="*60) | |
| print("BIAS ANALYSIS RESULTS") | |
| print("="*60) | |
| print(f"\nText: {text}") | |
| print(f"\nLanguage: {result['language']}") | |
| print(f"Overall Bias Score: {result['overall_bias_score']:.3f}") | |
| print(f"Is Biased: {result['is_biased']}") | |
| print(f"\nGender Bias:") | |
| gb = result['gender_bias'] | |
| print(f" - Score: {gb['bias_score']:.3f}") | |
| print(f" - Direction: {gb['bias_direction']}") | |
| print(f" - Severity: {gb['severity']}") | |
| print(f"\nSentiment Bias:") | |
| sb = result['sentiment_bias'] | |
| print(f" - Score: {sb['sentiment_score']:.3f}") | |
| print(f" - Type: {sb['bias_type']}") | |
| print("\n" + "="*60) | |
| return result | |