| """ |
| ML model evaluation comparing rules-based vs ML vs hybrid approaches |
| """ |
| import csv |
| from typing import Dict, List |
| from .bias_detector import BiasDetector |
| from .ml_detector import MLBiasDetector |
| from .hybrid_detector import HybridBiasDetector |
| from .models import Language, EvaluationMetrics |
|
|
| class MLEvaluationFramework: |
| """Evaluate and compare different detection approaches""" |
| |
| def __init__(self): |
| self.rules_detector = BiasDetector() |
| self.ml_detector = MLBiasDetector() |
| self.hybrid_detector = HybridBiasDetector() |
| |
| def run_comparative_evaluation(self) -> Dict: |
| """Run evaluation across all approaches and languages""" |
| results = {} |
| |
| for language in Language: |
| print(f"\nEvaluating {language.value}...") |
| |
| |
| ground_truth = self._load_ground_truth(language) |
| |
| |
| rules_metrics = self._evaluate_approach(self.rules_detector, ground_truth, language) |
| ml_metrics = self._evaluate_approach(self.ml_detector, ground_truth, language) |
| hybrid_metrics = self._evaluate_approach(self.hybrid_detector, ground_truth, language) |
| |
| results[language.value] = { |
| 'rules_based': rules_metrics, |
| 'ml_based': ml_metrics, |
| 'hybrid': hybrid_metrics, |
| 'sample_count': len(ground_truth) |
| } |
| |
| |
| self._print_comparison(language, rules_metrics, ml_metrics, hybrid_metrics) |
| |
| return results |
| |
| def _evaluate_approach(self, detector, ground_truth: List, language: Language) -> EvaluationMetrics: |
| """Evaluate single detection approach""" |
| tp = fp = fn = tn = 0 |
| |
| for sample in ground_truth: |
| result = detector.detect_bias(sample['text'], language) |
| predicted = result.has_bias_detected |
| actual = sample['has_bias'] == 'True' |
| |
| if predicted and actual: |
| tp += 1 |
| elif predicted and not actual: |
| fp += 1 |
| elif not predicted and actual: |
| fn += 1 |
| else: |
| tn += 1 |
| |
| precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0 |
| recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0 |
| f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0 |
| |
| return EvaluationMetrics( |
| precision=precision, |
| recall=recall, |
| f1_score=f1, |
| true_positives=tp, |
| false_positives=fp, |
| false_negatives=fn, |
| true_negatives=tn |
| ) |
| |
| def _load_ground_truth(self, language: Language) -> List[Dict]: |
| """Load ground truth data for language""" |
| filename = f"eval/ground_truth_{language.value}.csv" |
| ground_truth = [] |
| |
| try: |
| with open(filename, 'r', encoding='utf-8') as f: |
| reader = csv.DictReader(f) |
| ground_truth = list(reader) |
| except FileNotFoundError: |
| print(f"Warning: Ground truth file {filename} not found") |
| |
| return ground_truth |
| |
| def _print_comparison(self, language: Language, rules: EvaluationMetrics, |
| ml: EvaluationMetrics, hybrid: EvaluationMetrics): |
| """Print comparison table for language""" |
| print(f"\n{language.value.upper()} COMPARISON:") |
| print("Approach | F1 | Precision | Recall") |
| print("-" * 40) |
| print(f"Rules-based | {rules.f1_score:.3f} | {rules.precision:.3f} | {rules.recall:.3f}") |
| print(f"ML-based | {ml.f1_score:.3f} | {ml.precision:.3f} | {ml.recall:.3f}") |
| print(f"Hybrid | {hybrid.f1_score:.3f} | {hybrid.precision:.3f} | {hybrid.recall:.3f}") |
|
|
| if __name__ == "__main__": |
| evaluator = MLEvaluationFramework() |
| results = evaluator.run_comparative_evaluation() |
| |
| print("\n" + "="*60) |
| print("SUMMARY: Best F1 Scores by Language") |
| print("="*60) |
| |
| for lang, metrics in results.items(): |
| best_f1 = max( |
| metrics['rules_based'].f1_score, |
| metrics['ml_based'].f1_score, |
| metrics['hybrid'].f1_score |
| ) |
| |
| best_approach = 'rules' if metrics['rules_based'].f1_score == best_f1 else \ |
| 'ml' if metrics['ml_based'].f1_score == best_f1 else 'hybrid' |
| |
| print(f"{lang}: {best_f1:.3f} ({best_approach})") |