test-ui / eval /ml_evaluation.py
juakazike's picture
Deploy testing UI for expert validation
d7d1833 verified
"""
ML model evaluation comparing rules-based vs ML vs hybrid approaches
"""
import csv
from typing import Dict, List
from .bias_detector import BiasDetector
from .ml_detector import MLBiasDetector
from .hybrid_detector import HybridBiasDetector
from .models import Language, EvaluationMetrics
class MLEvaluationFramework:
"""Evaluate and compare different detection approaches"""
def __init__(self):
self.rules_detector = BiasDetector()
self.ml_detector = MLBiasDetector()
self.hybrid_detector = HybridBiasDetector()
def run_comparative_evaluation(self) -> Dict:
"""Run evaluation across all approaches and languages"""
results = {}
for language in Language:
print(f"\nEvaluating {language.value}...")
# Load ground truth
ground_truth = self._load_ground_truth(language)
# Evaluate each approach
rules_metrics = self._evaluate_approach(self.rules_detector, ground_truth, language)
ml_metrics = self._evaluate_approach(self.ml_detector, ground_truth, language)
hybrid_metrics = self._evaluate_approach(self.hybrid_detector, ground_truth, language)
results[language.value] = {
'rules_based': rules_metrics,
'ml_based': ml_metrics,
'hybrid': hybrid_metrics,
'sample_count': len(ground_truth)
}
# Print comparison
self._print_comparison(language, rules_metrics, ml_metrics, hybrid_metrics)
return results
def _evaluate_approach(self, detector, ground_truth: List, language: Language) -> EvaluationMetrics:
"""Evaluate single detection approach"""
tp = fp = fn = tn = 0
for sample in ground_truth:
result = detector.detect_bias(sample['text'], language)
predicted = result.has_bias_detected
actual = sample['has_bias'] == 'True'
if predicted and actual:
tp += 1
elif predicted and not actual:
fp += 1
elif not predicted and actual:
fn += 1
else:
tn += 1
precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
return EvaluationMetrics(
precision=precision,
recall=recall,
f1_score=f1,
true_positives=tp,
false_positives=fp,
false_negatives=fn,
true_negatives=tn
)
def _load_ground_truth(self, language: Language) -> List[Dict]:
"""Load ground truth data for language"""
filename = f"eval/ground_truth_{language.value}.csv"
ground_truth = []
try:
with open(filename, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
ground_truth = list(reader)
except FileNotFoundError:
print(f"Warning: Ground truth file {filename} not found")
return ground_truth
def _print_comparison(self, language: Language, rules: EvaluationMetrics,
ml: EvaluationMetrics, hybrid: EvaluationMetrics):
"""Print comparison table for language"""
print(f"\n{language.value.upper()} COMPARISON:")
print("Approach | F1 | Precision | Recall")
print("-" * 40)
print(f"Rules-based | {rules.f1_score:.3f} | {rules.precision:.3f} | {rules.recall:.3f}")
print(f"ML-based | {ml.f1_score:.3f} | {ml.precision:.3f} | {ml.recall:.3f}")
print(f"Hybrid | {hybrid.f1_score:.3f} | {hybrid.precision:.3f} | {hybrid.recall:.3f}")
if __name__ == "__main__":
evaluator = MLEvaluationFramework()
results = evaluator.run_comparative_evaluation()
print("\n" + "="*60)
print("SUMMARY: Best F1 Scores by Language")
print("="*60)
for lang, metrics in results.items():
best_f1 = max(
metrics['rules_based'].f1_score,
metrics['ml_based'].f1_score,
metrics['hybrid'].f1_score
)
best_approach = 'rules' if metrics['rules_based'].f1_score == best_f1 else \
'ml' if metrics['ml_based'].f1_score == best_f1 else 'hybrid'
print(f"{lang}: {best_f1:.3f} ({best_approach})")