Spaces:

juakazike
/

test-ui

Running

App Files Files Community

test-ui / eval /ml_evaluation.py

juakazike

Deploy testing UI for expert validation

d7d1833 verified about 1 month ago

raw

history blame contribute delete

4.63 kB

	"""
	ML model evaluation comparing rules-based vs ML vs hybrid approaches
	"""
	import csv
	from typing import Dict, List
	from .bias_detector import BiasDetector
	from .ml_detector import MLBiasDetector
	from .hybrid_detector import HybridBiasDetector
	from .models import Language, EvaluationMetrics

	class MLEvaluationFramework:
	"""Evaluate and compare different detection approaches"""

	def __init__(self):
	self.rules_detector = BiasDetector()
	self.ml_detector = MLBiasDetector()
	self.hybrid_detector = HybridBiasDetector()

	def run_comparative_evaluation(self) -> Dict:
	"""Run evaluation across all approaches and languages"""
	results = {}

	for language in Language:
	print(f"\nEvaluating {language.value}...")

	# Load ground truth
	ground_truth = self._load_ground_truth(language)

	# Evaluate each approach
	rules_metrics = self._evaluate_approach(self.rules_detector, ground_truth, language)
	ml_metrics = self._evaluate_approach(self.ml_detector, ground_truth, language)
	hybrid_metrics = self._evaluate_approach(self.hybrid_detector, ground_truth, language)

	results[language.value] = {
	'rules_based': rules_metrics,
	'ml_based': ml_metrics,
	'hybrid': hybrid_metrics,
	'sample_count': len(ground_truth)
	}

	# Print comparison
	self._print_comparison(language, rules_metrics, ml_metrics, hybrid_metrics)

	return results

	def _evaluate_approach(self, detector, ground_truth: List, language: Language) -> EvaluationMetrics:
	"""Evaluate single detection approach"""
	tp = fp = fn = tn = 0

	for sample in ground_truth:
	result = detector.detect_bias(sample['text'], language)
	predicted = result.has_bias_detected
	actual = sample['has_bias'] == 'True'

	if predicted and actual:
	tp += 1
	elif predicted and not actual:
	fp += 1
	elif not predicted and actual:
	fn += 1
	else:
	tn += 1

	precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
	recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
	f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

	return EvaluationMetrics(
	precision=precision,
	recall=recall,
	f1_score=f1,
	true_positives=tp,
	false_positives=fp,
	false_negatives=fn,
	true_negatives=tn
	)

	def _load_ground_truth(self, language: Language) -> List[Dict]:
	"""Load ground truth data for language"""
	filename = f"eval/ground_truth_{language.value}.csv"
	ground_truth = []

	try:
	with open(filename, 'r', encoding='utf-8') as f:
	reader = csv.DictReader(f)
	ground_truth = list(reader)
	except FileNotFoundError:
	print(f"Warning: Ground truth file {filename} not found")

	return ground_truth

	def _print_comparison(self, language: Language, rules: EvaluationMetrics,
	ml: EvaluationMetrics, hybrid: EvaluationMetrics):
	"""Print comparison table for language"""
	print(f"\n{language.value.upper()} COMPARISON:")
	print("Approach \| F1 \| Precision \| Recall")
	print("-" * 40)
	print(f"Rules-based \| {rules.f1_score:.3f} \| {rules.precision:.3f} \| {rules.recall:.3f}")
	print(f"ML-based \| {ml.f1_score:.3f} \| {ml.precision:.3f} \| {ml.recall:.3f}")
	print(f"Hybrid \| {hybrid.f1_score:.3f} \| {hybrid.precision:.3f} \| {hybrid.recall:.3f}")

	if __name__ == "__main__":
	evaluator = MLEvaluationFramework()
	results = evaluator.run_comparative_evaluation()

	print("\n" + "="*60)
	print("SUMMARY: Best F1 Scores by Language")
	print("="*60)

	for lang, metrics in results.items():
	best_f1 = max(
	metrics['rules_based'].f1_score,
	metrics['ml_based'].f1_score,
	metrics['hybrid'].f1_score
	)

	best_approach = 'rules' if metrics['rules_based'].f1_score == best_f1 else \
	'ml' if metrics['ml_based'].f1_score == best_f1 else 'hybrid'

	print(f"{lang}: {best_f1:.3f} ({best_approach})")