""" Main evaluation orchestrator for bias detection framework. This module coordinates the evaluation process and provides the main interface for running evaluations. """ from datetime import datetime from pathlib import Path from typing import List, Optional from .models import Language, LanguageEvaluationResult from .data_loader import GroundTruthLoader, ResultsWriter, DataLoadError from .bias_detector import BiasDetector, BiasDetectionError from .metrics_calculator import MetricsCalculator, MetricsFormatter class EvaluationError(Exception): """Custom exception for evaluation errors.""" pass class BiasEvaluationOrchestrator: """ Main orchestrator for bias detection evaluation. Coordinates data loading, bias detection, metrics calculation, and result output. Provides a clean interface for running complete evaluations. """ def __init__( self, data_dir: Path = Path("eval"), rules_dir: Path = Path("rules"), results_dir: Path = Path("eval/results") ): """ Initialize the evaluation orchestrator. Args: data_dir: Directory containing ground truth data rules_dir: Directory containing bias detection rules results_dir: Directory for writing results """ self.ground_truth_loader = GroundTruthLoader(data_dir) self.bias_detector = BiasDetector(rules_dir) self.metrics_calculator = MetricsCalculator() self.metrics_formatter = MetricsFormatter() self.results_writer = ResultsWriter(results_dir) def run_evaluation( self, languages: Optional[List[Language]] = None, save_results: bool = True ) -> List[LanguageEvaluationResult]: """ Run complete bias detection evaluation. Args: languages: List of languages to evaluate (defaults to English and Swahili) save_results: Whether to save results to files Returns: List of evaluation results for each language Raises: EvaluationError: If evaluation fails """ if languages is None: # JuaKazi languages: EN (production), SW (foundation), FR/KI (pending validation) languages = [Language.ENGLISH, Language.SWAHILI, Language.FRENCH, Language.GIKUYU] results = [] try: for language in languages: print(f"Evaluating {language.value}...") result = self._evaluate_language(language) results.append(result) # Print immediate results lang_names = { Language.ENGLISH: "English", Language.SWAHILI: "Swahili", Language.FRENCH: "French", Language.GIKUYU: "Gikuyu" } lang_name = lang_names.get(language, language.value) print(f"{lang_name} Results:") print(f" Overall F1: {result.overall_metrics.f1_score:.3f}") print(f" Precision: {result.overall_metrics.precision:.3f}") print(f" Recall: {result.overall_metrics.recall:.3f}") print() if save_results: self._save_results(results) return results except Exception as e: raise EvaluationError(f"Evaluation failed: {e}") from e def _evaluate_language(self, language: Language) -> LanguageEvaluationResult: """Evaluate bias detection for a single language.""" try: # Load ground truth data ground_truth = self.ground_truth_loader.load_ground_truth(language) # Run bias detection on all samples predictions = [] for sample in ground_truth: prediction = self.bias_detector.detect_bias(sample.text, language) predictions.append(prediction) # Calculate metrics result = self.metrics_calculator.calculate_language_metrics( ground_truth, predictions, language ) return result except (DataLoadError, BiasDetectionError) as e: raise EvaluationError(f"Failed to evaluate {language}: {e}") from e def _save_results(self, results: List[LanguageEvaluationResult]) -> None: """Save evaluation results to files.""" timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") try: # Save CSV report csv_data = self.metrics_formatter.format_for_csv(results) csv_filename = f"f1_report_{timestamp}.csv" csv_path = self.results_writer.write_csv_report(csv_data, csv_filename) print(f"Report saved to: {csv_path}") except Exception as e: print(f"Warning: Failed to save results: {e}") def main() -> None: """Main entry point for evaluation script.""" try: print("Running bias detection evaluation...") orchestrator = BiasEvaluationOrchestrator() results = orchestrator.run_evaluation() print("Evaluation completed successfully!") except EvaluationError as e: print(f"Evaluation failed: {e}") exit(1) except KeyboardInterrupt: print("\nEvaluation interrupted by user") exit(1) except Exception as e: print(f"Unexpected error: {e}") exit(1) if __name__ == "__main__": main()