#!/usr/bin/env python3 """ Simple script to run model evaluation on CNN/DailyMail dataset """ import os import sys import logging from pathlib import Path # Add project root to path project_root = Path(__file__).parent sys.path.insert(0, str(project_root)) from evaluation.dataset_loader import CNNDailyMailLoader from evaluation.model_evaluator import ModelEvaluator from evaluation.results_analyzer import ResultsAnalyzer # Setup logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) def main(): """Run comprehensive evaluation""" # Configuration SAMPLE_SIZE = 50 # Number of samples to evaluate OUTPUT_DIR = "evaluation_results" # Create output directory os.makedirs(OUTPUT_DIR, exist_ok=True) logger.info("Starting Smart Summarizer Evaluation") logger.info(f"Sample size: {SAMPLE_SIZE}") try: # Step 1: Load dataset logger.info("Step 1: Loading CNN/DailyMail dataset...") loader = CNNDailyMailLoader() dataset = loader.load_dataset() # Step 2: Create evaluation subset logger.info("Step 2: Creating evaluation subset...") eval_data = loader.create_evaluation_subset(size=SAMPLE_SIZE) loader.save_evaluation_data(eval_data, f"{OUTPUT_DIR}/eval_data.json") # Step 3: Categorize by topics logger.info("Step 3: Categorizing by topics...") categorized_data = loader.categorize_by_topic(eval_data) # Save categorized data for topic, data in categorized_data.items(): if data: loader.save_evaluation_data(data, f"{OUTPUT_DIR}/data_{topic}.json") logger.info(f" {topic}: {len(data)} articles") # Step 4: Initialize models logger.info("Step 4: Initializing models...") evaluator = ModelEvaluator() evaluator.initialize_models() # Step 5: Run overall evaluation logger.info("Step 5: Running overall evaluation...") overall_results = evaluator.evaluate_all_models(eval_data, max_samples=SAMPLE_SIZE) # Save overall results evaluator.save_results(overall_results, f"{OUTPUT_DIR}/results_overall.json") # Create overall comparison comparison_df = evaluator.compare_models(overall_results) comparison_df.to_csv(f"{OUTPUT_DIR}/comparison_overall.csv", index=False) print("\n" + "="*60) print("OVERALL EVALUATION RESULTS") print("="*60) print(comparison_df.to_string(index=False)) # Step 6: Run topic-based evaluation logger.info("Step 6: Running topic-based evaluation...") topic_results = {} for topic, data in categorized_data.items(): if len(data) >= 5: # Only evaluate topics with sufficient data logger.info(f" Evaluating topic: {topic}") topic_results[topic] = evaluator.evaluate_all_models(data, max_samples=20) # Save topic results evaluator.save_results(topic_results[topic], f"{OUTPUT_DIR}/results_{topic}.json") # Create topic comparison topic_comparison = evaluator.compare_models(topic_results[topic]) topic_comparison.to_csv(f"{OUTPUT_DIR}/comparison_{topic}.csv", index=False) print(f"\n{topic.upper()} TOPIC RESULTS:") print("-" * 40) print(topic_comparison.to_string(index=False)) # Step 7: Create visualizations and analysis logger.info("Step 7: Creating analysis and visualizations...") analyzer = ResultsAnalyzer() # Overall performance charts analyzer.create_performance_charts(overall_results, OUTPUT_DIR) # Topic analysis if we have topic results if topic_results: analyzer.analyze_topic_performance(topic_results, OUTPUT_DIR) # Detailed report analyzer.create_detailed_report(overall_results, OUTPUT_DIR) print(f"\n" + "="*60) print("EVALUATION COMPLETE") print("="*60) print(f"Results saved to: {OUTPUT_DIR}/") print("Files created:") print(f" - results_overall.json (detailed results)") print(f" - comparison_overall.csv (summary table)") print(f" - performance_comparison.png (charts)") print(f" - evaluation_report.md (detailed report)") if topic_results: print(f" - topic_performance_heatmap.png (topic analysis)") print(f" - topic_summary.csv (topic breakdown)") except Exception as e: logger.error(f"Evaluation failed: {e}") raise if __name__ == "__main__": main()