Spaces:
Running
Running
| """Run full RAGAS evaluation on generated responses and produce comparison table.""" | |
| import json | |
| import os | |
| import sys | |
| from pathlib import Path | |
| import yaml | |
| from loguru import logger | |
| sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) | |
| from src.evaluation.ragas_eval import run_ragas_evaluation | |
| from src.evaluation.report import generate_report | |
| from src.evaluation.classifier_eval import generate_comparison_table, measure_inference_time | |
| from src.models import baseline as baseline_mod | |
| from src.models.intent_classifier import IntentClassifier | |
| def _get_model_size_mb(path: str) -> float: | |
| """Return total size of all files in a directory in MB.""" | |
| total = 0 | |
| for p in Path(path).rglob("*"): | |
| if p.is_file(): | |
| try: | |
| total += p.stat().st_size | |
| except OSError: | |
| pass | |
| return total / (1024 * 1024) | |
| def main() -> None: | |
| """Run RAGAS evaluation and generate comparison table.""" | |
| Path("logs").mkdir(exist_ok=True) | |
| logger.add("logs/run_evaluation.log", rotation="10 MB") | |
| with open("config/config.yaml") as f: | |
| cfg = yaml.safe_load(f) | |
| results_dir = cfg["paths"]["results"] | |
| # Load generation results | |
| results_path = Path(results_dir) / "generation_results.json" | |
| if not results_path.exists(): | |
| logger.error(f"Generation results not found at {results_path}. Run run_generation.py first.") | |
| sys.exit(1) | |
| with open(results_path) as f: | |
| results = json.load(f) | |
| # Subsample to target size for RAGAS (it can be slow) | |
| n = cfg["evaluation"]["ragas_sample_size"] | |
| if len(results) > n: | |
| import random | |
| random.seed(42) | |
| results_sample = random.sample(results, n) | |
| else: | |
| results_sample = results | |
| # Run RAGAS | |
| ragas_output = run_ragas_evaluation( | |
| results=results_sample, | |
| results_dir=results_dir, | |
| faithfulness_threshold=cfg["evaluation"]["faithfulness_flag_threshold"], | |
| ) | |
| # Load classification reports | |
| baseline_report, distilbert_report = None, None | |
| b_path = Path(results_dir) / "baseline_classification_report.json" | |
| d_path = Path(results_dir) / "classification_report.json" | |
| if b_path.exists(): | |
| with open(b_path) as f: | |
| baseline_report = json.load(f) | |
| if d_path.exists(): | |
| with open(d_path) as f: | |
| distilbert_report = json.load(f) | |
| # Measure inference times | |
| from src.data.dataset import load_splits | |
| _, _, test_df = load_splits(cfg["paths"]["data_processed"]) | |
| texts = test_df["text"].tolist() | |
| b_time_ms, d_time_ms = 0.0, 0.0 | |
| if baseline_report: | |
| try: | |
| pipeline = baseline_mod.load_pipeline(cfg["paths"]["models_baseline"]) | |
| b_time_ms = measure_inference_time(pipeline.predict, texts) | |
| except Exception as e: | |
| logger.warning(f"Could not measure baseline inference time: {e}") | |
| if distilbert_report: | |
| try: | |
| model_dir = str(Path(cfg["paths"]["models_distilbert"]) / "best") | |
| clf = IntentClassifier(model_dir=model_dir, max_length=cfg["classifier"]["max_length"]) | |
| d_time_ms = measure_inference_time( | |
| lambda t: clf.predict_batch(t), texts | |
| ) | |
| except Exception as e: | |
| logger.warning(f"Could not measure DistilBERT inference time: {e}") | |
| # Model sizes | |
| b_size = _get_model_size_mb(cfg["paths"]["models_baseline"]) | |
| d_size = _get_model_size_mb(cfg["paths"]["models_distilbert"]) | |
| # Comparison table | |
| if baseline_report and distilbert_report: | |
| generate_comparison_table( | |
| baseline_report=baseline_report, | |
| distilbert_report=distilbert_report, | |
| baseline_inference_ms=b_time_ms, | |
| distilbert_inference_ms=d_time_ms, | |
| baseline_size_mb=b_size, | |
| distilbert_size_mb=d_size, | |
| results_dir=results_dir, | |
| ) | |
| # Final report | |
| generate_report(results_dir=results_dir, ragas_output=ragas_output) | |
| # Check RAGAS targets | |
| agg = ragas_output.get("aggregate", {}) | |
| for metric, target in [ | |
| ("faithfulness", cfg["evaluation"]["target_faithfulness"]), | |
| ("answer_relevancy", cfg["evaluation"]["target_answer_relevancy"]), | |
| ]: | |
| if metric in agg: | |
| mean = agg[metric]["mean"] | |
| status = "PASS" if mean >= target else "FAIL" | |
| logger.info(f"[{status}] {metric}: {mean:.4f} (target >= {target})") | |
| pct_flagged = ragas_output.get("pct_flagged", 100.0) | |
| flag_status = "PASS" if pct_flagged <= 5.0 else "FAIL" | |
| logger.info(f"[{flag_status}] Flagged responses: {pct_flagged:.1f}% (target <= 5%)") | |
| if __name__ == "__main__": | |
| main() | |