Spaces:
Running
Running
File size: 4,717 Bytes
e323466 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 | """Run full RAGAS evaluation on generated responses and produce comparison table."""
import json
import os
import sys
from pathlib import Path
import yaml
from loguru import logger
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from src.evaluation.ragas_eval import run_ragas_evaluation
from src.evaluation.report import generate_report
from src.evaluation.classifier_eval import generate_comparison_table, measure_inference_time
from src.models import baseline as baseline_mod
from src.models.intent_classifier import IntentClassifier
def _get_model_size_mb(path: str) -> float:
"""Return total size of all files in a directory in MB."""
total = 0
for p in Path(path).rglob("*"):
if p.is_file():
try:
total += p.stat().st_size
except OSError:
pass
return total / (1024 * 1024)
def main() -> None:
"""Run RAGAS evaluation and generate comparison table."""
Path("logs").mkdir(exist_ok=True)
logger.add("logs/run_evaluation.log", rotation="10 MB")
with open("config/config.yaml") as f:
cfg = yaml.safe_load(f)
results_dir = cfg["paths"]["results"]
# Load generation results
results_path = Path(results_dir) / "generation_results.json"
if not results_path.exists():
logger.error(f"Generation results not found at {results_path}. Run run_generation.py first.")
sys.exit(1)
with open(results_path) as f:
results = json.load(f)
# Subsample to target size for RAGAS (it can be slow)
n = cfg["evaluation"]["ragas_sample_size"]
if len(results) > n:
import random
random.seed(42)
results_sample = random.sample(results, n)
else:
results_sample = results
# Run RAGAS
ragas_output = run_ragas_evaluation(
results=results_sample,
results_dir=results_dir,
faithfulness_threshold=cfg["evaluation"]["faithfulness_flag_threshold"],
)
# Load classification reports
baseline_report, distilbert_report = None, None
b_path = Path(results_dir) / "baseline_classification_report.json"
d_path = Path(results_dir) / "classification_report.json"
if b_path.exists():
with open(b_path) as f:
baseline_report = json.load(f)
if d_path.exists():
with open(d_path) as f:
distilbert_report = json.load(f)
# Measure inference times
from src.data.dataset import load_splits
_, _, test_df = load_splits(cfg["paths"]["data_processed"])
texts = test_df["text"].tolist()
b_time_ms, d_time_ms = 0.0, 0.0
if baseline_report:
try:
pipeline = baseline_mod.load_pipeline(cfg["paths"]["models_baseline"])
b_time_ms = measure_inference_time(pipeline.predict, texts)
except Exception as e:
logger.warning(f"Could not measure baseline inference time: {e}")
if distilbert_report:
try:
model_dir = str(Path(cfg["paths"]["models_distilbert"]) / "best")
clf = IntentClassifier(model_dir=model_dir, max_length=cfg["classifier"]["max_length"])
d_time_ms = measure_inference_time(
lambda t: clf.predict_batch(t), texts
)
except Exception as e:
logger.warning(f"Could not measure DistilBERT inference time: {e}")
# Model sizes
b_size = _get_model_size_mb(cfg["paths"]["models_baseline"])
d_size = _get_model_size_mb(cfg["paths"]["models_distilbert"])
# Comparison table
if baseline_report and distilbert_report:
generate_comparison_table(
baseline_report=baseline_report,
distilbert_report=distilbert_report,
baseline_inference_ms=b_time_ms,
distilbert_inference_ms=d_time_ms,
baseline_size_mb=b_size,
distilbert_size_mb=d_size,
results_dir=results_dir,
)
# Final report
generate_report(results_dir=results_dir, ragas_output=ragas_output)
# Check RAGAS targets
agg = ragas_output.get("aggregate", {})
for metric, target in [
("faithfulness", cfg["evaluation"]["target_faithfulness"]),
("answer_relevancy", cfg["evaluation"]["target_answer_relevancy"]),
]:
if metric in agg:
mean = agg[metric]["mean"]
status = "PASS" if mean >= target else "FAIL"
logger.info(f"[{status}] {metric}: {mean:.4f} (target >= {target})")
pct_flagged = ragas_output.get("pct_flagged", 100.0)
flag_status = "PASS" if pct_flagged <= 5.0 else "FAIL"
logger.info(f"[{flag_status}] Flagged responses: {pct_flagged:.1f}% (target <= 5%)")
if __name__ == "__main__":
main()
|