Spaces:

pro580
/

customer-support-agent

Running

App Files Files Community

customer-support-agent / src /evaluation /report.py

pro580

Fix rate limiter to use X-Forwarded-For header behind HF proxy

e323466 3 months ago

Raw

History Blame Contribute Delete

3.49 kB

	"""Generate a combined evaluation report summarizing all results."""

	import json
	from pathlib import Path
	from typing import Dict, Optional

	from loguru import logger


	def generate_report(
	results_dir: str,
	baseline_report: Optional[Dict] = None,
	distilbert_report: Optional[Dict] = None,
	ragas_output: Optional[Dict] = None,
	) -> str:
	"""Generate a markdown summary report of all evaluation results.

	Args:
	results_dir: Directory containing result artifacts.
	baseline_report: Baseline classification report dict (optional, loads from file if None).
	distilbert_report: DistilBERT classification report dict (optional, loads from file if None).
	ragas_output: RAGAS evaluation output dict (optional, loads from file if None).

	Returns:
	Markdown report string.
	"""
	base = Path(results_dir)

	# Load from file if not provided
	if baseline_report is None:
	p = base / "baseline_classification_report.json"
	if p.exists():
	with open(p) as f:
	baseline_report = json.load(f)

	if distilbert_report is None:
	p = base / "classification_report.json"
	if p.exists():
	with open(p) as f:
	distilbert_report = json.load(f)

	if ragas_output is None:
	p = base / "ragas_scores.json"
	if p.exists():
	with open(p) as f:
	ragas_output = json.load(f)

	lines = [
	"# Customer Support Agent — Evaluation Report",
	"",
	"## Classification Results",
	"",
	]

	if baseline_report:
	b_f1 = baseline_report.get("weighted avg", {}).get("f1-score", "N/A")
	b_acc = baseline_report.get("accuracy", "N/A")
	lines += [
	"### Baseline (TF-IDF + Logistic Regression)",
	f"- Weighted F1: {b_f1:.4f}" if isinstance(b_f1, float) else f"- Weighted F1: {b_f1}",
	f"- Accuracy: {b_acc:.4f}" if isinstance(b_acc, float) else f"- Accuracy: {b_acc}",
	"",
	]

	if distilbert_report:
	d_f1 = distilbert_report.get("weighted avg", {}).get("f1-score", "N/A")
	d_acc = distilbert_report.get("accuracy", "N/A")
	lines += [
	"### DistilBERT Fine-tuned",
	f"- Weighted F1: {d_f1:.4f}" if isinstance(d_f1, float) else f"- Weighted F1: {d_f1}",
	f"- Accuracy: {d_acc:.4f}" if isinstance(d_acc, float) else f"- Accuracy: {d_acc}",
	"",
	]

	if ragas_output and "aggregate" in ragas_output:
	agg = ragas_output["aggregate"]
	lines += [
	"## RAGAS Evaluation",
	"",
	f"- Queries evaluated: {ragas_output.get('n_evaluated', 'N/A')}",
	f"- Flagged (low faithfulness): {ragas_output.get('n_flagged', 'N/A')} "
	f"({ragas_output.get('pct_flagged', 0.0):.1f}%)",
	"",
	]
	for metric, stats in agg.items():
	lines += [
	f"### {metric.replace('_', ' ').title()}",
	f"- Mean: {stats['mean']:.4f}",
	f"- Median: {stats['median']:.4f}",
	f"- Std: {stats['std']:.4f}",
	f"- Min / Max: {stats['min']:.4f} / {stats['max']:.4f}",
	"",
	]

	report = "\n".join(lines)
	path = base / "evaluation_report.md"
	path.write_text(report)
	logger.info(f"Saved evaluation report → {path}")
	return report