Spaces:

XQ
/

Dokumentassistent

Running

Dokumentassistent / scripts /render_eval_table.py

Add evaluation and update README

a493f04 about 1 month ago

5.75 kB

	"""Render an evaluation matrix as a Markdown table from RAGAS result JSONs.

	Reads one or more ``eval/runs/<ts>_<cell>.json`` (or ``.rejudged.json``)
	files, extracts the per-cell ``config`` + ``aggregate`` blocks, and prints a
	single comparison table suitable for pasting into a README.

	The cell with the highest score for each metric is bolded in the rendered
	table, so the "winning" configuration on each axis is visible at a glance.

	Usage:
	# Pass file paths directly (preserves the order you provide):
	python -m scripts.render_eval_table \\
	eval/runs/20260409_031715_fixed_react.rejudged.json \\
	eval/runs/20260409_031853_recursive_react.rejudged.json \\
	eval/runs/20260409_031911_semantic_react.rejudged.json \\
	eval/runs/20260409_031933_recursive_pipeline.rejudged.json

	# Or pass a shell glob via --pattern (sorted by filename):
	python -m scripts.render_eval_table --pattern 'eval/runs/20260409_03*.rejudged.json'
	"""

	import argparse
	import glob
	import json
	import os
	import sys
	from typing import Any

	# Display labels for metric columns, in the order they should appear in the
	# rendered table. Maps the RAGAS metric column name to a short header.
	# Metrics not in this map are skipped from the table; the raw JSON still
	# contains them.
	_METRIC_DISPLAY: list[tuple[str, str]] = [
	("faithfulness", "Faith"),
	("answer_relevancy", "Ans.Rel"),
	("llm_context_precision_with_reference", "Ctx.Prec"),
	("context_recall", "Ctx.Recall"),
	("answer_correctness", "Ans.Corr"),
	("factual_correctness(mode=f1)", "Fact.Corr"),
	]

	_FIXED_HEADERS: list[str] = ["Config", "Chunking", "Router", "top_k"]


	def _load_cells(paths: list[str]) -> list[dict[str, Any]]:
	"""Load and validate the requested result JSON files.

	Args:
	paths: List of JSON file paths.

	Returns:
	List of cell dicts with keys ``path``, ``config``, ``aggregate``.
	Files that are missing or malformed are skipped with a warning.
	"""
	cells: list[dict[str, Any]] = []
	for path in paths:
	if not os.path.exists(path):
	print(f"WARN: skipping missing file {path}", file=sys.stderr)
	continue
	with open(path, "r", encoding="utf-8") as fh:
	data = json.load(fh)
	if "config" not in data or "aggregate" not in data:
	print(
	f"WARN: {path} is missing 'config' or 'aggregate' — skipping",
	file=sys.stderr,
	)
	continue
	cells.append(
	{
	"path": path,
	"config": data["config"],
	"aggregate": data["aggregate"],
	}
	)
	return cells


	def _format_table(cells: list[dict[str, Any]]) -> str:
	"""Format cells as a Markdown comparison table with per-metric winners bolded.

	Args:
	cells: List of cell dicts produced by ``_load_cells``.

	Returns:
	Markdown string ready to paste into a README.
	"""
	if not cells:
	return "(no cells to render)"

	header = list(_FIXED_HEADERS) + [display for _, display in _METRIC_DISPLAY]

	# Build rows AND track the best score per metric so we can bold the winner.
	rows: list[list[str]] = []
	best_score: dict[str, float] = {}
	for cell in cells:
	cfg = cell["config"]
	agg = cell["aggregate"]
	row: list[str] = [
	str(cfg.get("name", "?")),
	str(cfg.get("chunking", "?")),
	str(cfg.get("router", "?")),
	str(cfg.get("top_k", "?")),
	]
	for metric_key, _display in _METRIC_DISPLAY:
	value = agg.get(metric_key)
	if isinstance(value, (int, float)):
	row.append(f"{value:.3f}")
	if value > best_score.get(metric_key, float("-inf")):
	best_score[metric_key] = value
	else:
	row.append("—")
	rows.append(row)

	# Bold the winning cell for each metric column.
	for row in rows:
	for col_idx, (metric_key, _display) in enumerate(_METRIC_DISPLAY):
	cell_idx = len(_FIXED_HEADERS) + col_idx
	value_str = row[cell_idx]
	if value_str == "—":
	continue
	if float(value_str) == best_score.get(metric_key):
	row[cell_idx] = f"{value_str}"

	lines: list[str] = []
	lines.append("\| " + " \| ".join(header) + " \|")
	lines.append("\|" + "\|".join("---" for _ in header) + "\|")
	for row in rows:
	lines.append("\| " + " \| ".join(row) + " \|")
	return "\n".join(lines)


	def parse_args() -> argparse.Namespace:
	"""Parse command-line arguments."""
	parser = argparse.ArgumentParser(
	description="Render a RAGAS evaluation matrix as a Markdown table.",
	)
	parser.add_argument(
	"files",
	nargs="*",
	help="Result JSON files to include (preserves the order given).",
	)
	parser.add_argument(
	"--pattern",
	type=str,
	default="",
	help="Shell glob to match files (sorted by filename).",
	)
	return parser.parse_args()


	def main() -> None:
	"""Render the requested cells as a Markdown table to stdout."""
	args = parse_args()
	if args.pattern:
	files = sorted(glob.glob(args.pattern))
	else:
	files = list(args.files)
	if not files:
	print(
	"ERROR: no files provided. Pass file paths as arguments or use --pattern.",
	file=sys.stderr,
	)
	sys.exit(1)
	cells = _load_cells(files)
	if not cells:
	print("ERROR: no valid cells loaded.", file=sys.stderr)
	sys.exit(1)
	print(_format_table(cells))


	if __name__ == "__main__":
	main()