Dokumentassistent / scripts /render_eval_table.py
XQ
Add evaluation and update README
a493f04
raw
history blame
5.75 kB
"""Render an evaluation matrix as a Markdown table from RAGAS result JSONs.
Reads one or more ``eval/runs/<ts>_<cell>.json`` (or ``.rejudged.json``)
files, extracts the per-cell ``config`` + ``aggregate`` blocks, and prints a
single comparison table suitable for pasting into a README.
The cell with the highest score for each metric is bolded in the rendered
table, so the "winning" configuration on each axis is visible at a glance.
Usage:
# Pass file paths directly (preserves the order you provide):
python -m scripts.render_eval_table \\
eval/runs/20260409_031715_fixed_react.rejudged.json \\
eval/runs/20260409_031853_recursive_react.rejudged.json \\
eval/runs/20260409_031911_semantic_react.rejudged.json \\
eval/runs/20260409_031933_recursive_pipeline.rejudged.json
# Or pass a shell glob via --pattern (sorted by filename):
python -m scripts.render_eval_table --pattern 'eval/runs/20260409_03*.rejudged.json'
"""
import argparse
import glob
import json
import os
import sys
from typing import Any
# Display labels for metric columns, in the order they should appear in the
# rendered table. Maps the RAGAS metric column name to a short header.
# Metrics not in this map are skipped from the table; the raw JSON still
# contains them.
_METRIC_DISPLAY: list[tuple[str, str]] = [
("faithfulness", "Faith"),
("answer_relevancy", "Ans.Rel"),
("llm_context_precision_with_reference", "Ctx.Prec"),
("context_recall", "Ctx.Recall"),
("answer_correctness", "Ans.Corr"),
("factual_correctness(mode=f1)", "Fact.Corr"),
]
_FIXED_HEADERS: list[str] = ["Config", "Chunking", "Router", "top_k"]
def _load_cells(paths: list[str]) -> list[dict[str, Any]]:
"""Load and validate the requested result JSON files.
Args:
paths: List of JSON file paths.
Returns:
List of cell dicts with keys ``path``, ``config``, ``aggregate``.
Files that are missing or malformed are skipped with a warning.
"""
cells: list[dict[str, Any]] = []
for path in paths:
if not os.path.exists(path):
print(f"WARN: skipping missing file {path}", file=sys.stderr)
continue
with open(path, "r", encoding="utf-8") as fh:
data = json.load(fh)
if "config" not in data or "aggregate" not in data:
print(
f"WARN: {path} is missing 'config' or 'aggregate' — skipping",
file=sys.stderr,
)
continue
cells.append(
{
"path": path,
"config": data["config"],
"aggregate": data["aggregate"],
}
)
return cells
def _format_table(cells: list[dict[str, Any]]) -> str:
"""Format cells as a Markdown comparison table with per-metric winners bolded.
Args:
cells: List of cell dicts produced by ``_load_cells``.
Returns:
Markdown string ready to paste into a README.
"""
if not cells:
return "(no cells to render)"
header = list(_FIXED_HEADERS) + [display for _, display in _METRIC_DISPLAY]
# Build rows AND track the best score per metric so we can bold the winner.
rows: list[list[str]] = []
best_score: dict[str, float] = {}
for cell in cells:
cfg = cell["config"]
agg = cell["aggregate"]
row: list[str] = [
str(cfg.get("name", "?")),
str(cfg.get("chunking", "?")),
str(cfg.get("router", "?")),
str(cfg.get("top_k", "?")),
]
for metric_key, _display in _METRIC_DISPLAY:
value = agg.get(metric_key)
if isinstance(value, (int, float)):
row.append(f"{value:.3f}")
if value > best_score.get(metric_key, float("-inf")):
best_score[metric_key] = value
else:
row.append("—")
rows.append(row)
# Bold the winning cell for each metric column.
for row in rows:
for col_idx, (metric_key, _display) in enumerate(_METRIC_DISPLAY):
cell_idx = len(_FIXED_HEADERS) + col_idx
value_str = row[cell_idx]
if value_str == "—":
continue
if float(value_str) == best_score.get(metric_key):
row[cell_idx] = f"**{value_str}**"
lines: list[str] = []
lines.append("| " + " | ".join(header) + " |")
lines.append("|" + "|".join("---" for _ in header) + "|")
for row in rows:
lines.append("| " + " | ".join(row) + " |")
return "\n".join(lines)
def parse_args() -> argparse.Namespace:
"""Parse command-line arguments."""
parser = argparse.ArgumentParser(
description="Render a RAGAS evaluation matrix as a Markdown table.",
)
parser.add_argument(
"files",
nargs="*",
help="Result JSON files to include (preserves the order given).",
)
parser.add_argument(
"--pattern",
type=str,
default="",
help="Shell glob to match files (sorted by filename).",
)
return parser.parse_args()
def main() -> None:
"""Render the requested cells as a Markdown table to stdout."""
args = parse_args()
if args.pattern:
files = sorted(glob.glob(args.pattern))
else:
files = list(args.files)
if not files:
print(
"ERROR: no files provided. Pass file paths as arguments or use --pattern.",
file=sys.stderr,
)
sys.exit(1)
cells = _load_cells(files)
if not cells:
print("ERROR: no valid cells loaded.", file=sys.stderr)
sys.exit(1)
print(_format_table(cells))
if __name__ == "__main__":
main()