Spaces:
Running
Running
File size: 5,748 Bytes
a493f04 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 | """Render an evaluation matrix as a Markdown table from RAGAS result JSONs.
Reads one or more ``eval/runs/<ts>_<cell>.json`` (or ``.rejudged.json``)
files, extracts the per-cell ``config`` + ``aggregate`` blocks, and prints a
single comparison table suitable for pasting into a README.
The cell with the highest score for each metric is bolded in the rendered
table, so the "winning" configuration on each axis is visible at a glance.
Usage:
# Pass file paths directly (preserves the order you provide):
python -m scripts.render_eval_table \\
eval/runs/20260409_031715_fixed_react.rejudged.json \\
eval/runs/20260409_031853_recursive_react.rejudged.json \\
eval/runs/20260409_031911_semantic_react.rejudged.json \\
eval/runs/20260409_031933_recursive_pipeline.rejudged.json
# Or pass a shell glob via --pattern (sorted by filename):
python -m scripts.render_eval_table --pattern 'eval/runs/20260409_03*.rejudged.json'
"""
import argparse
import glob
import json
import os
import sys
from typing import Any
# Display labels for metric columns, in the order they should appear in the
# rendered table. Maps the RAGAS metric column name to a short header.
# Metrics not in this map are skipped from the table; the raw JSON still
# contains them.
_METRIC_DISPLAY: list[tuple[str, str]] = [
("faithfulness", "Faith"),
("answer_relevancy", "Ans.Rel"),
("llm_context_precision_with_reference", "Ctx.Prec"),
("context_recall", "Ctx.Recall"),
("answer_correctness", "Ans.Corr"),
("factual_correctness(mode=f1)", "Fact.Corr"),
]
_FIXED_HEADERS: list[str] = ["Config", "Chunking", "Router", "top_k"]
def _load_cells(paths: list[str]) -> list[dict[str, Any]]:
"""Load and validate the requested result JSON files.
Args:
paths: List of JSON file paths.
Returns:
List of cell dicts with keys ``path``, ``config``, ``aggregate``.
Files that are missing or malformed are skipped with a warning.
"""
cells: list[dict[str, Any]] = []
for path in paths:
if not os.path.exists(path):
print(f"WARN: skipping missing file {path}", file=sys.stderr)
continue
with open(path, "r", encoding="utf-8") as fh:
data = json.load(fh)
if "config" not in data or "aggregate" not in data:
print(
f"WARN: {path} is missing 'config' or 'aggregate' — skipping",
file=sys.stderr,
)
continue
cells.append(
{
"path": path,
"config": data["config"],
"aggregate": data["aggregate"],
}
)
return cells
def _format_table(cells: list[dict[str, Any]]) -> str:
"""Format cells as a Markdown comparison table with per-metric winners bolded.
Args:
cells: List of cell dicts produced by ``_load_cells``.
Returns:
Markdown string ready to paste into a README.
"""
if not cells:
return "(no cells to render)"
header = list(_FIXED_HEADERS) + [display for _, display in _METRIC_DISPLAY]
# Build rows AND track the best score per metric so we can bold the winner.
rows: list[list[str]] = []
best_score: dict[str, float] = {}
for cell in cells:
cfg = cell["config"]
agg = cell["aggregate"]
row: list[str] = [
str(cfg.get("name", "?")),
str(cfg.get("chunking", "?")),
str(cfg.get("router", "?")),
str(cfg.get("top_k", "?")),
]
for metric_key, _display in _METRIC_DISPLAY:
value = agg.get(metric_key)
if isinstance(value, (int, float)):
row.append(f"{value:.3f}")
if value > best_score.get(metric_key, float("-inf")):
best_score[metric_key] = value
else:
row.append("—")
rows.append(row)
# Bold the winning cell for each metric column.
for row in rows:
for col_idx, (metric_key, _display) in enumerate(_METRIC_DISPLAY):
cell_idx = len(_FIXED_HEADERS) + col_idx
value_str = row[cell_idx]
if value_str == "—":
continue
if float(value_str) == best_score.get(metric_key):
row[cell_idx] = f"**{value_str}**"
lines: list[str] = []
lines.append("| " + " | ".join(header) + " |")
lines.append("|" + "|".join("---" for _ in header) + "|")
for row in rows:
lines.append("| " + " | ".join(row) + " |")
return "\n".join(lines)
def parse_args() -> argparse.Namespace:
"""Parse command-line arguments."""
parser = argparse.ArgumentParser(
description="Render a RAGAS evaluation matrix as a Markdown table.",
)
parser.add_argument(
"files",
nargs="*",
help="Result JSON files to include (preserves the order given).",
)
parser.add_argument(
"--pattern",
type=str,
default="",
help="Shell glob to match files (sorted by filename).",
)
return parser.parse_args()
def main() -> None:
"""Render the requested cells as a Markdown table to stdout."""
args = parse_args()
if args.pattern:
files = sorted(glob.glob(args.pattern))
else:
files = list(args.files)
if not files:
print(
"ERROR: no files provided. Pass file paths as arguments or use --pattern.",
file=sys.stderr,
)
sys.exit(1)
cells = _load_cells(files)
if not cells:
print("ERROR: no valid cells loaded.", file=sys.stderr)
sys.exit(1)
print(_format_table(cells))
if __name__ == "__main__":
main()
|