"""Render an evaluation matrix as a Markdown table from RAGAS result JSONs. Reads one or more ``eval/runs/_.json`` (or ``.rejudged.json``) files, extracts the per-cell ``config`` + ``aggregate`` blocks, and prints a single comparison table suitable for pasting into a README. The cell with the highest score for each metric is bolded in the rendered table, so the "winning" configuration on each axis is visible at a glance. Usage: # Pass file paths directly (preserves the order you provide): python -m scripts.render_eval_table \\ eval/runs/20260409_031715_fixed_react.rejudged.json \\ eval/runs/20260409_031853_recursive_react.rejudged.json \\ eval/runs/20260409_031911_semantic_react.rejudged.json \\ eval/runs/20260409_031933_recursive_pipeline.rejudged.json # Or pass a shell glob via --pattern (sorted by filename): python -m scripts.render_eval_table --pattern 'eval/runs/20260409_03*.rejudged.json' """ import argparse import glob import json import os import sys from typing import Any # Display labels for metric columns, in the order they should appear in the # rendered table. Maps the RAGAS metric column name to a short header. # Metrics not in this map are skipped from the table; the raw JSON still # contains them. _METRIC_DISPLAY: list[tuple[str, str]] = [ ("faithfulness", "Faith"), ("answer_relevancy", "Ans.Rel"), ("llm_context_precision_with_reference", "Ctx.Prec"), ("context_recall", "Ctx.Recall"), ("answer_correctness", "Ans.Corr"), ("factual_correctness(mode=f1)", "Fact.Corr"), ] _FIXED_HEADERS: list[str] = ["Config", "Chunking", "Router", "top_k"] def _load_cells(paths: list[str]) -> list[dict[str, Any]]: """Load and validate the requested result JSON files. Args: paths: List of JSON file paths. Returns: List of cell dicts with keys ``path``, ``config``, ``aggregate``. Files that are missing or malformed are skipped with a warning. """ cells: list[dict[str, Any]] = [] for path in paths: if not os.path.exists(path): print(f"WARN: skipping missing file {path}", file=sys.stderr) continue with open(path, "r", encoding="utf-8") as fh: data = json.load(fh) if "config" not in data or "aggregate" not in data: print( f"WARN: {path} is missing 'config' or 'aggregate' — skipping", file=sys.stderr, ) continue cells.append( { "path": path, "config": data["config"], "aggregate": data["aggregate"], } ) return cells def _format_table(cells: list[dict[str, Any]]) -> str: """Format cells as a Markdown comparison table with per-metric winners bolded. Args: cells: List of cell dicts produced by ``_load_cells``. Returns: Markdown string ready to paste into a README. """ if not cells: return "(no cells to render)" header = list(_FIXED_HEADERS) + [display for _, display in _METRIC_DISPLAY] # Build rows AND track the best score per metric so we can bold the winner. rows: list[list[str]] = [] best_score: dict[str, float] = {} for cell in cells: cfg = cell["config"] agg = cell["aggregate"] row: list[str] = [ str(cfg.get("name", "?")), str(cfg.get("chunking", "?")), str(cfg.get("router", "?")), str(cfg.get("top_k", "?")), ] for metric_key, _display in _METRIC_DISPLAY: value = agg.get(metric_key) if isinstance(value, (int, float)): row.append(f"{value:.3f}") if value > best_score.get(metric_key, float("-inf")): best_score[metric_key] = value else: row.append("—") rows.append(row) # Bold the winning cell for each metric column. for row in rows: for col_idx, (metric_key, _display) in enumerate(_METRIC_DISPLAY): cell_idx = len(_FIXED_HEADERS) + col_idx value_str = row[cell_idx] if value_str == "—": continue if float(value_str) == best_score.get(metric_key): row[cell_idx] = f"**{value_str}**" lines: list[str] = [] lines.append("| " + " | ".join(header) + " |") lines.append("|" + "|".join("---" for _ in header) + "|") for row in rows: lines.append("| " + " | ".join(row) + " |") return "\n".join(lines) def parse_args() -> argparse.Namespace: """Parse command-line arguments.""" parser = argparse.ArgumentParser( description="Render a RAGAS evaluation matrix as a Markdown table.", ) parser.add_argument( "files", nargs="*", help="Result JSON files to include (preserves the order given).", ) parser.add_argument( "--pattern", type=str, default="", help="Shell glob to match files (sorted by filename).", ) return parser.parse_args() def main() -> None: """Render the requested cells as a Markdown table to stdout.""" args = parse_args() if args.pattern: files = sorted(glob.glob(args.pattern)) else: files = list(args.files) if not files: print( "ERROR: no files provided. Pass file paths as arguments or use --pattern.", file=sys.stderr, ) sys.exit(1) cells = _load_cells(files) if not cells: print("ERROR: no valid cells loaded.", file=sys.stderr) sys.exit(1) print(_format_table(cells)) if __name__ == "__main__": main()