|
|
|
|
|
""" |
|
|
Cross-Track Comparison β loads results from all tracks and produces |
|
|
comparative tables and charts. |
|
|
|
|
|
Usage: |
|
|
python -m tracks.shared.compare --tracks A B C D --dataset medqa |
|
|
""" |
|
|
from __future__ import annotations |
|
|
|
|
|
import json |
|
|
import sys |
|
|
from pathlib import Path |
|
|
from typing import Dict, List, Optional |
|
|
|
|
|
|
|
|
BACKEND_DIR = Path(__file__).resolve().parent.parent.parent |
|
|
if str(BACKEND_DIR) not in sys.path: |
|
|
sys.path.insert(0, str(BACKEND_DIR)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TRACK_DIRS = { |
|
|
"A": BACKEND_DIR / "validation" / "results", |
|
|
"B": BACKEND_DIR / "tracks" / "rag_variants" / "results", |
|
|
"C": BACKEND_DIR / "tracks" / "iterative" / "results", |
|
|
"D": BACKEND_DIR / "tracks" / "arbitrated" / "results", |
|
|
} |
|
|
|
|
|
|
|
|
def load_latest_result(track_id: str, dataset: str = "medqa") -> Optional[dict]: |
|
|
"""Load the most recent result file for a track + dataset.""" |
|
|
result_dir = TRACK_DIRS.get(track_id) |
|
|
if not result_dir or not result_dir.exists(): |
|
|
return None |
|
|
|
|
|
|
|
|
pattern = f"*{dataset}*.json" |
|
|
if track_id != "A": |
|
|
pattern = f"track{track_id}_{dataset}*.json" |
|
|
|
|
|
files = sorted(result_dir.glob(pattern), reverse=True) |
|
|
if not files: |
|
|
return None |
|
|
|
|
|
with open(files[0], "r", encoding="utf-8") as f: |
|
|
return json.load(f) |
|
|
|
|
|
|
|
|
def load_all_results(dataset: str = "medqa") -> Dict[str, Optional[dict]]: |
|
|
"""Load latest results for all tracks.""" |
|
|
return {tid: load_latest_result(tid, dataset) for tid in TRACK_DIRS} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def compare_tracks(dataset: str = "medqa") -> str: |
|
|
""" |
|
|
Generate a comparison table across all tracks. |
|
|
|
|
|
Returns a formatted text table suitable for console or markdown. |
|
|
""" |
|
|
results = load_all_results(dataset) |
|
|
|
|
|
header = f"{'Track':<22} {'Top-1':>7} {'Top-3':>7} {'Mentioned':>10} {'Pipeline':>9} {'Cost':>10}" |
|
|
sep = "-" * len(header) |
|
|
lines = [f"\nCross-Track Comparison: {dataset.upper()}", sep, header, sep] |
|
|
|
|
|
for tid, data in results.items(): |
|
|
name = { |
|
|
"A": "A: Baseline", |
|
|
"B": "B: RAG Variants", |
|
|
"C": "C: Iterative", |
|
|
"D": "D: Arbitrated", |
|
|
}.get(tid, tid) |
|
|
|
|
|
if data is None: |
|
|
lines.append(f"{name:<22} {'--':>7} {'--':>7} {'--':>10} {'--':>9} {'--':>10}") |
|
|
continue |
|
|
|
|
|
metrics = data.get("metrics", data.get("summary", {}).get("metrics", {})) |
|
|
top1 = metrics.get("top1_accuracy", -1) |
|
|
top3 = metrics.get("top3_accuracy", -1) |
|
|
mentioned = metrics.get("mentioned_accuracy", -1) |
|
|
pipeline = metrics.get("parse_success", metrics.get("pipeline_success", -1)) |
|
|
cost = data.get("total_cost_usd", data.get("cost", {}).get("total_cost_usd", -1)) |
|
|
|
|
|
def fmt(v: float) -> str: |
|
|
return f"{v:.1%}" if v >= 0 else "--" |
|
|
|
|
|
cost_str = f"${cost:.4f}" if cost >= 0 else "--" |
|
|
lines.append( |
|
|
f"{name:<22} {fmt(top1):>7} {fmt(top3):>7} {fmt(mentioned):>10} " |
|
|
f"{fmt(pipeline):>9} {cost_str:>10}" |
|
|
) |
|
|
|
|
|
lines.append(sep) |
|
|
return "\n".join(lines) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main(): |
|
|
import argparse |
|
|
parser = argparse.ArgumentParser(description="Compare results across experimental tracks") |
|
|
parser.add_argument("--dataset", default="medqa", help="Dataset to compare (default: medqa)") |
|
|
parser.add_argument("--json", action="store_true", help="Output as JSON instead of table") |
|
|
args = parser.parse_args() |
|
|
|
|
|
if args.json: |
|
|
results = load_all_results(args.dataset) |
|
|
|
|
|
clean = {k: v for k, v in results.items() if v is not None} |
|
|
print(json.dumps(clean, indent=2)) |
|
|
else: |
|
|
print(compare_tracks(args.dataset)) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|