|
|
from __future__ import annotations |
|
|
|
|
|
import argparse |
|
|
import json |
|
|
import subprocess |
|
|
import sys |
|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
def run_cmd(cmd: list[str]) -> None: |
|
|
print(">>", " ".join(cmd)) |
|
|
subprocess.run(cmd, check=True) |
|
|
|
|
|
|
|
|
def main(): |
|
|
parser = argparse.ArgumentParser(description="Run ablation suite for retrieval settings.") |
|
|
parser.add_argument("--catalog", default="data/catalog_docs.jsonl") |
|
|
parser.add_argument("--train", default="data/Gen_AI Dataset.xlsx") |
|
|
parser.add_argument("--vector-index", default="data/faiss_index/index.faiss") |
|
|
parser.add_argument("--assessment-ids", default="data/embeddings/assessment_ids.json") |
|
|
parser.add_argument("--model", default="sentence-transformers/all-MiniLM-L6-v2") |
|
|
parser.add_argument("--topn-list", default="50,100,200,400", help="Comma-separated topn candidates to test") |
|
|
args = parser.parse_args() |
|
|
|
|
|
topn_vals = [int(x) for x in args.topn_list.split(",") if x.strip()] |
|
|
runs_dir = Path("runs/ablation") |
|
|
runs_dir.mkdir(parents=True, exist_ok=True) |
|
|
summary = [] |
|
|
|
|
|
for topn in topn_vals: |
|
|
|
|
|
run_cmd( |
|
|
[ |
|
|
sys.executable, |
|
|
"-m", |
|
|
"eval.run_eval", |
|
|
"--catalog", |
|
|
args.catalog, |
|
|
"--train", |
|
|
args.train, |
|
|
"--recommender", |
|
|
"bm25", |
|
|
"--topn-candidates", |
|
|
str(topn), |
|
|
"--out-dir", |
|
|
str(runs_dir / f"bm25_top{topn}"), |
|
|
] |
|
|
) |
|
|
|
|
|
run_cmd( |
|
|
[ |
|
|
sys.executable, |
|
|
"-m", |
|
|
"eval.run_eval", |
|
|
"--catalog", |
|
|
args.catalog, |
|
|
"--train", |
|
|
args.train, |
|
|
"--recommender", |
|
|
"vector", |
|
|
"--vector-index", |
|
|
args.vector_index, |
|
|
"--assessment-ids", |
|
|
args.assessment_ids, |
|
|
"--model", |
|
|
args.model, |
|
|
"--topn-candidates", |
|
|
str(topn), |
|
|
"--out-dir", |
|
|
str(runs_dir / f"vector_top{topn}"), |
|
|
] |
|
|
) |
|
|
|
|
|
run_cmd( |
|
|
[ |
|
|
sys.executable, |
|
|
"-m", |
|
|
"eval.run_eval", |
|
|
"--catalog", |
|
|
args.catalog, |
|
|
"--train", |
|
|
args.train, |
|
|
"--recommender", |
|
|
"hybrid_rrf", |
|
|
"--vector-index", |
|
|
args.vector_index, |
|
|
"--assessment-ids", |
|
|
args.assessment_ids, |
|
|
"--model", |
|
|
args.model, |
|
|
"--topn-candidates", |
|
|
str(topn), |
|
|
"--rrf-k", |
|
|
"60", |
|
|
"--out-dir", |
|
|
str(runs_dir / f"hybrid_rrf_top{topn}"), |
|
|
] |
|
|
) |
|
|
|
|
|
|
|
|
for name in ["bm25", "vector", "hybrid_rrf"]: |
|
|
mpath = runs_dir / f"{name}_top{topn}" / "metrics.json" |
|
|
if mpath.exists(): |
|
|
with open(mpath) as f: |
|
|
metrics = json.load(f) |
|
|
summary.append( |
|
|
{ |
|
|
"variant": f"{name}_top{topn}", |
|
|
"train_recall@10": metrics["train"]["recall@10"], |
|
|
"val_recall@10": metrics["val"]["recall@10"], |
|
|
"train_mrr@10": metrics["train"]["mrr@10"], |
|
|
"val_mrr@10": metrics["val"]["mrr@10"], |
|
|
} |
|
|
) |
|
|
|
|
|
with open(runs_dir / "ablation_summary.json", "w") as f: |
|
|
json.dump(summary, f, indent=2) |
|
|
print("Ablation summary written to runs/ablation/ablation_summary.json") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|