"""Verify retrieval quality against golden dataset.

Runs the Day 4 gate check: for each positive golden question,
does hybrid retrieval return the expected source in top-5?

Usage:
    python scripts/verify_retrieval.py
    python scripts/verify_retrieval.py --store-path .cache/store --output docs/retrieval_gate.md
"""

from __future__ import annotations

import argparse
import json
import sys
from pathlib import Path

sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

from agent_bench.rag.embedder import Embedder
from agent_bench.rag.store import HybridStore


def verify(
    store_path: str = ".cache/store",
    golden_path: str = "agent_bench/evaluation/datasets/tech_docs_golden.json",
    model_name: str = "all-MiniLM-L6-v2",
    cache_dir: str = ".cache/embeddings",
    output_path: str | None = None,
) -> bool:
    store = HybridStore.load(store_path)
    embedder = Embedder(model_name=model_name, cache_dir=cache_dir)

    with open(golden_path) as f:
        questions = json.load(f)

    lines: list[str] = []
    lines.append("# Retrieval Gate Check")
    lines.append("")
    lines.append(
        f"**Store:** {store.stats().total_chunks} chunks, "
        f"{store.stats().unique_sources} sources"
    )
    lines.append("")
    lines.append("| ID | Category | Expected Source | Top-5 Sources | Recall@5 | Result |")
    lines.append("|-----|----------|----------------|---------------|----------|--------|")

    total_recall = 0.0
    scorable = 0

    for q in questions:
        qid = q["id"]
        question = q["question"]
        expected = set(q["expected_sources"])
        category = q["category"]

        vec = embedder.embed(question)
        results = store.search(vec, question, top_k=5, strategy="hybrid")
        retrieved = [r.chunk.source for r in results]
        retrieved_set = set(retrieved)

        if expected:
            recall = len(expected & retrieved_set) / len(expected)
            total_recall += recall
            scorable += 1
            result = "PASS" if recall >= 0.5 else "FAIL"
        else:
            recall = float("nan")
            result = "N/A"

        expected_str = ", ".join(sorted(expected)) if expected else "(none)"
        retrieved_str = ", ".join(dict.fromkeys(retrieved[:3]))  # dedup, first 3
        recall_str = f"{recall:.2f}" if expected else "n/a"
        lines.append(
            f"| {qid} | {category} | {expected_str} | {retrieved_str} | {recall_str} | {result} |"
        )

    avg_recall = total_recall / max(scorable, 1)
    gate_pass = avg_recall >= 0.5

    lines.append("")
    lines.append(f"**Avg Recall@5 (positive only):** {avg_recall:.2f}")
    lines.append(f"**Gate:** {'PASS' if gate_pass else 'FAIL'} (threshold >= 0.5)")

    report = "\n".join(lines)
    print(report)

    if output_path:
        Path(output_path).parent.mkdir(parents=True, exist_ok=True)
        Path(output_path).write_text(report + "\n")
        print(f"\nSaved to {output_path}")

    return gate_pass


def main() -> None:
    parser = argparse.ArgumentParser(description="Verify retrieval against golden dataset")
    parser.add_argument("--store-path", default=".cache/store")
    parser.add_argument("--golden-path", default="agent_bench/evaluation/datasets/tech_docs_golden.json")
    parser.add_argument("--output", default="docs/retrieval_gate.md")
    args = parser.parse_args()

    passed = verify(
        store_path=args.store_path,
        golden_path=args.golden_path,
        output_path=args.output,
    )
    sys.exit(0 if passed else 1)


if __name__ == "__main__":
    main()