"""Embed all documents and persist to a Chroma collection at data/chroma/.

Run once. Re-run when the corpus or the embedding model changes.

Uses the same LangChain HuggingFaceEmbeddings wrapper as the live retrieval
path (`recsys.pipeline.get_embeddings`), so document and query vectors live
in the same space.
"""
from __future__ import annotations

import json
from pathlib import Path

import chromadb
from tqdm import tqdm

from recsys.pipeline import get_embeddings

ROOT = Path(__file__).resolve().parent.parent
DOCS = ROOT / "data" / "documents.jsonl"
DB_DIR = ROOT / "data" / "chroma"
COLLECTION = "shl_baseline"


def main() -> None:
    docs = [json.loads(l) for l in DOCS.read_text(encoding="utf-8").splitlines() if l.strip()]
    print(f"loaded {len(docs)} documents")

    embedder = get_embeddings()
    model_name = getattr(embedder, "model_name", "unknown")
    print(f"embedding model: {model_name}")

    client = chromadb.PersistentClient(path=str(DB_DIR))
    try:
        client.delete_collection(COLLECTION)
    except Exception:
        pass
    coll = client.create_collection(
        name=COLLECTION,
        metadata={"hnsw:space": "cosine", "embed_model": model_name},
    )

    batch = 64
    for i in tqdm(range(0, len(docs), batch), desc="embed+index"):
        chunk = docs[i : i + batch]
        ids = [d["id"] for d in chunk]
        texts = [d["text"] for d in chunk]
        metas = [d["metadata"] for d in chunk]
        # embed_documents (no query prefix) for indexing
        embs = embedder.embed_documents(texts)
        coll.add(ids=ids, embeddings=embs, metadatas=metas, documents=texts)

    print(f"\ncollection '{COLLECTION}' has {coll.count()} items at {DB_DIR}")


if __name__ == "__main__":
    main()