#!/usr/bin/env python3
"""
Ingest GAIA questions into ChromaDB for RAG retrieval.

Each document stores the question text, a "Possible answer" hint (rephrased
from the ground-truth), and task metadata (tools used, number of steps).
The possible answer gives the agent a useful starting point while the tool
description makes clear it must verify via web/wiki/arxiv search.

Usage:
    python ingest.py
    python ingest.py --questions path/to/file.jsonl
"""
import argparse
import json
from pathlib import Path

from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEmbeddings

QUESTIONS_FILE = Path("gaia_questions.jsonl")
CHROMA_DIR = "./chroma_db"
COLLECTION_NAME = "gaia_questions"
EMBED_MODEL = "sentence-transformers/all-mpnet-base-v2"


def load_questions(path: Path) -> list[dict]:
    with open(path) as f:
        return [json.loads(line) for line in f if line.strip()]


def make_document(q: dict) -> Document:
    """Build a LangChain Document from a GAIA question record.

    Stored fields:
      - page_content : question text + possible answer hint (used for similarity search)
      - metadata     : task context useful for retrieval
    """
    meta = q.get("Annotator Metadata", {})
    possible_answer = q.get("Final answer", "")
    page_content = q["Question"]
    if possible_answer:
        page_content += f"\n\nPossible answer: {possible_answer}"
    return Document(
        page_content=page_content,
        metadata={
            "task_id":        q["task_id"],
            "level":          q["Level"],
            "file_name":      q.get("file_name", ""),
            "tools":          meta.get("Tools", ""),
            "n_steps":        str(meta.get("Number of steps", "")),
            "n_tools":        str(meta.get("Number of tools", "")),
            "possible_answer": possible_answer,
        },
    )


def main(questions_file: Path = QUESTIONS_FILE) -> None:
    print(f"Loading questions from {questions_file} ...")
    questions = load_questions(questions_file)
    print(f"Loaded {len(questions)} questions.")

    print(f"Initialising embeddings ({EMBED_MODEL}) ...")
    embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)

    print(f"Connecting to ChromaDB at {CHROMA_DIR!r} ...")
    vector_store = Chroma(
        collection_name=COLLECTION_NAME,
        embedding_function=embeddings,
        persist_directory=CHROMA_DIR,
    )

    # Wipe and re-ingest so re-runs are idempotent
    existing_ids = vector_store.get()["ids"]
    if existing_ids:
        print(f"Removing {len(existing_ids)} existing documents ...")
        vector_store.delete(existing_ids)

    docs = [make_document(q) for q in questions]
    ids  = [q["task_id"] for q in questions]

    print(f"Ingesting {len(docs)} documents ...")
    vector_store.add_documents(documents=docs, ids=ids)

    count = vector_store._collection.count()
    print(f"Done. '{COLLECTION_NAME}' now contains {count} documents.")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Ingest GAIA questions into ChromaDB.")
    parser.add_argument(
        "--questions",
        type=Path,
        default=QUESTIONS_FILE,
        help=f"Path to the .jsonl questions file (default: {QUESTIONS_FILE})",
    )
    args = parser.parse_args()
    main(args.questions)