"""
Build or update the ChromaDB vectorstore from philosophical texts.

    python ingest.py           # incremental: skips already-indexed sources
    python ingest.py --rebuild # wipes and rebuilds from scratch
"""

import sys
import time
import requests
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from config import (
    DATA_DIR, VECTORSTORE_DIR,
    EMBEDDING_MODEL, CHUNK_SIZE, CHUNK_OVERLAP, SOURCES, DEVICE
)

GUTENBERG_URL = "https://www.gutenberg.org/cache/epub/{id}/pg{id}.txt"
BATCH_SIZE = 50
SLEEP_BETWEEN_BATCHES = 2


def download_gutenberg(gutenberg_id: int, title: str) -> str:
    url = GUTENBERG_URL.format(id=gutenberg_id)
    print(f"  Downloading {url}")
    try:
        resp = requests.get(url, timeout=30)
        resp.raise_for_status()
        return resp.text
    except Exception as e:
        print(f"  ERROR: {e}")
        return ""


def strip_gutenberg_boilerplate(text: str) -> str:
    start_markers = [
        "*** START OF THE PROJECT GUTENBERG",
        "***START OF THE PROJECT GUTENBERG",
        "*** START OF THIS PROJECT GUTENBERG",
    ]
    end_markers = [
        "*** END OF THE PROJECT GUTENBERG",
        "***END OF THE PROJECT GUTENBERG",
        "*** END OF THIS PROJECT GUTENBERG",
    ]
    start_idx = 0
    for marker in start_markers:
        idx = text.find(marker)
        if idx != -1:
            start_idx = text.find("\n", idx) + 1
            break
    end_idx = len(text)
    for marker in end_markers:
        idx = text.find(marker)
        if idx != -1:
            end_idx = idx
            break
    return text[start_idx:end_idx].strip()


def get_embeddings() -> HuggingFaceEmbeddings:
    print(f"Loading embedding model on {DEVICE}...")
    return HuggingFaceEmbeddings(
        model_name=EMBEDDING_MODEL,
        model_kwargs={"device": DEVICE},
        encode_kwargs={"prompt_name": "document", "normalize_embeddings": True},
        query_encode_kwargs={"prompt_name": "query", "normalize_embeddings": True},
    )


def get_indexed_titles(vectorstore: Chroma) -> set[str]:
    result = vectorstore.get(include=["metadatas"])
    return {m.get("title", "") for m in result["metadatas"]}


def ingest_source(source: dict, vectorstore: Chroma, splitter: RecursiveCharacterTextSplitter) -> int:
    raw = download_gutenberg(source["gutenberg_id"], source["title"])
    if not raw:
        return 0

    cleaned = strip_gutenberg_boilerplate(raw)

    # Cache locally
    DATA_DIR.mkdir(parents=True, exist_ok=True)
    safe_name = f"{source['philosopher']}_{source['title'][:40].replace(' ', '_')}.txt"
    (DATA_DIR / safe_name).write_text(cleaned, encoding="utf-8")

    chunks = splitter.split_text(cleaned)
    docs = [
        Document(
            page_content=chunk,
            metadata={
                "philosopher": source["philosopher"],
                "title": source["title"],
                "source": f"{source['philosopher']} — *{source['title']}*",
            },
        )
        for chunk in chunks
    ]

    for i in range(0, len(docs), BATCH_SIZE):
        vectorstore.add_documents(docs[i : i + BATCH_SIZE])
        if i + BATCH_SIZE < len(docs):
            time.sleep(SLEEP_BETWEEN_BATCHES)

    return len(docs)


def main() -> None:
    rebuild = "--rebuild" in sys.argv

    VECTORSTORE_DIR.mkdir(parents=True, exist_ok=True)

    embeddings = get_embeddings()
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        separators=["\n\n", "\n", ". ", " ", ""],
    )

    if rebuild and VECTORSTORE_DIR.exists():
        import shutil
        shutil.rmtree(VECTORSTORE_DIR)
        VECTORSTORE_DIR.mkdir()
        print("Vectorstore wiped for rebuild.")

    vectorstore = Chroma(
        collection_name="philosophers",
        embedding_function=embeddings,
        persist_directory=str(VECTORSTORE_DIR),
    )

    already_indexed = get_indexed_titles(vectorstore) if not rebuild else set()
    total_new = 0

    for source in SOURCES:
        print(f"\n[{source['philosopher']}] {source['title']}")
        if source["title"] in already_indexed:
            print("  SKIPPED (already indexed)")
            continue

        n = ingest_source(source, vectorstore, splitter)
        if n:
            print(f"  -> {n} chunks added")
            total_new += n
        time.sleep(1)

    if total_new:
        print(f"\nDone. {total_new} new chunks added to vectorstore.")
    else:
        print("\nNothing new to index.")


if __name__ == "__main__":
    main()