#!/usr/bin/env python3
import os
import sys
import shutil
import re
from pathlib import Path

from dotenv import load_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain.docstore.document import Document


def extract_section_reference(text: str) -> str:
    patterns = [
        r"(Section\s+\d+[A-Za-z0-9\-]*)",
        r"(Article\s+\d+[A-Za-z0-9\-]*)",
        r"(Part\s+[IVXLC]+)",
        r"(Chapter\s+\d+)",
    ]
    for p in patterns:
        m = re.search(p, text, re.IGNORECASE)
        if m:
            return m.group(1).strip()
    return "Unknown Section"


def _discover_processed_dirs(project_root: Path):
    candidates = [
        project_root / "data" / "processed",
        project_root / "src" / "data" / "processed",
    ]
    return [p for p in candidates if p.exists()]


def main():
    print("=== INGEST: Section-aware build (Spaces-friendly) ===")
    project_root = Path(__file__).resolve().parent.parent
    print(f"[dbg] project_root: {project_root}")

    load_dotenv()

    processed_dirs = _discover_processed_dirs(project_root)
    if not processed_dirs:
        print("ERROR: No processed directories found.")
        print("Expected one of: ./data/processed or ./src/data/processed")
        sys.exit(1)

    text_files = []
    for d in processed_dirs:
        text_files += list(d.glob("*.txt"))
    text_files = sorted(text_files)

    if not text_files:
        print("ERROR: No .txt files found in processed directories.")
        print("Make sure you committed your processed text files to the repo.")
        sys.exit(1)

    print(f"Found {len(text_files)} processed files:")
    for f in text_files:
        try:
            rel = f.relative_to(project_root)
        except Exception:
            rel = f
        print("  -", rel)

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=800, chunk_overlap=150, separators=['\n\n', '\n', '. ', ' ']
    )

    docs = []
    for tf in text_files:
        try:
            content = tf.read_text(encoding="utf-8")
        except Exception as e:
            print(f"[warn] Could not read {tf}: {e}")
            continue

        if not content.strip():
            print(f"[warn] Empty file, skipping: {tf}")
            continue

        chunks = splitter.split_text(content)
        base = tf.stem
        source_pdfish = base.replace("_text", "").replace("_TXT", "")

        lowname = tf.name.lower()
        if "constitution" in lowname:
            doc_type = "constitution"
        elif "labour" in lowname:
            doc_type = "labour_law"
        elif "fccpa" in lowname:
            doc_type = "consumer_protection"
        elif "data_protection" in lowname or "ndpr" in lowname:
            doc_type = "data_protection"
        else:
            doc_type = "general"

        for i, ch in enumerate(chunks):
            ch = ch.strip()
            if len(ch) < 25:
                continue
            section = extract_section_reference(ch)
            docs.append(
                Document(
                    page_content=ch,
                    metadata={
                        "document_type": doc_type,
                        "section": section,
                        "source": source_pdfish,
                        "chunk_index": i,
                        "total_chunks": len(chunks),
                        "file_path": str(tf.relative_to(project_root)),
                        "content_length": len(ch),
                    },
                )
            )

    if not docs:
        print("ERROR: No chunks prepared. Check your .txt content.")
        sys.exit(1)

    print(f"Prepared {len(docs)} chunks total.")

    print("Initializing embeddings...")
    embed = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en", model_kwargs={"device": "cpu"})
    test = embed.embed_query("hello")
    print(f"[dbg] embedding dim: {len(test)}")

    persist_dir = Path(os.getenv("VECTOR_DB_DIR", "vector_db"))
    if persist_dir.exists():
        shutil.rmtree(persist_dir)
        print("[dbg] removed existing vector_db")
    persist_dir.mkdir(parents=True, exist_ok=True)

    print(f"Building Chroma at: {persist_dir}")
    vectordb = Chroma.from_documents(
        documents=docs,
        embedding=embed,
        persist_directory=str(persist_dir),
        collection_name="legal_documents",
    )

    count = vectordb._collection.count()
    print(f"✅ Ingestion complete. Stored {count} chunks in 'legal_documents'.")
    if count == 0:
        print("ERROR: Zero chunks after build. Investigate your input files.")
        sys.exit(1)


if __name__ == "__main__":
    main()