import os
from langchain_text_splitters import RecursiveCharacterTextSplitter
from ebooklib import epub
from bs4 import BeautifulSoup
import pdfplumber
import logging
logging.getLogger("pdfminer").setLevel(logging.ERROR)

from embed_store import get_embeddings, store_embeddings, get_qdrant_client


# --------------------------
# LOAD EPUB
# --------------------------

def load_pdf(file_path):
    docs = []
    try:
        with pdfplumber.open(file_path) as pdf:
            total_pages = len(pdf.pages)
            print(f"   → PDF has {total_pages} pages")

            for i, page in enumerate(pdf.pages):
                if i % 20 == 0:
                    print(f"     Processing page {i+1}/{total_pages}")

                text = page.extract_text()
                if text:
                    docs.append({
                        "content": text,
                        "source": file_path,
                        "book": os.path.basename(file_path),
                        "type": "book"
                    })
    except Exception as e:
        print(f"❌ Error reading PDF {file_path}: {e}")

    print(f"   → Extracted {len(docs)} pages from PDF")
    return docs


# --------------------------
# LOAD PDF
# --------------------------

def load_epub(file_path):
    docs = []
    try:
        book = epub.read_epub(file_path)
        count = 0

        for item in book.get_items():
            try:
                if item.get_type() == epub.ITEM_DOCUMENT:
                    soup = BeautifulSoup(item.get_content(), "lxml")

                    # remove scripts/styles
                    for tag in soup(["script", "style"]):
                        tag.decompose()

                    text = soup.get_text(separator=" ", strip=True)

                    if text and len(text) > 50:  # filter junk
                        docs.append({
                            "content": text,
                            "source": file_path,
                            "book": os.path.basename(file_path),
                            "type": "book"
                        })
                        count += 1
            except Exception:
                continue

        print(f"   → Extracted {count} sections from EPUB")

    except Exception as e:
        print(f"❌ Failed EPUB {file_path}: {e}")

    return docs

# --------------------------
# LOAD ALL BOOKS
# --------------------------

def load_books(folder_path="knowledge"):
    all_docs = []
    files = os.listdir(folder_path)

    print(f"📚 Found {len(files)} files in '{folder_path}'")

    for i, file in enumerate(files):
        full_path = os.path.join(folder_path, file)

        print(f"\n📖 [{i+1}/{len(files)}] Loading: {file}")

        if file.endswith(".epub"):
            docs = load_epub(full_path)

        elif file.endswith(".pdf"):
            docs = load_pdf(full_path)

        else:
            print("   → Skipped (unsupported)")
            continue

        all_docs.extend(docs)

    print(f"\n✅ Total extracted documents: {len(all_docs)}")
    return all_docs

# --------------------------
# CHUNKING
# --------------------------

def chunk_documents(documents):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=100,
    )

    chunks = []
    print(f"Chunking {len(documents)} documents...")

    for i, doc in enumerate(documents):
        split_texts = splitter.split_text(doc["content"])

        # ✅ ensure small leftover is kept
        if len(split_texts) > 0 and len(split_texts[-1]) < 50:
            if len(split_texts) > 1:
                split_texts[-2] += " " + split_texts[-1]
                split_texts = split_texts[:-1]

        print(f"→ Processing doc {i+1}/{len(documents)} | chunks: {len(split_texts)}")

        for chunk in split_texts:
            chunks.append({
                "content": chunk,
                "source": doc["source"],
                "book": doc["book"],
                "type": doc["type"]
            })

    print(f"Total chunks created: {len(chunks)}")
    return chunks

# --------------------------
# MAIN INGEST FUNCTION
# --------------------------

def ingest_books(folder_path="knowledge"):
    client = get_qdrant_client()
    collection_name = "psychology_books"

    # ✅ Skip if already ingested
    try:
        info = client.get_collection(collection_name)
        if info.points_count > 0:
            print("Embeddings already exist. Skipping ingest.")
            return
    except Exception:
        pass

    docs = load_books(folder_path)
    chunks = chunk_documents(docs)

    embeddings = get_embeddings()
    store_embeddings(chunks, embeddings, collection_name)

    print(f"Ingested {len(chunks)} chunks from books.")


if __name__ == "__main__":
    ingest_books("knowledge")