import os

from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document

from embeddings.embedding_model import embedding_model
from rag.chunking import split_documents
from config import VECTOR_DB_PATH

PDF_FOLDER = "documents/pdfs"


def _ocr_pdf(pdf_path: str) -> list[Document]:
    """
    Fallback OCR using PyMuPDF (fitz) + Tesseract.
    No Poppler required — PyMuPDF handles PDF-to-image conversion natively.
    Requires: pip install pymupdf pytesseract pillow
              Tesseract installed: winget install UB-Mannheim.TesseractOCR
    """
    try:
        import fitz  # PyMuPDF
        import pytesseract
        from PIL import Image
        import io
    except ImportError as e:
        print(f"  [OCR] Missing package: {e}. Run: pip install pymupdf pytesseract pillow")
        return []

    print(f"  [OCR] Running Tesseract OCR on: {os.path.basename(pdf_path)}")

    try:
        pdf_doc = fitz.open(pdf_path)
    except Exception as e:
        print(f"  [OCR] Could not open PDF with PyMuPDF: {e}")
        return []

    documents = []
    for i, page in enumerate(pdf_doc):
        # Render page to image at 200 DPI
        matrix = fitz.Matrix(200 / 72, 200 / 72)
        pix = page.get_pixmap(matrix=matrix)
        img_bytes = pix.tobytes("png")
        img = Image.open(io.BytesIO(img_bytes))

        try:
            text = pytesseract.image_to_string(img)
        except Exception as e:
            print(f"  [OCR] Tesseract failed on page {i+1}: {e}")
            print("  [OCR] Is Tesseract installed? Run: winget install UB-Mannheim.TesseractOCR")
            break

        char_count = len(text.strip())
        print(f"  [OCR] Page {i+1}: extracted {char_count} chars")

        if text.strip():
            documents.append(Document(
                page_content=text,
                metadata={"source": pdf_path, "page": i}
            ))

    pdf_doc.close()
    return documents


def _load_pdf(pdf_path: str) -> list[Document]:
    """
    Load a PDF using PyPDFLoader. If all pages are empty (scanned PDF) or loader fails,
    automatically fall back to Tesseract OCR.
    """
    filename = os.path.basename(pdf_path)

    try:
        loader = PyPDFLoader(pdf_path)
        docs = loader.load()
        print(f"Loaded {len(docs)} page(s) from {filename}")
        
        # Check if any page has real text
        text_docs = [d for d in docs if d.page_content.strip()]
        if text_docs:
            return text_docs
    except Exception as e:
        print(f"  [WARNING] PyPDFLoader failed to read '{filename}': {e} — attempting OCR...")

    # All pages empty or loader failed → scanned PDF, try OCR
    print(f"  [WARNING] No text extracted from '{filename}' — attempting OCR...")
    ocr_docs = _ocr_pdf(pdf_path)

    if not ocr_docs:
        print(
            f"  [ERROR] OCR also failed for '{filename}'.\n"
            "  Make sure Tesseract is installed and on PATH:\n"
            "    winget install UB-Mannheim.TesseractOCR\n"
        )

    return ocr_docs

def _load_docx(docx_path: str) -> list[Document]:
    """
    Load a Word (.docx) file and return a list of Documents.
    """
    from docx import Document as DocxDocument
    filename = os.path.basename(docx_path)
    print(f"Loading Word document: {filename}")
    try:
        doc = DocxDocument(docx_path)
        full_text = []
        for para in doc.paragraphs:
            if para.text.strip():
                full_text.append(para.text.strip())
        
        # Also extract text from tables
        for table in doc.tables:
            for row in table.rows:
                row_text = [cell.text.strip() for cell in row.cells if cell.text.strip()]
                if row_text:
                    full_text.append(" | ".join(row_text))

        text_content = "\n".join(full_text)
        if text_content.strip():
            return [Document(
                page_content=text_content,
                metadata={"source": docx_path}
            )]
    except Exception as e:
        print(f"  [ERROR] Failed to read Word file '{filename}': {e}")
    return []


def _load_excel(excel_path: str) -> list[Document]:
    """
    Load an Excel (.xlsx, .xls) file using pandas and openpyxl,
    returning a text representation of the tables.
    """
    import pandas as pd
    filename = os.path.basename(excel_path)
    print(f"Loading Excel spreadsheet: {filename}")
    try:
        with pd.ExcelFile(excel_path) as xls:
            documents = []
            for sheet_name in xls.sheet_names:
                df = pd.read_excel(xls, sheet_name=sheet_name)
                if df.empty:
                    continue
                
                # Convert sheet to string representation
                sheet_text = f"Sheet: {sheet_name}\n"
                sheet_text += df.to_string(index=False)
                
                if sheet_text.strip():
                    documents.append(Document(
                        page_content=sheet_text,
                        metadata={"source": excel_path, "sheet": sheet_name}
                    ))
            return documents
    except Exception as e:
        print(f"  [ERROR] Failed to read Excel file '{filename}': {e}")
    return []


def load_any_document(file_path: str) -> list[Document]:
    """
    Unified loader for PDF, Word, and Excel files.
    """
    ext = os.path.splitext(file_path)[-1].lower()
    if ext == ".pdf":
        return _load_pdf(file_path)
    elif ext == ".docx":
        return _load_docx(file_path)
    elif ext in [".xlsx", ".xls"]:
        return _load_excel(file_path)
    return []


def build_vector_store():
    documents_folder = "documents"
    if not os.path.exists(documents_folder):
        os.makedirs(documents_folder)

    documents = []
    # Recursively find all supported files in documents/ directory
    for root, dirs, files in os.walk(documents_folder):
        for file in files:
            file_path = os.path.join(root, file)
            # Skip checking directories and temp/lock files (e.g. ~$Doc.docx)
            if file.startswith("~$"):
                continue
            docs = load_any_document(file_path)
            if docs:
                documents.extend(docs)

    if not documents:
        print("No documents found or extracted in documents/ folder.")
        return None

    chunks = split_documents(documents)

    if not chunks:
        print("Chunking produced no results. Check CHUNK_SIZE / CHUNK_OVERLAP in config.py.")
        return None

    print(f"Building FAISS index from {len(chunks)} chunk(s)...")

    vector_db = FAISS.from_documents(chunks, embedding_model)
    vector_db.save_local(VECTOR_DB_PATH)

    print(f"FAISS vector database created and saved to '{VECTOR_DB_PATH}'.")
    return vector_db


def load_vector_store():

    index_file = os.path.join(VECTOR_DB_PATH, "index.faiss")

    if not os.path.exists(index_file):
        print("FAISS index not found.")
        print("Creating new vector database...")
        return build_vector_store()

    vector_db = FAISS.load_local(
        VECTOR_DB_PATH,
        embedding_model,
        allow_dangerous_deserialization=True
    )

    return vector_db