import os
from langchain_core.documents import Document
import pdfplumber
from ingestion.loaders.normalization import normalize_text

def load_pdf(file_path: str):
    documents = []
    # Check if file exists
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")

    try:
        with pdfplumber.open(file_path) as pdf:
            for page_num, page in enumerate(pdf.pages, start=1):
                try:
                    text = page.extract_text() or ""
                    text = normalize_text(text)
                    tables = page.extract_tables() or []

                    # Reconstruct page text with tables preserved in order
                    page_content = text.strip()
                    for t_idx, table in enumerate(tables, start=1):
                        table_text = "\n".join(
                            ["\t".join(cell if cell else "" for cell in row) for row in table]
                        )
                        table_text = normalize_text(table_text)
                        page_content += f"\n\n=== Table {t_idx} (Page {page_num}) ===\n{table_text}"

                    # Append as LangChain Document
                    documents.append(
                        Document(
                            page_content=page_content,
                            metadata={
                                "source": os.path.basename(file_path),
                                "page_number": page_num,
                            },
                        )
                    )
                except Exception as e:
                    print(f"Error extracting page {page_num}: {e}")
                    continue  # Skip corrupted pages, process others

    except Exception as e:
        print(f"Failed to open or read PDF file: {file_path}")
        print(f"Error: {e}")
        return []  # Return empty list instead of crashing

    return documents


def load_pdf_with_pages(file_path: str):
    import fitz
    doc = fitz.open(file_path)
    pages = []

    for i, page in enumerate(doc):
        pages.append({
            "page": i + 1,
            "text": page.get_text()
        })

    return pages