import os from langchain_core.documents import Document import pdfplumber from ingestion.loaders.normalization import normalize_text def load_pdf(file_path: str): documents = [] # Check if file exists if not os.path.exists(file_path): raise FileNotFoundError(f"File not found: {file_path}") try: with pdfplumber.open(file_path) as pdf: for page_num, page in enumerate(pdf.pages, start=1): try: text = page.extract_text() or "" text = normalize_text(text) tables = page.extract_tables() or [] # Reconstruct page text with tables preserved in order page_content = text.strip() for t_idx, table in enumerate(tables, start=1): table_text = "\n".join( ["\t".join(cell if cell else "" for cell in row) for row in table] ) table_text = normalize_text(table_text) page_content += f"\n\n=== Table {t_idx} (Page {page_num}) ===\n{table_text}" # Append as LangChain Document documents.append( Document( page_content=page_content, metadata={ "source": os.path.basename(file_path), "page_number": page_num, }, ) ) except Exception as e: print(f"Error extracting page {page_num}: {e}") continue # Skip corrupted pages, process others except Exception as e: print(f"Failed to open or read PDF file: {file_path}") print(f"Error: {e}") return [] # Return empty list instead of crashing return documents def load_pdf_with_pages(file_path: str): import fitz doc = fitz.open(file_path) pages = [] for i, page in enumerate(doc): pages.append({ "page": i + 1, "text": page.get_text() }) return pages