EXAM_RAG_API / ingestion /loaders /pdf_loader.py
MinaNasser's picture
1st
1bc3f18
import os
from langchain_core.documents import Document
import pdfplumber
from ingestion.loaders.normalization import normalize_text
def load_pdf(file_path: str):
documents = []
# Check if file exists
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
try:
with pdfplumber.open(file_path) as pdf:
for page_num, page in enumerate(pdf.pages, start=1):
try:
text = page.extract_text() or ""
text = normalize_text(text)
tables = page.extract_tables() or []
# Reconstruct page text with tables preserved in order
page_content = text.strip()
for t_idx, table in enumerate(tables, start=1):
table_text = "\n".join(
["\t".join(cell if cell else "" for cell in row) for row in table]
)
table_text = normalize_text(table_text)
page_content += f"\n\n=== Table {t_idx} (Page {page_num}) ===\n{table_text}"
# Append as LangChain Document
documents.append(
Document(
page_content=page_content,
metadata={
"source": os.path.basename(file_path),
"page_number": page_num,
},
)
)
except Exception as e:
print(f"Error extracting page {page_num}: {e}")
continue # Skip corrupted pages, process others
except Exception as e:
print(f"Failed to open or read PDF file: {file_path}")
print(f"Error: {e}")
return [] # Return empty list instead of crashing
return documents
def load_pdf_with_pages(file_path: str):
import fitz
doc = fitz.open(file_path)
pages = []
for i, page in enumerate(doc):
pages.append({
"page": i + 1,
"text": page.get_text()
})
return pages