Spaces:

MinaNasser
/

EXAM_RAG_API

Paused

1st

1bc3f18 about 2 months ago

2.2 kB

	import os
	from langchain_core.documents import Document
	import pdfplumber
	from ingestion.loaders.normalization import normalize_text

	def load_pdf(file_path: str):
	documents = []
	# Check if file exists
	if not os.path.exists(file_path):
	raise FileNotFoundError(f"File not found: {file_path}")

	try:
	with pdfplumber.open(file_path) as pdf:
	for page_num, page in enumerate(pdf.pages, start=1):
	try:
	text = page.extract_text() or ""
	text = normalize_text(text)
	tables = page.extract_tables() or []

	# Reconstruct page text with tables preserved in order
	page_content = text.strip()
	for t_idx, table in enumerate(tables, start=1):
	table_text = "\n".join(
	["\t".join(cell if cell else "" for cell in row) for row in table]
	)
	table_text = normalize_text(table_text)
	page_content += f"\n\n=== Table {t_idx} (Page {page_num}) ===\n{table_text}"

	# Append as LangChain Document
	documents.append(
	Document(
	page_content=page_content,
	metadata={
	"source": os.path.basename(file_path),
	"page_number": page_num,
	},
	)
	)
	except Exception as e:
	print(f"Error extracting page {page_num}: {e}")
	continue # Skip corrupted pages, process others

	except Exception as e:
	print(f"Failed to open or read PDF file: {file_path}")
	print(f"Error: {e}")
	return [] # Return empty list instead of crashing

	return documents





	def load_pdf_with_pages(file_path: str):
	import fitz
	doc = fitz.open(file_path)
	pages = []

	for i, page in enumerate(doc):
	pages.append({
	"page": i + 1,
	"text": page.get_text()
	})

	return pages