DocQAgen / components /process_pdf.py
Ankit Singh
Added application with all its file
085eaee
raw
history blame contribute delete
696 Bytes
from typing import List, Tuple
import PyPDF2
import re
def extract_text_from_pdf(pdf_file) -> List[Tuple[int, str]]:
try:
pdf_reader = PyPDF2.PdfReader(pdf_file)
pages = []
for i in range(len(pdf_reader.pages)):
page = pdf_reader.pages[i]
page_text = page.extract_text()
page_text = re.sub(r'\s+', ' ', page_text)
page_text = page_text.strip()
if page_text:
pages.append((i + 1, page_text)) # Page numbers start from 1
return pages
except Exception as e:
print(f"Error extracting PDF text: {e}")
return []