Spaces:
Sleeping
Sleeping
File size: 696 Bytes
085eaee | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 | from typing import List, Tuple
import PyPDF2
import re
def extract_text_from_pdf(pdf_file) -> List[Tuple[int, str]]:
try:
pdf_reader = PyPDF2.PdfReader(pdf_file)
pages = []
for i in range(len(pdf_reader.pages)):
page = pdf_reader.pages[i]
page_text = page.extract_text()
page_text = re.sub(r'\s+', ' ', page_text)
page_text = page_text.strip()
if page_text:
pages.append((i + 1, page_text)) # Page numbers start from 1
return pages
except Exception as e:
print(f"Error extracting PDF text: {e}")
return [] |