File size: 696 Bytes
085eaee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
from typing import List, Tuple
import PyPDF2
import re 

def extract_text_from_pdf(pdf_file) -> List[Tuple[int, str]]:
        try:
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            pages = []
            for i in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[i]
                page_text = page.extract_text()
                page_text = re.sub(r'\s+', ' ', page_text)
                page_text = page_text.strip()
                if page_text:
                    pages.append((i + 1, page_text))  # Page numbers start from 1
            return pages
        except Exception as e:
            print(f"Error extracting PDF text: {e}")
            return []