Spaces:
Sleeping
Sleeping
| from typing import List, Tuple | |
| import PyPDF2 | |
| import re | |
| def extract_text_from_pdf(pdf_file) -> List[Tuple[int, str]]: | |
| try: | |
| pdf_reader = PyPDF2.PdfReader(pdf_file) | |
| pages = [] | |
| for i in range(len(pdf_reader.pages)): | |
| page = pdf_reader.pages[i] | |
| page_text = page.extract_text() | |
| page_text = re.sub(r'\s+', ' ', page_text) | |
| page_text = page_text.strip() | |
| if page_text: | |
| pages.append((i + 1, page_text)) # Page numbers start from 1 | |
| return pages | |
| except Exception as e: | |
| print(f"Error extracting PDF text: {e}") | |
| return [] |