Spaces:
Sleeping
Sleeping
File size: 1,804 Bytes
35fa114 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 | import PyPDF2
import pdfplumber
import docx
import os
class DocumentParser:
"""Parse PDF and DOCX documents to extract text content."""
@staticmethod
def parse_pdf(file_path):
"""Extract text from PDF files using pdfplumber for better text extraction."""
text = ""
try:
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
text += page.extract_text() or ""
# If pdfplumber fails to extract text, try PyPDF2 as fallback
if not text.strip():
with open(file_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
for page_num in range(len(reader.pages)):
text += reader.pages[page_num].extract_text() or ""
except Exception as e:
print(f"Error parsing PDF: {e}")
return None
return text.strip()
@staticmethod
def parse_docx(file_path):
"""Extract text from DOCX files."""
try:
doc = docx.Document(file_path)
text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
return text.strip()
except Exception as e:
print(f"Error parsing DOCX: {e}")
return None
@staticmethod
def parse_document(file_path):
"""Parse document based on file extension."""
_, file_extension = os.path.splitext(file_path)
if file_extension.lower() == '.pdf':
return DocumentParser.parse_pdf(file_path)
elif file_extension.lower() == '.docx':
return DocumentParser.parse_docx(file_path)
else:
print(f"Unsupported file format: {file_extension}")
return None |