Spaces:
Sleeping
Sleeping
| import PyPDF2 | |
| import pdfplumber | |
| import docx | |
| import os | |
| class DocumentParser: | |
| """Parse PDF and DOCX documents to extract text content.""" | |
| def parse_pdf(file_path): | |
| """Extract text from PDF files using pdfplumber for better text extraction.""" | |
| text = "" | |
| try: | |
| with pdfplumber.open(file_path) as pdf: | |
| for page in pdf.pages: | |
| text += page.extract_text() or "" | |
| # If pdfplumber fails to extract text, try PyPDF2 as fallback | |
| if not text.strip(): | |
| with open(file_path, 'rb') as file: | |
| reader = PyPDF2.PdfReader(file) | |
| for page_num in range(len(reader.pages)): | |
| text += reader.pages[page_num].extract_text() or "" | |
| except Exception as e: | |
| print(f"Error parsing PDF: {e}") | |
| return None | |
| return text.strip() | |
| def parse_docx(file_path): | |
| """Extract text from DOCX files.""" | |
| try: | |
| doc = docx.Document(file_path) | |
| text = "\n".join([paragraph.text for paragraph in doc.paragraphs]) | |
| return text.strip() | |
| except Exception as e: | |
| print(f"Error parsing DOCX: {e}") | |
| return None | |
| def parse_document(file_path): | |
| """Parse document based on file extension.""" | |
| _, file_extension = os.path.splitext(file_path) | |
| if file_extension.lower() == '.pdf': | |
| return DocumentParser.parse_pdf(file_path) | |
| elif file_extension.lower() == '.docx': | |
| return DocumentParser.parse_docx(file_path) | |
| else: | |
| print(f"Unsupported file format: {file_extension}") | |
| return None |