| import os |
| import sys |
| from pathlib import Path |
| from typing import Optional, Union |
| import logging |
|
|
| |
| try: |
| import PyPDF2 |
| from docx import Document |
| import ebooklib |
| from ebooklib import epub |
| from bs4 import BeautifulSoup |
| except ImportError as e: |
| print(f"Missing required dependency: {e}") |
| print("Please install dependencies with: pip install -r requirements.txt") |
| sys.exit(1) |
|
|
| |
| logging.basicConfig(level=logging.INFO) |
| logger = logging.getLogger(__name__) |
|
|
|
|
| class DocumentParser: |
| """ |
| A class to parse and extract text from various document formats. |
| Supports PDF, TXT, DOC, DOCX, and EPUB files. |
| """ |
| |
| def __init__(self): |
| self.supported_formats = { |
| 'application/pdf': self._parse_pdf, |
| 'text/plain': self._parse_txt, |
| 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': self._parse_docx, |
| 'application/msword': self._parse_doc, |
| 'application/epub+zip': self._parse_epub |
| } |
| |
| def get_file_type(self, file_path: Union[str, Path]) -> str: |
| """ |
| Detect the MIME type of a file using file extension. |
| |
| Args: |
| file_path: Path to the file |
| |
| Returns: |
| MIME type string |
| """ |
| return self._get_mime_from_extension(file_path) |
| |
| def _get_mime_from_extension(self, file_path: Union[str, Path]) -> str: |
| """ |
| Determine MIME type from file extension. |
| |
| Args: |
| file_path: Path to the file |
| |
| Returns: |
| MIME type string |
| """ |
| extension = Path(file_path).suffix.lower() |
| extension_map = { |
| '.pdf': 'application/pdf', |
| '.txt': 'text/plain', |
| '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', |
| '.doc': 'application/msword', |
| '.epub': 'application/epub+zip' |
| } |
| |
| mime_type = extension_map.get(extension, 'unknown') |
| |
| |
| if mime_type == 'unknown': |
| mime_type = self._detect_mime_by_content(file_path) |
| |
| return mime_type |
| |
| def _detect_mime_by_content(self, file_path: Union[str, Path]) -> str: |
| """ |
| Detect MIME type by reading file content. |
| |
| Args: |
| file_path: Path to the file |
| |
| Returns: |
| MIME type string |
| """ |
| try: |
| with open(file_path, 'rb') as f: |
| |
| header = f.read(1024) |
| |
| |
| if header.startswith(b'%PDF'): |
| return 'application/pdf' |
| |
| |
| if header.startswith(b'PK\x03\x04'): |
| |
| try: |
| import zipfile |
| with zipfile.ZipFile(file_path, 'r') as zf: |
| if 'mimetype' in zf.namelist(): |
| with zf.open('mimetype') as mf: |
| mimetype = mf.read().decode('utf-8').strip() |
| if mimetype == 'application/epub+zip': |
| return 'application/epub+zip' |
| |
| return 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' |
| except: |
| pass |
| |
| |
| try: |
| header.decode('utf-8') |
| return 'text/plain' |
| except UnicodeDecodeError: |
| pass |
| |
| except Exception as e: |
| logger.warning(f"Error detecting MIME type by content: {e}") |
| |
| return 'unknown' |
| |
| def extract_text(self, file_path: Union[str, Path]) -> Optional[str]: |
| """ |
| Extract text from a document file. |
| |
| Args: |
| file_path: Path to the document file |
| |
| Returns: |
| Extracted text as string, or None if extraction fails |
| """ |
| file_path = Path(file_path) |
| |
| if not file_path.exists(): |
| logger.error(f"File not found: {file_path}") |
| return None |
| |
| try: |
| mime_type = self.get_file_type(file_path) |
| logger.info(f"Detected file type: {mime_type}") |
| |
| if mime_type in self.supported_formats: |
| return self.supported_formats[mime_type](file_path) |
| else: |
| logger.error(f"Unsupported file type: {mime_type}") |
| return None |
| |
| except Exception as e: |
| logger.error(f"Error extracting text from {file_path}: {e}") |
| return None |
| |
| def _parse_pdf(self, file_path: Path) -> str: |
| """ |
| Extract text from PDF file. |
| |
| Args: |
| file_path: Path to PDF file |
| |
| Returns: |
| Extracted text |
| """ |
| text = "" |
| try: |
| with open(file_path, 'rb') as file: |
| pdf_reader = PyPDF2.PdfReader(file) |
| |
| for page_num in range(len(pdf_reader.pages)): |
| page = pdf_reader.pages[page_num] |
| page_text = page.extract_text() |
| if page_text: |
| text += page_text + "\n" |
| |
| except Exception as e: |
| logger.error(f"Error parsing PDF {file_path}: {e}") |
| raise |
| |
| return text.strip() |
| |
| def _parse_txt(self, file_path: Path) -> str: |
| """ |
| Extract text from plain text file. |
| |
| Args: |
| file_path: Path to text file |
| |
| Returns: |
| Extracted text |
| """ |
| try: |
| with open(file_path, 'r', encoding='utf-8') as file: |
| return file.read() |
| except UnicodeDecodeError: |
| |
| try: |
| with open(file_path, 'r', encoding='latin-1') as file: |
| return file.read() |
| except Exception as e: |
| logger.error(f"Error reading text file {file_path}: {e}") |
| raise |
| except Exception as e: |
| logger.error(f"Error reading text file {file_path}: {e}") |
| raise |
| |
| def _parse_docx(self, file_path: Path) -> str: |
| """ |
| Extract text from DOCX file. |
| |
| Args: |
| file_path: Path to DOCX file |
| |
| Returns: |
| Extracted text |
| """ |
| try: |
| doc = Document(file_path) |
| text = "" |
| |
| for paragraph in doc.paragraphs: |
| text += paragraph.text + "\n" |
| |
| return text.strip() |
| |
| except Exception as e: |
| logger.error(f"Error parsing DOCX {file_path}: {e}") |
| raise |
| |
| def _parse_doc(self, file_path: Path) -> str: |
| """ |
| Extract text from DOC file (legacy Word format). |
| Note: This requires additional dependencies like antiword or catdoc. |
| |
| Args: |
| file_path: Path to DOC file |
| |
| Returns: |
| Extracted text |
| """ |
| try: |
| |
| import subprocess |
| result = subprocess.run(['antiword', str(file_path)], |
| capture_output=True, text=True) |
| if result.returncode == 0: |
| return result.stdout.strip() |
| |
| |
| result = subprocess.run(['catdoc', str(file_path)], |
| capture_output=True, text=True) |
| if result.returncode == 0: |
| return result.stdout.strip() |
| |
| raise Exception("Neither antiword nor catdoc found. Please install one of them.") |
| |
| except FileNotFoundError: |
| raise Exception("antiword or catdoc not found. Please install one of them for DOC file support.") |
| except Exception as e: |
| logger.error(f"Error parsing DOC {file_path}: {e}") |
| raise |
| |
| def _parse_epub(self, file_path: Path) -> str: |
| """ |
| Extract text from EPUB file. |
| |
| Args: |
| file_path: Path to EPUB file |
| |
| Returns: |
| Extracted text |
| """ |
| try: |
| book = epub.read_epub(file_path) |
| text = "" |
| |
| for item in book.get_items(): |
| if item.get_type() == ebooklib.ITEM_DOCUMENT: |
| content = item.get_content().decode('utf-8') |
| soup = BeautifulSoup(content, 'html.parser') |
| text += soup.get_text() + "\n" |
| |
| return text.strip() |
| |
| except Exception as e: |
| logger.error(f"Error parsing EPUB {file_path}: {e}") |
| raise |
|
|
|
|
| def main(): |
| """ |
| Main function to demonstrate usage of the DocumentParser. |
| """ |
| if len(sys.argv) != 2: |
| print("Usage: python document_parsing.py <file_path>") |
| print("Supported formats: PDF, TXT, DOC, DOCX, EPUB") |
| sys.exit(1) |
| |
| file_path = sys.argv[1] |
| parser = DocumentParser() |
| |
| print(f"Extracting text from: {file_path}") |
| print("-" * 50) |
| |
| extracted_text = parser.extract_text(file_path) |
| |
| if extracted_text: |
| print("Extracted text:") |
| print(extracted_text) |
| print(f"\nTotal characters: {len(extracted_text)}") |
| else: |
| print("Failed to extract text from the file.") |
| sys.exit(1) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|