| """
|
| File Parser Utilities.
|
|
|
| This module provides utilities for extracting text content from
|
| various document formats (Word, Excel, PDF).
|
|
|
| Design Note:
|
| These parsers extract raw text content only. All interpretation
|
| and structuring is handled by the Input Normalizer.
|
| """
|
|
|
| from typing import Optional
|
| from pathlib import Path
|
|
|
|
|
| class FileParser:
|
| """
|
| Multi-format file parser for document content extraction.
|
|
|
| Supports:
|
| - Word documents (.docx, .doc)
|
| - Excel spreadsheets (.xlsx, .xls)
|
| - PDF documents (.pdf)
|
| """
|
|
|
| def parse_word(self, file_path: str) -> Optional[str]:
|
| """
|
| Extract text content from a Word document.
|
|
|
| Args:
|
| file_path: Path to the Word document
|
|
|
| Returns:
|
| Extracted text content, or None if extraction fails
|
| """
|
| try:
|
| from docx import Document
|
|
|
| doc = Document(file_path)
|
|
|
|
|
| paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]
|
|
|
|
|
| tables_text = []
|
| for table in doc.tables:
|
| for row in table.rows:
|
| row_text = [cell.text.strip() for cell in row.cells]
|
| if any(row_text):
|
| tables_text.append(" | ".join(row_text))
|
|
|
|
|
| all_text = paragraphs + tables_text
|
| return "\n".join(all_text)
|
|
|
| except ImportError:
|
| print("Warning: python-docx not installed. Word parsing disabled.")
|
| return None
|
| except Exception as e:
|
| print(f"Error parsing Word document: {e}")
|
| return None
|
|
|
| def parse_excel(self, file_path: str) -> Optional[str]:
|
| """
|
| Extract text content from an Excel spreadsheet.
|
|
|
| Args:
|
| file_path: Path to the Excel file
|
|
|
| Returns:
|
| Extracted text content, or None if extraction fails
|
| """
|
| try:
|
| from openpyxl import load_workbook
|
|
|
| wb = load_workbook(file_path, read_only=True, data_only=True)
|
|
|
| all_text = []
|
|
|
| for sheet_name in wb.sheetnames:
|
| sheet = wb[sheet_name]
|
| all_text.append(f"=== Sheet: {sheet_name} ===")
|
|
|
| for row in sheet.iter_rows():
|
| row_values = []
|
| for cell in row:
|
| if cell.value is not None:
|
| row_values.append(str(cell.value))
|
|
|
| if row_values:
|
| all_text.append(" | ".join(row_values))
|
|
|
| wb.close()
|
| return "\n".join(all_text)
|
|
|
| except ImportError:
|
| print("Warning: openpyxl not installed. Excel parsing disabled.")
|
| return None
|
| except Exception as e:
|
| print(f"Error parsing Excel file: {e}")
|
| return None
|
|
|
| def parse_pdf(self, file_path: str) -> Optional[str]:
|
| """
|
| Extract text content from a PDF document.
|
|
|
| Args:
|
| file_path: Path to the PDF file
|
|
|
| Returns:
|
| Extracted text content, or None if extraction fails
|
| """
|
| try:
|
| import pdfplumber
|
|
|
| all_text = []
|
|
|
| with pdfplumber.open(file_path) as pdf:
|
| for page_num, page in enumerate(pdf.pages, 1):
|
|
|
| page_text = page.extract_text()
|
| if page_text:
|
| all_text.append(f"--- Page {page_num} ---")
|
| all_text.append(page_text)
|
|
|
|
|
| tables = page.extract_tables()
|
| for table in tables:
|
| for row in table:
|
| if row:
|
| row_text = [str(cell) if cell else "" for cell in row]
|
| all_text.append(" | ".join(row_text))
|
|
|
| return "\n".join(all_text)
|
|
|
| except ImportError:
|
| print("Warning: pdfplumber not installed. PDF parsing disabled.")
|
| return None
|
| except Exception as e:
|
| print(f"Error parsing PDF: {e}")
|
| return None
|
|
|
| def detect_file_type(self, file_path: str) -> Optional[str]:
|
| """
|
| Detect the type of file based on extension.
|
|
|
| Returns:
|
| File type string ('word', 'excel', 'pdf') or None
|
| """
|
| path = Path(file_path)
|
| suffix = path.suffix.lower()
|
|
|
| if suffix in ['.docx', '.doc']:
|
| return 'word'
|
| elif suffix in ['.xlsx', '.xls']:
|
| return 'excel'
|
| elif suffix == '.pdf':
|
| return 'pdf'
|
| else:
|
| return None
|
|
|
| def parse_auto(self, file_path: str) -> Optional[str]:
|
| """
|
| Automatically detect file type and parse accordingly.
|
|
|
| Args:
|
| file_path: Path to the file
|
|
|
| Returns:
|
| Extracted text content, or None if unsupported/failed
|
| """
|
| file_type = self.detect_file_type(file_path)
|
|
|
| if file_type == 'word':
|
| return self.parse_word(file_path)
|
| elif file_type == 'excel':
|
| return self.parse_excel(file_path)
|
| elif file_type == 'pdf':
|
| return self.parse_pdf(file_path)
|
| else:
|
| print(f"Unsupported file type: {file_path}")
|
| return None
|
|
|
|
|
|
|
|
|
|
|
| def parse_file(file_path: str) -> Optional[str]:
|
| """
|
| Helper function to parse file content using FileParser.
|
| This provides backward compatibility with code expecting a functional interface.
|
|
|
| Args:
|
| file_path: Path to the file to parse
|
|
|
| Returns:
|
| Extracted text content
|
| """
|
| parser = FileParser()
|
| return parser.parse_auto(file_path)
|
|
|