Preformu / utils /file_parsers.py
Kevinshh's picture
Upload file_parsers.py
39ed0fb verified
"""
File Parser Utilities.
This module provides utilities for extracting text content from
various document formats (Word, Excel, PDF).
Design Note:
These parsers extract raw text content only. All interpretation
and structuring is handled by the Input Normalizer.
"""
from typing import Optional
from pathlib import Path
class FileParser:
"""
Multi-format file parser for document content extraction.
Supports:
- Word documents (.docx, .doc)
- Excel spreadsheets (.xlsx, .xls)
- PDF documents (.pdf)
"""
def parse_word(self, file_path: str) -> Optional[str]:
"""
Extract text content from a Word document.
Args:
file_path: Path to the Word document
Returns:
Extracted text content, or None if extraction fails
"""
try:
from docx import Document
doc = Document(file_path)
# Extract text from paragraphs
paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]
# Extract text from tables
tables_text = []
for table in doc.tables:
for row in table.rows:
row_text = [cell.text.strip() for cell in row.cells]
if any(row_text):
tables_text.append(" | ".join(row_text))
# Combine all text
all_text = paragraphs + tables_text
return "\n".join(all_text)
except ImportError:
print("Warning: python-docx not installed. Word parsing disabled.")
return None
except Exception as e:
print(f"Error parsing Word document: {e}")
return None
def parse_excel(self, file_path: str) -> Optional[str]:
"""
Extract text content from an Excel spreadsheet.
Args:
file_path: Path to the Excel file
Returns:
Extracted text content, or None if extraction fails
"""
try:
from openpyxl import load_workbook
wb = load_workbook(file_path, read_only=True, data_only=True)
all_text = []
for sheet_name in wb.sheetnames:
sheet = wb[sheet_name]
all_text.append(f"=== Sheet: {sheet_name} ===")
for row in sheet.iter_rows():
row_values = []
for cell in row:
if cell.value is not None:
row_values.append(str(cell.value))
if row_values:
all_text.append(" | ".join(row_values))
wb.close()
return "\n".join(all_text)
except ImportError:
print("Warning: openpyxl not installed. Excel parsing disabled.")
return None
except Exception as e:
print(f"Error parsing Excel file: {e}")
return None
def parse_pdf(self, file_path: str) -> Optional[str]:
"""
Extract text content from a PDF document.
Args:
file_path: Path to the PDF file
Returns:
Extracted text content, or None if extraction fails
"""
try:
import pdfplumber
all_text = []
with pdfplumber.open(file_path) as pdf:
for page_num, page in enumerate(pdf.pages, 1):
# Extract text
page_text = page.extract_text()
if page_text:
all_text.append(f"--- Page {page_num} ---")
all_text.append(page_text)
# Extract tables
tables = page.extract_tables()
for table in tables:
for row in table:
if row:
row_text = [str(cell) if cell else "" for cell in row]
all_text.append(" | ".join(row_text))
return "\n".join(all_text)
except ImportError:
print("Warning: pdfplumber not installed. PDF parsing disabled.")
return None
except Exception as e:
print(f"Error parsing PDF: {e}")
return None
def detect_file_type(self, file_path: str) -> Optional[str]:
"""
Detect the type of file based on extension.
Returns:
File type string ('word', 'excel', 'pdf') or None
"""
path = Path(file_path)
suffix = path.suffix.lower()
if suffix in ['.docx', '.doc']:
return 'word'
elif suffix in ['.xlsx', '.xls']:
return 'excel'
elif suffix == '.pdf':
return 'pdf'
else:
return None
def parse_auto(self, file_path: str) -> Optional[str]:
"""
Automatically detect file type and parse accordingly.
Args:
file_path: Path to the file
Returns:
Extracted text content, or None if unsupported/failed
"""
file_type = self.detect_file_type(file_path)
if file_type == 'word':
return self.parse_word(file_path)
elif file_type == 'excel':
return self.parse_excel(file_path)
elif file_type == 'pdf':
return self.parse_pdf(file_path)
else:
print(f"Unsupported file type: {file_path}")
return None
# =============================================================================
# Helper function wrapper for functional usage
# =============================================================================
def parse_file(file_path: str) -> Optional[str]:
"""
Helper function to parse file content using FileParser.
This provides backward compatibility with code expecting a functional interface.
Args:
file_path: Path to the file to parse
Returns:
Extracted text content
"""
parser = FileParser()
return parser.parse_auto(file_path)