Spaces:

Kevinshh
/

Preformu

Sleeping

App Files Files Community

Preformu / utils /file_parsers.py

Kevinshh

Upload file_parsers.py

39ed0fb verified 4 months ago

raw

history blame contribute delete

6.63 kB

	"""
	File Parser Utilities.

	This module provides utilities for extracting text content from
	various document formats (Word, Excel, PDF).

	Design Note:
	These parsers extract raw text content only. All interpretation
	and structuring is handled by the Input Normalizer.
	"""

	from typing import Optional
	from pathlib import Path


	class FileParser:
	"""
	Multi-format file parser for document content extraction.

	Supports:
	- Word documents (.docx, .doc)
	- Excel spreadsheets (.xlsx, .xls)
	- PDF documents (.pdf)
	"""

	def parse_word(self, file_path: str) -> Optional[str]:
	"""
	Extract text content from a Word document.

	Args:
	file_path: Path to the Word document

	Returns:
	Extracted text content, or None if extraction fails
	"""
	try:
	from docx import Document

	doc = Document(file_path)

	# Extract text from paragraphs
	paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]

	# Extract text from tables
	tables_text = []
	for table in doc.tables:
	for row in table.rows:
	row_text = [cell.text.strip() for cell in row.cells]
	if any(row_text):
	tables_text.append(" \| ".join(row_text))

	# Combine all text
	all_text = paragraphs + tables_text
	return "\n".join(all_text)

	except ImportError:
	print("Warning: python-docx not installed. Word parsing disabled.")
	return None
	except Exception as e:
	print(f"Error parsing Word document: {e}")
	return None

	def parse_excel(self, file_path: str) -> Optional[str]:
	"""
	Extract text content from an Excel spreadsheet.

	Args:
	file_path: Path to the Excel file

	Returns:
	Extracted text content, or None if extraction fails
	"""
	try:
	from openpyxl import load_workbook

	wb = load_workbook(file_path, read_only=True, data_only=True)

	all_text = []

	for sheet_name in wb.sheetnames:
	sheet = wb[sheet_name]
	all_text.append(f"=== Sheet: {sheet_name} ===")

	for row in sheet.iter_rows():
	row_values = []
	for cell in row:
	if cell.value is not None:
	row_values.append(str(cell.value))

	if row_values:
	all_text.append(" \| ".join(row_values))

	wb.close()
	return "\n".join(all_text)

	except ImportError:
	print("Warning: openpyxl not installed. Excel parsing disabled.")
	return None
	except Exception as e:
	print(f"Error parsing Excel file: {e}")
	return None

	def parse_pdf(self, file_path: str) -> Optional[str]:
	"""
	Extract text content from a PDF document.

	Args:
	file_path: Path to the PDF file

	Returns:
	Extracted text content, or None if extraction fails
	"""
	try:
	import pdfplumber

	all_text = []

	with pdfplumber.open(file_path) as pdf:
	for page_num, page in enumerate(pdf.pages, 1):
	# Extract text
	page_text = page.extract_text()
	if page_text:
	all_text.append(f"--- Page {page_num} ---")
	all_text.append(page_text)

	# Extract tables
	tables = page.extract_tables()
	for table in tables:
	for row in table:
	if row:
	row_text = [str(cell) if cell else "" for cell in row]
	all_text.append(" \| ".join(row_text))

	return "\n".join(all_text)

	except ImportError:
	print("Warning: pdfplumber not installed. PDF parsing disabled.")
	return None
	except Exception as e:
	print(f"Error parsing PDF: {e}")
	return None

	def detect_file_type(self, file_path: str) -> Optional[str]:
	"""
	Detect the type of file based on extension.

	Returns:
	File type string ('word', 'excel', 'pdf') or None
	"""
	path = Path(file_path)
	suffix = path.suffix.lower()

	if suffix in ['.docx', '.doc']:
	return 'word'
	elif suffix in ['.xlsx', '.xls']:
	return 'excel'
	elif suffix == '.pdf':
	return 'pdf'
	else:
	return None

	def parse_auto(self, file_path: str) -> Optional[str]:
	"""
	Automatically detect file type and parse accordingly.

	Args:
	file_path: Path to the file

	Returns:
	Extracted text content, or None if unsupported/failed
	"""
	file_type = self.detect_file_type(file_path)

	if file_type == 'word':
	return self.parse_word(file_path)
	elif file_type == 'excel':
	return self.parse_excel(file_path)
	elif file_type == 'pdf':
	return self.parse_pdf(file_path)
	else:
	print(f"Unsupported file type: {file_path}")
	return None

	# =============================================================================
	# Helper function wrapper for functional usage
	# =============================================================================

	def parse_file(file_path: str) -> Optional[str]:
	"""
	Helper function to parse file content using FileParser.
	This provides backward compatibility with code expecting a functional interface.

	Args:
	file_path: Path to the file to parse

	Returns:
	Extracted text content
	"""
	parser = FileParser()
	return parser.parse_auto(file_path)