ai-sl-api

Build error

App Files Files Community

ai-sl-api / document_parsing.py

deenasun

fix for catching Gradio DataFile objects when they are passed from API calls as strings

f37f939 11 months ago

raw

history blame contribute delete

10.2 kB

	import os
	import sys
	from pathlib import Path
	from typing import Optional, Union
	import logging

	# Import document parsing libraries
	try:
	import PyPDF2
	from docx import Document
	import ebooklib
	from ebooklib import epub
	from bs4 import BeautifulSoup
	except ImportError as e:
	print(f"Missing required dependency: {e}")
	print("Please install dependencies with: pip install -r requirements.txt")
	sys.exit(1)

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)


	class DocumentParser:
	"""
	A class to parse and extract text from various document formats.
	Supports PDF, TXT, DOC, DOCX, and EPUB files.
	"""

	def __init__(self):
	self.supported_formats = {
	'application/pdf': self._parse_pdf,
	'text/plain': self._parse_txt,
	'application/vnd.openxmlformats-officedocument.wordprocessingml.document': self._parse_docx,
	'application/msword': self._parse_doc,
	'application/epub+zip': self._parse_epub
	}

	def get_file_type(self, file_path: Union[str, Path]) -> str:
	"""
	Detect the MIME type of a file using file extension.

	Args:
	file_path: Path to the file

	Returns:
	MIME type string
	"""
	return self._get_mime_from_extension(file_path)

	def _get_mime_from_extension(self, file_path: Union[str, Path]) -> str:
	"""
	Determine MIME type from file extension.

	Args:
	file_path: Path to the file

	Returns:
	MIME type string
	"""
	extension = Path(file_path).suffix.lower()
	extension_map = {
	'.pdf': 'application/pdf',
	'.txt': 'text/plain',
	'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
	'.doc': 'application/msword',
	'.epub': 'application/epub+zip'
	}

	mime_type = extension_map.get(extension, 'unknown')

	# If no extension or unknown extension, try to detect by content
	if mime_type == 'unknown':
	mime_type = self._detect_mime_by_content(file_path)

	return mime_type

	def _detect_mime_by_content(self, file_path: Union[str, Path]) -> str:
	"""
	Detect MIME type by reading file content.

	Args:
	file_path: Path to the file

	Returns:
	MIME type string
	"""
	try:
	with open(file_path, 'rb') as f:
	# Read first 1024 bytes to detect file type
	header = f.read(1024)

	# PDF detection
	if header.startswith(b'%PDF'):
	return 'application/pdf'

	# ZIP-based formats (DOCX, EPUB)
	if header.startswith(b'PK\x03\x04'):
	# Check if it's EPUB by looking for mimetype file
	try:
	import zipfile
	with zipfile.ZipFile(file_path, 'r') as zf:
	if 'mimetype' in zf.namelist():
	with zf.open('mimetype') as mf:
	mimetype = mf.read().decode('utf-8').strip()
	if mimetype == 'application/epub+zip':
	return 'application/epub+zip'
	# If not EPUB, assume DOCX
	return 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
	except:
	pass

	# Plain text detection (try to decode as UTF-8)
	try:
	header.decode('utf-8')
	return 'text/plain'
	except UnicodeDecodeError:
	pass

	except Exception as e:
	logger.warning(f"Error detecting MIME type by content: {e}")

	return 'unknown'

	def extract_text(self, file_path: Union[str, Path]) -> Optional[str]:
	"""
	Extract text from a document file.

	Args:
	file_path: Path to the document file

	Returns:
	Extracted text as string, or None if extraction fails
	"""
	file_path = Path(file_path)

	if not file_path.exists():
	logger.error(f"File not found: {file_path}")
	return None

	try:
	mime_type = self.get_file_type(file_path)
	logger.info(f"Detected file type: {mime_type}")

	if mime_type in self.supported_formats:
	return self.supported_formats[mime_type](file_path)
	else:
	logger.error(f"Unsupported file type: {mime_type}")
	return None

	except Exception as e:
	logger.error(f"Error extracting text from {file_path}: {e}")
	return None

	def _parse_pdf(self, file_path: Path) -> str:
	"""
	Extract text from PDF file.

	Args:
	file_path: Path to PDF file

	Returns:
	Extracted text
	"""
	text = ""
	try:
	with open(file_path, 'rb') as file:
	pdf_reader = PyPDF2.PdfReader(file)

	for page_num in range(len(pdf_reader.pages)):
	page = pdf_reader.pages[page_num]
	page_text = page.extract_text()
	if page_text:
	text += page_text + "\n"

	except Exception as e:
	logger.error(f"Error parsing PDF {file_path}: {e}")
	raise

	return text.strip()

	def _parse_txt(self, file_path: Path) -> str:
	"""
	Extract text from plain text file.

	Args:
	file_path: Path to text file

	Returns:
	Extracted text
	"""
	try:
	with open(file_path, 'r', encoding='utf-8') as file:
	return file.read()
	except UnicodeDecodeError:
	# Try with different encoding
	try:
	with open(file_path, 'r', encoding='latin-1') as file:
	return file.read()
	except Exception as e:
	logger.error(f"Error reading text file {file_path}: {e}")
	raise
	except Exception as e:
	logger.error(f"Error reading text file {file_path}: {e}")
	raise

	def _parse_docx(self, file_path: Path) -> str:
	"""
	Extract text from DOCX file.

	Args:
	file_path: Path to DOCX file

	Returns:
	Extracted text
	"""
	try:
	doc = Document(file_path)
	text = ""

	for paragraph in doc.paragraphs:
	text += paragraph.text + "\n"

	return text.strip()

	except Exception as e:
	logger.error(f"Error parsing DOCX {file_path}: {e}")
	raise

	def _parse_doc(self, file_path: Path) -> str:
	"""
	Extract text from DOC file (legacy Word format).
	Note: This requires additional dependencies like antiword or catdoc.

	Args:
	file_path: Path to DOC file

	Returns:
	Extracted text
	"""
	try:
	# Try using antiword if available
	import subprocess
	result = subprocess.run(['antiword', str(file_path)],
	capture_output=True, text=True)
	if result.returncode == 0:
	return result.stdout.strip()

	# Fallback: try catdoc
	result = subprocess.run(['catdoc', str(file_path)],
	capture_output=True, text=True)
	if result.returncode == 0:
	return result.stdout.strip()

	raise Exception("Neither antiword nor catdoc found. Please install one of them.")

	except FileNotFoundError:
	raise Exception("antiword or catdoc not found. Please install one of them for DOC file support.")
	except Exception as e:
	logger.error(f"Error parsing DOC {file_path}: {e}")
	raise

	def _parse_epub(self, file_path: Path) -> str:
	"""
	Extract text from EPUB file.

	Args:
	file_path: Path to EPUB file

	Returns:
	Extracted text
	"""
	try:
	book = epub.read_epub(file_path)
	text = ""

	for item in book.get_items():
	if item.get_type() == ebooklib.ITEM_DOCUMENT:
	content = item.get_content().decode('utf-8')
	soup = BeautifulSoup(content, 'html.parser')
	text += soup.get_text() + "\n"

	return text.strip()

	except Exception as e:
	logger.error(f"Error parsing EPUB {file_path}: {e}")
	raise


	def main():
	"""
	Main function to demonstrate usage of the DocumentParser.
	"""
	if len(sys.argv) != 2:
	print("Usage: python document_parsing.py <file_path>")
	print("Supported formats: PDF, TXT, DOC, DOCX, EPUB")
	sys.exit(1)

	file_path = sys.argv[1]
	parser = DocumentParser()

	print(f"Extracting text from: {file_path}")
	print("-" * 50)

	extracted_text = parser.extract_text(file_path)

	if extracted_text:
	print("Extracted text:")
	print(extracted_text)
	print(f"\nTotal characters: {len(extracted_text)}")
	else:
	print("Failed to extract text from the file.")
	sys.exit(1)


	if __name__ == "__main__":
	main()