Spaces:

gilzero
/

editor-app-v10

Paused

App Files Files Community

editor-app-v10 / document_processor.py

gilzero

Upload folder using huggingface_hub

cb1a5c9 verified over 1 year ago

raw

history blame contribute delete

2.74 kB

	# filename: document_processor.py

	"""
	Module for extracting text from various document formats.
	"""

	import io
	import docx
	from PyPDF2 import PdfReader
	from log_config import get_logger

	logger = get_logger('DocumentProcessor')

	def extract_text_from_document(file_path: str) -> str:
	"""
	Extracts text from a document based on its file extension.

	Args:
	file_path (str): The path to the file.

	Returns:
	str: The extracted text from the document.

	Raises:
	ValueError: If the file format is not supported.
	"""
	file_extension = file_path.split(".")[-1].lower()
	try:
	with open(file_path, 'rb') as file_obj:
	if file_extension == "txt":
	return extract_text_from_txt(file_obj)
	elif file_extension == "pdf":
	return extract_text_from_pdf(file_obj)
	elif file_extension == "docx":
	return extract_text_from_docx(file_obj)
	else:
	raise ValueError(f"Unsupported file format: {file_extension}")
	except Exception as e:
	logger.error(f"Failed to extract text from {file_path}: {str(e)}")
	raise

	def extract_text_from_txt(file_obj: io.BufferedReader) -> str:
	"""
	Extracts text from a text file.

	Args:
	file_obj (io.BufferedReader): The file object opened in binary mode.

	Returns:
	str: The decoded text.
	"""
	try:
	content = file_obj.read()
	return content.decode('utf-8')
	except UnicodeDecodeError as e:
	logger.error(f"Unicode decode error: {str(e)}")
	raise

	def extract_text_from_pdf(file_obj: io.BufferedReader) -> str:
	"""
	Extracts text from a PDF file.

	Args:
	file_obj (io.BufferedReader): The file object opened in binary mode.

	Returns:
	str: The concatenated text from all pages.
	"""
	try:
	reader = PdfReader(file_obj)
	text = ''.join([page.extract_text() or '' for page in reader.pages])
	return text.strip()
	except Exception as e:
	logger.error(f"Failed to extract text from PDF: {str(e)}")
	raise

	def extract_text_from_docx(file_obj: io.BufferedReader) -> str:
	"""
	Extracts text from a DOCX file.

	Args:
	file_obj (io.BufferedReader): The file object opened in binary mode.

	Returns:
	str: The concatenated text from all paragraphs.
	"""
	try:
	doc = docx.Document(io.BytesIO(file_obj.read()))
	text = '\n'.join(paragraph.text for paragraph in doc.paragraphs if paragraph.text)
	return text.strip()
	except Exception as e:
	logger.error(f"Failed to extract text from DOCX: {str(e)}")
	raise

	# file: document_processor.py (end)