Spaces:

LisaMegaWatts
/

pre-punctuation-processor

Sleeping

Upload parsers/epub_parser.py with huggingface_hub

8cef66a verified about 1 month ago

1.48 kB

	"""EPUB file parser - extracts text from ebook chapters."""

	import logging
	from pathlib import Path

	from bs4 import BeautifulSoup
	from ebooklib import epub, ITEM_DOCUMENT

	logger = logging.getLogger(__name__)


	def parse_epub(filepath: Path) -> str:
	"""Extract all text content from an EPUB file.

	Args:
	filepath: Path to the .epub file.

	Returns:
	The combined text from all chapters.
	"""
	try:
	book = epub.read_epub(str(filepath), options={"ignore_ncx": True})
	except Exception as e:
	logger.error("Failed to read EPUB %s: %s", filepath.name, e)
	return ""

	chapters = []
	for item in book.get_items_of_type(ITEM_DOCUMENT):
	html = item.get_content().decode("utf-8", errors="replace")
	soup = BeautifulSoup(html, "lxml")

	# Remove script and style elements
	for tag in soup(["script", "style", "nav"]):
	tag.decompose()

	text = soup.get_text(separator="\n")

	# Clean up whitespace within the chapter
	lines = []
	for line in text.splitlines():
	stripped = line.strip()
	if stripped:
	lines.append(stripped)

	if lines:
	chapters.append("\n".join(lines))

	if not chapters:
	logger.warning("No text content found in EPUB: %s", filepath.name)
	return ""

	logger.info("Extracted %d chapters from %s", len(chapters), filepath.name)
	return "\n\n".join(chapters)