LisaMegaWatts's picture
Upload parsers/epub_parser.py with huggingface_hub
8cef66a verified
"""EPUB file parser - extracts text from ebook chapters."""
import logging
from pathlib import Path
from bs4 import BeautifulSoup
from ebooklib import epub, ITEM_DOCUMENT
logger = logging.getLogger(__name__)
def parse_epub(filepath: Path) -> str:
"""Extract all text content from an EPUB file.
Args:
filepath: Path to the .epub file.
Returns:
The combined text from all chapters.
"""
try:
book = epub.read_epub(str(filepath), options={"ignore_ncx": True})
except Exception as e:
logger.error("Failed to read EPUB %s: %s", filepath.name, e)
return ""
chapters = []
for item in book.get_items_of_type(ITEM_DOCUMENT):
html = item.get_content().decode("utf-8", errors="replace")
soup = BeautifulSoup(html, "lxml")
# Remove script and style elements
for tag in soup(["script", "style", "nav"]):
tag.decompose()
text = soup.get_text(separator="\n")
# Clean up whitespace within the chapter
lines = []
for line in text.splitlines():
stripped = line.strip()
if stripped:
lines.append(stripped)
if lines:
chapters.append("\n".join(lines))
if not chapters:
logger.warning("No text content found in EPUB: %s", filepath.name)
return ""
logger.info("Extracted %d chapters from %s", len(chapters), filepath.name)
return "\n\n".join(chapters)