Spaces:

LisaMegaWatts
/

pre-punctuation-processor

Sleeping

File size: 1,478 Bytes

8cef66a

"""EPUB file parser - extracts text from ebook chapters."""

import logging
from pathlib import Path

from bs4 import BeautifulSoup
from ebooklib import epub, ITEM_DOCUMENT

logger = logging.getLogger(__name__)


def parse_epub(filepath: Path) -> str:
    """Extract all text content from an EPUB file.

    Args:
        filepath: Path to the .epub file.

    Returns:
        The combined text from all chapters.
    """
    try:
        book = epub.read_epub(str(filepath), options={"ignore_ncx": True})
    except Exception as e:
        logger.error("Failed to read EPUB %s: %s", filepath.name, e)
        return ""

    chapters = []
    for item in book.get_items_of_type(ITEM_DOCUMENT):
        html = item.get_content().decode("utf-8", errors="replace")
        soup = BeautifulSoup(html, "lxml")

        # Remove script and style elements
        for tag in soup(["script", "style", "nav"]):
            tag.decompose()

        text = soup.get_text(separator="\n")

        # Clean up whitespace within the chapter
        lines = []
        for line in text.splitlines():
            stripped = line.strip()
            if stripped:
                lines.append(stripped)

        if lines:
            chapters.append("\n".join(lines))

    if not chapters:
        logger.warning("No text content found in EPUB: %s", filepath.name)
        return ""

    logger.info("Extracted %d chapters from %s", len(chapters), filepath.name)
    return "\n\n".join(chapters)