"""EPUB file parser - extracts text from ebook chapters.""" import logging from pathlib import Path from bs4 import BeautifulSoup from ebooklib import epub, ITEM_DOCUMENT logger = logging.getLogger(__name__) def parse_epub(filepath: Path) -> str: """Extract all text content from an EPUB file. Args: filepath: Path to the .epub file. Returns: The combined text from all chapters. """ try: book = epub.read_epub(str(filepath), options={"ignore_ncx": True}) except Exception as e: logger.error("Failed to read EPUB %s: %s", filepath.name, e) return "" chapters = [] for item in book.get_items_of_type(ITEM_DOCUMENT): html = item.get_content().decode("utf-8", errors="replace") soup = BeautifulSoup(html, "lxml") # Remove script and style elements for tag in soup(["script", "style", "nav"]): tag.decompose() text = soup.get_text(separator="\n") # Clean up whitespace within the chapter lines = [] for line in text.splitlines(): stripped = line.strip() if stripped: lines.append(stripped) if lines: chapters.append("\n".join(lines)) if not chapters: logger.warning("No text content found in EPUB: %s", filepath.name) return "" logger.info("Extracted %d chapters from %s", len(chapters), filepath.name) return "\n\n".join(chapters)