| """EPUB file parser - extracts text from ebook chapters.""" |
|
|
| import logging |
| from pathlib import Path |
|
|
| from bs4 import BeautifulSoup |
| from ebooklib import epub, ITEM_DOCUMENT |
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| def parse_epub(filepath: Path) -> str: |
| """Extract all text content from an EPUB file. |
| |
| Args: |
| filepath: Path to the .epub file. |
| |
| Returns: |
| The combined text from all chapters. |
| """ |
| try: |
| book = epub.read_epub(str(filepath), options={"ignore_ncx": True}) |
| except Exception as e: |
| logger.error("Failed to read EPUB %s: %s", filepath.name, e) |
| return "" |
|
|
| chapters = [] |
| for item in book.get_items_of_type(ITEM_DOCUMENT): |
| html = item.get_content().decode("utf-8", errors="replace") |
| soup = BeautifulSoup(html, "lxml") |
|
|
| |
| for tag in soup(["script", "style", "nav"]): |
| tag.decompose() |
|
|
| text = soup.get_text(separator="\n") |
|
|
| |
| lines = [] |
| for line in text.splitlines(): |
| stripped = line.strip() |
| if stripped: |
| lines.append(stripped) |
|
|
| if lines: |
| chapters.append("\n".join(lines)) |
|
|
| if not chapters: |
| logger.warning("No text content found in EPUB: %s", filepath.name) |
| return "" |
|
|
| logger.info("Extracted %d chapters from %s", len(chapters), filepath.name) |
| return "\n\n".join(chapters) |
|
|