File size: 1,478 Bytes
8cef66a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 | """EPUB file parser - extracts text from ebook chapters."""
import logging
from pathlib import Path
from bs4 import BeautifulSoup
from ebooklib import epub, ITEM_DOCUMENT
logger = logging.getLogger(__name__)
def parse_epub(filepath: Path) -> str:
"""Extract all text content from an EPUB file.
Args:
filepath: Path to the .epub file.
Returns:
The combined text from all chapters.
"""
try:
book = epub.read_epub(str(filepath), options={"ignore_ncx": True})
except Exception as e:
logger.error("Failed to read EPUB %s: %s", filepath.name, e)
return ""
chapters = []
for item in book.get_items_of_type(ITEM_DOCUMENT):
html = item.get_content().decode("utf-8", errors="replace")
soup = BeautifulSoup(html, "lxml")
# Remove script and style elements
for tag in soup(["script", "style", "nav"]):
tag.decompose()
text = soup.get_text(separator="\n")
# Clean up whitespace within the chapter
lines = []
for line in text.splitlines():
stripped = line.strip()
if stripped:
lines.append(stripped)
if lines:
chapters.append("\n".join(lines))
if not chapters:
logger.warning("No text content found in EPUB: %s", filepath.name)
return ""
logger.info("Extracted %d chapters from %s", len(chapters), filepath.name)
return "\n\n".join(chapters)
|