Spaces:

LisaMegaWatts
/

pre-punctuation-processor

Sleeping

App Files Files Community

LisaMegaWatts commited on Feb 20

Commit

8cef66a

verified ·

1 Parent(s): 4a3b497

Upload parsers/epub_parser.py with huggingface_hub

Browse files

Files changed (1) hide show

parsers/epub_parser.py +53 -0

parsers/epub_parser.py ADDED Viewed

	@@ -0,0 +1,53 @@

+"""EPUB file parser - extracts text from ebook chapters."""
+import logging
+from pathlib import Path
+from bs4 import BeautifulSoup
+from ebooklib import epub, ITEM_DOCUMENT
+logger = logging.getLogger(__name__)
+def parse_epub(filepath: Path) -> str:
+    """Extract all text content from an EPUB file.
+    Args:
+        filepath: Path to the .epub file.
+    Returns:
+        The combined text from all chapters.
+    """
+    try:
+        book = epub.read_epub(str(filepath), options={"ignore_ncx": True})
+    except Exception as e:
+        logger.error("Failed to read EPUB %s: %s", filepath.name, e)
+        return ""
+    chapters = []
+    for item in book.get_items_of_type(ITEM_DOCUMENT):
+        html = item.get_content().decode("utf-8", errors="replace")
+        soup = BeautifulSoup(html, "lxml")
+        # Remove script and style elements
+        for tag in soup(["script", "style", "nav"]):
+            tag.decompose()
+        text = soup.get_text(separator="\n")
+        # Clean up whitespace within the chapter
+        lines = []
+        for line in text.splitlines():
+            stripped = line.strip()
+            if stripped:
+                lines.append(stripped)
+        if lines:
+            chapters.append("\n".join(lines))
+    if not chapters:
+        logger.warning("No text content found in EPUB: %s", filepath.name)
+        return ""
+    logger.info("Extracted %d chapters from %s", len(chapters), filepath.name)
+    return "\n\n".join(chapters)