LisaMegaWatts commited on
Commit
8cef66a
·
verified ·
1 Parent(s): 4a3b497

Upload parsers/epub_parser.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. parsers/epub_parser.py +53 -0
parsers/epub_parser.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """EPUB file parser - extracts text from ebook chapters."""
2
+
3
+ import logging
4
+ from pathlib import Path
5
+
6
+ from bs4 import BeautifulSoup
7
+ from ebooklib import epub, ITEM_DOCUMENT
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ def parse_epub(filepath: Path) -> str:
13
+ """Extract all text content from an EPUB file.
14
+
15
+ Args:
16
+ filepath: Path to the .epub file.
17
+
18
+ Returns:
19
+ The combined text from all chapters.
20
+ """
21
+ try:
22
+ book = epub.read_epub(str(filepath), options={"ignore_ncx": True})
23
+ except Exception as e:
24
+ logger.error("Failed to read EPUB %s: %s", filepath.name, e)
25
+ return ""
26
+
27
+ chapters = []
28
+ for item in book.get_items_of_type(ITEM_DOCUMENT):
29
+ html = item.get_content().decode("utf-8", errors="replace")
30
+ soup = BeautifulSoup(html, "lxml")
31
+
32
+ # Remove script and style elements
33
+ for tag in soup(["script", "style", "nav"]):
34
+ tag.decompose()
35
+
36
+ text = soup.get_text(separator="\n")
37
+
38
+ # Clean up whitespace within the chapter
39
+ lines = []
40
+ for line in text.splitlines():
41
+ stripped = line.strip()
42
+ if stripped:
43
+ lines.append(stripped)
44
+
45
+ if lines:
46
+ chapters.append("\n".join(lines))
47
+
48
+ if not chapters:
49
+ logger.warning("No text content found in EPUB: %s", filepath.name)
50
+ return ""
51
+
52
+ logger.info("Extracted %d chapters from %s", len(chapters), filepath.name)
53
+ return "\n\n".join(chapters)