LisaMegaWatts commited on
Commit
f4c61e6
·
verified ·
1 Parent(s): 8cef66a

Upload parsers/zip_parser.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. parsers/zip_parser.py +66 -0
parsers/zip_parser.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ZIP archive parser - extracts and delegates to txt/epub parsers."""
2
+
3
+ import logging
4
+ import tempfile
5
+ import zipfile
6
+ from pathlib import Path
7
+
8
+ from .txt_parser import parse_txt
9
+ from .epub_parser import parse_epub
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ PARSERS = {
14
+ ".txt": parse_txt,
15
+ ".epub": parse_epub,
16
+ }
17
+
18
+
19
+ def parse_zip(filepath: Path) -> str:
20
+ """Extract a ZIP archive and parse all supported files inside.
21
+
22
+ Args:
23
+ filepath: Path to the .zip file.
24
+
25
+ Returns:
26
+ Combined text from all supported files in the archive.
27
+ """
28
+ if not zipfile.is_zipfile(str(filepath)):
29
+ logger.error("Not a valid ZIP file: %s", filepath.name)
30
+ return ""
31
+
32
+ texts = []
33
+
34
+ with tempfile.TemporaryDirectory() as tmpdir:
35
+ tmppath = Path(tmpdir)
36
+ try:
37
+ with zipfile.ZipFile(str(filepath), "r") as zf:
38
+ zf.extractall(tmppath)
39
+ except Exception as e:
40
+ logger.error("Failed to extract ZIP %s: %s", filepath.name, e)
41
+ return ""
42
+
43
+ # Find all supported files recursively
44
+ supported_files = []
45
+ for ext, parser in PARSERS.items():
46
+ for f in tmppath.rglob(f"*{ext}"):
47
+ if f.is_file() and not f.name.startswith("."):
48
+ supported_files.append((f, parser))
49
+
50
+ if not supported_files:
51
+ logger.warning("No supported files found in ZIP: %s", filepath.name)
52
+ return ""
53
+
54
+ supported_files.sort(key=lambda x: x[0].name)
55
+ logger.info("Found %d supported files in ZIP %s", len(supported_files), filepath.name)
56
+
57
+ for f, parser in supported_files:
58
+ try:
59
+ text = parser(f)
60
+ if text.strip():
61
+ texts.append(text)
62
+ logger.info(" Parsed %s (%d chars)", f.name, len(text))
63
+ except Exception as e:
64
+ logger.error(" Failed to parse %s: %s", f.name, e)
65
+
66
+ return "\n\n".join(texts)