LisaMegaWatts commited on
Commit
4a3b497
·
verified ·
1 Parent(s): 656d99f

Upload parsers/txt_parser.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. parsers/txt_parser.py +40 -0
parsers/txt_parser.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Plain text file parser with encoding detection."""
2
+
3
+ import logging
4
+ from pathlib import Path
5
+
6
+ import chardet
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ def parse_txt(filepath: Path) -> str:
12
+ """Read a plain text file, auto-detecting encoding.
13
+
14
+ Args:
15
+ filepath: Path to the .txt file.
16
+
17
+ Returns:
18
+ The full text content as a string.
19
+ """
20
+ raw = filepath.read_bytes()
21
+ if not raw:
22
+ logger.warning("Empty file: %s", filepath)
23
+ return ""
24
+
25
+ detected = chardet.detect(raw)
26
+ encoding = detected.get("encoding") or "utf-8"
27
+ confidence = detected.get("confidence", 0)
28
+ logger.info("Detected encoding %s (%.0f%%) for %s", encoding, confidence * 100, filepath.name)
29
+
30
+ try:
31
+ text = raw.decode(encoding)
32
+ except (UnicodeDecodeError, LookupError):
33
+ logger.warning("Failed to decode %s with %s, falling back to utf-8 with replace", filepath.name, encoding)
34
+ text = raw.decode("utf-8", errors="replace")
35
+
36
+ # Strip BOM if present
37
+ if text.startswith("\ufeff"):
38
+ text = text[1:]
39
+
40
+ return text