| """Plain text file parser with encoding detection.""" |
|
|
| import logging |
| from pathlib import Path |
|
|
| import chardet |
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| def parse_txt(filepath: Path) -> str: |
| """Read a plain text file, auto-detecting encoding. |
| |
| Args: |
| filepath: Path to the .txt file. |
| |
| Returns: |
| The full text content as a string. |
| """ |
| raw = filepath.read_bytes() |
| if not raw: |
| logger.warning("Empty file: %s", filepath) |
| return "" |
|
|
| detected = chardet.detect(raw) |
| encoding = detected.get("encoding") or "utf-8" |
| confidence = detected.get("confidence", 0) |
| logger.info("Detected encoding %s (%.0f%%) for %s", encoding, confidence * 100, filepath.name) |
|
|
| try: |
| text = raw.decode(encoding) |
| except (UnicodeDecodeError, LookupError): |
| logger.warning("Failed to decode %s with %s, falling back to utf-8 with replace", filepath.name, encoding) |
| text = raw.decode("utf-8", errors="replace") |
|
|
| |
| if text.startswith("\ufeff"): |
| text = text[1:] |
|
|
| return text |
|
|