Spaces:

LisaMegaWatts
/

pre-punctuation-processor

Sleeping

Upload parsers/txt_parser.py with huggingface_hub

4a3b497 verified about 2 months ago

1.08 kB

	"""Plain text file parser with encoding detection."""

	import logging
	from pathlib import Path

	import chardet

	logger = logging.getLogger(__name__)


	def parse_txt(filepath: Path) -> str:
	"""Read a plain text file, auto-detecting encoding.

	Args:
	filepath: Path to the .txt file.

	Returns:
	The full text content as a string.
	"""
	raw = filepath.read_bytes()
	if not raw:
	logger.warning("Empty file: %s", filepath)
	return ""

	detected = chardet.detect(raw)
	encoding = detected.get("encoding") or "utf-8"
	confidence = detected.get("confidence", 0)
	logger.info("Detected encoding %s (%.0f%%) for %s", encoding, confidence * 100, filepath.name)

	try:
	text = raw.decode(encoding)
	except (UnicodeDecodeError, LookupError):
	logger.warning("Failed to decode %s with %s, falling back to utf-8 with replace", filepath.name, encoding)
	text = raw.decode("utf-8", errors="replace")

	# Strip BOM if present
	if text.startswith("\ufeff"):
	text = text[1:]

	return text