Spaces:

XQ
/

Dokumentassistent

Sleeping

Dokumentassistent / src /ingestion /text_cleaner.py

init

31a2688 2 months ago

2.16 kB

	"""Text cleaning and normalization for parsed PDF content."""

	import logging
	import re

	logger = logging.getLogger(__name__)


	class TextCleaner:
	"""Cleans and normalizes raw text extracted from PDFs."""

	def clean(self, raw_text: str) -> str:
	"""Clean raw text by removing artifacts and normalizing whitespace.

	Args:
	raw_text: The raw text extracted from a PDF page.

	Returns:
	Cleaned and normalized text string.
	"""
	text = raw_text
	# Remove null bytes and control characters (keep newlines and tabs)
	text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]", "", text)
	# Normalize unicode whitespace to regular spaces
	text = re.sub(r"\u00a0", " ", text)
	# Remove soft hyphens
	text = text.replace("\u00ad", "")
	# Collapse multiple spaces into one
	text = re.sub(r"[ \t]+", " ", text)
	# Collapse three or more newlines into two
	text = re.sub(r"\n{3,}", "\n\n", text)
	# Strip leading/trailing whitespace per line
	text = "\n".join(line.strip() for line in text.splitlines())
	# Strip leading/trailing whitespace overall
	text = text.strip()
	return text

	def remove_headers_footers(self, text: str) -> str:
	"""Remove repeating headers and footers from text.

	Args:
	text: Text that may contain headers/footers.

	Returns:
	Text with headers and footers removed.
	"""
	lines = text.splitlines()
	if len(lines) < 3:
	return text
	# Remove common page-number-only lines (e.g. " 3 ", "- 12 -", "Side 5")
	cleaned_lines: list[str] = []
	for line in lines:
	stripped = line.strip()
	# Skip standalone page numbers
	if re.match(r"^[-–—]?\s\d{1,4}\s[-–—]?$", stripped):
	continue
	# Skip lines like "Side 3" or "Page 3" (Danish/English)
	if re.match(r"^(side\|page)\s+\d{1,4}$", stripped, re.IGNORECASE):
	continue
	cleaned_lines.append(line)
	return "\n".join(cleaned_lines)