Babu Pallam
Add document loading and text cleaning modules
c37cfba
Raw
History Blame Contribute Delete
978 Bytes
# ============================================================
# FILE: src/text_cleaner.py
# ============================================================
# PURPOSE:
# Clean extracted text before chunking.
#
# Cleaning should improve text quality without destroying meaning.
#
# Do NOT blindly remove:
# - punctuation
# - numbers
# - headings
# - table labels
# - legal references
#
# These can be important for retrieval.
# ============================================================
import re
def clean_text(text: str) -> str:
"""
Clean text while preserving meaning.
Steps:
1. Normalize line endings.
2. Replace tabs with spaces.
3. Remove repeated spaces.
4. Reduce excessive blank lines.
5. Strip leading and trailing whitespace.
"""
text = text.replace("\r\n", "\n").replace("\r", "\n")
text = text.replace("\t", " ")
text = re.sub(r"[ ]+", " ", text)
text = re.sub(r"\n{3,}", "\n\n", text)
return text.strip()