Spaces:

babupallam
/

knowflow-ai-rag-document-chatbot

Sleeping

Babu Pallam

Add document loading and text cleaning modules

c37cfba about 2 months ago

978 Bytes

	# ============================================================
	# FILE: src/text_cleaner.py
	# ============================================================
	# PURPOSE:
	# Clean extracted text before chunking.
	#
	# Cleaning should improve text quality without destroying meaning.
	#
	# Do NOT blindly remove:
	# - punctuation
	# - numbers
	# - headings
	# - table labels
	# - legal references
	#
	# These can be important for retrieval.
	# ============================================================

	import re


	def clean_text(text: str) -> str:
	"""
	Clean text while preserving meaning.

	Steps:
	1. Normalize line endings.
	2. Replace tabs with spaces.
	3. Remove repeated spaces.
	4. Reduce excessive blank lines.
	5. Strip leading and trailing whitespace.
	"""

	text = text.replace("\r\n", "\n").replace("\r", "\n")
	text = text.replace("\t", " ")
	text = re.sub(r"[ ]+", " ", text)
	text = re.sub(r"\n{3,}", "\n\n", text)

	return text.strip()