Spaces:

satyakimitra
/

QuerySphere

Running

App Files Files Community

QuerySphere / utils /text_cleaner.py

satyakimitra

first commit

0a4529c about 2 months ago

raw

history blame contribute delete

14.9 kB

	# DEPENDENCIES
	import re
	import html
	import unicodedata
	from typing import Optional, List
	from config.logging_config import get_logger

	# Setup Logger
	logger = get_logger(__name__)


	class TextCleaner:
	"""
	Comprehensive text cleaning and normalization: Preserves semantic meaning while removing noise
	"""
	# Common patterns
	URL_PATTERN = re.compile(r'http[s]?://(?:[a-zA-Z]\|[0-9]\|[$-_@.&+]\|[!*\$\$,]\|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
	EMAIL_PATTERN = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z\|a-z]{2,}\b')
	PHONE_PATTERN = re.compile(r'(\+\d{1,3}[-.\s]?)?($?\d{3}$?[-.\s]?)?\d{3}[-.\s]?\d{4}')
	MULTIPLE_SPACES = re.compile(r'\s+')
	MULTIPLE_NEWLINES = re.compile(r'\n\s\n\s\n+')

	# HTML/XML patterns
	HTML_TAG_PATTERN = re.compile(r'<[^>]+>')
	HTML_ENTITY_PATTERN = re.compile(r'&[a-zA-Z]+;\|&#\d+;')

	# Special characters
	BULLET_POINTS = ['•', '◦', '▪', '▫', '⬩', '▹', '▸', '►', '▻', '→']
	QUOTATION_MARKS = ['"', '"', ''', ''', '«', '»', '‹', '›']


	@classmethod
	def clean(cls, text: str, remove_urls: bool = False, remove_emails: bool = False, remove_phone_numbers: bool = False, remove_html: bool = True,
	normalize_whitespace: bool = True, normalize_quotes: bool = True, normalize_bullets: bool = True, lowercase: bool = False,
	remove_extra_newlines: bool = True, preserve_structure: bool = True) -> str:
	"""
	Clean text with configurable options

	Arguments:
	----------
	text { str } : Input text

	remove_urls { bool } : Remove URLs

	remove_emails { bool } : Remove email addresses

	remove_phone_numbers { bool } : Remove phone numbers

	remove_html { bool } : Remove HTML tags

	normalize_whitespace { bool } : Normalize spaces/tabs

	normalize_quotes { bool } : Convert fancy quotes to standard

	normalize_bullets { bool } : Convert bullet points to standard

	lowercase { bool } : Convert to lowercase

	remove_extra_newlines { bool } : Remove excessive blank lines

	preserve_structure { bool } : Try to maintain document structure

	Returns:
	--------
	{ str } : Cleaned text
	"""
	if not text or not text.strip():
	return ""

	# Original length for logging
	original_length = len(text)

	# Remove HTML if present
	if remove_html:
	text = cls.remove_html_tags(text)
	text = cls.decode_html_entities(text)

	# Remove specific patterns
	if remove_urls:
	text = cls.URL_PATTERN.sub(' ', text)

	if remove_emails:
	text = cls.EMAIL_PATTERN.sub(' ', text)

	if remove_phone_numbers:
	text = cls.PHONE_PATTERN.sub(' ', text)

	# Normalize unicode
	text = cls.normalize_unicode(text)

	# Normalize quotes
	if normalize_quotes:
	text = cls.normalize_quotation_marks(text)

	# Normalize bullets
	if normalize_bullets:
	text = cls.normalize_bullet_points(text)

	# Handle whitespace
	if normalize_whitespace:
	# Replace tabs with spaces
	text = text.replace('\t', ' ')

	# Normalize spaces (but not newlines if preserving structure)
	if preserve_structure:
	lines = text.split('\n')
	lines = [cls.MULTIPLE_SPACES.sub(' ', line) for line in lines]
	text = '\n'.join(lines)

	else:
	text = cls.MULTIPLE_SPACES.sub(' ', text)

	# Remove extra newlines
	if remove_extra_newlines:
	text = cls.MULTIPLE_NEWLINES.sub('\n\n', text)

	# Lowercase if requested
	if lowercase:
	text = text.lower()

	# Final cleanup
	text = text.strip()

	# Log cleaning stats
	cleaned_length = len(text)
	reduction = ((original_length - cleaned_length) / original_length * 100) if (original_length > 0) else 0

	logger.debug(f"Text cleaned: {original_length} -> {cleaned_length} chars ({reduction:.1f}% reduction)")

	return text


	@classmethod
	def remove_html_tags(cls, text: str) -> str:
	"""
	Remove HTML tags
	"""
	return cls.HTML_TAG_PATTERN.sub('', text)


	@classmethod
	def decode_html_entities(cls, text: str) -> str:
	"""
	Decode HTML entities
	"""
	return html.unescape(text)


	@classmethod
	def normalize_unicode(cls, text: str) -> str:
	"""
	Normalize unicode characters : Converts to NFC form (canonical composition)
	"""
	return unicodedata.normalize('NFC', text)


	@classmethod
	def normalize_quotation_marks(cls, text: str) -> str:
	"""
	Convert fancy quotes to standard ASCII quotes
	"""
	for fancy_quote in cls.QUOTATION_MARKS:
	if (fancy_quote in ['"', '"', '«', '»']):
	text = text.replace(fancy_quote, '"')

	elif (fancy_quote in [''', ''', '‹', '›']):
	text = text.replace(fancy_quote, "'")

	return text


	@classmethod
	def normalize_bullet_points(cls, text: str) -> str:
	"""
	Convert various bullet points to standard bullet
	"""
	for bullet in cls.BULLET_POINTS:
	text = text.replace(bullet, '•')

	return text


	@classmethod
	def remove_boilerplate(cls, text: str, remove_headers: bool = True, remove_footers: bool = True, remove_page_numbers: bool = True) -> str:
	"""
	Remove common boilerplate text

	Arguments:
	----------
	text { str } : Input text

	remove_headers { bool } : Remove common header patterns

	remove_footers { bool } : Remove common footer patterns

	remove_page_numbers { bool } : Remove standalone page numbers

	Returns:
	--------
	{ str } : Text without boilerplate
	"""
	lines = text.split('\n')
	cleaned_lines = list()

	for line in lines:
	line_stripped = line.strip()

	# Skip empty lines
	if not line_stripped:
	cleaned_lines.append(line)
	continue

	# Remove page numbers (lines that are just numbers)
	if remove_page_numbers and line_stripped.isdigit():
	continue

	# Remove common header patterns
	if remove_headers:
	header_patterns = [r'^Page \d+ of \d+$', r'^\d+/\d+$', r'^Header:', r'^Draft', r'^Confidential']

	if (any(re.match(pattern, line_stripped, re.IGNORECASE) for pattern in header_patterns)):
	continue

	# Remove common footer patterns
	if remove_footers:
	footer_patterns = [r'^Copyright ©', r'^All rights reserved', r'^Footer:', r'^\d{4} .+ Inc\.']

	if any(re.match(pattern, line_stripped, re.IGNORECASE) for pattern in footer_patterns):
	continue

	cleaned_lines.append(line)

	return '\n'.join(cleaned_lines)


	@classmethod
	def extract_sentences(cls, text: str) -> List[str]:
	"""
	Split text into sentences : Handles common abbreviations and edge cases

	Arguments:
	----------
	text { str } : Input text

	Returns:
	--------
	{ list } : List of sentences
	"""
	# Common abbreviations that shouldn't trigger sentence breaks
	abbreviations = {'Dr.', 'Mr.', 'Mrs.', 'Ms.', 'Jr.', 'Sr.', 'Prof.', 'Inc.', 'Ltd.', 'Corp.', 'Co.', 'vs.', 'etc.', 'e.g.', 'i.e.', 'Ph.D.', 'M.D.', 'B.A.', 'M.A.', 'U.S.', 'U.K.'}

	# Protect abbreviations
	protected_text = text

	for abbr in abbreviations:
	protected_text = protected_text.replace(abbr, abbr.replace('.', '<DOT>'))

	# Split on sentence boundaries
	sentence_pattern = r'(?<=[.!?])\s+(?=[A-Z])'
	sentences = re.split(sentence_pattern, protected_text)

	# Restore abbreviations
	sentences = [s.replace('<DOT>', '.') for s in sentences]

	# Clean and filter
	sentences = [s.strip() for s in sentences if s.strip()]

	return sentences


	@classmethod
	def truncate(cls, text: str, max_length: int, suffix: str = "...", word_boundary: bool = True) -> str:
	"""
	Truncate text to maximum length

	Arguments:
	----------
	text { str } : Input text

	max_length { int } : Maximum length

	suffix { str } : Suffix to append when truncated

	word_boundary { bool } : Truncate at word boundary

	Returns:
	--------
	{ str } : Truncated text
	"""
	if (len(text) <= max_length):
	return text

	# Account for suffix
	max_length -= len(suffix)

	if word_boundary:
	# Find last space before max_length
	truncated = text[:max_length]
	last_space = truncated.rfind(' ')

	if (last_space > 0):
	truncated = truncated[:last_space]

	else:
	truncated = text[:max_length]

	return truncated + suffix


	@classmethod
	def remove_special_characters(cls, text: str, keep_punctuation: bool = True, keep_numbers: bool = True) -> str:
	"""
	Remove special characters

	Arguments:
	----------
	text { str } : Input text

	keep_punctuation { bool } : Keep basic punctuation

	keep_numbers { bool } : Keep numbers

	Returns:
	--------
	{ str } : Text with special characters removed
	"""
	if keep_punctuation and keep_numbers:
	# Keep alphanumeric and basic punctuation
	pattern = r'[^a-zA-Z0-9\s.,!?;:\'-]'

	elif keep_punctuation:
	# Keep letters and punctuation
	pattern = r'[^a-zA-Z\s.,!?;:\'-]'

	elif keep_numbers:
	# Keep letters and numbers
	pattern = r'[^a-zA-Z0-9\s]'

	else:
	# Keep only letters
	pattern = r'[^a-zA-Z\s]'

	return re.sub(pattern, '', text)


	@classmethod
	def deduplicate_lines(cls, text: str, preserve_order: bool = True) -> str:
	"""
	Remove duplicate lines

	Arguments:
	----------
	text { str } : Input text

	preserve_order { bool } : Maintain original order

	Returns:
	--------
	{ str } : Text with duplicate lines removed
	"""
	lines = text.split('\n')

	if preserve_order:
	seen = set()
	unique_lines = list()

	for line in lines:
	if line not in seen:
	seen.add(line)
	unique_lines.append(line)

	else:
	unique_lines = list(set(lines))

	return '\n'.join(unique_lines)


	@classmethod
	def count_tokens_estimate(cls, text: str) -> int:
	"""
	Estimate token count: Rule of thumb is - ~4 characters per token for English.

	Arguments:
	----------
	text { str } : Input text

	Returns:
	--------
	{ int } : Estimated token count
	"""
	# More accurate estimation
	words = text.split()
	chars = len(text)

	# Average of word-based and char-based estimates
	word_estimate = len(words) * 1.3 # ~1.3 tokens per word

	# ~4 chars per token
	char_estimate = chars / 4

	return int((word_estimate + char_estimate) / 2)


	@classmethod
	def preserve_structure_markers(cls, text: str) -> str:
	"""
	Identify and mark structural elements: Useful for semantic chunking

	Arguments:
	----------
	text { str } : Input text

	Returns:
	--------
	{ str } : Text with structure markers
	"""
	lines = text.split('\n')
	marked_lines = list()

	for line in lines:
	stripped = line.strip()

	# Mark headers (ALL CAPS, short lines)
	if (stripped.isupper() and (len(stripped) < 100)):
	marked_lines.append(f"[HEADER] {line}")

	# Mark list items
	elif re.match(r'^[\d•\-\*]\s', stripped):
	marked_lines.append(f"[LIST] {line}")

	# Regular text
	else:
	marked_lines.append(line)

	return '\n'.join(marked_lines)


	def clean_for_rag(text: str) -> str:
	"""
	Convenience function: clean text optimally for RAG

	Arguments:
	----------
	text { str } : Input text

	Returns:
	--------
	{ str } : Cleaned text
	"""
	return TextCleaner.clean(text,
	remove_urls = False, # URLs might be useful context
	remove_emails = False, # Emails might be useful
	remove_phone_numbers = False, # Phone numbers might be useful
	remove_html = True,
	normalize_whitespace = True,
	normalize_quotes = True,
	normalize_bullets = True,
	lowercase = False, # Keep original casing for proper nouns
	remove_extra_newlines = True,
	preserve_structure = True, # Important for chunking
	)