Spaces:

NavyDevilDoc
/

AI_Toolkit

Sleeping

App Files Files Community

AI_Toolkit / src /core /TextPreprocessor.py

NavyDevilDoc

Upload 10 files

c0f31c1 verified 4 months ago

raw

history blame contribute delete

7.91 kB

	import logging
	from nltk.corpus import stopwords
	from nltk.stem import WordNetLemmatizer
	import re

	class TextPreprocessor:
	def __init__(self):
	try:
	self.stopwords = set(stopwords.words('english'))
	self.lemmatizer = WordNetLemmatizer()
	self.logger = logging.getLogger(__name__)

	except Exception as e:
	self.logger.error(f"Failed to initialize NLTK resources: {e}")
	raise


	def standardize_case(self, text):
	return text.lower()


	def remove_punctuation(self, text):
	return re.sub(r'[^\w\s]', '', text)


	def normalize_whitespace(self, text):
	return re.sub(r'\s+', ' ', text).strip()


	def remove_stopwords(self, words):
	return [word for word in words if word not in self.stopwords]


	def lemmatize_words(self, words):
	return [self.lemmatizer.lemmatize(word) for word in words]


	def remove_headers_and_footers(self, text, aggressive=False, pattern=None):
	try:
	if not text or not text.strip():
	return text

	lines = text.splitlines()
	if len(lines) <= 4: # For very short text, don't remove anything
	return text

	# Store original lines for fallback
	original_lines = lines.copy()

	# Use different strategies based on document characteristics
	if self._appears_to_be_slide(lines):
	# Slide-friendly approach - only remove obvious headers/footers
	cleaned_lines = self._clean_slide_headers_footers(lines, pattern)
	elif aggressive:
	# Traditional document approach - remove first/last few lines
	num_lines = 2
	cleaned_lines = lines[num_lines:-num_lines]
	else:
	# Conservative approach - only remove based on patterns
	cleaned_lines = self._pattern_based_removal(lines, pattern)

	# If we removed too much (over 30% of content), revert to original
	if len(cleaned_lines) < len(lines) * 0.7:
	self.logger.warning("Header/footer removal eliminated too much content, reverting")
	cleaned_lines = original_lines

	# Additional heuristic: Remove single-word lines that might be page numbers
	cleaned_lines = [line for line in cleaned_lines
	if not (len(line.strip().split()) == 1 and
	line.strip().isdigit())]

	# Join lines back into text
	return '\n'.join(cleaned_lines)

	except Exception as e:
	self.logger.error(f"Error removing headers/footers: {e}")
	return text # Return original text on error


	def _appears_to_be_slide(self, lines):
	"""Detect if the content appears to be from a slide/presentation."""
	# Characteristics of slides:
	# - Shorter overall text
	# - Fewer lines
	# - More bullet points
	# - Title followed by bullet points

	if len(lines) < 15: # Short content
	return True

	# Check for bullet point patterns
	bullet_pattern = r'^\s[•\-\\>\◦\○\◆\◇\▪\▫\⚫\⚪\✓\✔\✕\✖\✗\✘]'
	bullet_lines = sum(1 for line in lines if re.match(bullet_pattern, line))

	# If more than 20% of lines are bullets, likely a slide
	if bullet_lines > len(lines) * 0.2:
	return True

	# If first non-empty line is short (likely a title) and followed by bullet points
	non_empty_lines = [line for line in lines if line.strip()]
	if non_empty_lines and len(non_empty_lines[0].strip()) < 60:
	# Check for bullet points in the following lines
	for line in non_empty_lines[1:4]: # Check next few lines
	if re.match(bullet_pattern, line):
	return True

	return False


	def _clean_slide_headers_footers(self, lines, pattern=None):
	"""Clean headers/footers from slide-based content."""
	cleaned_lines = lines.copy()

	# For slides, we primarily rely on pattern matching rather than line position
	if pattern:
	cleaned_lines = [line for line in cleaned_lines
	if not re.search(pattern, line)]

	# Common slide footer patterns to remove
	footer_patterns = [
	r'^\s\d+\s$', # Standalone page number
	r'confidential', # Confidentiality notices
	r'all rights reserved',
	r'proprietary',
	r'^\s*www\.', # Website in footer
	r'^\s*https?://', # URL in footer
	r'\bpage\s+\d+\b', # "Page X" footer
	r'^\s[©Ⓒ]\s\d{4}' # Copyright notice
	]

	# Combine all patterns
	combined_pattern = '\|'.join(f'({p})' for p in footer_patterns)

	# Filter out footer lines
	if combined_pattern:
	cleaned_lines = [line for line in cleaned_lines
	if not re.search(combined_pattern, line, re.IGNORECASE)]

	return cleaned_lines


	def _pattern_based_removal(self, lines, pattern=None):
	"""Remove headers/footers based only on patterns, not position."""
	if not pattern:
	# Default patterns for headers/footers
	patterns = [
	r'^\s\d+\s$', # Standalone page numbers
	r'^\spage\s+\d+\s+of\s+\d+\s$', # Page X of Y
	r'^\s[©Ⓒ]\s\d{4}.*$', # Copyright lines
	r'^\sconfidential\s$', # Confidentiality markers
	r'^\shttps?://.$', # URLs alone on a line
	r'^\swww\..$', # Website alone on a line
	r'^\s[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\s$' # Email addresses
	]
	combined_pattern = '\|'.join(f'({p})' for p in patterns)
	else:
	combined_pattern = pattern

	return [line for line in lines
	if not re.search(combined_pattern, line, re.IGNORECASE)]


	def remove_common_pdf_artifacts(self, text):
	try:
	# Remove form field indicators
	text = re.sub(r'\[\s\]\s\|\[\sX\s\]\|$\s$\s\|$\sX\s$', '', text)

	# Remove common PDF annotations
	text = re.sub(r'<<[^>]*>>', '', text)

	# Remove artifact markers often found in PDFs
	text = re.sub(r'obj\s\d+\s\d+\s*R', '', text)

	return text

	except Exception as e:
	self.logger.error(f"Error removing PDF artifacts: {e}")
	return text

	def preprocess(self, text, remove_headers_footers=True, aggressive_removal=False):
	try:
	if remove_headers_footers:
	text = self.remove_headers_and_footers(text, aggressive=aggressive_removal)

	text = self.remove_common_pdf_artifacts(text)

	text = self.standardize_case(text)
	text = self.remove_punctuation(text)
	text = self.normalize_whitespace(text)

	words = text.split()
	words = self.remove_stopwords(words)
	words = self.lemmatize_words(words)

	return ' '.join(words)
	except Exception as e:
	self.logger.error(f"Error preprocessing text: {e}")
	raise