Spaces:

Omnamdev02
/

AutoExamGen

Sleeping

App Files Files Community

AutoExamGen / text_processor.py

Omnamdev02

Add files via upload

300f197 unverified 3 months ago

raw

history blame contribute delete

3.78 kB

	import nltk
	import re
	import string
	from nltk.corpus import stopwords
	from nltk.tokenize import sent_tokenize, word_tokenize
	from nltk.stem import WordNetLemmatizer

	class TextProcessor:
	def __init__(self):
	"""Initialize the text processor with required NLTK data."""
	self.download_nltk_data()
	self.stop_words = set(stopwords.words('english'))
	self.lemmatizer = WordNetLemmatizer()

	def download_nltk_data(self):
	"""Download required NLTK data if not already present."""
	required_data = [
	('tokenizers/punkt_tab', 'punkt_tab'),
	('tokenizers/punkt', 'punkt'),
	('corpora/stopwords', 'stopwords'),
	('corpora/wordnet', 'wordnet'),
	('corpora/omw-1.4', 'omw-1.4')
	]

	for path, name in required_data:
	try:
	nltk.data.find(path)
	except LookupError:
	print(f"Downloading NLTK {name}...")
	nltk.download(name)

	def clean_text(self, text):
	"""
	Clean and preprocess the input text.

	Args:
	text (str): Raw input text

	Returns:
	str: Cleaned text
	"""
	# Remove extra whitespace and normalize
	text = re.sub(r'\s+', ' ', text.strip())

	# Remove common header/footer patterns (e.g., "Page 1 of 10", "Unit 1")
	text = re.sub(r'Page\s+\d+\s+of\s+\d+', '', text, flags=re.IGNORECASE)
	text = re.sub(r'Unit\s+\d+(\.\d+)?', '', text, flags=re.IGNORECASE)

	# Remove standalone numbers (often page numbers or list markers)
	text = re.sub(r'\b\d+\b', '', text)

	# Remove special characters but keep sentence structure
	# Keep periods, question marks, exclamation points, commas, and hyphens
	text = re.sub(r'[^\w\s\.\?\!,\-]', '', text)

	# Remove multiple periods/spaces
	text = re.sub(r'\.+', '.', text)
	text = re.sub(r'\s+', ' ', text).strip()

	return text

	def tokenize_sentences(self, text):
	"""
	Tokenize text into sentences.

	Args:
	text (str): Input text

	Returns:
	list: List of sentences
	"""
	sentences = sent_tokenize(text)
	# Filter out very short sentences (less than 5 words)
	filtered_sentences = [s for s in sentences if len(word_tokenize(s)) >= 5]
	return filtered_sentences

	def tokenize_words(self, text):
	"""
	Tokenize text into words and remove stopwords.

	Args:
	text (str): Input text

	Returns:
	list: List of processed words
	"""
	words = word_tokenize(text.lower())

	# Remove punctuation and stopwords
	words = [word for word in words if word not in string.punctuation]
	words = [word for word in words if word not in self.stop_words]

	# Lemmatize words
	words = [self.lemmatizer.lemmatize(word) for word in words]

	return words

	def preprocess_text(self, text):
	"""
	Complete preprocessing pipeline.

	Args:
	text (str): Raw input text

	Returns:
	dict: Processed text components
	"""
	cleaned_text = self.clean_text(text)
	sentences = self.tokenize_sentences(cleaned_text)
	words = self.tokenize_words(cleaned_text)

	return {
	'cleaned_text': cleaned_text,
	'sentences': sentences,
	'words': words,
	'word_count': len(words),
	'sentence_count': len(sentences)
	}