AutoExamGen / text_processor.py
Omnamdev02's picture
Add files via upload
300f197 unverified
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
class TextProcessor:
def __init__(self):
"""Initialize the text processor with required NLTK data."""
self.download_nltk_data()
self.stop_words = set(stopwords.words('english'))
self.lemmatizer = WordNetLemmatizer()
def download_nltk_data(self):
"""Download required NLTK data if not already present."""
required_data = [
('tokenizers/punkt_tab', 'punkt_tab'),
('tokenizers/punkt', 'punkt'),
('corpora/stopwords', 'stopwords'),
('corpora/wordnet', 'wordnet'),
('corpora/omw-1.4', 'omw-1.4')
]
for path, name in required_data:
try:
nltk.data.find(path)
except LookupError:
print(f"Downloading NLTK {name}...")
nltk.download(name)
def clean_text(self, text):
"""
Clean and preprocess the input text.
Args:
text (str): Raw input text
Returns:
str: Cleaned text
"""
# Remove extra whitespace and normalize
text = re.sub(r'\s+', ' ', text.strip())
# Remove common header/footer patterns (e.g., "Page 1 of 10", "Unit 1")
text = re.sub(r'Page\s+\d+\s+of\s+\d+', '', text, flags=re.IGNORECASE)
text = re.sub(r'Unit\s+\d+(\.\d+)?', '', text, flags=re.IGNORECASE)
# Remove standalone numbers (often page numbers or list markers)
text = re.sub(r'\b\d+\b', '', text)
# Remove special characters but keep sentence structure
# Keep periods, question marks, exclamation points, commas, and hyphens
text = re.sub(r'[^\w\s\.\?\!,\-]', '', text)
# Remove multiple periods/spaces
text = re.sub(r'\.+', '.', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
def tokenize_sentences(self, text):
"""
Tokenize text into sentences.
Args:
text (str): Input text
Returns:
list: List of sentences
"""
sentences = sent_tokenize(text)
# Filter out very short sentences (less than 5 words)
filtered_sentences = [s for s in sentences if len(word_tokenize(s)) >= 5]
return filtered_sentences
def tokenize_words(self, text):
"""
Tokenize text into words and remove stopwords.
Args:
text (str): Input text
Returns:
list: List of processed words
"""
words = word_tokenize(text.lower())
# Remove punctuation and stopwords
words = [word for word in words if word not in string.punctuation]
words = [word for word in words if word not in self.stop_words]
# Lemmatize words
words = [self.lemmatizer.lemmatize(word) for word in words]
return words
def preprocess_text(self, text):
"""
Complete preprocessing pipeline.
Args:
text (str): Raw input text
Returns:
dict: Processed text components
"""
cleaned_text = self.clean_text(text)
sentences = self.tokenize_sentences(cleaned_text)
words = self.tokenize_words(cleaned_text)
return {
'cleaned_text': cleaned_text,
'sentences': sentences,
'words': words,
'word_count': len(words),
'sentence_count': len(sentences)
}