import re import nltk from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from typing import List, Optional class TextPreprocessor: def __init__(self): nltk.download('punkt') nltk.download('stopwords') nltk.download('wordnet') self.stop_words = set(stopwords.words('english')) self.lemmatizer = WordNetLemmatizer() def clean_text(self, text: str) -> str: """Clean and normalize text""" # Convert to lowercase text = text.lower() # Remove special characters and numbers text = re.sub(r'[^a-zA-Z\s]', '', text) # Remove extra whitespace text = re.sub(r'\s+', ' ', text).strip() return text def tokenize(self, text: str) -> List[str]: """Tokenize text into words""" return word_tokenize(text) def remove_stopwords(self, tokens: List[str]) -> List[str]: """Remove stop words from token list""" return [token for token in tokens if token not in self.stop_words] def lemmatize(self, tokens: List[str]) -> List[str]: """Lemmatize tokens""" return [self.lemmatizer.lemmatize(token) for token in tokens] def process(self, text: str) -> List[str]: """Complete preprocessing pipeline""" cleaned_text = self.clean_text(text) tokens = self.tokenize(cleaned_text) tokens = self.remove_stopwords(tokens) tokens = self.lemmatize(tokens) return tokens