|
|
import re
|
|
|
import nltk
|
|
|
from nltk.tokenize import word_tokenize
|
|
|
from nltk.corpus import stopwords
|
|
|
from nltk.stem import WordNetLemmatizer
|
|
|
from typing import List, Optional
|
|
|
|
|
|
class TextPreprocessor:
|
|
|
def __init__(self):
|
|
|
nltk.download('punkt')
|
|
|
nltk.download('stopwords')
|
|
|
nltk.download('wordnet')
|
|
|
self.stop_words = set(stopwords.words('english'))
|
|
|
self.lemmatizer = WordNetLemmatizer()
|
|
|
|
|
|
def clean_text(self, text: str) -> str:
|
|
|
"""Clean and normalize text"""
|
|
|
|
|
|
text = text.lower()
|
|
|
|
|
|
|
|
|
text = re.sub(r'[^a-zA-Z\s]', '', text)
|
|
|
|
|
|
|
|
|
text = re.sub(r'\s+', ' ', text).strip()
|
|
|
|
|
|
return text
|
|
|
|
|
|
def tokenize(self, text: str) -> List[str]:
|
|
|
"""Tokenize text into words"""
|
|
|
return word_tokenize(text)
|
|
|
|
|
|
def remove_stopwords(self, tokens: List[str]) -> List[str]:
|
|
|
"""Remove stop words from token list"""
|
|
|
return [token for token in tokens if token not in self.stop_words]
|
|
|
|
|
|
def lemmatize(self, tokens: List[str]) -> List[str]:
|
|
|
"""Lemmatize tokens"""
|
|
|
return [self.lemmatizer.lemmatize(token) for token in tokens]
|
|
|
|
|
|
def process(self, text: str) -> List[str]:
|
|
|
"""Complete preprocessing pipeline"""
|
|
|
cleaned_text = self.clean_text(text)
|
|
|
tokens = self.tokenize(cleaned_text)
|
|
|
tokens = self.remove_stopwords(tokens)
|
|
|
tokens = self.lemmatize(tokens)
|
|
|
return tokens |