File size: 1,600 Bytes
dec266f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from typing import List, Optional
class TextPreprocessor:
def __init__(self):
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
self.stop_words = set(stopwords.words('english'))
self.lemmatizer = WordNetLemmatizer()
def clean_text(self, text: str) -> str:
"""Clean and normalize text"""
# Convert to lowercase
text = text.lower()
# Remove special characters and numbers
text = re.sub(r'[^a-zA-Z\s]', '', text)
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text).strip()
return text
def tokenize(self, text: str) -> List[str]:
"""Tokenize text into words"""
return word_tokenize(text)
def remove_stopwords(self, tokens: List[str]) -> List[str]:
"""Remove stop words from token list"""
return [token for token in tokens if token not in self.stop_words]
def lemmatize(self, tokens: List[str]) -> List[str]:
"""Lemmatize tokens"""
return [self.lemmatizer.lemmatize(token) for token in tokens]
def process(self, text: str) -> List[str]:
"""Complete preprocessing pipeline"""
cleaned_text = self.clean_text(text)
tokens = self.tokenize(cleaned_text)
tokens = self.remove_stopwords(tokens)
tokens = self.lemmatize(tokens)
return tokens |