| import nltk | |
| from nltk.stem import WordNetLemmatizer | |
| from sklearn.base import BaseEstimator, TransformerMixin | |
| import nltk | |
| import contractions | |
| # Download required NLTK resources | |
| nltk.download('punkt_tab') | |
| nltk.download('wordnet') | |
| # Custom transformer for preprocessing text | |
| class TextPreprocessor(BaseEstimator, TransformerMixin): | |
| def __init__(self): | |
| self.lemmatizer = WordNetLemmatizer() | |
| def fit(self, X, y=None): | |
| return self # Does nothing, just returns the instance | |
| def transform(self, X): | |
| preprocessed_texts = [] | |
| for doc in X: | |
| # Expand contractions | |
| expanded = contractions.fix(doc) | |
| # Lowercase | |
| lowered = expanded.lower() | |
| # Tokenize and lemmatize | |
| lemmatized = " ".join([self.lemmatizer.lemmatize(word) for word in nltk.word_tokenize(lowered)]) | |
| preprocessed_texts.append(lemmatized) | |
| return preprocessed_texts |