|
|
import nltk |
|
|
from nltk.corpus import stopwords |
|
|
from nltk.stem import WordNetLemmatizer |
|
|
import string |
|
|
|
|
|
nltk.download('punkt') |
|
|
nltk.download('stopwords') |
|
|
nltk.download('wordnet') |
|
|
|
|
|
def lemmatize_and_clean(text): |
|
|
|
|
|
words = nltk.word_tokenize(text) |
|
|
|
|
|
|
|
|
words = [word.lower() for word in words if word.isalpha()] |
|
|
|
|
|
|
|
|
stop_words = set(stopwords.words('english')) |
|
|
words = [word for word in words if word not in stop_words] |
|
|
|
|
|
|
|
|
lemmatizer = WordNetLemmatizer() |
|
|
words = [lemmatizer.lemmatize(word) for word in words] |
|
|
|
|
|
|
|
|
cleaned_text = ' '.join(words) |
|
|
|
|
|
return cleaned_text |
|
|
|
|
|
|
|
|
input_text = "kushir cover. kushir cover benson and hezes nih unique capsule of our janum benson and hesses breeze aprajanara kushiha benjay a capsule roche egg thorne refreshing taste and smell arapnajudiya trial kotachan tahal ajinita parnakti trial kit donnabat." |
|
|
cleaned_text = lemmatize_and_clean(input_text) |
|
|
|
|
|
print("Original Text:") |
|
|
print(input_text) |
|
|
print("\nCleaned Text:") |
|
|
print(cleaned_text) |
|
|
|