File size: 684 Bytes
46917c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import string

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize


class TextCleaner:

    def __init__(self, raw_text):
        self.stopwords_set = set(stopwords.words("english") + list(string.punctuation))
        self.lemmatizer = WordNetLemmatizer()
        self.raw_input_text = raw_text

    def clean_text(self) -> str:
        tokens = word_tokenize(self.raw_input_text.lower())
        tokens = [token for token in tokens if token not in self.stopwords_set]
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens]
        cleaned_text = " ".join(tokens)
        return cleaned_text