resumematcher / scripts /TextCleaner.py
March
first
46917c3
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
class TextCleaner:
def __init__(self, raw_text):
self.stopwords_set = set(stopwords.words("english") + list(string.punctuation))
self.lemmatizer = WordNetLemmatizer()
self.raw_input_text = raw_text
def clean_text(self) -> str:
tokens = word_tokenize(self.raw_input_text.lower())
tokens = [token for token in tokens if token not in self.stopwords_set]
tokens = [self.lemmatizer.lemmatize(token) for token in tokens]
cleaned_text = " ".join(tokens)
return cleaned_text