| import re |
| from autocorrect import Speller |
| import nltk |
| from nltk.corpus import stopwords |
| from nltk import word_tokenize |
| from nltk.stem import WordNetLemmatizer |
| from nltk import word_tokenize |
| import string |
| import nltk |
| import os |
| import nltk |
| nltk_data_dir = os.path.expanduser("~/app/nltk_data") |
| os.makedirs(nltk_data_dir, exist_ok=True) |
| nltk.data.path.append(nltk_data_dir) |
| nltk.download("stopwords", download_dir=nltk_data_dir) |
| nltk.download("punkt_tab", download_dir=nltk_data_dir) |
| nltk.download("punkt", download_dir=nltk_data_dir) |
| nltk.download("wordnet", download_dir=nltk_data_dir) |
|
|
|
|
| def remove_html_tags(text): |
| html_pattern = r'<.*?>' |
| without_html = re.sub(pattern=html_pattern, repl=' ', string=text) |
| return without_html |
|
|
| def convert_to_lower(text): |
| return text.lower() |
|
|
| def remove_urls(text): |
| url_pattern = r'https?://\S+|www\.\S+' |
| without_urls = re.sub(pattern=url_pattern, repl=' ', string=text) |
| return without_urls |
|
|
| def spell_checker(text): |
| spellChecker = Speller(lang="en") |
| correct_words = [] |
| for word in nltk.word_tokenize(text): |
| correct_word = spellChecker(word) |
| correct_words.append(correct_word) |
| correct_spell_text = " ".join(correct_words) |
| return correct_spell_text |
|
|
|
|
| def remove_punctuation(text): |
| return text.translate(str.maketrans('', '', string.punctuation)) |
|
|
|
|
| def remove_stopwords(text): |
| removed = [] |
| stop_words = list(stopwords.words("english")) |
| tokens = word_tokenize(text) |
| for i in range(len(tokens)): |
| if tokens[i] not in stop_words: |
| removed.append(tokens[i]) |
| return " ".join(removed) |
|
|
|
|
| def lemmatizing(text): |
| lemmatizer = WordNetLemmatizer() |
| tokens = word_tokenize(text) |
| for i in range(len(tokens)): |
| lemma_word = lemmatizer.lemmatize(tokens[i]) |
| tokens[i] = lemma_word |
| return " ".join(tokens) |
|
|
|
|
| def clean(text): |
| cleaned_text=convert_to_lower(text) |
| cleaned_text=remove_html_tags(cleaned_text) |
| cleaned_text=remove_urls(cleaned_text) |
| cleaned_text=remove_punctuation(cleaned_text) |
| cleaned_text=remove_stopwords(cleaned_text) |
| cleaned_text=lemmatizing(cleaned_text) |
| cleaned_text= spell_checker(cleaned_text) |
| return cleaned_text |
| |