File size: 2,222 Bytes
05e8fd6 2afdd48 223e278 2afdd48 05e8fd6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 | import re
from autocorrect import Speller
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
import string
import nltk
import os
import nltk
nltk_data_dir = os.path.expanduser("~/app/nltk_data")
os.makedirs(nltk_data_dir, exist_ok=True)
nltk.data.path.append(nltk_data_dir)
nltk.download("stopwords", download_dir=nltk_data_dir)
nltk.download("punkt_tab", download_dir=nltk_data_dir)
nltk.download("punkt", download_dir=nltk_data_dir)
nltk.download("wordnet", download_dir=nltk_data_dir)
def remove_html_tags(text):
html_pattern = r'<.*?>'
without_html = re.sub(pattern=html_pattern, repl=' ', string=text)
return without_html
def convert_to_lower(text):
return text.lower()
def remove_urls(text):
url_pattern = r'https?://\S+|www\.\S+'
without_urls = re.sub(pattern=url_pattern, repl=' ', string=text)
return without_urls
def spell_checker(text):
spellChecker = Speller(lang="en")
correct_words = []
for word in nltk.word_tokenize(text):
correct_word = spellChecker(word)
correct_words.append(correct_word)
correct_spell_text = " ".join(correct_words)
return correct_spell_text
def remove_punctuation(text):
return text.translate(str.maketrans('', '', string.punctuation))
def remove_stopwords(text):
removed = []
stop_words = list(stopwords.words("english"))
tokens = word_tokenize(text)
for i in range(len(tokens)):
if tokens[i] not in stop_words:
removed.append(tokens[i])
return " ".join(removed)
def lemmatizing(text):
lemmatizer = WordNetLemmatizer()
tokens = word_tokenize(text)
for i in range(len(tokens)):
lemma_word = lemmatizer.lemmatize(tokens[i])
tokens[i] = lemma_word
return " ".join(tokens)
def clean(text):
cleaned_text=convert_to_lower(text)
cleaned_text=remove_html_tags(cleaned_text)
cleaned_text=remove_urls(cleaned_text)
cleaned_text=remove_punctuation(cleaned_text)
cleaned_text=remove_stopwords(cleaned_text)
cleaned_text=lemmatizing(cleaned_text)
cleaned_text= spell_checker(cleaned_text)
return cleaned_text
|