File size: 2,222 Bytes
05e8fd6
 
 
 
 
 
 
 
 
 
2afdd48
 
 
 
 
223e278
2afdd48
 
05e8fd6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import re
from autocorrect import Speller
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
import string
import nltk
import os
import nltk
nltk_data_dir = os.path.expanduser("~/app/nltk_data")
os.makedirs(nltk_data_dir, exist_ok=True)
nltk.data.path.append(nltk_data_dir)
nltk.download("stopwords", download_dir=nltk_data_dir)
nltk.download("punkt_tab", download_dir=nltk_data_dir)
nltk.download("punkt", download_dir=nltk_data_dir)
nltk.download("wordnet", download_dir=nltk_data_dir)


def remove_html_tags(text):
    html_pattern = r'<.*?>'
    without_html = re.sub(pattern=html_pattern, repl=' ', string=text)
    return without_html

def convert_to_lower(text):
    return text.lower()

def remove_urls(text):
    url_pattern = r'https?://\S+|www\.\S+'
    without_urls = re.sub(pattern=url_pattern, repl=' ', string=text)
    return without_urls

def spell_checker(text):
    spellChecker = Speller(lang="en")
    correct_words = []
    for word in nltk.word_tokenize(text):
        correct_word = spellChecker(word)
        correct_words.append(correct_word)
    correct_spell_text = " ".join(correct_words)
    return correct_spell_text


def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))


def remove_stopwords(text):
    removed = []
    stop_words = list(stopwords.words("english"))
    tokens = word_tokenize(text)
    for i in range(len(tokens)):
        if tokens[i] not in stop_words:
            removed.append(tokens[i])
    return " ".join(removed)


def lemmatizing(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    for i in range(len(tokens)):
        lemma_word = lemmatizer.lemmatize(tokens[i])
        tokens[i] = lemma_word
    return " ".join(tokens)


def clean(text):
    cleaned_text=convert_to_lower(text)
    cleaned_text=remove_html_tags(cleaned_text)
    cleaned_text=remove_urls(cleaned_text)
    cleaned_text=remove_punctuation(cleaned_text)
    cleaned_text=remove_stopwords(cleaned_text)
    cleaned_text=lemmatizing(cleaned_text)
    cleaned_text= spell_checker(cleaned_text)
    return  cleaned_text