cChat / dataset.py
Shymaa2611
update
223e278
import re
from autocorrect import Speller
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
import string
import nltk
import os
import nltk
nltk_data_dir = os.path.expanduser("~/app/nltk_data")
os.makedirs(nltk_data_dir, exist_ok=True)
nltk.data.path.append(nltk_data_dir)
nltk.download("stopwords", download_dir=nltk_data_dir)
nltk.download("punkt_tab", download_dir=nltk_data_dir)
nltk.download("punkt", download_dir=nltk_data_dir)
nltk.download("wordnet", download_dir=nltk_data_dir)
def remove_html_tags(text):
html_pattern = r'<.*?>'
without_html = re.sub(pattern=html_pattern, repl=' ', string=text)
return without_html
def convert_to_lower(text):
return text.lower()
def remove_urls(text):
url_pattern = r'https?://\S+|www\.\S+'
without_urls = re.sub(pattern=url_pattern, repl=' ', string=text)
return without_urls
def spell_checker(text):
spellChecker = Speller(lang="en")
correct_words = []
for word in nltk.word_tokenize(text):
correct_word = spellChecker(word)
correct_words.append(correct_word)
correct_spell_text = " ".join(correct_words)
return correct_spell_text
def remove_punctuation(text):
return text.translate(str.maketrans('', '', string.punctuation))
def remove_stopwords(text):
removed = []
stop_words = list(stopwords.words("english"))
tokens = word_tokenize(text)
for i in range(len(tokens)):
if tokens[i] not in stop_words:
removed.append(tokens[i])
return " ".join(removed)
def lemmatizing(text):
lemmatizer = WordNetLemmatizer()
tokens = word_tokenize(text)
for i in range(len(tokens)):
lemma_word = lemmatizer.lemmatize(tokens[i])
tokens[i] = lemma_word
return " ".join(tokens)
def clean(text):
cleaned_text=convert_to_lower(text)
cleaned_text=remove_html_tags(cleaned_text)
cleaned_text=remove_urls(cleaned_text)
cleaned_text=remove_punctuation(cleaned_text)
cleaned_text=remove_stopwords(cleaned_text)
cleaned_text=lemmatizing(cleaned_text)
cleaned_text= spell_checker(cleaned_text)
return cleaned_text