Spaces:
Sleeping
Sleeping
File size: 4,847 Bytes
f1554a2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 | import re
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize,pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
def tokenize(sentence):
sentence = re.sub(r'\s+', ' ', sentence)
token_words = word_tokenize(sentence)
token_words = pos_tag(token_words)
return token_words
wordnet_lematizer = WordNetLemmatizer()
def stem(token_words):
words_lematizer = []
for word, tag in token_words:
if tag.startswith('NN'):
word_lematizer = wordnet_lematizer.lemmatize(word, pos='n')
elif tag.startswith('VB'):
word_lematizer = wordnet_lematizer.lemmatize(word, pos='v')
elif tag.startswith('JJ'):
word_lematizer = wordnet_lematizer.lemmatize(word, pos='a')
elif tag.startswith('R'):
word_lematizer = wordnet_lematizer.lemmatize(word, pos='r')
else:
word_lematizer = wordnet_lematizer.lemmatize(word)
words_lematizer.append(word_lematizer)
return words_lematizer
def delete_invalid_word(token_words):
valid_word = []
for word in token_words:
if len(wordnet.synsets(word)) > 0:
valid_word.append(word)
return valid_word
sr = stopwords.words('english')
sr.append("limited")
sr.append("additionnaly")
sr.append("e.g")
sr.remove("other")
sr.remove("than")
sr.remove("not")
sr.remove("you")
sr.remove("and")
sr2 = stopwords.words('english')
def delete_stopwords(token_words):
cleaned_words = [word for word in token_words if word not in sr]
return cleaned_words
def delete_stopwords2(token_words):
cleaned_words = [word for word in token_words if word not in sr2]
return cleaned_words
def delete_adjwords(token_words):
cleaned_words = [word for word in token_words if word not in sr]
return cleaned_words
def is_number(s):
try:
float(s)
return True
except ValueError:
pass
try:
import unicodedata
unicodedata.numeric(s)
return True
except (TypeError, ValueError):
pass
return False
characters_title = [' ','.',',','|' , ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%','-','...','^','{','}']
characters = [' ','|' , ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%','-','...','^','{','}']
characters_proposal = [' ','|' , '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%','-','...','^','{','}']
def delete_characters(token_words):
words_list = [word for word in token_words if word not in characters]
return words_list
def delete_characters_proposal(token_words):
words_list = [word for word in token_words if word not in characters_proposal and not is_number(word)]
return words_list
def delete_characters_title(token_words):
words_list = [word for word in token_words if word not in characters and not is_number(word)]
return words_list
def to_lower(token_words):
words_lists = [x.lower() for x in token_words]
return words_lists
def pre_process_title(text):
token_words = tokenize(text)
token_words = stem(token_words)
token_words = delete_invalid_word(token_words)
token_words = delete_characters_title(token_words)
token_words = to_lower(token_words)
return ' '.join(token_words)
def pre_process(text):
token_words = tokenize(text)
token_words = stem(token_words)
token_words = delete_stopwords(token_words)
token_words = delete_characters(token_words)
token_words = to_lower(token_words)
return ' '.join(token_words)
def pre_process_type(text):
token_words = tokenize(text)
token_words = stem(token_words)
token_words = delete_stopwords2(token_words)
token_words = delete_characters(token_words)
token_words = to_lower(token_words)
return ' '.join(token_words)
def pre_process_proposal(text):
token_words = tokenize(text)
token_words = stem(token_words)
token_words = delete_stopwords(token_words)
token_words = delete_characters_proposal(token_words)
token_words = to_lower(token_words)
return ' '.join(token_words)
def pre_process_list(text):
token_words = tokenize(text)
token_words = stem(token_words)
token_words = delete_stopwords(token_words)
token_words = delete_characters(token_words)
token_words = to_lower(token_words)
return token_words
def pre_process_stop(text):
token_words = tokenize(text)
token_words = stem(token_words)
token_words = delete_characters(token_words)
token_words = to_lower(token_words)
text = ' '.join(token_words)
final_text = text.split(".")
return final_text
|