nicolasV441 commited on
Commit
c818394
·
verified ·
1 Parent(s): 9aeac75

Create Stopwords.py

Browse files
Files changed (1) hide show
  1. Stopwords.py +37 -0
Stopwords.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ from nltk.corpus import stopwords
3
+ from nltk.tokenize import word_tokenize
4
+ from nltk.stem import WordNetLemmatizer
5
+
6
+ # downloading the stopwords from nltk
7
+ nltk.download('stopwords')
8
+ # donwloading the resource word_tokenize
9
+ nltk.download('punkt_tab')
10
+ # donwloading the resource wordnet
11
+ nltk.download('wordnet')
12
+
13
+ # setting the stopwords dictionary to english language
14
+ stop_words_en = set(stopwords.words('english'))
15
+ # Define additional stopwords
16
+ additional_exclusions = {'persona', 'team', 'minecraft', 'arma', 'bean', 'boss','ps4','wild','moon','2d','wasteland','ultra','military','editor','allows','solid','journey','expert','bungie','season','terraria','minor','hunting','kojima','deck','destiny','remaster','complain','fall','slay','gta','atlus','spire','terrarium','rockstar','biome','mafia','castle',
17
+ 'access','instead','idea','sorry','ca','dlc','dayz','until','valve','tutorial','concept','swing','horde','regionlockchina','unless','sims','ark','was','company','customer','state','software','advertised','region','survivor','2016','fighter','headshot','zombie','alpha','trailer','planet','pubg','microtransactions','payday','too','camping','ubisoft','galaxy','rome','skyrim','vr','h1z1','vegas','warband','flight','creation','forum','netcode','sky','elite','china','chinese','hero','policy','pile','untill','zombies','xcom','rust','capcom'}
18
+
19
+ stop_words_en = stop_words_en.union(additional_exclusions)
20
+
21
+ def filter_review(reviews):
22
+ #Tokenize the text
23
+ tokens = word_tokenize(reviews.lower())
24
+
25
+ #Remove stop words
26
+ filtered_tokens = [token for token in tokens if token not in stop_words_en]
27
+
28
+ #Lemmatize the tokens (to mantain the context)
29
+ lemmatizer = WordNetLemmatizer()
30
+ lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
31
+
32
+ # Remove extra words after lemmatizer
33
+ final_tokens = [token for token in lemmatized_tokens if token not in stop_words_en]
34
+
35
+ #Join the tokens back into a string
36
+ processed_text = ' '.join(final_tokens)
37
+ return processed_text