Spaces:
Build error
Build error
Create Stopwords.py
Browse files- Stopwords.py +37 -0
Stopwords.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import nltk
|
| 2 |
+
from nltk.corpus import stopwords
|
| 3 |
+
from nltk.tokenize import word_tokenize
|
| 4 |
+
from nltk.stem import WordNetLemmatizer
|
| 5 |
+
|
| 6 |
+
# downloading the stopwords from nltk
|
| 7 |
+
nltk.download('stopwords')
|
| 8 |
+
# donwloading the resource word_tokenize
|
| 9 |
+
nltk.download('punkt_tab')
|
| 10 |
+
# donwloading the resource wordnet
|
| 11 |
+
nltk.download('wordnet')
|
| 12 |
+
|
| 13 |
+
# setting the stopwords dictionary to english language
|
| 14 |
+
stop_words_en = set(stopwords.words('english'))
|
| 15 |
+
# Define additional stopwords
|
| 16 |
+
additional_exclusions = {'persona', 'team', 'minecraft', 'arma', 'bean', 'boss','ps4','wild','moon','2d','wasteland','ultra','military','editor','allows','solid','journey','expert','bungie','season','terraria','minor','hunting','kojima','deck','destiny','remaster','complain','fall','slay','gta','atlus','spire','terrarium','rockstar','biome','mafia','castle',
|
| 17 |
+
'access','instead','idea','sorry','ca','dlc','dayz','until','valve','tutorial','concept','swing','horde','regionlockchina','unless','sims','ark','was','company','customer','state','software','advertised','region','survivor','2016','fighter','headshot','zombie','alpha','trailer','planet','pubg','microtransactions','payday','too','camping','ubisoft','galaxy','rome','skyrim','vr','h1z1','vegas','warband','flight','creation','forum','netcode','sky','elite','china','chinese','hero','policy','pile','untill','zombies','xcom','rust','capcom'}
|
| 18 |
+
|
| 19 |
+
stop_words_en = stop_words_en.union(additional_exclusions)
|
| 20 |
+
|
| 21 |
+
def filter_review(reviews):
|
| 22 |
+
#Tokenize the text
|
| 23 |
+
tokens = word_tokenize(reviews.lower())
|
| 24 |
+
|
| 25 |
+
#Remove stop words
|
| 26 |
+
filtered_tokens = [token for token in tokens if token not in stop_words_en]
|
| 27 |
+
|
| 28 |
+
#Lemmatize the tokens (to mantain the context)
|
| 29 |
+
lemmatizer = WordNetLemmatizer()
|
| 30 |
+
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
|
| 31 |
+
|
| 32 |
+
# Remove extra words after lemmatizer
|
| 33 |
+
final_tokens = [token for token in lemmatized_tokens if token not in stop_words_en]
|
| 34 |
+
|
| 35 |
+
#Join the tokens back into a string
|
| 36 |
+
processed_text = ' '.join(final_tokens)
|
| 37 |
+
return processed_text
|