Spaces:
Sleeping
Sleeping
Commit ·
af6c457
1
Parent(s): 08e4fd7
Update functions.py
Browse files- functions.py +16 -1
functions.py
CHANGED
|
@@ -1,3 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
def get_wordnet_pos(treebank_tag):
|
| 2 |
if treebank_tag.startswith('J'):
|
| 3 |
return wordnet.ADJ
|
|
@@ -13,9 +27,10 @@ def get_wordnet_pos(treebank_tag):
|
|
| 13 |
lemmatizer = WordNetLemmatizer()
|
| 14 |
|
| 15 |
def preprocess_text(text):
|
|
|
|
| 16 |
text = text.lower() # lowercase text
|
| 17 |
tokens = word_tokenize(text) # tokenize
|
| 18 |
-
filtered_words = [word for word in tokens if word.lower() not in stopword_list]
|
| 19 |
lemmatized_words = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in filtered_words]
|
| 20 |
lemmatized_clean = [word.translate(str.maketrans('', '', string.punctuation)) for word in lemmatized_words]
|
| 21 |
return ' '.join(lemmatized_clean)
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
# preprocess
|
| 4 |
+
import string
|
| 5 |
+
import nltk
|
| 6 |
+
from nltk.tokenize import word_tokenize
|
| 7 |
+
from nltk.stem import WordNetLemmatizer
|
| 8 |
+
from nltk.corpus import wordnet
|
| 9 |
+
import joblib
|
| 10 |
+
|
| 11 |
+
nltk.download('punkt')
|
| 12 |
+
nltk.download('wordnet')
|
| 13 |
+
nltk.download('averaged_perceptron_tagger')
|
| 14 |
+
|
| 15 |
def get_wordnet_pos(treebank_tag):
|
| 16 |
if treebank_tag.startswith('J'):
|
| 17 |
return wordnet.ADJ
|
|
|
|
| 27 |
lemmatizer = WordNetLemmatizer()
|
| 28 |
|
| 29 |
def preprocess_text(text):
|
| 30 |
+
stopword_list= joblib.load('stopword_list.joblib')
|
| 31 |
text = text.lower() # lowercase text
|
| 32 |
tokens = word_tokenize(text) # tokenize
|
| 33 |
+
filtered_words = [word for word in tokens if word.lower() not in stopword_list= load.]
|
| 34 |
lemmatized_words = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in filtered_words]
|
| 35 |
lemmatized_clean = [word.translate(str.maketrans('', '', string.punctuation)) for word in lemmatized_words]
|
| 36 |
return ' '.join(lemmatized_clean)
|