achmaddhani commited on
Commit
af6c457
·
1 Parent(s): 08e4fd7

Update functions.py

Browse files
Files changed (1) hide show
  1. functions.py +16 -1
functions.py CHANGED
@@ -1,3 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  def get_wordnet_pos(treebank_tag):
2
  if treebank_tag.startswith('J'):
3
  return wordnet.ADJ
@@ -13,9 +27,10 @@ def get_wordnet_pos(treebank_tag):
13
  lemmatizer = WordNetLemmatizer()
14
 
15
  def preprocess_text(text):
 
16
  text = text.lower() # lowercase text
17
  tokens = word_tokenize(text) # tokenize
18
- filtered_words = [word for word in tokens if word.lower() not in stopword_list]
19
  lemmatized_words = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in filtered_words]
20
  lemmatized_clean = [word.translate(str.maketrans('', '', string.punctuation)) for word in lemmatized_words]
21
  return ' '.join(lemmatized_clean)
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ # preprocess
4
+ import string
5
+ import nltk
6
+ from nltk.tokenize import word_tokenize
7
+ from nltk.stem import WordNetLemmatizer
8
+ from nltk.corpus import wordnet
9
+ import joblib
10
+
11
+ nltk.download('punkt')
12
+ nltk.download('wordnet')
13
+ nltk.download('averaged_perceptron_tagger')
14
+
15
  def get_wordnet_pos(treebank_tag):
16
  if treebank_tag.startswith('J'):
17
  return wordnet.ADJ
 
27
  lemmatizer = WordNetLemmatizer()
28
 
29
  def preprocess_text(text):
30
+ stopword_list= joblib.load('stopword_list.joblib')
31
  text = text.lower() # lowercase text
32
  tokens = word_tokenize(text) # tokenize
33
+ filtered_words = [word for word in tokens if word.lower() not in stopword_list= load.]
34
  lemmatized_words = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in filtered_words]
35
  lemmatized_clean = [word.translate(str.maketrans('', '', string.punctuation)) for word in lemmatized_words]
36
  return ' '.join(lemmatized_clean)