Spaces:
Sleeping
Sleeping
add tokenize function
Browse files
app.py
CHANGED
|
@@ -16,98 +16,56 @@ from underthesea import word_tokenize
|
|
| 16 |
|
| 17 |
from phoBERT import BERT_predict
|
| 18 |
|
| 19 |
-
# Load tokenizer
|
| 20 |
-
# fp = Path(__file__).with_name('tokenizer.pkl')
|
| 21 |
-
# with open(fp,mode="rb") as f:
|
| 22 |
-
# tokenizer = pickle.load(f)
|
| 23 |
|
| 24 |
-
#Load LSTM
|
| 25 |
-
#fp = Path(__file__).with_name('lstm_model.h5')
|
| 26 |
LSTM_model = tf.keras.models.load_model('lstm_model.tf')
|
| 27 |
|
| 28 |
-
#Load GRU
|
| 29 |
-
#fp = Path(__file__).with_name('gru_model.h5')
|
| 30 |
GRU_model = tf.keras.models.load_model('gru_model.tf')
|
| 31 |
|
| 32 |
-
|
| 33 |
-
def tokenizer_pad(tokenizer,comment_text,max_length=200):
|
| 34 |
-
|
| 35 |
-
comment_text = word_tokenize(comment_text, format="text")
|
| 36 |
-
comment_text = [comment_text]
|
| 37 |
-
tokenized_text = tokenizer.texts_to_sequences(comment_text)
|
| 38 |
-
|
| 39 |
-
padded_sequences = pad_sequences(sequences=tokenized_text,maxlen=max_length,padding="post",truncating="post")
|
| 40 |
-
|
| 41 |
-
return padded_sequences
|
| 42 |
-
|
| 43 |
def LSTM_predict(x):
|
| 44 |
-
# x = tokenizer_pad(tokenizer=tokenizer,comment_text=x)
|
| 45 |
-
|
| 46 |
|
| 47 |
pred_proba = LSTM_model.predict([x])[0]
|
| 48 |
|
| 49 |
pred_proba = [round(i,2) for i in pred_proba]
|
| 50 |
|
| 51 |
-
#print(pred_proba)
|
| 52 |
-
|
| 53 |
return pred_proba
|
| 54 |
|
| 55 |
def GRU_predict(x):
|
| 56 |
-
# x = tokenizer_pad(tokenizer=tokenizer,comment_text=x)
|
| 57 |
-
|
| 58 |
|
| 59 |
pred_proba = GRU_model.predict([x])[0]
|
| 60 |
|
| 61 |
pred_proba = [round(i,2) for i in pred_proba]
|
| 62 |
|
| 63 |
-
#print(pred_proba)
|
| 64 |
-
|
| 65 |
return pred_proba
|
| 66 |
|
| 67 |
-
def
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
data['Điểm'] = result
|
| 72 |
-
|
| 73 |
-
#print(data)
|
| 74 |
|
| 75 |
-
p = px.bar(data, x='Nhãn', y='Điểm', color='Nhãn', range_y=[0, 1] )
|
| 76 |
-
return p
|
| 77 |
-
pass
|
| 78 |
|
| 79 |
def judge(x):
|
| 80 |
|
| 81 |
-
label = ['độc hại', 'cực kì độc hại', 'tục tĩu', 'đe dọa', 'xúc phạm', 'thù ghét cá nhân']
|
| 82 |
result = []
|
| 83 |
-
judge_result = []
|
| 84 |
|
| 85 |
-
x =
|
| 86 |
-
x = word_tokenize(x, format="text")
|
| 87 |
|
| 88 |
lstm_pred = LSTM_predict(x)
|
| 89 |
gru_pred = GRU_predict(x)
|
| 90 |
-
#bert_pred = BERT_predict(x)
|
| 91 |
-
#print(result)
|
| 92 |
|
| 93 |
-
return_result = 'Result'
|
| 94 |
result_lstm = np.round(lstm_pred, 2)
|
| 95 |
result_gru = np.round(gru_pred, 2)
|
| 96 |
-
#result_bert = np.round(bert_pred, 2)
|
| 97 |
|
| 98 |
for i in range(6):
|
| 99 |
result.append((result_lstm[i]+result_gru[i])/2)
|
| 100 |
|
| 101 |
return (result)
|
| 102 |
|
|
|
|
| 103 |
def judgePlus(x):
|
| 104 |
|
| 105 |
-
label = ['độc hại', 'cực kì độc hại', 'tục tĩu', 'đe dọa', 'xúc phạm', 'thù ghét cá nhân']
|
| 106 |
result = []
|
| 107 |
-
judge_result = []
|
| 108 |
|
| 109 |
-
x =
|
| 110 |
-
x = word_tokenize(x, format="text")
|
| 111 |
|
| 112 |
lstm_pred = LSTM_predict(x)
|
| 113 |
gru_pred = GRU_predict(x)
|
|
@@ -117,11 +75,10 @@ def judgePlus(x):
|
|
| 117 |
bert_pred = np.average([lstm_pred, gru_pred], axis=0)
|
| 118 |
|
| 119 |
|
| 120 |
-
return_result = 'Result'
|
| 121 |
result_lstm = np.round(lstm_pred, 2)
|
| 122 |
result_gru = np.round(gru_pred, 2)
|
| 123 |
result_bert = np.round(bert_pred, 2)
|
| 124 |
-
|
| 125 |
if((result_lstm[0]+result_gru[0])<(result_bert[0]*2)):
|
| 126 |
for i in range(6):
|
| 127 |
result.append((result_bert[i])/1)
|
|
@@ -131,26 +88,19 @@ def judgePlus(x):
|
|
| 131 |
|
| 132 |
return (result)
|
| 133 |
|
|
|
|
| 134 |
def judgeBert(x):
|
| 135 |
|
| 136 |
-
label = ['độc hại', 'cực kì độc hại', 'tục tĩu', 'đe dọa', 'xúc phạm', 'thù ghét cá nhân']
|
| 137 |
result = []
|
| 138 |
-
judge_result = []
|
| 139 |
|
| 140 |
-
x =
|
| 141 |
-
x = word_tokenize(x, format="text")
|
| 142 |
|
| 143 |
-
|
| 144 |
try:
|
| 145 |
bert_pred = BERT_predict(x)
|
| 146 |
except:
|
| 147 |
bert_pred = np.zeros(6, dtype=float)
|
| 148 |
|
| 149 |
-
|
| 150 |
-
return_result = 'Result'
|
| 151 |
-
|
| 152 |
result_bert = np.round(bert_pred, 2)
|
| 153 |
-
#result_bert = np.round(bert_pred, 2)
|
| 154 |
|
| 155 |
for i in range(6):
|
| 156 |
result.append((result_bert[i])/1)
|
|
|
|
| 16 |
|
| 17 |
from phoBERT import BERT_predict
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
|
|
|
|
|
|
| 20 |
LSTM_model = tf.keras.models.load_model('lstm_model.tf')
|
| 21 |
|
|
|
|
|
|
|
| 22 |
GRU_model = tf.keras.models.load_model('gru_model.tf')
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
def LSTM_predict(x):
|
|
|
|
|
|
|
| 25 |
|
| 26 |
pred_proba = LSTM_model.predict([x])[0]
|
| 27 |
|
| 28 |
pred_proba = [round(i,2) for i in pred_proba]
|
| 29 |
|
|
|
|
|
|
|
| 30 |
return pred_proba
|
| 31 |
|
| 32 |
def GRU_predict(x):
|
|
|
|
|
|
|
| 33 |
|
| 34 |
pred_proba = GRU_model.predict([x])[0]
|
| 35 |
|
| 36 |
pred_proba = [round(i,2) for i in pred_proba]
|
| 37 |
|
|
|
|
|
|
|
| 38 |
return pred_proba
|
| 39 |
|
| 40 |
+
def tokenize(x):
|
| 41 |
+
x = ud.normalize('NFKC', x)
|
| 42 |
+
x = word_tokenize(x, format="text")
|
| 43 |
+
return x
|
|
|
|
|
|
|
|
|
|
| 44 |
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
def judge(x):
|
| 47 |
|
|
|
|
| 48 |
result = []
|
|
|
|
| 49 |
|
| 50 |
+
x = tokenize(x)
|
|
|
|
| 51 |
|
| 52 |
lstm_pred = LSTM_predict(x)
|
| 53 |
gru_pred = GRU_predict(x)
|
|
|
|
|
|
|
| 54 |
|
|
|
|
| 55 |
result_lstm = np.round(lstm_pred, 2)
|
| 56 |
result_gru = np.round(gru_pred, 2)
|
|
|
|
| 57 |
|
| 58 |
for i in range(6):
|
| 59 |
result.append((result_lstm[i]+result_gru[i])/2)
|
| 60 |
|
| 61 |
return (result)
|
| 62 |
|
| 63 |
+
|
| 64 |
def judgePlus(x):
|
| 65 |
|
|
|
|
| 66 |
result = []
|
|
|
|
| 67 |
|
| 68 |
+
x = tokenize(x)
|
|
|
|
| 69 |
|
| 70 |
lstm_pred = LSTM_predict(x)
|
| 71 |
gru_pred = GRU_predict(x)
|
|
|
|
| 75 |
bert_pred = np.average([lstm_pred, gru_pred], axis=0)
|
| 76 |
|
| 77 |
|
|
|
|
| 78 |
result_lstm = np.round(lstm_pred, 2)
|
| 79 |
result_gru = np.round(gru_pred, 2)
|
| 80 |
result_bert = np.round(bert_pred, 2)
|
| 81 |
+
|
| 82 |
if((result_lstm[0]+result_gru[0])<(result_bert[0]*2)):
|
| 83 |
for i in range(6):
|
| 84 |
result.append((result_bert[i])/1)
|
|
|
|
| 88 |
|
| 89 |
return (result)
|
| 90 |
|
| 91 |
+
|
| 92 |
def judgeBert(x):
|
| 93 |
|
|
|
|
| 94 |
result = []
|
|
|
|
| 95 |
|
| 96 |
+
x = tokenize(x)
|
|
|
|
| 97 |
|
|
|
|
| 98 |
try:
|
| 99 |
bert_pred = BERT_predict(x)
|
| 100 |
except:
|
| 101 |
bert_pred = np.zeros(6, dtype=float)
|
| 102 |
|
|
|
|
|
|
|
|
|
|
| 103 |
result_bert = np.round(bert_pred, 2)
|
|
|
|
| 104 |
|
| 105 |
for i in range(6):
|
| 106 |
result.append((result_bert[i])/1)
|