Spaces:
Runtime error
Runtime error
documented scripts
Browse files
app.py
CHANGED
|
@@ -6,40 +6,55 @@ import tensorflow as tf
|
|
| 6 |
from sentence_transformers import SentenceTransformer
|
| 7 |
from huggingface_hub import from_pretrained_keras
|
| 8 |
|
|
|
|
| 9 |
emb_model = SentenceTransformer('distiluse-base-multilingual-cased-v2')
|
| 10 |
EMB_DIM = 512
|
| 11 |
|
| 12 |
-
|
|
|
|
| 13 |
feedback_labels = ['Request', 'Thanks', 'Question', 'Opinion', 'Concern']
|
| 14 |
|
| 15 |
-
|
|
|
|
| 16 |
area_labels = ['cross-cutting', 'education', 'food security', 'governance', 'health', 'protection', 'shelter', 'wash']
|
| 17 |
|
| 18 |
|
| 19 |
def process(text_1, text_2):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
tokenized_text = emb_model.encode([text_1])
|
| 21 |
tokenized_org = emb_model.encode([text_2])
|
| 22 |
return tokenized_text, tokenized_org
|
| 23 |
|
| 24 |
|
| 25 |
def feedback_classification(input_text, input_org):
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
|
|
|
| 28 |
X_text = tf.reshape(train_text, [-1, 1, EMB_DIM])
|
| 29 |
X_org = tf.reshape(train_org, [-1, 1, EMB_DIM])
|
| 30 |
|
|
|
|
| 31 |
feedback_scores = feedback_class_model.predict([X_text, X_org])
|
| 32 |
area_scores = area_class_model.predict([X_text, X_org])
|
| 33 |
|
|
|
|
| 34 |
feedback_scores = {feedback_labels[num]: feedback_scores[0][0][num] for num in range(len(feedback_labels))}
|
| 35 |
area_scores = {area_labels[num]: area_scores[0][0][num] for num in range(len(area_labels))}
|
| 36 |
|
| 37 |
return feedback_scores, area_scores
|
| 38 |
|
| 39 |
demo = gr.Interface(
|
| 40 |
-
fn=feedback_classification,
|
| 41 |
-
inputs=[gr.Textbox(placeholder='Enter a feedback text'),
|
| 42 |
-
|
| 43 |
-
|
|
|
|
| 44 |
|
| 45 |
demo.launch()
|
|
|
|
| 6 |
from sentence_transformers import SentenceTransformer
|
| 7 |
from huggingface_hub import from_pretrained_keras
|
| 8 |
|
| 9 |
+
# load pre-trained model
|
| 10 |
emb_model = SentenceTransformer('distiluse-base-multilingual-cased-v2')
|
| 11 |
EMB_DIM = 512
|
| 12 |
|
| 13 |
+
# load model for first classification with defining classes
|
| 14 |
+
feedback_class_model = from_pretrained_keras('vitiugin/loop_feedback')
|
| 15 |
feedback_labels = ['Request', 'Thanks', 'Question', 'Opinion', 'Concern']
|
| 16 |
|
| 17 |
+
# load model for second classification with defining classes
|
| 18 |
+
area_class_model = from_pretrained_keras('vitiugin/loop_area')
|
| 19 |
area_labels = ['cross-cutting', 'education', 'food security', 'governance', 'health', 'protection', 'shelter', 'wash']
|
| 20 |
|
| 21 |
|
| 22 |
def process(text_1, text_2):
|
| 23 |
+
'''
|
| 24 |
+
process(str, str) -> Union[List[torch.Tensor], numpy.ndarray, torch.Tensor]
|
| 25 |
+
Function encodes texts from to embeddings.
|
| 26 |
+
'''
|
| 27 |
tokenized_text = emb_model.encode([text_1])
|
| 28 |
tokenized_org = emb_model.encode([text_2])
|
| 29 |
return tokenized_text, tokenized_org
|
| 30 |
|
| 31 |
|
| 32 |
def feedback_classification(input_text, input_org):
|
| 33 |
+
'''
|
| 34 |
+
process(str, str) -> Dict, Dict
|
| 35 |
+
Function fits texts (fedback and organization title) into predefined classes
|
| 36 |
+
'''
|
| 37 |
+
train_text, train_org = process(input_text, input_org) #tokenization
|
| 38 |
|
| 39 |
+
# reshaping tensors
|
| 40 |
X_text = tf.reshape(train_text, [-1, 1, EMB_DIM])
|
| 41 |
X_org = tf.reshape(train_org, [-1, 1, EMB_DIM])
|
| 42 |
|
| 43 |
+
# getting scores from classification model
|
| 44 |
feedback_scores = feedback_class_model.predict([X_text, X_org])
|
| 45 |
area_scores = area_class_model.predict([X_text, X_org])
|
| 46 |
|
| 47 |
+
# create dict with classification-based probabilities
|
| 48 |
feedback_scores = {feedback_labels[num]: feedback_scores[0][0][num] for num in range(len(feedback_labels))}
|
| 49 |
area_scores = {area_labels[num]: area_scores[0][0][num] for num in range(len(area_labels))}
|
| 50 |
|
| 51 |
return feedback_scores, area_scores
|
| 52 |
|
| 53 |
demo = gr.Interface(
|
| 54 |
+
fn=feedback_classification, #define type of the app
|
| 55 |
+
inputs=[gr.Textbox(placeholder='Enter a feedback text'),
|
| 56 |
+
gr.Textbox(placeholder='Enter a title of organization')], # define the interface fields
|
| 57 |
+
outputs=['label', 'label'], # output interface
|
| 58 |
+
examples=[['Thank you, but next time just send us more fresh vegetables!', 'Red Cross']]) # definition of data example
|
| 59 |
|
| 60 |
demo.launch()
|
test.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import gradio as gr
|
| 3 |
+
import tensorflow as tf
|
| 4 |
+
|
| 5 |
+
from sentence_transformers import SentenceTransformer, util
|
| 6 |
+
|
| 7 |
+
emb_model = SentenceTransformer('distiluse-base-multilingual-cased-v2')
|
| 8 |
+
EMB_DIM = 512
|
| 9 |
+
|
| 10 |
+
feedback_class_model = tf.keras.models.load_model('models/feedback_class.keras')
|
| 11 |
+
area_class_model = tf.keras.models.load_model('models/area_class.keras')
|
| 12 |
+
|
| 13 |
+
def process(text_1, text_2):
|
| 14 |
+
'''
|
| 15 |
+
process(str, str) -> Union[List[torch.Tensor], numpy.ndarray, torch.Tensor]
|
| 16 |
+
Function encodes texts from to embeddings.
|
| 17 |
+
'''
|
| 18 |
+
tokenized_text = emb_model.encode([text_1])
|
| 19 |
+
tokenized_org = emb_model.encode([text_2])
|
| 20 |
+
return tokenized_text, tokenized_org
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def feedback_classification(input_text, input_org):
|
| 24 |
+
train_text, train_org = process(input_text, input_org)
|
| 25 |
+
|
| 26 |
+
X_text = tf.reshape(train_text, [-1, 1, EMB_DIM])
|
| 27 |
+
X_org = tf.reshape(train_org, [-1, 1, EMB_DIM])
|
| 28 |
+
|
| 29 |
+
feedback_scores = feedback_class_model.predict([X_text, X_org])
|
| 30 |
+
#area_scores = area_class_model.predict([X_text, X_org])
|
| 31 |
+
|
| 32 |
+
print(feedback_scores[0][0])
|
| 33 |
+
|
| 34 |
+
feedback_classification('Хотіли би подякувати усім співробітникам Gdynia Community Center!!!',
|
| 35 |
+
'Community Center Gdynia | Danish Refugee Council (DRC) | Stowarzyszenie OVUM')
|
train.py
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import numpy as np
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import tensorflow as tf
|
| 5 |
+
from datasets import Dataset
|
| 6 |
+
from sentence_transformers import SentenceTransformer
|
| 7 |
+
|
| 8 |
+
from tensorflow.keras.models import Model
|
| 9 |
+
from tensorflow.keras.layers import Dense, LSTM, Dropout, Input, Concatenate, BatchNormalization
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
# arguments
|
| 13 |
+
# example: python train.py data.csv feedback_type
|
| 14 |
+
df_path = sys.argv[1] # path to data file
|
| 15 |
+
train_mode = sys.argv[2] # type of model: feedback_type or thematic_area only
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def process(examples):
|
| 20 |
+
'''
|
| 21 |
+
process(pd.DataFrame) -> Union[List[torch.Tensor], numpy.ndarray, torch.Tensor]
|
| 22 |
+
Function encodes texts from dataframe columns "text" and "org" to embeddings.
|
| 23 |
+
'''
|
| 24 |
+
tokenized_text = model.encode(examples["text"])
|
| 25 |
+
tokenized_org = model.encode(examples["org"])
|
| 26 |
+
return tokenized_text, tokenized_org
|
| 27 |
+
|
| 28 |
+
def get_feedback_data(f):
|
| 29 |
+
'''
|
| 30 |
+
get_feedback_data(pd.DataFrame) -> pd.DataFrame, List[str]
|
| 31 |
+
Function for one hot encoding of feedback types
|
| 32 |
+
'''
|
| 33 |
+
f_types = []
|
| 34 |
+
f = f.dropna(subset=['feedback_type'])
|
| 35 |
+
for row in f.feedback_type:
|
| 36 |
+
f_types += row.split(' | ')
|
| 37 |
+
|
| 38 |
+
types_list = list(set(f_types))
|
| 39 |
+
|
| 40 |
+
for t in types_list:
|
| 41 |
+
t_list = []
|
| 42 |
+
for row in f.feedback_type:
|
| 43 |
+
if t in row:
|
| 44 |
+
t_list.append(1)
|
| 45 |
+
else:
|
| 46 |
+
t_list.append(0)
|
| 47 |
+
f['feedback_' + str(t)] = t_list
|
| 48 |
+
|
| 49 |
+
label_list = ['feedback_Request', 'feedback_Thanks', 'feedback_Question', 'feedback_Opinion', 'feedback_Concern']
|
| 50 |
+
|
| 51 |
+
return f, label_list
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def get_area_data(f):
|
| 55 |
+
'''
|
| 56 |
+
get_feedback_data(pd.DataFrame) -> pd.DataFrame, List[str]
|
| 57 |
+
Function for one hot encoding of thematic area categories
|
| 58 |
+
'''
|
| 59 |
+
f_types = []
|
| 60 |
+
f = f.dropna(subset=['thematic_area'])
|
| 61 |
+
for row in f.thematic_area:
|
| 62 |
+
f_types += row.split(' | ')
|
| 63 |
+
|
| 64 |
+
types_list = list(set(f_types))
|
| 65 |
+
|
| 66 |
+
for t in types_list:
|
| 67 |
+
t_list = []
|
| 68 |
+
for row in f.thematic_area:
|
| 69 |
+
if t in row:
|
| 70 |
+
t_list.append(1)
|
| 71 |
+
else:
|
| 72 |
+
t_list.append(0)
|
| 73 |
+
f['area_' + str(t)] = t_list
|
| 74 |
+
|
| 75 |
+
label_list = ['area_cross-cutting', 'area_education', 'area_food security', 'area_governance',
|
| 76 |
+
'area_health', 'area_protection', 'area_shelter', 'area_wash']
|
| 77 |
+
return f, label_list
|
| 78 |
+
|
| 79 |
+
df = pd.read_csv(df_path) #reading data
|
| 80 |
+
|
| 81 |
+
# check the training mode type
|
| 82 |
+
if train_mode == 'feedback_type':
|
| 83 |
+
df, label_list = get_feedback_data(df)
|
| 84 |
+
elif train_mode == 'thematic_area':
|
| 85 |
+
df, label_list = get_feedback_data(df)
|
| 86 |
+
else:
|
| 87 |
+
print('The script supports "feedback_type" or "thematic_area" modes only.')
|
| 88 |
+
|
| 89 |
+
# parameters for reproducibility
|
| 90 |
+
SEED_VALUE = 13
|
| 91 |
+
tf.random.set_seed(SEED_VALUE)
|
| 92 |
+
tf.keras.utils.set_random_seed(SEED_VALUE)
|
| 93 |
+
tf.config.experimental.enable_op_determinism()
|
| 94 |
+
LABELS_NUM = len(label_list) # detect number of classes
|
| 95 |
+
|
| 96 |
+
model = SentenceTransformer('distiluse-base-multilingual-cased-v2') # define pretrained model
|
| 97 |
+
EMB_DIM = 512 # vector dimensionality
|
| 98 |
+
|
| 99 |
+
df = df.dropna(subset=['feedback_content', 'organisation']) # drop duplicated texts AND organizations
|
| 100 |
+
|
| 101 |
+
# dataset processing part
|
| 102 |
+
new_df = pd.DataFrame(data=df, columns=label_list) # new dataset with content and classes only
|
| 103 |
+
dataset = Dataset.from_dict({"text": df['feedback_content'], "org": df['organisation'], "label": new_df.to_numpy()}) # preparing data columns
|
| 104 |
+
|
| 105 |
+
# train-validation splitting
|
| 106 |
+
dataset = dataset.shuffle(seed=SEED_VALUE)
|
| 107 |
+
train, val = dataset.train_test_split(test_size=0.2, seed=SEED_VALUE).values()
|
| 108 |
+
|
| 109 |
+
# tokenization
|
| 110 |
+
train_text, train_org = process(train)
|
| 111 |
+
val_text, val_org = process(val)
|
| 112 |
+
|
| 113 |
+
# training data format preparation
|
| 114 |
+
X1_text = tf.reshape(train_text, [-1, 1, EMB_DIM])
|
| 115 |
+
X1_org = tf.reshape(train_org, [-1, 1, EMB_DIM])
|
| 116 |
+
Y1 = tf.reshape(np.array(train['label']), [-1, 1, LABELS_NUM])
|
| 117 |
+
|
| 118 |
+
# validation data format preparation
|
| 119 |
+
X2_text = tf.reshape(val_text, [-1, 1, EMB_DIM])
|
| 120 |
+
X2_org = tf.reshape(val_org, [-1, 1, EMB_DIM])
|
| 121 |
+
Y2 = tf.reshape(np.array(val['label']), [-1, 1, LABELS_NUM])
|
| 122 |
+
|
| 123 |
+
# Defining the model paramaters.
|
| 124 |
+
inputA = Input(shape=(1, EMB_DIM, ))
|
| 125 |
+
inputB = Input(shape=(1, EMB_DIM, ))
|
| 126 |
+
|
| 127 |
+
# the first branch operates on the transformer embeddings
|
| 128 |
+
x = LSTM(EMB_DIM, input_shape=(1, EMB_DIM), return_sequences=True, dropout=0.1, recurrent_dropout=0.1)(inputA)
|
| 129 |
+
x = Dense(EMB_DIM,activation='relu')(x)
|
| 130 |
+
x = Dense(256, activation="sigmoid")(x)
|
| 131 |
+
x = Dropout(0.5)(x)
|
| 132 |
+
x = Dense(128,activation='sigmoid')(x)
|
| 133 |
+
x_model = Model(inputs=inputA, outputs=x)
|
| 134 |
+
|
| 135 |
+
# the first branch operates on the transformer embeddings
|
| 136 |
+
y = LSTM(EMB_DIM, input_shape=(1, EMB_DIM), return_sequences=True, dropout=0.1, recurrent_dropout=0.1)(inputB)
|
| 137 |
+
y = Dense(EMB_DIM,activation='relu')(y)
|
| 138 |
+
y = Dense(256, activation="sigmoid")(y)
|
| 139 |
+
y = Dropout(0.5)(y)
|
| 140 |
+
y = Dense(128,activation='sigmoid')(y)
|
| 141 |
+
y_model = Model(inputs=inputB, outputs=y)
|
| 142 |
+
|
| 143 |
+
# combine the output of the three branches
|
| 144 |
+
combined = Concatenate()([x_model.output, y_model.output])
|
| 145 |
+
|
| 146 |
+
# apply a FC layer and then a regression prediction on the combined outputs
|
| 147 |
+
combo = BatchNormalization()(combined)
|
| 148 |
+
combo1 = Dense(LABELS_NUM, activation="sigmoid")(combo)
|
| 149 |
+
|
| 150 |
+
# our model will accept the inputs of the two branches and then output a single value
|
| 151 |
+
model = Model(inputs=[x_model.inputs, y_model.inputs], outputs=combo1)
|
| 152 |
+
|
| 153 |
+
# summary of model
|
| 154 |
+
model.compile(optimizer='adam', loss='binary_crossentropy',
|
| 155 |
+
metrics=[tf.keras.metrics.BinaryAccuracy(name='Acc'), tf.keras.metrics.Precision(name='Prec'),
|
| 156 |
+
tf.keras.metrics.Recall(name='Rec'), tf.keras.metrics.AUC(name='AUC')])
|
| 157 |
+
|
| 158 |
+
# model training
|
| 159 |
+
model.fit([X1_text, X1_org], Y1, validation_data=([X2_text, X2_org], Y2), epochs=20)
|
| 160 |
+
|
| 161 |
+
# saving of trained model
|
| 162 |
+
model.save('models/' + str(train_mode))
|