vitiugin commited on
Commit
182b923
·
verified ·
1 Parent(s): 37cfae7

documented scripts

Browse files
Files changed (3) hide show
  1. app.py +22 -7
  2. test.py +35 -0
  3. train.py +162 -0
app.py CHANGED
@@ -6,40 +6,55 @@ import tensorflow as tf
6
  from sentence_transformers import SentenceTransformer
7
  from huggingface_hub import from_pretrained_keras
8
 
 
9
  emb_model = SentenceTransformer('distiluse-base-multilingual-cased-v2')
10
  EMB_DIM = 512
11
 
12
- feedback_class_model = from_pretrained_keras('vitiugin/feedback_class')
 
13
  feedback_labels = ['Request', 'Thanks', 'Question', 'Opinion', 'Concern']
14
 
15
- area_class_model = from_pretrained_keras('vitiugin/area_class')
 
16
  area_labels = ['cross-cutting', 'education', 'food security', 'governance', 'health', 'protection', 'shelter', 'wash']
17
 
18
 
19
  def process(text_1, text_2):
 
 
 
 
20
  tokenized_text = emb_model.encode([text_1])
21
  tokenized_org = emb_model.encode([text_2])
22
  return tokenized_text, tokenized_org
23
 
24
 
25
  def feedback_classification(input_text, input_org):
26
- train_text, train_org = process(input_text, input_org)
 
 
 
 
27
 
 
28
  X_text = tf.reshape(train_text, [-1, 1, EMB_DIM])
29
  X_org = tf.reshape(train_org, [-1, 1, EMB_DIM])
30
 
 
31
  feedback_scores = feedback_class_model.predict([X_text, X_org])
32
  area_scores = area_class_model.predict([X_text, X_org])
33
 
 
34
  feedback_scores = {feedback_labels[num]: feedback_scores[0][0][num] for num in range(len(feedback_labels))}
35
  area_scores = {area_labels[num]: area_scores[0][0][num] for num in range(len(area_labels))}
36
 
37
  return feedback_scores, area_scores
38
 
39
  demo = gr.Interface(
40
- fn=feedback_classification,
41
- inputs=[gr.Textbox(placeholder='Enter a feedback text'), gr.Textbox(placeholder='Enter a title of organization')],
42
- outputs=['label', 'label'],
43
- examples=[['Thank you, but next time just send us more fresh vegetables!', 'Red Cross']])
 
44
 
45
  demo.launch()
 
6
  from sentence_transformers import SentenceTransformer
7
  from huggingface_hub import from_pretrained_keras
8
 
9
+ # load pre-trained model
10
  emb_model = SentenceTransformer('distiluse-base-multilingual-cased-v2')
11
  EMB_DIM = 512
12
 
13
+ # load model for first classification with defining classes
14
+ feedback_class_model = from_pretrained_keras('vitiugin/loop_feedback')
15
  feedback_labels = ['Request', 'Thanks', 'Question', 'Opinion', 'Concern']
16
 
17
+ # load model for second classification with defining classes
18
+ area_class_model = from_pretrained_keras('vitiugin/loop_area')
19
  area_labels = ['cross-cutting', 'education', 'food security', 'governance', 'health', 'protection', 'shelter', 'wash']
20
 
21
 
22
  def process(text_1, text_2):
23
+ '''
24
+ process(str, str) -> Union[List[torch.Tensor], numpy.ndarray, torch.Tensor]
25
+ Function encodes texts from to embeddings.
26
+ '''
27
  tokenized_text = emb_model.encode([text_1])
28
  tokenized_org = emb_model.encode([text_2])
29
  return tokenized_text, tokenized_org
30
 
31
 
32
  def feedback_classification(input_text, input_org):
33
+ '''
34
+ process(str, str) -> Dict, Dict
35
+ Function fits texts (fedback and organization title) into predefined classes
36
+ '''
37
+ train_text, train_org = process(input_text, input_org) #tokenization
38
 
39
+ # reshaping tensors
40
  X_text = tf.reshape(train_text, [-1, 1, EMB_DIM])
41
  X_org = tf.reshape(train_org, [-1, 1, EMB_DIM])
42
 
43
+ # getting scores from classification model
44
  feedback_scores = feedback_class_model.predict([X_text, X_org])
45
  area_scores = area_class_model.predict([X_text, X_org])
46
 
47
+ # create dict with classification-based probabilities
48
  feedback_scores = {feedback_labels[num]: feedback_scores[0][0][num] for num in range(len(feedback_labels))}
49
  area_scores = {area_labels[num]: area_scores[0][0][num] for num in range(len(area_labels))}
50
 
51
  return feedback_scores, area_scores
52
 
53
  demo = gr.Interface(
54
+ fn=feedback_classification, #define type of the app
55
+ inputs=[gr.Textbox(placeholder='Enter a feedback text'),
56
+ gr.Textbox(placeholder='Enter a title of organization')], # define the interface fields
57
+ outputs=['label', 'label'], # output interface
58
+ examples=[['Thank you, but next time just send us more fresh vegetables!', 'Red Cross']]) # definition of data example
59
 
60
  demo.launch()
test.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import gradio as gr
3
+ import tensorflow as tf
4
+
5
+ from sentence_transformers import SentenceTransformer, util
6
+
7
+ emb_model = SentenceTransformer('distiluse-base-multilingual-cased-v2')
8
+ EMB_DIM = 512
9
+
10
+ feedback_class_model = tf.keras.models.load_model('models/feedback_class.keras')
11
+ area_class_model = tf.keras.models.load_model('models/area_class.keras')
12
+
13
+ def process(text_1, text_2):
14
+ '''
15
+ process(str, str) -> Union[List[torch.Tensor], numpy.ndarray, torch.Tensor]
16
+ Function encodes texts from to embeddings.
17
+ '''
18
+ tokenized_text = emb_model.encode([text_1])
19
+ tokenized_org = emb_model.encode([text_2])
20
+ return tokenized_text, tokenized_org
21
+
22
+
23
+ def feedback_classification(input_text, input_org):
24
+ train_text, train_org = process(input_text, input_org)
25
+
26
+ X_text = tf.reshape(train_text, [-1, 1, EMB_DIM])
27
+ X_org = tf.reshape(train_org, [-1, 1, EMB_DIM])
28
+
29
+ feedback_scores = feedback_class_model.predict([X_text, X_org])
30
+ #area_scores = area_class_model.predict([X_text, X_org])
31
+
32
+ print(feedback_scores[0][0])
33
+
34
+ feedback_classification('Хотіли би подякувати усім співробітникам Gdynia Community Center!!!',
35
+ 'Community Center Gdynia | Danish Refugee Council (DRC) | Stowarzyszenie OVUM')
train.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import numpy as np
3
+ import pandas as pd
4
+ import tensorflow as tf
5
+ from datasets import Dataset
6
+ from sentence_transformers import SentenceTransformer
7
+
8
+ from tensorflow.keras.models import Model
9
+ from tensorflow.keras.layers import Dense, LSTM, Dropout, Input, Concatenate, BatchNormalization
10
+
11
+
12
+ # arguments
13
+ # example: python train.py data.csv feedback_type
14
+ df_path = sys.argv[1] # path to data file
15
+ train_mode = sys.argv[2] # type of model: feedback_type or thematic_area only
16
+
17
+
18
+
19
+ def process(examples):
20
+ '''
21
+ process(pd.DataFrame) -> Union[List[torch.Tensor], numpy.ndarray, torch.Tensor]
22
+ Function encodes texts from dataframe columns "text" and "org" to embeddings.
23
+ '''
24
+ tokenized_text = model.encode(examples["text"])
25
+ tokenized_org = model.encode(examples["org"])
26
+ return tokenized_text, tokenized_org
27
+
28
+ def get_feedback_data(f):
29
+ '''
30
+ get_feedback_data(pd.DataFrame) -> pd.DataFrame, List[str]
31
+ Function for one hot encoding of feedback types
32
+ '''
33
+ f_types = []
34
+ f = f.dropna(subset=['feedback_type'])
35
+ for row in f.feedback_type:
36
+ f_types += row.split(' | ')
37
+
38
+ types_list = list(set(f_types))
39
+
40
+ for t in types_list:
41
+ t_list = []
42
+ for row in f.feedback_type:
43
+ if t in row:
44
+ t_list.append(1)
45
+ else:
46
+ t_list.append(0)
47
+ f['feedback_' + str(t)] = t_list
48
+
49
+ label_list = ['feedback_Request', 'feedback_Thanks', 'feedback_Question', 'feedback_Opinion', 'feedback_Concern']
50
+
51
+ return f, label_list
52
+
53
+
54
+ def get_area_data(f):
55
+ '''
56
+ get_feedback_data(pd.DataFrame) -> pd.DataFrame, List[str]
57
+ Function for one hot encoding of thematic area categories
58
+ '''
59
+ f_types = []
60
+ f = f.dropna(subset=['thematic_area'])
61
+ for row in f.thematic_area:
62
+ f_types += row.split(' | ')
63
+
64
+ types_list = list(set(f_types))
65
+
66
+ for t in types_list:
67
+ t_list = []
68
+ for row in f.thematic_area:
69
+ if t in row:
70
+ t_list.append(1)
71
+ else:
72
+ t_list.append(0)
73
+ f['area_' + str(t)] = t_list
74
+
75
+ label_list = ['area_cross-cutting', 'area_education', 'area_food security', 'area_governance',
76
+ 'area_health', 'area_protection', 'area_shelter', 'area_wash']
77
+ return f, label_list
78
+
79
+ df = pd.read_csv(df_path) #reading data
80
+
81
+ # check the training mode type
82
+ if train_mode == 'feedback_type':
83
+ df, label_list = get_feedback_data(df)
84
+ elif train_mode == 'thematic_area':
85
+ df, label_list = get_feedback_data(df)
86
+ else:
87
+ print('The script supports "feedback_type" or "thematic_area" modes only.')
88
+
89
+ # parameters for reproducibility
90
+ SEED_VALUE = 13
91
+ tf.random.set_seed(SEED_VALUE)
92
+ tf.keras.utils.set_random_seed(SEED_VALUE)
93
+ tf.config.experimental.enable_op_determinism()
94
+ LABELS_NUM = len(label_list) # detect number of classes
95
+
96
+ model = SentenceTransformer('distiluse-base-multilingual-cased-v2') # define pretrained model
97
+ EMB_DIM = 512 # vector dimensionality
98
+
99
+ df = df.dropna(subset=['feedback_content', 'organisation']) # drop duplicated texts AND organizations
100
+
101
+ # dataset processing part
102
+ new_df = pd.DataFrame(data=df, columns=label_list) # new dataset with content and classes only
103
+ dataset = Dataset.from_dict({"text": df['feedback_content'], "org": df['organisation'], "label": new_df.to_numpy()}) # preparing data columns
104
+
105
+ # train-validation splitting
106
+ dataset = dataset.shuffle(seed=SEED_VALUE)
107
+ train, val = dataset.train_test_split(test_size=0.2, seed=SEED_VALUE).values()
108
+
109
+ # tokenization
110
+ train_text, train_org = process(train)
111
+ val_text, val_org = process(val)
112
+
113
+ # training data format preparation
114
+ X1_text = tf.reshape(train_text, [-1, 1, EMB_DIM])
115
+ X1_org = tf.reshape(train_org, [-1, 1, EMB_DIM])
116
+ Y1 = tf.reshape(np.array(train['label']), [-1, 1, LABELS_NUM])
117
+
118
+ # validation data format preparation
119
+ X2_text = tf.reshape(val_text, [-1, 1, EMB_DIM])
120
+ X2_org = tf.reshape(val_org, [-1, 1, EMB_DIM])
121
+ Y2 = tf.reshape(np.array(val['label']), [-1, 1, LABELS_NUM])
122
+
123
+ # Defining the model paramaters.
124
+ inputA = Input(shape=(1, EMB_DIM, ))
125
+ inputB = Input(shape=(1, EMB_DIM, ))
126
+
127
+ # the first branch operates on the transformer embeddings
128
+ x = LSTM(EMB_DIM, input_shape=(1, EMB_DIM), return_sequences=True, dropout=0.1, recurrent_dropout=0.1)(inputA)
129
+ x = Dense(EMB_DIM,activation='relu')(x)
130
+ x = Dense(256, activation="sigmoid")(x)
131
+ x = Dropout(0.5)(x)
132
+ x = Dense(128,activation='sigmoid')(x)
133
+ x_model = Model(inputs=inputA, outputs=x)
134
+
135
+ # the first branch operates on the transformer embeddings
136
+ y = LSTM(EMB_DIM, input_shape=(1, EMB_DIM), return_sequences=True, dropout=0.1, recurrent_dropout=0.1)(inputB)
137
+ y = Dense(EMB_DIM,activation='relu')(y)
138
+ y = Dense(256, activation="sigmoid")(y)
139
+ y = Dropout(0.5)(y)
140
+ y = Dense(128,activation='sigmoid')(y)
141
+ y_model = Model(inputs=inputB, outputs=y)
142
+
143
+ # combine the output of the three branches
144
+ combined = Concatenate()([x_model.output, y_model.output])
145
+
146
+ # apply a FC layer and then a regression prediction on the combined outputs
147
+ combo = BatchNormalization()(combined)
148
+ combo1 = Dense(LABELS_NUM, activation="sigmoid")(combo)
149
+
150
+ # our model will accept the inputs of the two branches and then output a single value
151
+ model = Model(inputs=[x_model.inputs, y_model.inputs], outputs=combo1)
152
+
153
+ # summary of model
154
+ model.compile(optimizer='adam', loss='binary_crossentropy',
155
+ metrics=[tf.keras.metrics.BinaryAccuracy(name='Acc'), tf.keras.metrics.Precision(name='Prec'),
156
+ tf.keras.metrics.Recall(name='Rec'), tf.keras.metrics.AUC(name='AUC')])
157
+
158
+ # model training
159
+ model.fit([X1_text, X1_org], Y1, validation_data=([X2_text, X2_org], Y2), epochs=20)
160
+
161
+ # saving of trained model
162
+ model.save('models/' + str(train_mode))