Spaces:
Runtime error
Runtime error
shakhovak commited on
Commit ·
8e44dd8
1
Parent(s): 69f2a1b
bienc+intent_added
Browse files- data/scripts.pkl +2 -2
- data/scripts_vectors.pkl +2 -2
- requirements.txt +2 -1
- retrieve_bot.py +77 -25
- utils.py +134 -11
data/scripts.pkl
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2df9355dd53669d082cecbdcfabee2cedba4527b0dfafcc086d7da479f78be48
|
| 3 |
+
size 3031195
|
data/scripts_vectors.pkl
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f3452fc927c68cb4cfd2a3eacd1b86158cb64696ef41b419ef45d50f9946196b
|
| 3 |
+
size 100818899
|
requirements.txt
CHANGED
|
@@ -4,4 +4,5 @@ pandas==1.3.5
|
|
| 4 |
gunicorn==20.1.0
|
| 5 |
requests==2.27.
|
| 6 |
datasets==2.13.2
|
| 7 |
-
transformers==4.37.2
|
|
|
|
|
|
| 4 |
gunicorn==20.1.0
|
| 5 |
requests==2.27.
|
| 6 |
datasets==2.13.2
|
| 7 |
+
transformers==4.37.2
|
| 8 |
+
DialogTag==1.1.3
|
retrieve_bot.py
CHANGED
|
@@ -1,20 +1,46 @@
|
|
| 1 |
import pandas as pd
|
| 2 |
import pickle
|
|
|
|
| 3 |
from sentence_transformers import SentenceTransformer
|
| 4 |
-
from utils import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
from collections import deque
|
| 6 |
from transformers import pipeline
|
| 7 |
import torch
|
| 8 |
from transformers import AutoTokenizer
|
|
|
|
| 9 |
|
| 10 |
# this class representes main functions of retrieve bot
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
class ChatBot:
|
| 14 |
def __init__(self):
|
| 15 |
self.vect_data = []
|
| 16 |
self.scripts = []
|
| 17 |
self.conversation_history = deque([], maxlen=5)
|
|
|
|
| 18 |
self.ranking_model = None
|
| 19 |
self.reranking_model = None
|
| 20 |
self.device = None
|
|
@@ -27,46 +53,72 @@ class ChatBot:
|
|
| 27 |
|
| 28 |
with open("data/scripts_vectors.pkl", "rb") as fp:
|
| 29 |
self.vect_data = pickle.load(fp)
|
| 30 |
-
|
| 31 |
-
self.
|
| 32 |
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 33 |
-
self.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
self.reranking_model = pipeline(
|
| 35 |
model="Shakhovak/RerankerModel_chat_bot",
|
| 36 |
device=self.device,
|
| 37 |
-
tokenizer=self.
|
| 38 |
)
|
| 39 |
|
| 40 |
def generate_response(self, utterance: str) -> str:
|
| 41 |
"""this functions identifies potential
|
| 42 |
candidates for answer and ranks them"""
|
|
|
|
|
|
|
| 43 |
query_encoding = encode(
|
| 44 |
-
utterance,
|
|
|
|
|
|
|
|
|
|
| 45 |
)
|
| 46 |
-
bot_cosine_scores = cosine_sim(
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
top_indexes,
|
| 53 |
-
self.conversation_history,
|
| 54 |
-
utterance,
|
| 55 |
-
self.scripts,
|
| 56 |
-
self.reranking_model,
|
| 57 |
)
|
| 58 |
-
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
|
|
|
|
|
|
|
|
|
| 64 |
)
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
|
| 69 |
self.conversation_history.append(utterance)
|
| 70 |
self.conversation_history.append(answer)
|
| 71 |
|
| 72 |
return answer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import pandas as pd
|
| 2 |
import pickle
|
| 3 |
+
import random
|
| 4 |
from sentence_transformers import SentenceTransformer
|
| 5 |
+
from utils import (
|
| 6 |
+
encode,
|
| 7 |
+
cosine_sim,
|
| 8 |
+
top_candidates,
|
| 9 |
+
candidates_reranking,
|
| 10 |
+
intent_classification,
|
| 11 |
+
)
|
| 12 |
from collections import deque
|
| 13 |
from transformers import pipeline
|
| 14 |
import torch
|
| 15 |
from transformers import AutoTokenizer
|
| 16 |
+
from dialog_tag import DialogTag
|
| 17 |
|
| 18 |
# this class representes main functions of retrieve bot
|
| 19 |
|
| 20 |
+
low_scoring_list = [
|
| 21 |
+
"What does it mean?",
|
| 22 |
+
"You have two strikes. Three strikes and you’ re out. It’ s a sports metaphor. Explain again!",
|
| 23 |
+
"Again, urban slang. In which, I believe I’ m gaining remarkable fluency. So, could you repeat?",
|
| 24 |
+
"I’m confused.",
|
| 25 |
+
"I can’t comment without violating our agreement that I don’ t criticize you.",
|
| 26 |
+
"Oh!",
|
| 27 |
+
"I need to use the restroom.",
|
| 28 |
+
"Move. Move. Move!",
|
| 29 |
+
"I was going to mention it at the time, but then I thought, some day maybe...",
|
| 30 |
+
"Well...",
|
| 31 |
+
"Apparently... I have no idea!?",
|
| 32 |
+
"I’m not sure...",
|
| 33 |
+
"Nothing. I say nothing.",
|
| 34 |
+
"Well, my friend. Focus and repeat!",
|
| 35 |
+
]
|
| 36 |
+
|
| 37 |
|
| 38 |
class ChatBot:
|
| 39 |
def __init__(self):
|
| 40 |
self.vect_data = []
|
| 41 |
self.scripts = []
|
| 42 |
self.conversation_history = deque([], maxlen=5)
|
| 43 |
+
self.tag_model = None
|
| 44 |
self.ranking_model = None
|
| 45 |
self.reranking_model = None
|
| 46 |
self.device = None
|
|
|
|
| 53 |
|
| 54 |
with open("data/scripts_vectors.pkl", "rb") as fp:
|
| 55 |
self.vect_data = pickle.load(fp)
|
| 56 |
+
self.scripts = pd.read_pickle("data/scripts.pkl")
|
| 57 |
+
self.tag_model = DialogTag("distilbert-base-uncased")
|
| 58 |
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 59 |
+
self.ranking_model = SentenceTransformer(
|
| 60 |
+
"Shakhovak/chatbot_sentence-transformer"
|
| 61 |
+
) # # sentence-transformers/LaBSE or sentence-transformers/all-mpnet-base-v2 or Shakhovak/chatbot_sentence-transformer
|
| 62 |
+
|
| 63 |
+
self.tokenizer_reranking = AutoTokenizer.from_pretrained("bert-base-uncased")
|
| 64 |
self.reranking_model = pipeline(
|
| 65 |
model="Shakhovak/RerankerModel_chat_bot",
|
| 66 |
device=self.device,
|
| 67 |
+
tokenizer=self.tokenizer_reranking,
|
| 68 |
)
|
| 69 |
|
| 70 |
def generate_response(self, utterance: str) -> str:
|
| 71 |
"""this functions identifies potential
|
| 72 |
candidates for answer and ranks them"""
|
| 73 |
+
|
| 74 |
+
intent = intent_classification(utterance, utterance, self.tag_model)
|
| 75 |
query_encoding = encode(
|
| 76 |
+
texts=utterance,
|
| 77 |
+
intent=intent,
|
| 78 |
+
model=self.ranking_model,
|
| 79 |
+
contexts=self.conversation_history,
|
| 80 |
)
|
| 81 |
+
bot_cosine_scores = cosine_sim(
|
| 82 |
+
self.vect_data,
|
| 83 |
+
query_encoding,
|
| 84 |
+
)
|
| 85 |
+
top_scores, top_indexes = top_candidates(
|
| 86 |
+
bot_cosine_scores, intent=intent, initial_data=self.scripts, top=10
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
)
|
| 88 |
+
print(top_scores)
|
| 89 |
+
if top_scores[0] < 0.9:
|
| 90 |
+
answer = random.choice(low_scoring_list)
|
| 91 |
+
self.conversation_history.clear()
|
| 92 |
+
else:
|
| 93 |
+
# test candidates and collects them with label 0 to dictionary
|
| 94 |
|
| 95 |
+
reranked_dict = candidates_reranking(
|
| 96 |
+
top_indexes,
|
| 97 |
+
self.conversation_history,
|
| 98 |
+
utterance,
|
| 99 |
+
self.scripts,
|
| 100 |
+
self.reranking_model,
|
| 101 |
)
|
| 102 |
+
# if any candidates were selected, range them and pick up the top
|
| 103 |
+
# else keep up the initial top 1
|
| 104 |
+
|
| 105 |
+
if len(reranked_dict) >= 1:
|
| 106 |
+
updated_top_candidates = dict(
|
| 107 |
+
sorted(reranked_dict.items(), key=lambda item: item[1])
|
| 108 |
+
)
|
| 109 |
+
answer = self.scripts.iloc[list(updated_top_candidates.keys())[0]][
|
| 110 |
+
"answer"
|
| 111 |
+
]
|
| 112 |
+
print(self.scripts.iloc[top_indexes[0]]["answer"])
|
| 113 |
+
else:
|
| 114 |
+
answer = self.scripts.iloc[top_indexes[0]]["answer"]
|
| 115 |
|
| 116 |
self.conversation_history.append(utterance)
|
| 117 |
self.conversation_history.append(answer)
|
| 118 |
|
| 119 |
return answer
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
# katya = ChatBot()
|
| 123 |
+
# katya.load()
|
| 124 |
+
# katya.generate_response("hi man!")
|
utils.py
CHANGED
|
@@ -4,31 +4,47 @@ from scipy import sparse
|
|
| 4 |
import pandas as pd
|
| 5 |
import pickle
|
| 6 |
import random
|
|
|
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
-
def encode(texts, model, contexts=None, do_norm=True):
|
| 10 |
"""function to encode texts for cosine similarity search"""
|
| 11 |
|
| 12 |
question_vectors = model.encode(texts)
|
| 13 |
context_vectors = model.encode("".join(contexts))
|
|
|
|
| 14 |
|
| 15 |
return np.concatenate(
|
| 16 |
-
[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
)
|
| 18 |
|
| 19 |
|
|
|
|
|
|
|
|
|
|
| 20 |
def cosine_sim(data_vectors, query_vectors) -> list:
|
| 21 |
"""returns list of tuples with similarity score and
|
| 22 |
script index in initial dataframe"""
|
|
|
|
| 23 |
data_emb = sparse.csr_matrix(data_vectors)
|
| 24 |
query_emb = sparse.csr_matrix(query_vectors)
|
| 25 |
similarity = cosine_similarity(query_emb, data_emb).flatten()
|
| 26 |
ind = np.argwhere(similarity)
|
| 27 |
match = sorted(zip(similarity, ind.tolist()), reverse=True)
|
|
|
|
| 28 |
return match
|
| 29 |
|
| 30 |
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
| 32 |
"""this functions split scripts for queation, answer, context,
|
| 33 |
picks up the cahracter and saves data in pickle format"""
|
| 34 |
|
|
@@ -66,18 +82,30 @@ def scripts_rework(path, character):
|
|
| 66 |
"context": context,
|
| 67 |
}
|
| 68 |
|
| 69 |
-
scripts =
|
| 70 |
|
| 71 |
elif (row["person_scene"] == character) & (
|
| 72 |
df.iloc[index - 1]["person_scene"] == "Scene"
|
| 73 |
):
|
| 74 |
context = []
|
| 75 |
new_row = {"answer": row["dialogue"], "question": "", "context": context}
|
| 76 |
-
scripts =
|
| 77 |
# load reworked data to pkl
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
scripts.to_pickle("data/scripts.pkl")
|
| 79 |
|
| 80 |
|
|
|
|
|
|
|
|
|
|
| 81 |
def encode_df_save(model):
|
| 82 |
"""this functions vectorizes reworked scripts and loads them to
|
| 83 |
pickle file to be used as retrieval base for ranking script"""
|
|
@@ -85,21 +113,33 @@ def encode_df_save(model):
|
|
| 85 |
scripts_reopened = pd.read_pickle("data/scripts.pkl")
|
| 86 |
vect_data = []
|
| 87 |
for index, row in scripts_reopened.iterrows():
|
| 88 |
-
vect = encode(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
vect_data.append(vect)
|
| 90 |
with open("data/scripts_vectors.pkl", "wb") as f:
|
| 91 |
pickle.dump(vect_data, f)
|
| 92 |
|
| 93 |
|
| 94 |
-
|
|
|
|
|
|
|
|
|
|
| 95 |
"""this functions receives results of the cousine similarity ranking and
|
| 96 |
returns top items' scores and their indices"""
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
|
|
|
| 100 |
return scores[0:top], candidates_indexes[0:top]
|
| 101 |
|
| 102 |
|
|
|
|
|
|
|
|
|
|
| 103 |
def candidates_reranking(
|
| 104 |
top_candidates_idx_lst, conversational_history, utterance, initial_df, pipeline
|
| 105 |
):
|
|
@@ -123,6 +163,9 @@ def candidates_reranking(
|
|
| 123 |
return reranked_idx
|
| 124 |
|
| 125 |
|
|
|
|
|
|
|
|
|
|
| 126 |
def read_files_negative(path1, path2):
|
| 127 |
"""this functions creates training dataset for classifier incl negative
|
| 128 |
examples and saves it to the pickle file"""
|
|
@@ -155,12 +198,92 @@ def read_files_negative(path1, path2):
|
|
| 155 |
fin_scripts["context"] = fin_scripts["context"].apply(lambda x: "".join(x))
|
| 156 |
fin_scripts = fin_scripts[fin_scripts["question"] != ""]
|
| 157 |
fin_scripts = fin_scripts[fin_scripts["answer"] != ""]
|
| 158 |
-
fin_scripts["
|
| 159 |
fin_scripts["context"]
|
| 160 |
+ "[SEP]"
|
| 161 |
+ fin_scripts["question"]
|
| 162 |
+ "[SEP]"
|
| 163 |
+ fin_scripts["answer"]
|
| 164 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
# fin_scripts = fin_scripts.dropna(how='any')
|
| 166 |
fin_scripts.to_pickle("data/scripts_for_reranker.pkl")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
import pandas as pd
|
| 5 |
import pickle
|
| 6 |
import random
|
| 7 |
+
from nltk.tokenize import word_tokenize
|
| 8 |
+
import string
|
| 9 |
|
| 10 |
|
| 11 |
+
def encode(texts, model, intent, contexts=None, do_norm=True):
|
| 12 |
"""function to encode texts for cosine similarity search"""
|
| 13 |
|
| 14 |
question_vectors = model.encode(texts)
|
| 15 |
context_vectors = model.encode("".join(contexts))
|
| 16 |
+
intent_vectors = model.encode(intent)
|
| 17 |
|
| 18 |
return np.concatenate(
|
| 19 |
+
[
|
| 20 |
+
np.asarray(context_vectors),
|
| 21 |
+
np.asarray(question_vectors),
|
| 22 |
+
np.asarray(intent_vectors),
|
| 23 |
+
],
|
| 24 |
+
axis=-1,
|
| 25 |
)
|
| 26 |
|
| 27 |
|
| 28 |
+
# ===================================================
|
| 29 |
+
|
| 30 |
+
|
| 31 |
def cosine_sim(data_vectors, query_vectors) -> list:
|
| 32 |
"""returns list of tuples with similarity score and
|
| 33 |
script index in initial dataframe"""
|
| 34 |
+
|
| 35 |
data_emb = sparse.csr_matrix(data_vectors)
|
| 36 |
query_emb = sparse.csr_matrix(query_vectors)
|
| 37 |
similarity = cosine_similarity(query_emb, data_emb).flatten()
|
| 38 |
ind = np.argwhere(similarity)
|
| 39 |
match = sorted(zip(similarity, ind.tolist()), reverse=True)
|
| 40 |
+
|
| 41 |
return match
|
| 42 |
|
| 43 |
|
| 44 |
+
# ===================================================
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def scripts_rework(path, character, tag_model):
|
| 48 |
"""this functions split scripts for queation, answer, context,
|
| 49 |
picks up the cahracter and saves data in pickle format"""
|
| 50 |
|
|
|
|
| 82 |
"context": context,
|
| 83 |
}
|
| 84 |
|
| 85 |
+
scripts = pd.concat([scripts, pd.DataFrame([new_row])])
|
| 86 |
|
| 87 |
elif (row["person_scene"] == character) & (
|
| 88 |
df.iloc[index - 1]["person_scene"] == "Scene"
|
| 89 |
):
|
| 90 |
context = []
|
| 91 |
new_row = {"answer": row["dialogue"], "question": "", "context": context}
|
| 92 |
+
scripts = pd.concat([scripts, pd.DataFrame([new_row])])
|
| 93 |
# load reworked data to pkl
|
| 94 |
+
scripts = scripts[scripts["question"] != ""]
|
| 95 |
+
scripts["answer"] = scripts["answer"].apply(lambda x: change_names(x))
|
| 96 |
+
scripts["tag"] = scripts[["answer", "question"]].apply(
|
| 97 |
+
lambda test_scripts: intent_classification(
|
| 98 |
+
test_scripts["question"], test_scripts["answer"], tag_model
|
| 99 |
+
),
|
| 100 |
+
axis=1,
|
| 101 |
+
)
|
| 102 |
+
scripts = scripts.reset_index(drop=True)
|
| 103 |
scripts.to_pickle("data/scripts.pkl")
|
| 104 |
|
| 105 |
|
| 106 |
+
# ===================================================
|
| 107 |
+
|
| 108 |
+
|
| 109 |
def encode_df_save(model):
|
| 110 |
"""this functions vectorizes reworked scripts and loads them to
|
| 111 |
pickle file to be used as retrieval base for ranking script"""
|
|
|
|
| 113 |
scripts_reopened = pd.read_pickle("data/scripts.pkl")
|
| 114 |
vect_data = []
|
| 115 |
for index, row in scripts_reopened.iterrows():
|
| 116 |
+
vect = encode(
|
| 117 |
+
texts=row["question"],
|
| 118 |
+
model=model,
|
| 119 |
+
intent=row["tag"],
|
| 120 |
+
contexts=row["context"],
|
| 121 |
+
)
|
| 122 |
vect_data.append(vect)
|
| 123 |
with open("data/scripts_vectors.pkl", "wb") as f:
|
| 124 |
pickle.dump(vect_data, f)
|
| 125 |
|
| 126 |
|
| 127 |
+
# ===================================================
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def top_candidates(score_lst_sorted, intent, initial_data, top=1):
|
| 131 |
"""this functions receives results of the cousine similarity ranking and
|
| 132 |
returns top items' scores and their indices"""
|
| 133 |
+
intent_idx = initial_data.index[initial_data["tag"] == intent]
|
| 134 |
+
filtered_candiates = [item for item in score_lst_sorted if item[1][0] in intent_idx]
|
| 135 |
+
scores = [item[0] for item in filtered_candiates]
|
| 136 |
+
candidates_indexes = [item[1][0] for item in filtered_candiates]
|
| 137 |
return scores[0:top], candidates_indexes[0:top]
|
| 138 |
|
| 139 |
|
| 140 |
+
# ===================================================
|
| 141 |
+
|
| 142 |
+
|
| 143 |
def candidates_reranking(
|
| 144 |
top_candidates_idx_lst, conversational_history, utterance, initial_df, pipeline
|
| 145 |
):
|
|
|
|
| 163 |
return reranked_idx
|
| 164 |
|
| 165 |
|
| 166 |
+
# ===================================================
|
| 167 |
+
|
| 168 |
+
|
| 169 |
def read_files_negative(path1, path2):
|
| 170 |
"""this functions creates training dataset for classifier incl negative
|
| 171 |
examples and saves it to the pickle file"""
|
|
|
|
| 198 |
fin_scripts["context"] = fin_scripts["context"].apply(lambda x: "".join(x))
|
| 199 |
fin_scripts = fin_scripts[fin_scripts["question"] != ""]
|
| 200 |
fin_scripts = fin_scripts[fin_scripts["answer"] != ""]
|
| 201 |
+
fin_scripts["combined_all"] = (
|
| 202 |
fin_scripts["context"]
|
| 203 |
+ "[SEP]"
|
| 204 |
+ fin_scripts["question"]
|
| 205 |
+ "[SEP]"
|
| 206 |
+ fin_scripts["answer"]
|
| 207 |
)
|
| 208 |
+
|
| 209 |
+
fin_scripts["combined_cq"] = (
|
| 210 |
+
fin_scripts["context"] + "[SEP]" + fin_scripts["question"]
|
| 211 |
+
)
|
| 212 |
# fin_scripts = fin_scripts.dropna(how='any')
|
| 213 |
fin_scripts.to_pickle("data/scripts_for_reranker.pkl")
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
# ===================================================
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
def intent_classification(question, answer, tag_model):
|
| 220 |
+
greetings = ["hi", "hello", "greeting", "greetings", "hii", "helo", "hellow"]
|
| 221 |
+
tokens = word_tokenize(answer.lower())
|
| 222 |
+
for token in tokens:
|
| 223 |
+
if token in greetings:
|
| 224 |
+
return "greetings"
|
| 225 |
+
else:
|
| 226 |
+
intent = tag_model.predict_tag(question)
|
| 227 |
+
return intent
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
# ===================================================
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
def change_names(sentences):
|
| 234 |
+
lst_punct = string.punctuation
|
| 235 |
+
lst_punct += "’"
|
| 236 |
+
sheldon_friends = [
|
| 237 |
+
"Penny",
|
| 238 |
+
"Amy",
|
| 239 |
+
"Leonard",
|
| 240 |
+
"Stephanie",
|
| 241 |
+
"Dr. Stephanie",
|
| 242 |
+
"Raj",
|
| 243 |
+
"Rebecca",
|
| 244 |
+
]
|
| 245 |
+
tokens = word_tokenize(sentences)
|
| 246 |
+
changes = "".join(
|
| 247 |
+
"my friend" if i in sheldon_friends else i if i in lst_punct else f" {i}"
|
| 248 |
+
for i in tokens
|
| 249 |
+
).strip()
|
| 250 |
+
return changes
|
| 251 |
+
|
| 252 |
+
|
| 253 |
+
# ===================================================
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
def data_prep_biencoder(path1, path2):
|
| 257 |
+
"""this functions creates training dataset for classifier incl negative
|
| 258 |
+
examples and saves it to the pickle file"""
|
| 259 |
+
|
| 260 |
+
star_wars = []
|
| 261 |
+
for file in path1:
|
| 262 |
+
star_wars.append(pd.read_csv(file, sep='"', on_bad_lines="warn"))
|
| 263 |
+
total = pd.concat(star_wars, ignore_index=True)
|
| 264 |
+
|
| 265 |
+
rick_and_morty = pd.read_csv(path2)
|
| 266 |
+
negative_lines_to_add = list(rick_and_morty["line"])
|
| 267 |
+
negative_lines_to_add.extend(list(total["dialogue"]))
|
| 268 |
+
|
| 269 |
+
scripts_reopened = pd.read_pickle("data/scripts.pkl")
|
| 270 |
+
scripts_reopened["label"] = 0
|
| 271 |
+
source = random.sample(
|
| 272 |
+
list(scripts_reopened[scripts_reopened["question"] != ""]["question"]), 7062
|
| 273 |
+
)
|
| 274 |
+
negative_lines_to_add.extend(source)
|
| 275 |
+
random.shuffle(negative_lines_to_add)
|
| 276 |
+
|
| 277 |
+
scripts_negative = scripts_reopened[["question", "context", "answer"]]
|
| 278 |
+
scripts_negative["label"] = 1
|
| 279 |
+
|
| 280 |
+
scripts_negative["neg_answer"] = negative_lines_to_add[0 : len(scripts_negative)]
|
| 281 |
+
|
| 282 |
+
fin_scripts = scripts_negative.sample(frac=1).reset_index(drop=True)
|
| 283 |
+
fin_scripts["context"] = fin_scripts["context"].apply(lambda x: "".join(x))
|
| 284 |
+
fin_scripts = fin_scripts[fin_scripts["question"] != ""]
|
| 285 |
+
fin_scripts = fin_scripts[fin_scripts["answer"] != ""]
|
| 286 |
+
|
| 287 |
+
fin_scripts["combined"] = fin_scripts["context"] + "[SEP]" + fin_scripts["question"]
|
| 288 |
+
# fin_scripts = fin_scripts.dropna(how='any')
|
| 289 |
+
fin_scripts.to_pickle("data/scripts_for_biencoder.pkl")
|