Spaces:

Shakhovak
/

Sheldon_generative_chat_bot

Runtime error

App Files Files Community

shakhovak commited on Mar 8, 2024

Commit

706771c

1 Parent(s): fe6f530

added files

Browse files

Files changed (10) hide show

Dockerfile +25 -0
app.py +36 -0
data/scripts.pkl +3 -0
data/scripts_reworked.pkl +3 -0
data/scripts_vectors.pkl +3 -0
generate_bot.py +84 -0
requirements.txt +12 -0
static/style.css +223 -0
templates/chat.html +80 -0
utils.py +256 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,25 @@

+FROM python:3.9.13
+WORKDIR /app
+COPY requirements.txt /app
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . /app
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+	PATH=/home/user/.local/bin:$PATH
+WORKDIR $HOME/app
+COPY --chown=user . $HOME/app
+#CMD ["gunicorn", "--timeout", "1000", "app:app", "-b", "0.0.0.0:5000"]
+#CMD ["python", "app.py"]
+CMD ["gunicorn", "--timeout", "1000", "--workers", "2", "--worker-class", "gevent", "--worker-connections" , "100", "app:app", "-b", "0.0.0.0:7860"]

app.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from flask import Flask, render_template, request
+from generate_bot import ChatBot
+import asyncio
+app = Flask(__name__)
+chatSheldon = ChatBot()
+chatSheldon.load()
+# this script is running flask application
+@app.route("/")
+async def index():
+    return render_template("chat.html")
+async def sleep():
+    await asyncio.sleep(0.1)
+    return 0.1
+@app.route("/get", methods=["GET", "POST"])
+async def chat():
+    msg = request.form["msg"]
+    input = msg
+    await asyncio.gather(sleep(), sleep())
+    return get_Chat_response(input)
+def get_Chat_response(text):
+    answer = chatSheldon.generate_response(text)
+    return answer
+if __name__ == "__main__":
+    app.run(debug=True, host="0.0.0.0")

data/scripts.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:02be23abb73d94025637264be0813338fba80a81eb1a95074f3437d61392cc73
+size 2099433

data/scripts_reworked.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0c0329681a10e682750117eb86c9eace5ef79af5e1c113f0af383ef814bba405
+size 7686225

data/scripts_vectors.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1865b58f9fc16255786cfee8be7f6c120e3986ae1dc7012e07d1cee9f77bdb77
+size 67336895

generate_bot.py ADDED Viewed

	@@ -0,0 +1,84 @@

+from collections import deque
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from sentence_transformers import SentenceTransformer
+from utils import generate_response
+import pandas as pd
+import pickle
+from utils import encode_rag, cosine_sim_rag, top_candidates
+class ChatBot:
+    def __init__(self):
+        self.conversation_history = deque([], maxlen=10)
+        self.generative_model = None
+        self.generative_tokenizer = None
+        self.vect_data = []
+        self.scripts = []
+        self.ranking_model = None
+    def load(self):
+        """ "This method is called first to load all datasets and
+        model used by the chat bot; all the data to be saved in
+        tha data folder, models to be loaded from hugging face"""
+        with open("data/scripts_vectors.pkl", "rb") as fp:
+            self.vect_data = pickle.load(fp)
+        self.scripts = pd.read_pickle("data/scripts.pkl")
+        self.ranking_model = SentenceTransformer(
+            "Shakhovak/chatbot_sentence-transformer"
+        )
+        self.generative_model = AutoModelForSeq2SeqLM.from_pretrained(
+            "Shakhovak/flan-t5-base-sheldon-chat-v2"
+        )
+        self.generative_tokenizer = AutoTokenizer.from_pretrained(
+            "Shakhovak/flan-t5-base-sheldon-chat-v2"
+        )
+    def generate_response(self, utterance):
+        query_encoding = encode_rag(
+            texts=utterance,
+            model=self.ranking_model,
+            contexts=self.conversation_history,
+        )
+        bot_cosine_scores = cosine_sim_rag(
+            self.vect_data,
+            query_encoding,
+        )
+        top_scores, top_indexes = top_candidates(
+            bot_cosine_scores, initial_data=self.scripts
+        )
+        if top_scores[0] >= 0.89:
+            for index in top_indexes:
+                rag_answer = self.scripts.iloc[index]["answer"]
+            answer = generate_response(
+                model=self.generative_model,
+                tokenizer=self.generative_tokenizer,
+                question=utterance,
+                context=self.conversation_history,
+                top_p=0.9,
+                temperature=0.95,
+                rag_answer=rag_answer,
+            )
+        else:
+            answer = generate_response(
+                model=self.generative_model,
+                tokenizer=self.generative_tokenizer,
+                question=utterance,
+                context=self.conversation_history,
+                top_p=0.9,
+                temperature=0.95,
+            )
+        self.conversation_history.append(utterance)
+        self.conversation_history.append(answer)
+        return answer
+# katya = ChatBot()
+# katya.load()
+# print(katya.generate_response("What is he doing there?"))

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+pandas==2.2.1
+flask[async]==3.0.2
+datasets==2.17.1
+transformers==4.38.1
+gunicorn==21.2.0
+gevent>=1.4
+requests==2.31.0
+scikit-learn==1.4.1.post1
+scipy==1.12.0
+numpy==1.26.4
+torch==2.2.1
+sentence-transformers==2.3.1

static/style.css ADDED Viewed

	@@ -0,0 +1,223 @@

+body,html{
+	height: 100%;
+	margin: 0;
+	background: rgb(44, 47, 59);
+   background: -webkit-linear-gradient(to right, rgb(40, 59, 34), rgb(54, 60, 70), rgb(32, 32, 43));
+	background: linear-gradient(to right, rgb(38, 51, 61), rgb(50, 55, 65), rgb(33, 33, 78));
+}
+.chat{
+	margin-top: auto;
+	margin-bottom: auto;
+}
+.card{
+	height: 500px;
+	border-radius: 15px !important;
+	background-color: rgba(0,0,0,0.4) !important;
+}
+.contacts_body{
+	padding:  0.75rem 0 !important;
+	overflow-y: auto;
+	white-space: nowrap;
+}
+.msg_card_body{
+	overflow-y: auto;
+}
+.card-header{
+	border-radius: 15px 15px 0 0 !important;
+	border-bottom: 0 !important;
+}
+.card-footer{
+border-radius: 0 0 15px 15px !important;
+	border-top: 0 !important;
+}
+.container{
+	align-content: center;
+}
+.search{
+	border-radius: 15px 0 0 15px !important;
+	background-color: rgba(0,0,0,0.3) !important;
+	border:0 !important;
+	color:white !important;
+}
+.search:focus{
+	 box-shadow:none !important;
+   outline:0px !important;
+}
+.type_msg{
+	background-color: rgba(0,0,0,0.3) !important;
+	border:0 !important;
+	color:white !important;
+	height: 60px !important;
+	overflow-y: auto;
+}
+	.type_msg:focus{
+	 box-shadow:none !important;
+   outline:0px !important;
+}
+.attach_btn{
+	border-radius: 15px 0 0 15px !important;
+	background-color: rgba(0,0,0,0.3) !important;
+	border:0 !important;
+	color: white !important;
+	cursor: pointer;
+}
+.send_btn{
+	border-radius: 0 15px 15px 0 !important;
+	background-color: rgba(0,0,0,0.3) !important;
+	border:0 !important;
+	color: white !important;
+	cursor: pointer;
+}
+.search_btn{
+	border-radius: 0 15px 15px 0 !important;
+	background-color: rgba(0,0,0,0.3) !important;
+	border:0 !important;
+	color: white !important;
+	cursor: pointer;
+}
+.contacts{
+	list-style: none;
+	padding: 0;
+}
+.contacts li{
+	width: 100% !important;
+	padding: 5px 10px;
+	margin-bottom: 15px !important;
+}
+.active{
+	background-color: rgba(0,0,0,0.3);
+}
+.user_img{
+	height: 70px;
+	width: 70px;
+	border:1.5px solid #f5f6fa;
+}
+.user_img_msg{
+	height: 40px;
+	width: 40px;
+	border:1.5px solid #f5f6fa;
+}
+.img_cont{
+	position: relative;
+	height: 70px;
+	width: 70px;
+}
+.img_cont_msg{
+	height: 40px;
+	width: 40px;
+}
+.online_icon{
+	position: absolute;
+	height: 15px;
+	width:15px;
+	background-color: #4cd137;
+	border-radius: 50%;
+	bottom: 0.2em;
+	right: 0.4em;
+	border:1.5px solid white;
+}
+.offline{
+	background-color: #c23616 !important;
+}
+.user_info{
+	margin-top: auto;
+	margin-bottom: auto;
+	margin-left: 15px;
+}
+.user_info span{
+	font-size: 20px;
+	color: white;
+}
+.user_info p{
+	font-size: 10px;
+	color: rgba(255,255,255,0.6);
+}
+.video_cam{
+	margin-left: 50px;
+	margin-top: 5px;
+}
+.video_cam span{
+	color: white;
+	font-size: 20px;
+	cursor: pointer;
+	margin-right: 20px;
+}
+.msg_cotainer{
+	margin-top: auto;
+	margin-bottom: auto;
+	margin-left: 10px;
+	border-radius: 25px;
+	background-color: rgb(82, 172, 255);
+	padding: 10px;
+	position: relative;
+}
+.msg_cotainer_send{
+	margin-top: auto;
+	margin-bottom: auto;
+	margin-right: 10px;
+	border-radius: 25px;
+	background-color: #58cc71;
+	padding: 10px;
+	position: relative;
+}
+.msg_time{
+	position: absolute;
+	left: 0;
+	bottom: -15px;
+	color: rgba(255,255,255,0.5);
+	font-size: 10px;
+}
+.msg_time_send{
+	position: absolute;
+	right:0;
+	bottom: -15px;
+	color: rgba(255,255,255,0.5);
+	font-size: 10px;
+}
+.msg_head{
+	position: relative;
+}
+#action_menu_btn{
+	position: absolute;
+	right: 10px;
+	top: 10px;
+	color: white;
+	cursor: pointer;
+	font-size: 20px;
+}
+.action_menu{
+	z-index: 1;
+	position: absolute;
+	padding: 15px 0;
+	background-color: rgba(0,0,0,0.5);
+	color: white;
+	border-radius: 15px;
+	top: 30px;
+	right: 15px;
+	display: none;
+}
+.action_menu ul{
+	list-style: none;
+	padding: 0;
+	margin: 0;
+}
+.action_menu ul li{
+	width: 100%;
+	padding: 10px 15px;
+	margin-bottom: 5px;
+}
+.action_menu ul li i{
+	padding-right: 10px;
+}
+.action_menu ul li:hover{
+	cursor: pointer;
+	background-color: rgba(0,0,0,0.2);
+}
+@media(max-width: 576px){
+	.contacts_card{
+	margin-bottom: 15px !important;
+}
+}

templates/chat.html ADDED Viewed

	@@ -0,0 +1,80 @@

+<link href="//maxcdn.bootstrapcdn.com/bootstrap/4.1.1/css/bootstrap.min.css" rel="stylesheet" id="bootstrap-css">
+<script src="//maxcdn.bootstrapcdn.com/bootstrap/4.1.1/js/bootstrap.min.js"></script>
+<script src="//cdnjs.cloudflare.com/ajax/libs/jquery/3.2.1/jquery.min.js"></script>
+<!DOCTYPE html>
+<html>
+	<head>
+		<title>Chatbot</title>
+		<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.3/css/bootstrap.min.css" integrity="sha384-MCw98/SFnGE8fJT3GXwEOngsV7Zt27NXFoaoApmYm81iuXoPkFOJwJ8ERdknLPMO" crossorigin="anonymous">
+		<link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.5.0/css/all.css" integrity="sha384-B4dIYHKNBt8Bc12p+WXckhzcICo0wtJAoU8YZTY5qE0Id1GSseTk6S+L3BlXeVIU" crossorigin="anonymous">
+		<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.3.1/jquery.min.js"></script>
+		<link rel="stylesheet" type="text/css" href="{{ url_for('static', filename='style.css')}}"/>
+	</head>
+	<body>
+		<div class="container-fluid h-100">
+			<div class="row justify-content-center h-100">
+				<div class="col-md-8 col-xl-6 chat">
+					<div class="card">
+						<div class="card-header msg_head">
+							<div class="d-flex bd-highlight">
+								<div class="img_cont">
+									<img src="https://stickerpacks.ru/wp-content/uploads/2023/04/nabor-stikerov-teorija-bolshogo-vzryva-5-dlja-telegram-3.webp" class="rounded-circle user_img">
+									<span class="online_icon"></span>
+								</div>
+								<div class="user_info">
+									<span>ChatBot</span>
+									<p>Ask me anything!</p>
+								</div>
+							</div>
+						</div>
+						<div id="messageFormeight" class="card-body msg_card_body">
+						</div>
+						<div class="card-footer">
+							<form id="messageArea" class="input-group">
+                                <input type="text" id="text" name="msg" placeholder="Type your message..." autocomplete="off" class="form-control type_msg" required/>
+								<div class="input-group-append">
+									<button type="submit" id="send" class="input-group-text send_btn"><i class="fas fa-location-arrow"></i></button>
+								</div>
+							</form>
+						</div>
+					</div>
+				</div>
+			</div>
+		</div>
+		<script>
+			$(document).ready(function() {
+				$("#messageArea").on("submit", function(event) {
+					const date = new Date();
+					const hour = date.getHours();
+					const minute = date.getMinutes();
+					const str_time = hour+":"+minute;
+					var rawText = $("#text").val();
+					var userHtml = '<div class="d-flex justify-content-end mb-4"><div class="msg_cotainer_send">' + rawText + '<span class="msg_time_send">'+ str_time + '</span></div><div class="img_cont_msg"><img src="https://i.ibb.co/d5b84Xw/Untitled-design.png" class="rounded-circle user_img_msg"></div></div>';
+					$("#text").val("");
+					$("#messageFormeight").append(userHtml);
+					$.ajax({
+						data: {
+							msg: rawText,
+						},
+						type: "POST",
+						url: "/get",
+					}).done(function(data) {
+						var botHtml = '<div class="d-flex justify-content-start mb-4"><div class="img_cont_msg"><img src="https://stickerpacks.ru/wp-content/uploads/2023/04/nabor-stikerov-teorija-bolshogo-vzryva-5-dlja-telegram-3.webp" class="rounded-circle user_img_msg"></div><div class="msg_cotainer">' + data + '<span class="msg_time">' + str_time + '</span></div></div>';
+						$("#messageFormeight").append($.parseHTML(botHtml));
+					});
+					event.preventDefault();
+				});
+			});
+		</script>
+    </body>
+</html>

utils.py ADDED Viewed

	@@ -0,0 +1,256 @@

+import pandas as pd
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+from scipy import sparse
+import pickle
+def scripts_rework(path, character):
+    """FOR GENERARTIVE MODEL TRAINING!!!
+    this functions split scripts for question, answer, context,
+    picks up the character, augments data for generative model training
+    and saves data in pickle format"""
+    df = pd.read_csv(path)
+    # split data for scenes
+    count = 0
+    df["scene_count"] = ""
+    for index, row in df.iterrows():
+        if index == 0:
+            df.iloc[index]["scene_count"] = count
+        elif row["person_scene"] == "Scene":
+            count += 1
+            df.iloc[index]["scene_count"] = count
+        else:
+            df.iloc[index]["scene_count"] = count
+    df = df.dropna().reset_index()
+    # rework scripts to filer by caracter utterances and related context
+    scripts = pd.DataFrame()
+    for index, row in df.iterrows():
+        if (row["person_scene"] == character) & (
+            df.iloc[index - 1]["person_scene"] != "Scene"
+        ):
+            context = []
+            for i in reversed(range(2, 6)):
+                if (df.iloc[index - i]["person_scene"] != "Scene") & (index - i >= 0):
+                    context.append(df.iloc[index - i]["dialogue"])
+                else:
+                    break
+            for j in range(len(context)):
+                new_row = {
+                    "answer": row["dialogue"],
+                    "question": df.iloc[index - 1]["dialogue"],
+                    "context": context[j:],
+                }
+                scripts = pd.concat([scripts, pd.DataFrame([new_row])])
+            new_row = {
+                "answer": row["dialogue"],
+                "question": df.iloc[index - 1]["dialogue"],
+                "context": [],
+            }
+            scripts = pd.concat([scripts, pd.DataFrame([new_row])])
+        elif (row["person_scene"] == character) & (
+            df.iloc[index - 1]["person_scene"] == "Scene"
+        ):
+            context = []
+            new_row = {"answer": row["dialogue"], "question": "", "context": context}
+            scripts = pd.concat([scripts, pd.DataFrame([new_row])])
+    # load reworked data to pkl
+    scripts = scripts[scripts["question"] != ""]
+    scripts["context"] = scripts["context"].apply(lambda x: "".join(x))
+    scripts = scripts.reset_index(drop=True)
+    scripts.to_pickle("data/scripts_reworked.pkl")
+# ===================================================
+def scripts_rework_ranking(path, character):
+    """FOR RAG RETRIEVAL !!!!
+    this functions split scripts for queation, answer, context,
+    picks up the cahracter and saves data in pickle format"""
+    df = pd.read_csv(path)
+    # split data for scenes
+    count = 0
+    df["scene_count"] = ""
+    for index, row in df.iterrows():
+        if index == 0:
+            df.iloc[index]["scene_count"] = count
+        elif row["person_scene"] == "Scene":
+            count += 1
+            df.iloc[index]["scene_count"] = count
+        else:
+            df.iloc[index]["scene_count"] = count
+    df = df.dropna().reset_index()
+    # rework scripts to filer by caracter utterances and related context
+    scripts = pd.DataFrame()
+    for index, row in df.iterrows():
+        if (row["person_scene"] == character) & (
+            df.iloc[index - 1]["person_scene"] != "Scene"
+        ):
+            context = []
+            for i in reversed(range(2, 5)):
+                if (df.iloc[index - i]["person_scene"] != "Scene") & (index - i >= 0):
+                    context.append(df.iloc[index - i]["dialogue"])
+                else:
+                    break
+            new_row = {
+                "answer": row["dialogue"],
+                "question": df.iloc[index - 1]["dialogue"],
+                "context": context,
+            }
+            scripts = pd.concat([scripts, pd.DataFrame([new_row])])
+        elif (row["person_scene"] == character) & (
+            df.iloc[index - 1]["person_scene"] == "Scene"
+        ):
+            context = []
+            new_row = {"answer": row["dialogue"], "question": "", "context": context}
+            scripts = pd.concat([scripts, pd.DataFrame([new_row])])
+    # load reworked data to pkl
+    scripts = scripts[scripts["question"] != ""]
+    scripts = scripts.reset_index(drop=True)
+    scripts.to_pickle("data/scripts.pkl")
+# ===================================================
+def encode(texts, model, contexts=None, do_norm=True):
+    """function to encode texts for cosine similarity search"""
+    question_vectors = model.encode(texts)
+    if type(contexts) is list:
+        context_vectors = model.encode("".join(contexts))
+    else:
+        context_vectors = model.encode(contexts)
+    return np.concatenate(
+        [
+            np.asarray(context_vectors),
+            np.asarray(question_vectors),
+        ],
+        axis=-1,
+    )
+def encode_rag(texts, model, contexts=None, do_norm=True):
+    """function to encode texts for cosine similarity search"""
+    question_vectors = model.encode(texts)
+    context_vectors = model.encode("".join(contexts))
+    return np.concatenate(
+        [
+            np.asarray(context_vectors),
+            np.asarray(question_vectors),
+        ],
+        axis=-1,
+    )
+# ===================================================
+def encode_df_save(model):
+    """FOR RAG RETRIEVAL DATABASE
+    this functions vectorizes reworked scripts and loads them to
+    pickle file to be used as retrieval base for ranking script"""
+    scripts_reopened = pd.read_pickle("data/scripts.pkl")
+    vect_data = []
+    for index, row in scripts_reopened.iterrows():
+        if type(row["context"]) is list:
+            vect = encode(
+                texts=row["question"],
+                model=model,
+                contexts="".join(row["context"]),
+            )
+            vect_data.append(vect)
+        else:
+            vect = encode(
+                texts=row["question"],
+                model=model,
+                contexts=row["context"],
+            )
+            vect_data.append(vect)
+    with open("data/scripts_vectors.pkl", "wb") as f:
+        pickle.dump(vect_data, f)
+# ===================================================
+def cosine_sim(answer_true_vectros, answer_generated_vectors) -> list:
+    """FOR MODEL EVALUATION!!!!
+    returns list of tuples with similarity score"""
+    data_emb = sparse.csr_matrix(answer_true_vectros)
+    query_emb = sparse.csr_matrix(answer_generated_vectors)
+    similarity = cosine_similarity(query_emb, data_emb).flatten()
+    return similarity[0]
+# ===================================================
+def cosine_sim_rag(data_vectors, query_vectors) -> list:
+    """FOR RAG RETRIEVAL RANKS!!!
+    returns list of tuples with similarity score and
+    script index in initial dataframe"""
+    data_emb = sparse.csr_matrix(data_vectors)
+    query_emb = sparse.csr_matrix(query_vectors)
+    similarity = cosine_similarity(query_emb, data_emb).flatten()
+    ind = np.argwhere(similarity)
+    match = sorted(zip(similarity, ind.tolist()), reverse=True)
+    return match
+# ===================================================
+def generate_response(
+    model,
+    tokenizer,
+    question,
+    context,
+    top_p,
+    temperature,
+    rag_answer="",
+):
+    combined = (
+        "context:" + rag_answer +
+        "".join(context) + "</s>" +
+        "question: " + question
+    )
+    input_ids = tokenizer.encode(combined, return_tensors="pt")
+    sample_output = model.generate(
+        input_ids,
+        do_sample=True,
+        max_length=1000,
+        top_p=top_p,
+        temperature=temperature,
+        repetition_penalty=2.0,
+        top_k=50,
+        no_repeat_ngram_size=4,
+        # early_stopping=True,
+        # min_length=10,
+    )
+    out = tokenizer.decode(sample_output[0][1:], skip_special_tokens=True)
+    if "</s>" in out:
+        out = out[: out.find("</s>")].strip()
+    return out
+# ===================================================
+def top_candidates(score_lst_sorted, initial_data, top=1):
+    """this functions receives results of the cousine similarity ranking and
+    returns top items' scores and their indices"""
+    scores = [item[0] for item in score_lst_sorted]
+    candidates_indexes = [item[1][0] for item in score_lst_sorted]
+    return scores[0:top], candidates_indexes[0:top]