Spaces:

StKirill
/

chatbot

Sleeping

App Files Files Community

StKirill commited on Feb 18, 2024

Commit

844cfc1

verified ·

1 Parent(s): cb9b058

Update app.py

Browse files

Files changed (1) hide show

app.py +138 -14

app.py CHANGED Viewed

@@ -32,6 +32,9 @@ nltk.download('stopwords')
 # Take Rachel as main character
 df = pd.read_csv("rachel_friends.csv")  # read the database into a data frame
 # Define function for text normalization
 def text_normalization(text):
     text = str(text).lower()  # convert to all lower letters
@@ -126,7 +129,7 @@ def chat_tfidf_context(question, history):
   answer = df['answer'].loc[index_value]
   return answer
-#------------------------------------------------------------------------------------------------#
 punkt = [p for p in punctuation] + ["`", "``" ,"''", "'"]
 def tokenize(sent: str) -> str:
@@ -201,7 +204,7 @@ def chat_word2vec_context(question, history):
   return answer
-#------------------------------------------------------------------------------------------------#
 # Let's try bert model by elastic and with e5
 model_name = "distilbert/distilbert-base-uncased"
@@ -296,23 +299,144 @@ def chat_bert_context(question, history):
   answer = df['answer'].loc[relevant_indice]
   return answer
-#------------------------------------------------------------------------------------------------#
 # gradio part
 def echo(message, history, model):
-  if model=="TF-IDF":
     # answer = chat_tfidf(message)
-    answer = chat_tfidf_context(message, history)
-    return answer
-  elif model=="W2V":
     # answer = chat_word2vec(message)
-    answer = chat_word2vec_context(message, history)
-    return answer
-  elif model=="BERT":
-    answer = chat_bert_context(message, history)
-    return answer
@@ -321,7 +445,7 @@ title = "Chatbot who speaks like Rachel from Friends"
 description = "You have a good opportunity to have a dialog with friend's actor - Rachel Green"
 # model = gr.CheckboxGroup(["TF-IDF", "W2V", "BERT", "BI-Encoder", "Cross-Encoder"], label="Model", info="What model do you want to use?", value="TF-IDF")
-model = gr.Dropdown(["TF-IDF", "W2V", "BERT", "BI-Encoder", "Cross-Encoder"], label="Retrieval model", info="What model do you want to use?", value="TF-IDF")
 with gr.Blocks() as demo:

 # Take Rachel as main character
 df = pd.read_csv("rachel_friends.csv")  # read the database into a data frame
+#-------------------------------------TF-IDF------------------------------------------#
 # Define function for text normalization
 def text_normalization(text):
     text = str(text).lower()  # convert to all lower letters
   answer = df['answer'].loc[index_value]
   return answer
+#-------------------------------------W2V------------------------------------------#
 punkt = [p for p in punctuation] + ["`", "``" ,"''", "'"]
 def tokenize(sent: str) -> str:
   return answer
+#-------------------------------------BERT------------------------------------------#
 # Let's try bert model by elastic and with e5
 model_name = "distilbert/distilbert-base-uncased"
   answer = df['answer'].loc[relevant_indice]
   return answer
+#-------------------------------------Bi-BERT-Encoder------------------------------------------#
+# Define function for mean-pooling
+def mean_pool(token_embeds: torch.tensor, attention_mask: torch.tensor) -> torch.tensor:
+    in_mask = attention_mask.unsqueeze(-1).expand(token_embeds.size()).float()
+    pool = torch.sum(token_embeds * in_mask, 1) / torch.clamp(in_mask.sum(1), min=1e-9)
+    return pool
+# Define function for tokenization of the sentence and encoding it
+def encode(input_texts: list[str], tokenizer: AutoTokenizer, model: AutoModel, device: str = "cpu"
+) -> torch.tensor:
+    model.eval()
+    tokenized_texts = tokenizer(input_texts, max_length=128,
+                                padding='max_length', truncation=True, return_tensors="pt")
+    token_embeds = model(tokenized_texts["input_ids"].to(device),
+                         tokenized_texts["attention_mask"].to(device)).last_hidden_state
+    pooled_embeds = mean_pool(token_embeds, tokenized_texts["attention_mask"].to(device))
+    return pooled_embeds
+# Define architecture for bi-bert-encoder
+class Sbert(torch.nn.Module):
+    def __init__(self, max_length: int = 128):
+        super().__init__()
+        self.max_length = max_length
+        self.bert_model = AutoModel.from_pretrained('distilbert-base-uncased')
+        self.bert_tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
+        self.linear = torch.nn.Linear(self.bert_model.config.hidden_size * 3, 1)
+        # self.sigmoid = torch.nn.Sigmoid()
+    def forward(self, data: datasets.arrow_dataset.Dataset) -> torch.tensor:
+        question_input_ids = data["question_input_ids"].to(device)
+        question_attention_mask = data["question_attention_mask"].to(device)
+        answer_input_ids = data["answer_input_ids"].to(device)
+        answer_attention_mask = data["answer_attention_mask"].to(device)
+        out_question = self.bert_model(question_input_ids, question_attention_mask)
+        out_answer = self.bert_model(answer_input_ids, answer_attention_mask)
+        question_embeds = out_question.last_hidden_state
+        answer_embeds = out_answer.last_hidden_state
+        pooled_question_embeds = mean_pool(question_embeds, question_attention_mask)
+        pooled_answer_embeds = mean_pool(answer_embeds, answer_attention_mask)
+        embeds =  torch.cat([pooled_question_embeds, pooled_answer_embeds,
+                             torch.abs(pooled_question_embeds - pooled_answer_embeds)],
+                            dim=-1)
+        # return self.sigmoid(self.linear(embeds))
+        return self.linear(embeds)
+# Initialize the model
+model_bi_encoder = Sbert().to(device)
+# Load weights from training step
+model_bi_encoder.bert_model.from_pretrained("models/friends_bi_encoder")
+# Load question embeds
+question_embeds = np.load("bi_bert_question.npy")
+def chat_bi_bert(question):
+    question = encode(question, model_bi_encoder.bert_tokenizer, model_bi_encoder.bert_model, device).squeeze().cpu().detach().numpy()
+    cosine_similarities = cosine_similarity([question], question_embeds).flatten()
+    top_indice = np.argmax(cosine_similarities, axis=0)
+    answer = df['answer'].iloc[ind]
+    return answer
+#-------------------------------------Bi+Cross-BERT-Encoder------------------------------------------#
+inverted_question = dict(enumerate(df_base.answer.tolist()))
+#Define class for CrossEncoderBert
+class CrossEncoderBert(torch.nn.Module):
+    def __init__(self, max_length: int = MAX_LENGTH):
+        super().__init__()
+        self.max_length = max_length
+        self.bert_model = AutoModel.from_pretrained('distilbert-base-uncased')
+        self.bert_tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
+        self.linear = torch.nn.Linear(self.bert_model.config.hidden_size, 1)
+    def forward(self, input_ids, attention_mask):
+        outputs = self.bert_model(input_ids=input_ids, attention_mask=attention_mask)
+        pooled_output = outputs.last_hidden_state[:, 0]  # Use the CLS token's output
+        return self.linear(pooled_output)
+model_cross_encoder = CrossEncoderBert().to(device)
+model_cross_encoder.bert_model.from_pretrained("models/friends_cross_encoder")
+def chat_cross_bert(tokenizer, finetuned_ce, base_bert, query):
+    question_encoded = encode(question, model_bi_encoder.bert_tokenizer, model_bi_encoder.bert_model, device).squeeze().cpu().detach().numpy()
+    cosine_similarities = cosine_similarity([question_encoded], question_embeds).flatten()
+    topk_indices = np.argsort(cosine_similarities, axis=0)[::-1][:5]
+    topk_indices=topk_indices.tolist()
+    corpus = [inverted_answer[ind] for ind in topk_indices]
+    queries = [question] * len(corpus)
+    tokenized_texts = tokenizer(
+        queries, corpus, max_length=MAX_LENGTH, padding=True, truncation=True, return_tensors="pt"
+    ).to(device)
+    # Finetuned CrossEncoder model scoring
+    with torch.no_grad():
+        ce_scores = model_cross_encoder(tokenized_texts['input_ids'], tokenized_texts['attention_mask']).squeeze(-1)
+        ce_scores = torch.sigmoid(ce_scores)  # Apply sigmoid if needed
+    # Process scores for finetuned model
+    scores = ce_scores.cpu().numpy()
+    scores_ix = np.argmax(scores)
+    # print(f"{corpus[scores_ix]}")
+    return corpus[scores_ix]
 # gradio part
 def echo(message, history, model):
+    if model=="TF-IDF":
     # answer = chat_tfidf(message)
+        answer = chat_tfidf_context(message, history)
+        return answer
+    elif model=="W2V":
     # answer = chat_word2vec(message)
+        answer = chat_word2vec_context(message, history)
+        return answer
+    elif model=="BERT":
+        answer = chat_bert_context(message, history)
+        return answer
+    elif model=="Bi-BERT-Encoder":
+        answer = chat_bi_bert(message)
+        return answer
+    elif model=="Bi+Cross-BERT-Encoder":
+        answer = chat_cross_bert(message)
+        return answer
 description = "You have a good opportunity to have a dialog with friend's actor - Rachel Green"
 # model = gr.CheckboxGroup(["TF-IDF", "W2V", "BERT", "BI-Encoder", "Cross-Encoder"], label="Model", info="What model do you want to use?", value="TF-IDF")
+model = gr.Dropdown(["TF-IDF", "W2V", "BERT", "Bi-BERT-Encoder", "Bi+Cross-BERT-Encoder"], label="Retrieval model", info="What model do you want to use?", value="TF-IDF")
 with gr.Blocks() as demo: