Spaces:

wb-droid
/

SentenceEmbedding

Sleeping

App Files Files Community

wb-droid commited on Apr 29, 2024

Commit

93ecd47

1 Parent(s): 794d1e1

first commit

Browse files

Files changed (4) hide show

app.py +80 -0
myTextEmbedding.py +98 -0
myTextEmbeddingStudent.pt +3 -0
requirements.txt +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,80 @@

+from myTextEmbedding import *
+import gradio as gr
+def generate_chunk_emb(m, chunk_data):
+    with torch.no_grad():
+        emb = m(chunk_data, device = "cpu")
+    return emb
+def search_document(s, chunk_data, chunk_emb, m, topk=3):
+    question = [s]
+    with torch.no_grad():
+        result_score = m.cos_score(m(question, device = "cpu").expand(chunk_emb.shape),chunk_emb)
+        #result_score = m.cos_score(m(question, device = "cpu"),chunk_emb)
+    print(result_score)
+    _,idxs = torch.topk(result_score,topk)
+    print([result_score.flatten()[idx] for idx in idxs.flatten().tolist()])
+    print(idxs.flatten().tolist())
+    print(chunk_data)
+    print(len(chunk_data))
+    return [chunk_data[idx] for idx in idxs.flatten().tolist() if idx < len(chunk_data)]
+# create the student training model
+class TrainStudent(nn.Module):
+    def __init__(self, student_model):
+        super().__init__()
+        self.student_model = student_model
+    def forward(self, s1, teacher_model):
+        emb_student = self.student_model(s1)
+        emb_teacher = teacher_model(s1)
+        mse = (emb_student - emb_teacher).pow(2).mean()
+        return mse
+chunk_data = generate_chunk_data(["AI","moon","brain"])
+student_model=torch.load("myTextEmbeddingStudent.pt",map_location='cpu').student_model
+# create the embedding vector database
+chunk_emb = generate_chunk_emb(student_model, chunk_data)
+#new_chunk_data = []
+#new_chunk_emb = tensor([])
+def addNewConcepts(user_concepts):
+    return user_concepts
+def search(input, user_concepts):
+    if user_concepts:
+        new_chunk_data = generate_chunk_data(user_concepts.split(","))
+        new_chunk_emb = generate_chunk_emb(student_model, new_chunk_data)
+        result = search_document(input, new_chunk_data, new_chunk_emb, student_model)
+    else:
+        result = search_document(input, chunk_data, chunk_emb, student_model)
+    return " ".join(result)
+with gr.Blocks() as demo:
+    gr.HTML("""<h1 align="center">Sentence Embedding and Vector Database</h1>""")
+    search_result = gr.Textbox(show_label=False, placeholder="Search Result", lines=8)
+    with gr.Row():
+        with gr.Column(scale=1):
+            new_concept_box = gr.Textbox(show_label=False, placeholder="Add new concepts", lines=8)
+            #addConceptBtn = gr.Button("Add concepts")
+        with gr.Column(scale=4):
+            user_input = gr.Textbox(show_label=False, placeholder="Enter question on the concept...", lines=8)
+            searchBtn = gr.Button("Search", variant="primary")
+    searchBtn.click(
+        search,
+        [user_input],
+        [search_result],
+        show_progress=True,
+    )
+    #addConceptBtn.click(addNewConcepts, [user_concepts], [new_concept_box])
+    searchBtn.click(search, inputs=[user_input, new_concept_box], outputs=[search_result], show_progress=True)
+demo.queue().launch(share=False, inbrowser=True)

myTextEmbedding.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import torch
+import torch.nn as nn
+from torch import tensor
+from transformers import BertModel, BertTokenizer
+#import gzip
+import pandas as pd
+import requests
+class EmbeddingModel(nn.Module):
+    def __init__(self, bertName = "bert-base-uncased"): # other bert models can also be supported
+        super().__init__()
+        self.bertName = bertName
+        # use BERT model
+        self.tokenizer = BertTokenizer.from_pretrained(self.bertName)
+        self.model = BertModel.from_pretrained(self.bertName)
+    def forward(self, s, device = "cuda"):
+        # get tokens, which also include attention_mask
+        tokens = self.tokenizer(s, return_tensors='pt', padding = "max_length", truncation = True, max_length = 256).to(device)
+        # get token embeddings
+        output = self.model(**tokens)
+        tokens_embeddings = output.last_hidden_state
+        #print("tokens_embeddings:" + str(tokens_embeddings.shape))
+        # mean pooling to get text embedding
+        embeddings = tokens_embeddings * tokens.attention_mask[...,None] # [B, T, emb]
+        #print("embeddings:" + str(embeddings.shape))
+        embeddings = embeddings.sum(1) # [B, emb]
+        valid_tokens = tokens.attention_mask.sum(1) # [B]
+        embeddings = embeddings / valid_tokens[...,None] # [B, emb]
+        return embeddings
+    # from scratch: nn.CosineSimilarity(dim = 1)(q,a)
+    def cos_score(self, q, a):
+        q_norm = q / (q.pow(2).sum(dim=1, keepdim=True).pow(0.5))
+        r_norm = a / (a.pow(2).sum(dim=1, keepdim=True).pow(0.5))
+        return (q_norm @ r_norm.T).diagonal()
+# contrastive training
+class TrainModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.m = EmbeddingModel("bert-base-uncased")
+    def forward(self, s1, s2, score):
+        cos_score = self.m.cos_score(self.m(s1), self.m(s2))
+        loss = nn.MSELoss()(cos_score, score)
+        return loss, cos_score
+def searchWiki(s):
+    response = requests.get(
+            'https://en.wikipedia.org/w/api.php',
+            params={
+                'action': 'query',
+                'format': 'json',
+                'titles': s,
+                'prop': 'extracts',
+                'exintro': True,
+                'explaintext': True,
+            }
+        ).json()
+    page = next(iter(response['query']['pages'].values()))
+    return page['extract'].replace("\n","")
+# sentence chunking
+def chunk(w):
+    return w.split(".")
+def generate_chunk_data(concepts):
+    wiki_data = [searchWiki(c).replace("\n","") for c in concepts]
+    chunk_data = []
+    for w in wiki_data:
+        chunk_data = chunk_data + chunk(w)
+    chunk_data = [c.strip()+"." for c in chunk_data]
+    while '.' in chunk_data:
+        chunk_data.remove('.')
+    return chunk_data
+def generate_chunk_emb(m, chunk_data):
+    with torch.no_grad():
+        emb = m(chunk_data, device = "cpu")
+    return emb
+def search_document(s, chunk_data, chunk_emb, m, topk=3):
+    question = [s]
+    with torch.no_grad():
+        result_score = m.cos_score(m(question, device = "cpu").expand(chunk_emb.shape),chunk_emb)
+    print(result_score)
+    _,idxs = torch.topk(result_score,topk)
+    print([result_score.flatten()[idx] for idx in idxs.flatten().tolist()])
+    return [chunk_data[idx] for idx in idxs.flatten().tolist()]

myTextEmbeddingStudent.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:84960ed3f791210853072638f665f07dc70e344688bf77a24c10e7d556a175bf
+size 268739587

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+torch
+transformers
+pandas