kitopang commited on
Commit
7fe9c2f
·
verified ·
1 Parent(s): 784b5c2

Upload 2 files

Browse files
Files changed (2) hide show
  1. embeddings.parquet +3 -0
  2. search.py +48 -0
embeddings.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5aa8ce739a4a72fe313857da6d8067ce5b1df7ccaacae0af260af95d79ac9409
3
+ size 76990154
search.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import BertTokenizer, BertModel
2
+ import torch
3
+ import pandas as pd
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+ import numpy as np
6
+
7
+ # Load embeddings DataFrame
8
+ df = pd.read_parquet('embeddings.parquet')
9
+ df = df.head(5)
10
+
11
+ # Initialize tokenizer and model
12
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
13
+ model = BertModel.from_pretrained('bert-base-uncased')
14
+
15
+ def search_embeddings(query):
16
+ # Prepare the query text
17
+ inputs = tokenizer(query_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
18
+
19
+ # Generate embeddings for the query text
20
+ with torch.no_grad():
21
+ outputs = model(**inputs)
22
+ query_vector = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
23
+
24
+ # Assuming 'embedding' column in df contains embeddings as lists or arrays
25
+ # Convert list of embeddings to numpy array for cosine similarity calculation
26
+
27
+ embedding_matrix = np.stack(df['embedding'].values)
28
+ # Compute cosine similarities
29
+ similarities = cosine_similarity([query_vector], embedding_matrix)
30
+
31
+ # Get the top 5 most similar entries
32
+ top_indices = np.argsort(similarities[0])[::-1][:5]
33
+ top_scores = similarities[0][top_indices]
34
+
35
+ results = ""
36
+
37
+ # Print top matches with their scores
38
+ for index, score in zip(top_indices, top_scores):
39
+ # print(f"Index: {index}, Score: {score}, Data: {df.iloc[index]}")
40
+ data = df.iloc[index]
41
+ results += (f"Question: {data['text']} Answer: {data['answer']} ")
42
+ return results
43
+
44
+
45
+ # query_text = "Paul's First Epistle to the Corinthians"
46
+ # print(search_embeddings(query_text))
47
+
48
+