ayush2917 commited on
Commit
202c49b
·
verified ·
1 Parent(s): 75e8362

Update src/retrieval.py

Browse files
Files changed (1) hide show
  1. src/retrieval.py +27 -9
src/retrieval.py CHANGED
@@ -1,19 +1,37 @@
1
- from langchain_community.vectorstores import FAISS
2
- from langchain.docstore.document import Document
3
- from langchain_huggingface import HuggingFaceEmbeddings
4
  import json
 
 
5
 
6
  class RetrievalSystem:
7
  def __init__(self, document_path, embedder_model):
8
  self.embedder = HuggingFaceEmbeddings(model_name=embedder_model)
9
- self.vectorstore = self._build_vectorstore(document_path)
 
 
 
10
 
11
- def _build_vectorstore(self, document_path):
12
  with open(document_path, "r") as f:
13
  docs_data = json.load(f)
14
- documents = [Document(page_content=doc["content"], metadata={"category": doc["category"], "subkeyword": doc["subkeyword"]}) for doc in docs_data]
15
- return FAISS.from_documents(documents, embedding=self.embedder)
 
 
 
16
 
17
  def get_context(self, query, k=2):
18
- docs = self.vectorstore.similarity_search(query, k=k)
19
- return " ".join([doc.page_content for doc in docs])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import json
2
+ import numpy as np
3
+ from langchain_huggingface import HuggingFaceEmbeddings
4
 
5
  class RetrievalSystem:
6
  def __init__(self, document_path, embedder_model):
7
  self.embedder = HuggingFaceEmbeddings(model_name=embedder_model)
8
+ self.documents = []
9
+ self.embeddings = None
10
+ self._load_documents(document_path)
11
+ self._build_index()
12
 
13
+ def _load_documents(self, document_path):
14
  with open(document_path, "r") as f:
15
  docs_data = json.load(f)
16
+ self.documents = [(doc["content"], doc["metadata"]) for doc in docs_data]
17
+
18
+ def _build_index(self):
19
+ texts = [doc[0] for doc in self.documents]
20
+ self.embeddings = self.embedder.embed_documents(texts)
21
 
22
  def get_context(self, query, k=2):
23
+ # Embed the query
24
+ query_embedding = self.embedder.embed_query(query)
25
+
26
+ # Compute cosine similarity
27
+ embeddings = np.array(self.embeddings)
28
+ query_embedding = np.array(query_embedding)
29
+ similarities = np.dot(embeddings, query_embedding) / (
30
+ np.linalg.norm(embeddings, axis=1) * np.linalg.norm(query_embedding)
31
+ )
32
+
33
+ # Get top k documents
34
+ top_k_indices = np.argsort(similarities)[-k:][::-1]
35
+ top_k_docs = [self.documents[i][0] for i in top_k_indices]
36
+
37
+ return " ".join(top_k_docs)