AashitaK commited on
Commit
1a6ff22
·
verified ·
1 Parent(s): 228f462

Create document_retrieval.py

Browse files
Files changed (1) hide show
  1. document_retrieval.py +25 -0
document_retrieval.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from openai_api import get_embedding
3
+
4
+ def vector_similarity(x: list[float], y: list[float]) -> float:
5
+ """
6
+ Returns the similarity between two vectors.
7
+
8
+ Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
9
+ """
10
+ return np.dot(np.array(x), np.array(y))
11
+
12
+ def select_document_section_by_query_similarity(query: str, contexts: dict[(str, str), np.array]) -> list[(float, (str, str))]:
13
+ """
14
+ Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
15
+ to find the most relevant sections.
16
+
17
+ Return the list of document sections, sorted by relevance in descending order.
18
+ """
19
+ query_embedding = get_embedding(query)
20
+
21
+ document_similarities = sorted([
22
+ (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
23
+ ], reverse=True)
24
+
25
+ return document_similarities[0]