Vitomir Jovanović commited on
Commit ·
15a5228
1
Parent(s): 9a58d44
Add full quotation data
Browse files- app.py +1 -1
- fast_api.py +3 -3
- models/data_reader.py +2 -1
- models/quotes_search_engine.py +2 -2
app.py
CHANGED
|
@@ -40,7 +40,7 @@ if st.button("Search Quotes"):
|
|
| 40 |
# Format and display search results
|
| 41 |
st.write(f"Search Results: ")
|
| 42 |
for i, (prompt, distance) in enumerate(zip(similar_quotes, distances)):
|
| 43 |
-
st.write(f"{i+1}.
|
| 44 |
print(f'Those are: {prompt}, {distance}')
|
| 45 |
else:
|
| 46 |
st.error("Please enter a quote or phrase.")
|
|
|
|
| 40 |
# Format and display search results
|
| 41 |
st.write(f"Search Results: ")
|
| 42 |
for i, (prompt, distance) in enumerate(zip(similar_quotes, distances)):
|
| 43 |
+
st.write(f"{i+1}. Quote: {prompt}, Cosine similarity: {distance}")
|
| 44 |
print(f'Those are: {prompt}, {distance}')
|
| 45 |
else:
|
| 46 |
st.error("Please enter a quote or phrase.")
|
fast_api.py
CHANGED
|
@@ -32,8 +32,8 @@ def read_root():
|
|
| 32 |
async def search_prompts(query: Query, k: int = 3):
|
| 33 |
print(f'Prompt: {query}')
|
| 34 |
similar_quotes, distances = search_engine.most_similar(query.quote, top_k=k)
|
| 35 |
-
print(f'Similar Quotes {similar_quotes}')
|
| 36 |
-
print(f'
|
| 37 |
print(40*'****')
|
| 38 |
# Format the response
|
| 39 |
response = [
|
|
@@ -48,7 +48,7 @@ async def all_vectors(query: Query):
|
|
| 48 |
|
| 49 |
query_embedding = search_engine.model.encode([query.quote]) # Encode the prompt to a vector
|
| 50 |
all_similarities = search_engine.cosine_similarity(query_embedding, search_engine.index)
|
| 51 |
-
print(f'
|
| 52 |
print(f'All Vector Similarities: {all_similarities}')
|
| 53 |
print(40*'****')
|
| 54 |
response = [
|
|
|
|
| 32 |
async def search_prompts(query: Query, k: int = 3):
|
| 33 |
print(f'Prompt: {query}')
|
| 34 |
similar_quotes, distances = search_engine.most_similar(query.quote, top_k=k)
|
| 35 |
+
print(f'Similar Quotes: {similar_quotes}')
|
| 36 |
+
print(f'Cosine similarity: {distances}')
|
| 37 |
print(40*'****')
|
| 38 |
# Format the response
|
| 39 |
response = [
|
|
|
|
| 48 |
|
| 49 |
query_embedding = search_engine.model.encode([query.quote]) # Encode the prompt to a vector
|
| 50 |
all_similarities = search_engine.cosine_similarity(query_embedding, search_engine.index)
|
| 51 |
+
print(f'Quote or phrase: {query}')
|
| 52 |
print(f'All Vector Similarities: {all_similarities}')
|
| 53 |
print(40*'****')
|
| 54 |
response = [
|
models/data_reader.py
CHANGED
|
@@ -2,6 +2,7 @@ from datasets import load_dataset
|
|
| 2 |
import json
|
| 3 |
from tqdm import tqdm
|
| 4 |
import pandas as pd
|
|
|
|
| 5 |
|
| 6 |
|
| 7 |
|
|
@@ -59,7 +60,7 @@ def load_quotes_from_csv(file_path):
|
|
| 59 |
print("Quotes loaded:", len(quotes)) # should be 499709
|
| 60 |
print("First quote:", quotes[0][:100])
|
| 61 |
print("Data loaded successfully.")
|
| 62 |
-
return quotes
|
| 63 |
|
| 64 |
|
| 65 |
if __name__ == "__main__":
|
|
|
|
| 2 |
import json
|
| 3 |
from tqdm import tqdm
|
| 4 |
import pandas as pd
|
| 5 |
+
# import kagglehub
|
| 6 |
|
| 7 |
|
| 8 |
|
|
|
|
| 60 |
print("Quotes loaded:", len(quotes)) # should be 499709
|
| 61 |
print("First quote:", quotes[0][:100])
|
| 62 |
print("Data loaded successfully.")
|
| 63 |
+
return quotes
|
| 64 |
|
| 65 |
|
| 66 |
if __name__ == "__main__":
|
models/quotes_search_engine.py
CHANGED
|
@@ -20,7 +20,7 @@ class QuoteSearchEngine:
|
|
| 20 |
print("Data encoding started...")
|
| 21 |
print("Number of quotes to encode:", len(quotes))
|
| 22 |
|
| 23 |
-
batch_size =
|
| 24 |
for i in range(0, len(quotes), batch_size):
|
| 25 |
batch = quotes[i:i+batch_size]
|
| 26 |
embeddings = self.model.encode(batch, batch_size=128, show_progress_bar=True)
|
|
@@ -44,7 +44,7 @@ class QuoteSearchEngine:
|
|
| 44 |
# Retrieve the corresponding prompts for the found indices
|
| 45 |
similar_prompts = [self.prompts_track[idx] for idx in indices[0]]
|
| 46 |
|
| 47 |
-
return similar_prompts, distances[0] # Return both the similar prompts and their distances
|
| 48 |
|
| 49 |
|
| 50 |
def cosine_similarity(self, query_vector, index):
|
|
|
|
| 20 |
print("Data encoding started...")
|
| 21 |
print("Number of quotes to encode:", len(quotes))
|
| 22 |
|
| 23 |
+
batch_size = 10000
|
| 24 |
for i in range(0, len(quotes), batch_size):
|
| 25 |
batch = quotes[i:i+batch_size]
|
| 26 |
embeddings = self.model.encode(batch, batch_size=128, show_progress_bar=True)
|
|
|
|
| 44 |
# Retrieve the corresponding prompts for the found indices
|
| 45 |
similar_prompts = [self.prompts_track[idx] for idx in indices[0]]
|
| 46 |
|
| 47 |
+
return similar_prompts, round(distances[0], 3) # Return both the similar prompts and their distances
|
| 48 |
|
| 49 |
|
| 50 |
def cosine_similarity(self, query_vector, index):
|