Vitomir Jovanović commited on
Commit
15a5228
·
1 Parent(s): 9a58d44

Add full quotation data

Browse files
app.py CHANGED
@@ -40,7 +40,7 @@ if st.button("Search Quotes"):
40
  # Format and display search results
41
  st.write(f"Search Results: ")
42
  for i, (prompt, distance) in enumerate(zip(similar_quotes, distances)):
43
- st.write(f"{i+1}. Prompt: {prompt}, Distance: {distance}")
44
  print(f'Those are: {prompt}, {distance}')
45
  else:
46
  st.error("Please enter a quote or phrase.")
 
40
  # Format and display search results
41
  st.write(f"Search Results: ")
42
  for i, (prompt, distance) in enumerate(zip(similar_quotes, distances)):
43
+ st.write(f"{i+1}. Quote: {prompt}, Cosine similarity: {distance}")
44
  print(f'Those are: {prompt}, {distance}')
45
  else:
46
  st.error("Please enter a quote or phrase.")
fast_api.py CHANGED
@@ -32,8 +32,8 @@ def read_root():
32
  async def search_prompts(query: Query, k: int = 3):
33
  print(f'Prompt: {query}')
34
  similar_quotes, distances = search_engine.most_similar(query.quote, top_k=k)
35
- print(f'Similar Quotes {similar_quotes}')
36
- print(f'Distances {distances}')
37
  print(40*'****')
38
  # Format the response
39
  response = [
@@ -48,7 +48,7 @@ async def all_vectors(query: Query):
48
 
49
  query_embedding = search_engine.model.encode([query.quote]) # Encode the prompt to a vector
50
  all_similarities = search_engine.cosine_similarity(query_embedding, search_engine.index)
51
- print(f'Prompt: {query}')
52
  print(f'All Vector Similarities: {all_similarities}')
53
  print(40*'****')
54
  response = [
 
32
  async def search_prompts(query: Query, k: int = 3):
33
  print(f'Prompt: {query}')
34
  similar_quotes, distances = search_engine.most_similar(query.quote, top_k=k)
35
+ print(f'Similar Quotes: {similar_quotes}')
36
+ print(f'Cosine similarity: {distances}')
37
  print(40*'****')
38
  # Format the response
39
  response = [
 
48
 
49
  query_embedding = search_engine.model.encode([query.quote]) # Encode the prompt to a vector
50
  all_similarities = search_engine.cosine_similarity(query_embedding, search_engine.index)
51
+ print(f'Quote or phrase: {query}')
52
  print(f'All Vector Similarities: {all_similarities}')
53
  print(40*'****')
54
  response = [
models/data_reader.py CHANGED
@@ -2,6 +2,7 @@ from datasets import load_dataset
2
  import json
3
  from tqdm import tqdm
4
  import pandas as pd
 
5
 
6
 
7
 
@@ -59,7 +60,7 @@ def load_quotes_from_csv(file_path):
59
  print("Quotes loaded:", len(quotes)) # should be 499709
60
  print("First quote:", quotes[0][:100])
61
  print("Data loaded successfully.")
62
- return quotes[:10000]
63
 
64
 
65
  if __name__ == "__main__":
 
2
  import json
3
  from tqdm import tqdm
4
  import pandas as pd
5
+ # import kagglehub
6
 
7
 
8
 
 
60
  print("Quotes loaded:", len(quotes)) # should be 499709
61
  print("First quote:", quotes[0][:100])
62
  print("Data loaded successfully.")
63
+ return quotes
64
 
65
 
66
  if __name__ == "__main__":
models/quotes_search_engine.py CHANGED
@@ -20,7 +20,7 @@ class QuoteSearchEngine:
20
  print("Data encoding started...")
21
  print("Number of quotes to encode:", len(quotes))
22
 
23
- batch_size = 1000
24
  for i in range(0, len(quotes), batch_size):
25
  batch = quotes[i:i+batch_size]
26
  embeddings = self.model.encode(batch, batch_size=128, show_progress_bar=True)
@@ -44,7 +44,7 @@ class QuoteSearchEngine:
44
  # Retrieve the corresponding prompts for the found indices
45
  similar_prompts = [self.prompts_track[idx] for idx in indices[0]]
46
 
47
- return similar_prompts, distances[0] # Return both the similar prompts and their distances
48
 
49
 
50
  def cosine_similarity(self, query_vector, index):
 
20
  print("Data encoding started...")
21
  print("Number of quotes to encode:", len(quotes))
22
 
23
+ batch_size = 10000
24
  for i in range(0, len(quotes), batch_size):
25
  batch = quotes[i:i+batch_size]
26
  embeddings = self.model.encode(batch, batch_size=128, show_progress_bar=True)
 
44
  # Retrieve the corresponding prompts for the found indices
45
  similar_prompts = [self.prompts_track[idx] for idx in indices[0]]
46
 
47
+ return similar_prompts, round(distances[0], 3) # Return both the similar prompts and their distances
48
 
49
 
50
  def cosine_similarity(self, query_vector, index):