fionasu commited on
Commit
7dca6a4
·
verified ·
1 Parent(s): 497e4af

commented out some of the extra print functions (just so it runs faster)

Browse files
Files changed (1) hide show
  1. app.py +6 -30
app.py CHANGED
@@ -14,31 +14,23 @@ with open("luggage.txt", "r", encoding="utf-8") as file:
14
  # Read the entire contents of the file and store it in a variable
15
  luggage_text = file.read()
16
 
17
-
18
  #STEP 3 FROM SEMATIC SEARCH
19
  def preprocess_text(text):
20
  # Strip extra whitespace from the beginning and the end of the text
21
  cleaned_text = text.strip()
22
-
23
  # Split the cleaned_text by every newline character (\n)
24
  chunks = cleaned_text.split("***")
25
-
26
  # Create an empty list to store cleaned chunks
27
  cleaned_chunks = []
28
-
29
  # Write your for-in loop below to clean each chunk and add it to the cleaned_chunks list
30
  for chunk in chunks:
31
  chunk.strip()
32
-
33
  if chunk != "":
34
  cleaned_chunks.append(chunk)
35
-
36
  # Print cleaned_chunks
37
- print(cleaned_chunks)
38
-
39
  # Print the length of cleaned_chunks
40
- print(len(cleaned_chunks))
41
-
42
  # Return the cleaned_chunks
43
  return cleaned_chunks
44
 
@@ -53,13 +45,10 @@ model = SentenceTransformer('all-MiniLM-L6-v2')
53
  def create_embeddings(text_chunks):
54
  # Convert each text chunk into a vector embedding and store as a tensor
55
  chunk_embeddings = model.encode(text_chunks, convert_to_tensor=True) # Replace ... with the text_chunks list
56
-
57
  # Print the chunk embeddings
58
- print(chunk_embeddings)
59
-
60
  # Print the shape of chunk_embeddings
61
- print(chunk_embeddings.shape)
62
-
63
  # Return the chunk_embeddings
64
  return chunk_embeddings
65
 
@@ -72,32 +61,23 @@ chunk_embeddings_luggage = create_embeddings(cleaned_chunks_luggage)
72
  def get_top_chunks(query, chunk_embeddings, text_chunks):
73
  # Convert the query text into a vector embedding
74
  query_embedding = model.encode(query, convert_to_tensor = True) # Complete this line
75
-
76
  # Normalize the query embedding to unit length for accurate similarity comparison
77
  query_embedding_normalized = query_embedding / query_embedding.norm()
78
-
79
  # Normalize all chunk embeddings to unit length for consistent comparison
80
  chunk_embeddings_normalized = chunk_embeddings / chunk_embeddings.norm(dim=1, keepdim=True)
81
-
82
  # Calculate cosine similarity between query and all chunks using matrix multiplication
83
  similarities = torch.matmul(chunk_embeddings_normalized, query_embedding_normalized) # Complete this line
84
-
85
  # Print the similarities
86
- print(similarities)
87
-
88
  # Find the indices of the 3 chunks with highest similarity scores
89
  top_indices = torch.topk(similarities, k=3).indices
90
-
91
  # Print the top indices
92
- print(top_indices)
93
-
94
  # Create an empty list to store the most relevant chunks
95
  top_chunks = []
96
-
97
  # Loop through the top indices and retrieve the corresponding text chunks
98
  for top_index in top_indices:
99
  top_chunks.append(text_chunks[top_index])
100
-
101
  # Return the list of most relevant chunks
102
  return top_chunks
103
 
@@ -108,7 +88,6 @@ def get_top_chunks(query, chunk_embeddings, text_chunks):
108
  # # Print the top results
109
  # print(top_weather)
110
 
111
-
112
  client = InferenceClient("Qwen/Qwen2.5-72B-Instruct")
113
 
114
  def respond(message, history):
@@ -137,6 +116,3 @@ def respond(message, history):
137
 
138
  chatbot = gr.ChatInterface(respond, type = 'messages')
139
  chatbot.launch(debug = True)
140
-
141
-
142
-
 
14
  # Read the entire contents of the file and store it in a variable
15
  luggage_text = file.read()
16
 
 
17
  #STEP 3 FROM SEMATIC SEARCH
18
  def preprocess_text(text):
19
  # Strip extra whitespace from the beginning and the end of the text
20
  cleaned_text = text.strip()
 
21
  # Split the cleaned_text by every newline character (\n)
22
  chunks = cleaned_text.split("***")
 
23
  # Create an empty list to store cleaned chunks
24
  cleaned_chunks = []
 
25
  # Write your for-in loop below to clean each chunk and add it to the cleaned_chunks list
26
  for chunk in chunks:
27
  chunk.strip()
 
28
  if chunk != "":
29
  cleaned_chunks.append(chunk)
 
30
  # Print cleaned_chunks
31
+ #print(cleaned_chunks)
 
32
  # Print the length of cleaned_chunks
33
+ #print(len(cleaned_chunks))
 
34
  # Return the cleaned_chunks
35
  return cleaned_chunks
36
 
 
45
  def create_embeddings(text_chunks):
46
  # Convert each text chunk into a vector embedding and store as a tensor
47
  chunk_embeddings = model.encode(text_chunks, convert_to_tensor=True) # Replace ... with the text_chunks list
 
48
  # Print the chunk embeddings
49
+ #print(chunk_embeddings)
 
50
  # Print the shape of chunk_embeddings
51
+ #print(chunk_embeddings.shape)
 
52
  # Return the chunk_embeddings
53
  return chunk_embeddings
54
 
 
61
  def get_top_chunks(query, chunk_embeddings, text_chunks):
62
  # Convert the query text into a vector embedding
63
  query_embedding = model.encode(query, convert_to_tensor = True) # Complete this line
 
64
  # Normalize the query embedding to unit length for accurate similarity comparison
65
  query_embedding_normalized = query_embedding / query_embedding.norm()
 
66
  # Normalize all chunk embeddings to unit length for consistent comparison
67
  chunk_embeddings_normalized = chunk_embeddings / chunk_embeddings.norm(dim=1, keepdim=True)
 
68
  # Calculate cosine similarity between query and all chunks using matrix multiplication
69
  similarities = torch.matmul(chunk_embeddings_normalized, query_embedding_normalized) # Complete this line
 
70
  # Print the similarities
71
+ #print(similarities)
 
72
  # Find the indices of the 3 chunks with highest similarity scores
73
  top_indices = torch.topk(similarities, k=3).indices
 
74
  # Print the top indices
75
+ #print(top_indices)
 
76
  # Create an empty list to store the most relevant chunks
77
  top_chunks = []
 
78
  # Loop through the top indices and retrieve the corresponding text chunks
79
  for top_index in top_indices:
80
  top_chunks.append(text_chunks[top_index])
 
81
  # Return the list of most relevant chunks
82
  return top_chunks
83
 
 
88
  # # Print the top results
89
  # print(top_weather)
90
 
 
91
  client = InferenceClient("Qwen/Qwen2.5-72B-Instruct")
92
 
93
  def respond(message, history):
 
116
 
117
  chatbot = gr.ChatInterface(respond, type = 'messages')
118
  chatbot.launch(debug = True)