Saachi-S123 commited on
Commit
be80398
·
verified ·
1 Parent(s): d8d058e

added the code from semantic search lab (steps 1-6)

Browse files
Files changed (1) hide show
  1. app.py +113 -0
app.py CHANGED
@@ -1,6 +1,119 @@
1
  import gradio as gr
2
  from huggingface_hub import InferenceClient
3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
5
 
6
  def respond(message, history):
 
1
  import gradio as gr
2
  from huggingface_hub import InferenceClient
3
 
4
+ #STEP 1 FROM SEMANTIC SEARCH
5
+ from sentence_transformers import SentenceTransformer
6
+ import torch
7
+
8
+ #STEP 2 FROM SEMANTIC SEARCH
9
+ # Open the water_cycle.txt file in read mode with UTF-8 encoding
10
+ with open("/water_cycle.txt", "r", encoding="utf-8") as file:
11
+ # Read the entire contents of the file and store it in a variable
12
+ water_cycle_text = file.read()
13
+
14
+ # Print the text below
15
+ print(water_cycle_text)
16
+
17
+ # Print the text below
18
+ print(water_cycle_text)
19
+
20
+
21
+ #STEP 3 FROM SEMANTIC SEARCH
22
+ def preprocess_text(text):
23
+ # Strip extra whitespace from the beginning and the end of the text
24
+ cleaned_text = text.strip()
25
+
26
+ # Split the cleaned_text by every newline character (\n)
27
+ chunks = cleaned_text.split("\n")
28
+
29
+ # Create an empty list to store cleaned chunks
30
+ cleaned_chunks = []
31
+
32
+ # Write your for-in loop below to clean each chunk and add it to the cleaned_chunks list
33
+ for chunk in chunks:
34
+ cleaned_chunk = chunk.strip()
35
+ if len(cleaned_chunk) > 0:
36
+ cleaned_chunks.append(cleaned_chunk)
37
+
38
+ # Print cleaned_chunks
39
+ print(cleaned_chunks)
40
+
41
+ # Print the length of cleaned_chunks
42
+ print(len(cleaned_chunks))
43
+
44
+ # Return the cleaned_chunks
45
+ return cleaned_chunks
46
+
47
+ # Call the preprocess_text function and store the result in a cleaned_chunks variable
48
+ cleaned_chunks = preprocess_text(water_cycle_text) # Complete this line
49
+
50
+ #STEP 4 FROM SEMANTIC SEARCH
51
+
52
+ # Load the pre-trained embedding model that converts text to vectors
53
+ model = SentenceTransformer('all-MiniLM-L6-v2')
54
+
55
+ def create_embeddings(text_chunks):
56
+ # Convert each text chunk into a vector embedding and store as a tensor
57
+ chunk_embeddings = model.encode(text_chunks, convert_to_tensor=True) # Replace ... with the text_chunks list
58
+
59
+ # Print the chunk embeddings
60
+ print(chunk_embeddings)
61
+
62
+ # Print the shape of chunk_embeddings
63
+ print(chunk_embeddings.shape)
64
+
65
+ # Return the chunk_embeddings
66
+ return chunk_embeddings
67
+
68
+ # Call the create_embeddings function and store the result in a new chunk_embeddings variable
69
+ chunk_embeddings = create_embeddings(cleaned_chunks) # Complete this line
70
+
71
+
72
+ #STEP 5 FROM SEMANTIC SEARCH
73
+
74
+ # Define a function to find the most relevant text chunks for a given query, chunk_embeddings, and text_chunks
75
+ def get_top_chunks(query, chunk_embeddings, text_chunks):
76
+ # Convert the query text into a vector embedding
77
+ query_embedding = model.encode(query, convert_to_tensor=True) # Complete this line
78
+
79
+ # Normalize the query embedding to unit length for accurate similarity comparison
80
+ query_embedding_normalized = query_embedding / query_embedding.norm()
81
+
82
+ # Normalize all chunk embeddings to unit length for consistent comparison
83
+ chunk_embeddings_normalized = chunk_embeddings / chunk_embeddings.norm(dim=1, keepdim=True)
84
+
85
+ # Calculate cosine similarity between query and all chunks using matrix multiplication
86
+ similarities = torch.matmul(chunk_embeddings_normalized, query_embedding_normalized) # Complete this line
87
+
88
+ # Print the similarities
89
+ print(similarities)
90
+
91
+
92
+ # Find the indices of the 3 chunks with highest similarity scores
93
+ top_indices = torch.topk(similarities, k=3).indices
94
+
95
+ # Print the top indices
96
+ print(top_indices)
97
+
98
+ # Create an empty list to store the most relevant chunks
99
+ top_chunks = []
100
+
101
+ # Loop through the top indices and retrieve the corresponding text chunks
102
+ for index in top_indices:
103
+ relevant_chunk = text_chunks[index]
104
+ top_chunks.append(relevant_chunk)
105
+
106
+ # Return the list of most relevant chunks
107
+ return top_chunks
108
+
109
+ #STEP 6 FROM SEMANTIC SEARCH
110
+ # Call the get_top_chunks function with the original query
111
+ top_results = get_top_chunks("How does water get into the sky?", chunk_embedings, cleaned chunks) # Complete this line
112
+
113
+ # Print the top results
114
+ print(top_results)
115
+
116
+
117
  client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
118
 
119
  def respond(message, history):