pragyarama commited on
Commit
e2db73d
·
verified ·
1 Parent(s): ae57727

added semantic search code

Browse files
Files changed (1) hide show
  1. app.py +65 -0
app.py CHANGED
@@ -1,6 +1,71 @@
1
  import gradio as gr
2
  from huggingface_hub import InferenceClient
3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  client = InferenceClient("Qwen/Qwen2.5-72B-instruct")
5
 
6
  def respond(message, history):
 
1
  import gradio as gr
2
  from huggingface_hub import InferenceClient
3
 
4
+ #STEP 1 FROM SEMANTIC SEARCH
5
+ from sentence_transformers import SentenceTransformer
6
+ import torch
7
+
8
+ #STEP 2 FROM SEMANTIC SEARCH
9
+ with open("water_cycle.txt", "r", encoding="utf-8") as file: # Open the water_cycle.txt file in read mode with UTF-8 encoding
10
+ water_cycle_text = file.read() # Read file and store into variable
11
+
12
+ #STEP 3 FROM SEMANTIC SEARCH
13
+ def preprocess_text(text):
14
+ cleaned_text = text.strip() # Strip extra whitespace from beginning and end of text
15
+ chunks = cleaned_text.split("\n") # Split cleaned_text by every newline character (\n)
16
+ cleaned_chunks = [] # Empty list to store cleaned chunks
17
+
18
+ for chunk in chunks: # For-in loop to clean each chunk and add to list
19
+ chunk = chunk.strip()
20
+ if chunk != "":
21
+ cleaned_chunks.append(chunk)
22
+
23
+ print(cleaned_chunks)
24
+ print(len(cleaned_chunks))
25
+
26
+ return cleaned_chunks
27
+
28
+ cleaned_chunks = preprocess_text(water_cycle_text) # Call preprocess_text and store result in cleaned_chunks
29
+
30
+ #STEP 4 FROM SEMANTIC SEARCH
31
+ model = SentenceTransformer('all-MiniLM-L6-v2') # Load pre-trained embedding model that converts text to vectors
32
+
33
+ def create_embeddings(text_chunks):
34
+ chunk_embeddings = model.encode(text_chunks, convert_to_tensor=True) # Convert each text chunk into a vector embedding and store as a tensor
35
+
36
+ print(chunk_embeddings)
37
+ print(chunk_embeddings.shape)
38
+
39
+ return chunk_embeddings
40
+
41
+ chunk_embeddings = create_embeddings(cleaned_chunks) # Call create_embeddings and store result in chunk_embeddings
42
+
43
+ #STEP 5 FROM SEMANTIC SEARCH
44
+ def get_top_chunks(query, chunk_embeddings, text_chunks): #Finds most relevant text chunks for given query, chunk_embeddings, and text_chunks
45
+ query_embedding = model.encode(query,convert_to_tensor=True) # Convert query string into vector embedding
46
+
47
+ query_embedding_normalized = query_embedding / query_embedding.norm() # Normalize query embedding to unit length for accurate similarity comparison
48
+ chunk_embeddings_normalized = chunk_embeddings / chunk_embeddings.norm(dim=1, keepdim=True) # Normalize all chunk embeddings to unit length for consistent comparison
49
+
50
+ similarities = torch.matmul(chunk_embeddings_normalized,query_embedding_normalized) # Calculate cosine similarity between query and all chunks using matrix multiplication
51
+ print(similarities)
52
+
53
+ top_indices = torch.topk(similarities, k=3).indices # Find indices of the 3 chunks with highest similarity scores
54
+ print(top_indices)
55
+
56
+ top_chunks = [] # Empty list to store most relevant chunks
57
+
58
+ for i in top_indices: # Loop through top indices to retrieve corresponding text chunks
59
+ chunk=text_chunks[i]
60
+ top_chunks.append(chunk)
61
+
62
+ return top_chunks
63
+
64
+ #STEP 6 FROM SEMANTIC SEARCH
65
+ top_results = get_top_chunks("How does the water cycle work?", chunk_embeddings, cleaned_chunks) # Call get_top_chunks with query
66
+ print(top_results)
67
+
68
+ #SAMPLE HUGGING FACE PROJECT
69
  client = InferenceClient("Qwen/Qwen2.5-72B-instruct")
70
 
71
  def respond(message, history):