mgetz commited on
Commit
acdfb09
·
verified ·
1 Parent(s): 31abba7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +104 -0
app.py CHANGED
@@ -2,6 +2,110 @@ import gradio as gr
2
  import random
3
  from huggingface_hub import InferenceClient
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  client = InferenceClient("Qwen/Qwen2.5-72B-Instruct")
6
 
7
  def respond(message, history):
 
2
  import random
3
  from huggingface_hub import InferenceClient
4
 
5
+ #STEP 1 FROM SEMANTIC SEARCH
6
+ from sentence_transformers import SentenceTransformer
7
+ import torch
8
+
9
+ #STEP 2 FROM SEMANTIC SEARCH
10
+ # Open the water_cycle.txt file in read mode with UTF-8 encoding
11
+ with open("water_cycle.txt", "r", encoding="utf-8") as file:
12
+ # Read the entire contents of the file and store it in a variable
13
+ water_cycle_text = file.read()
14
+
15
+ # Print the text below
16
+ print(water_cycle_text)
17
+
18
+ #STEP 3 FROM SEMANTIC SEARCH
19
+ def preprocess_text(text):
20
+ # Strip extra whitespace from the beginning and the end of the text
21
+ cleaned_text = text.strip()
22
+
23
+ # Split the cleaned_text by every newline character (\n)
24
+ chunks = cleaned_text.split("\n")
25
+
26
+ # Create an empty list to store cleaned chunks
27
+ cleaned_chunks = []
28
+
29
+ # Write your for-in loop below to clean each chunk and add it to the cleaned_chunks list
30
+ for chunk in chunks:
31
+ stripped_chunk = chunk.strip()
32
+ if len(stripped_chunk) >= 0:
33
+ cleaned_chunks.append(stripped_chunk)
34
+
35
+ # Print cleaned_chunks
36
+ print(cleaned_chunks)
37
+
38
+ # Print the length of cleaned_chunks
39
+ print(len(cleaned_chunks))
40
+
41
+ # Return the cleaned_chunks
42
+ return cleaned_chunks
43
+
44
+ # Call the preprocess_text function and store the result in a cleaned_chunks variable
45
+ cleaned_chunks = preprocess_text(water_cycle_text)
46
+
47
+ #STEP 4 FROM SEMANTIC SEARCH
48
+ # Load the pre-trained embedding model that converts text to vectors
49
+ model = SentenceTransformer('all-MiniLM-L6-v2')
50
+
51
+ def create_embeddings(text_chunks):
52
+ # Convert each text chunk into a vector embedding and store as a tensor
53
+ chunk_embeddings = model.encode(text_chunks, convert_to_tensor=True) # Replace ... with the text_chunks list
54
+
55
+ # Print the chunk embeddings
56
+ print(chunk_embeddings)
57
+
58
+ # Print the shape of chunk_embeddings
59
+ print(chunk_embeddings.shape) # no parentheses on .shape because it's a property, not a method! Look up the difference between class methods and classes properties.
60
+
61
+ # Return the chunk_embeddings
62
+ return chunk_embeddings
63
+
64
+ # Call the create_embeddings function and store the result in a new chunk_embeddings variable
65
+ chunk_embeddings = create_embeddings(cleaned_chunks)
66
+
67
+ #STEP 5 FROM SEMANTIC SEARCH
68
+ # Define a function to find the most relevant text chunks for a given query, chunk_embeddings, and text_chunks
69
+ def get_top_chunks(query, chunk_embeddings, text_chunks):
70
+ # Convert the query text into a vector embedding
71
+ query_embedding = model.encode(query, convert_to_tensor=True)
72
+
73
+ # Normalize the query embedding to unit length for accurate similarity comparison
74
+ query_embedding_normalized = query_embedding / query_embedding.norm()
75
+
76
+ # Normalize all chunk embeddings to unit length for consistent comparison
77
+ chunk_embeddings_normalized = chunk_embeddings / chunk_embeddings.norm(dim=1, keepdim=True)
78
+
79
+ # Calculate cosine similarity between query and all chunks using matrix multiplication
80
+ similarities = torch.matmul(chunk_embeddings_normalized, query_embedding_normalized)
81
+
82
+ # Print the similarities
83
+ print(similarities)
84
+
85
+ # Find the indices of the 3 chunks with highest similarity scores
86
+ top_indices = torch.topk(similarities, k=3).indices
87
+
88
+ # Print the top indices
89
+ print(top_indices)
90
+
91
+ # Create an empty list to store the most relevant chunks
92
+ top_chunks = []
93
+
94
+ # Loop through the top indices and retrieve the corresponding text chunks
95
+ for index in top_indices:
96
+ chunk = text_chunks[index]
97
+ top_chunks.append(chunk)
98
+
99
+ # Return the list of most relevant chunks
100
+ return top_chunks
101
+
102
+ #STEP 6 FROM SEMANTIC SEARCH
103
+ # Call the get_top_chunks function with the original query
104
+ top_results = get_top_chunks("How does water get into the sky?", chunk_embeddings, cleaned_chunks)
105
+
106
+ # Print the top results
107
+ print(top_results)
108
+
109
  client = InferenceClient("Qwen/Qwen2.5-72B-Instruct")
110
 
111
  def respond(message, history):