elinstallation commited on
Commit
932d832
·
verified ·
1 Parent(s): 71bf6cb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -153
app.py CHANGED
@@ -7,12 +7,12 @@ from sentence_transformers import SentenceTransformer
7
  import torch
8
 
9
  with open("poverty_and_education.txt", "r", encoding="utf-8") as file:
10
- # Read the entire contents of the file and store it in a variable
11
- poverty_and_education = file.read()
12
 
13
  with open("academic_tips_text.txt", "r", encoding="utf-8") as file:
14
- # Read the entire contents of the file and store it in a variable
15
- acadenic_tips_text = file.read()
16
 
17
 
18
  # Print the text below
@@ -22,30 +22,31 @@ print(poverty_and_education)
22
 
23
  ### STEP 3
24
  def preprocess_text(text):
25
- # Strip extra whitespace from the beginning and the end of the text
26
- cleaned_text = text.strip()
27
 
28
- # Split the cleaned_text by every newline character (\n)
29
- chunks = cleaned_text.split("\n")
30
 
31
- # Create an empty list to store cleaned chunks
32
- cleaned_chunks = []
33
 
34
- # Write your for-in loop below to clean each chunk and add it to the cleaned_chunks list
35
- for chunk in chunks:
36
- stripped_chunk = chunk.strip()
37
- if len(stripped_chunk) > 0:
38
- cleaned_chunks.append(stripped_chunk)
39
 
40
- # Print cleaned_chunks
41
- print(cleaned_chunks)
42
 
43
- # Print the length of cleaned_chunks
44
- num_of_chunks = print(len(cleaned_chunks))
 
45
 
46
- print(f"There are {num_of_chunks} amount of chunks")
47
- # Return the cleaned_chunks
48
- return cleaned_chunks
49
 
50
 
51
  # Load the pre-trained embedding model that converts text to vectors
@@ -53,17 +54,17 @@ model = SentenceTransformer('all-MiniLM-L6-v2')
53
 
54
  ### STEP 4
55
  def create_embeddings(text_chunks):
56
- # Convert each text chunk into a vector embedding and store as a tensor
57
- chunk_embeddings = model.encode(text_chunks, convert_to_tensor=True) # Replace ... with the text_chunks list
58
 
59
- # Print the chunk embeddings
60
- print(chunk_embeddings)
61
 
62
- # Print the shape of chunk_embeddings
63
- print(chunk_embeddings.shape)
64
 
65
- # Return the chunk_embeddings
66
- return chunk_embeddings
67
 
68
  # Call the create_embeddings function and store the result in a new chunk_embeddings variable
69
  #chunk_embeddings = create_embeddings(cleaned_chunks) # Complete this line
@@ -71,37 +72,37 @@ def create_embeddings(text_chunks):
71
  ###STEP 5
72
  # Define a function to find the most relevant text chunks for a given query, chunk_embeddings, and text_chunks
73
  def get_top_chunks(query, chunk_embeddings, text_chunks):
74
- # Convert the query text into a vector embedding
75
- query_embedding = model.encode(query, convert_to_tensor = True) # Complete this line
76
 
77
- # Normalize the query embedding to unit length for accurate similarity comparison
78
- query_embedding_normalized = query_embedding / query_embedding.norm()
79
 
80
- # Normalize all chunk embeddings to unit length for consistent comparison
81
- chunk_embeddings_normalized = chunk_embeddings / chunk_embeddings.norm(dim=1, keepdim=True)
82
 
83
- # Calculate cosine similarity between query and all chunks using matrix multiplication
84
- similarities = torch.matmul(chunk_embeddings_normalized, query_embedding_normalized) # Complete this line
85
 
86
- # Print the similarities
87
- print(similarities)
88
 
89
- # Find the indices of the 3 chunks with highest similarity scores
90
- top_indices = torch.topk(similarities, k=3).indices
91
 
92
- # Print the top indices
93
- print(top_indices)
94
 
95
- # Create an empty list to store the most relevant chunks
96
- top_chunks = []
97
 
98
- # Loop through the top indices and retrieve the corresponding text chunks
99
- for i in top_indices:
100
- relevant_info = cleaned_chunks[i]
101
- top_chunks.append(relevant_info)
102
 
103
- # Return the list of most relevant chunks
104
- return top_chunks
105
 
106
  # Print the top results
107
  #print(top_results)
@@ -111,19 +112,18 @@ chunk_embeddings = create_embeddings(cleaned_chunks)
111
  client= InferenceClient("Qwen/Qwen2.5-7B-Instruct-1M")
112
 
113
  #defining role of AI and user
114
- # i moved it to the bottom
115
- #def respond(message,history):
116
 
117
- # messages = [{"role": "assistant", "content": "You are a friendly chatbot."}]
118
 
119
- # if history:
120
- # messages.extend(history) #keep adding history
121
 
122
- #messages.append({"role":"user", "content": message})
123
 
124
- #response=client.chat_completion(messages, max_tokens=100) #capping how many words the LLM is allowed to generate as a respond (100 words)
125
 
126
- # return response['choices'][0]['message']['content'].strip() #storing value of response in a readable format to display
127
 
128
  ### STEP 6
129
  # Call the preprocess_text function and store the result in a cleaned_chunks variable
@@ -134,100 +134,10 @@ print(top_results)
134
 
135
  #Defining chatbot giving user a UI to interact, see their conversation history, and see new messages using built in gr feature
136
  #ChatInterface requires at least one parameter(a function)
137
- #chatbot = gr.ChatInterface(respond,type="messages", title="AI Chatbot", theme="Taithrah/Minimal")
138
-
139
- # INTERFACE EDITS #
140
- custom_css = """
141
- #chatbox {background-color: #ffffff; border-radius: 10px; padding: 10px;}
142
- #chatbox .message.user {background-color: #EDE7F6; color: #4A148C; border-radius: 20px; padding: 10px; margin: 5px; max-width: 75%;}
143
- #chatbox .message.bot {background-color: #F3E5F5; color: #4A148C; border-radius: 20px; padding: 10px; margin: 5px; max-width: 75%;}
144
- #header {background-color: #8E24AA; color: white; padding: 12px; border-radius: 12px 12px 0 0; font-weight: bold;}
145
-
146
- /* Input bar test */
147
- .input-container {
148
- display: flex;
149
- align-items: center;
150
- background-color: white;
151
- border: 1px solid #ccc;
152
- border-radius: 25px;
153
- padding: 5px 10px;
154
- width: 100%;
155
- }
156
- .input-container input {
157
- border: none;
158
- outline: none;
159
- flex: 1;
160
- font-size: 14px;
161
- }
162
- .input-container button {
163
- background-color: #8E24AA;
164
- color: white;
165
- border: none;
166
- border-radius: 50%;
167
- width: 35px;
168
- height: 35px;
169
- cursor: pointer;
170
- }
171
- """
172
-
173
- def respond(message, history):
174
- # Prepare messages for the API
175
- messages = [{"role": "assistant", "content": "You are a friendly chatbot."}]
176
- if history:
177
- # Convert Gradio history into API format
178
- for user_msg, bot_msg in history:
179
- messages.append({"role": "user", "content": user_msg})
180
- messages.append({"role": "assistant", "content": bot_msg})
181
- messages.append({"role": "user", "content": message})
182
-
183
- # Call the API
184
- response = client.chat_completion(messages, max_tokens=100)
185
- assistant_reply = response['choices'][0]['message']['content'].strip()
186
-
187
- # Return for Gradio
188
- return history + [(message, assistant_reply)], ""
189
-
190
- with gr.Blocks(css=custom_css) as demo:
191
- gr.HTML("<div id='header'>DivaBot</div>")
192
- chatbot = gr.Chatbot(elem_id="chatbox", height=400)
193
-
194
- # Hidden textbox to store the message
195
- msg = gr.Textbox(visible=False)
196
-
197
- # Visible custom input bar with send button
198
- gr.HTML("""
199
- <div class="input-container">
200
- <input id="user-input" placeholder="Type your message..." />
201
- <button id="send-btn">➤</button>
202
- </div>
203
- <script>
204
- const sendBtn = document.getElementById('send-btn');
205
- const userInput = document.getElementById('user-input');
206
- sendBtn.onclick = () => {
207
- const value = userInput.value;
208
- if (value.trim() !== "") {
209
- // Set the hidden Gradio textbox value
210
- const textbox = document.querySelector('textarea');
211
- textbox.value = value;
212
- textbox.dispatchEvent(new Event('input', { bubbles: true }));
213
- // Trigger submit
214
- document.querySelector('textarea').closest('form').dispatchEvent(new Event('submit', { bubbles: true }));
215
- userInput.value = "";
216
- }
217
- };
218
- userInput.addEventListener("keypress", function(e) {
219
- if (e.key === "Enter") {
220
- sendBtn.click();
221
- e.preventDefault();
222
- }
223
- });
224
- </script>
225
- """)
226
-
227
- msg.submit(respond, [msg, chatbot], [chatbot, msg])
228
-
229
- demo.launch()
230
 
 
 
231
 
232
 
233
  #You may run into errors when you're trying different models. To see the error messages, set debug to True in launch()
 
7
  import torch
8
 
9
  with open("poverty_and_education.txt", "r", encoding="utf-8") as file:
10
+ # Read the entire contents of the file and store it in a variable
11
+ poverty_and_education = file.read()
12
 
13
  with open("academic_tips_text.txt", "r", encoding="utf-8") as file:
14
+ # Read the entire contents of the file and store it in a variable
15
+ acadenic_tips_text = file.read()
16
 
17
 
18
  # Print the text below
 
22
 
23
  ### STEP 3
24
  def preprocess_text(text):
25
+ # Strip extra whitespace from the beginning and the end of the text
26
+ cleaned_text = text.strip()
27
 
28
+ # Split the cleaned_text by every newline character (\n)
29
+ chunks = cleaned_text.split("\n")
30
 
31
+ # Create an empty list to store cleaned chunks
32
+ cleaned_chunks = []
33
 
34
+ # Write your for-in loop below to clean each chunk and add it to the cleaned_chunks list
35
+ for chunk in chunks:
36
+ stripped_chunk = chunk.strip()
37
+ if len(stripped_chunk) > 0:
38
+ cleaned_chunks.append(stripped_chunk)
39
 
40
+ # Print cleaned_chunks
41
+ print(cleaned_chunks)
42
 
43
+ # Print the length of cleaned_chunks
44
+ num_of_chunks = len(cleaned_chunks)
45
+ print(num_of_chunks)
46
 
47
+ print(f"There are {num_of_chunks} amount of chunks")
48
+ # Return the cleaned_chunks
49
+ return cleaned_chunks
50
 
51
 
52
  # Load the pre-trained embedding model that converts text to vectors
 
54
 
55
  ### STEP 4
56
  def create_embeddings(text_chunks):
57
+ # Convert each text chunk into a vector embedding and store as a tensor
58
+ chunk_embeddings = model.encode(text_chunks, convert_to_tensor=True) # Replace ... with the text_chunks list
59
 
60
+ # Print the chunk embeddings
61
+ print(chunk_embeddings)
62
 
63
+ # Print the shape of chunk_embeddings
64
+ print(chunk_embeddings.shape)
65
 
66
+ # Return the chunk_embeddings
67
+ return chunk_embeddings
68
 
69
  # Call the create_embeddings function and store the result in a new chunk_embeddings variable
70
  #chunk_embeddings = create_embeddings(cleaned_chunks) # Complete this line
 
72
  ###STEP 5
73
  # Define a function to find the most relevant text chunks for a given query, chunk_embeddings, and text_chunks
74
  def get_top_chunks(query, chunk_embeddings, text_chunks):
75
+ # Convert the query text into a vector embedding
76
+ query_embedding = model.encode(query, convert_to_tensor = True) # Complete this line
77
 
78
+ # Normalize the query embedding to unit length for accurate similarity comparison
79
+ query_embedding_normalized = query_embedding / query_embedding.norm()
80
 
81
+ # Normalize all chunk embeddings to unit length for consistent comparison
82
+ chunk_embeddings_normalized = chunk_embeddings / chunk_embeddings.norm(dim=1, keepdim=True)
83
 
84
+ # Calculate cosine similarity between query and all chunks using matrix multiplication
85
+ similarities = torch.matmul(chunk_embeddings_normalized, query_embedding_normalized) # Complete this line
86
 
87
+ # Print the similarities
88
+ print(similarities)
89
 
90
+ # Find the indices of the 3 chunks with highest similarity scores
91
+ top_indices = torch.topk(similarities, k=3).indices
92
 
93
+ # Print the top indices
94
+ print(top_indices)
95
 
96
+ # Create an empty list to store the most relevant chunks
97
+ top_chunks = []
98
 
99
+ # Loop through the top indices and retrieve the corresponding text chunks
100
+ for i in top_indices:
101
+ relevant_info = cleaned_chunks[i]
102
+ top_chunks.append(relevant_info)
103
 
104
+ # Return the list of most relevant chunks
105
+ return top_chunks
106
 
107
  # Print the top results
108
  #print(top_results)
 
112
  client= InferenceClient("Qwen/Qwen2.5-7B-Instruct-1M")
113
 
114
  #defining role of AI and user
115
+ def respond(message,history):
 
116
 
117
+ messages = [{"role": "assistant", "content": "You are a friendly chatbot."}]
118
 
119
+ if history:
120
+ messages.extend(history) #keep adding history
121
 
122
+ messages.append({"role":"user", "content": message})
123
 
124
+ response=client.chat_completion(messages, max_tokens=100) #capping how many words the LLM is allowed to generate as a respond (100 words)
125
 
126
+ return response['choices'][0]['message']['content'].strip() #storing value of response in a readable format to display
127
 
128
  ### STEP 6
129
  # Call the preprocess_text function and store the result in a cleaned_chunks variable
 
134
 
135
  #Defining chatbot giving user a UI to interact, see their conversation history, and see new messages using built in gr feature
136
  #ChatInterface requires at least one parameter(a function)
137
+ chatbot = gr.ChatInterface(respond,type="messages", title="AI Chatbot", theme="Taithrah/Minimal")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
+ #launching chatbot
140
+ chatbot.launch()
141
 
142
 
143
  #You may run into errors when you're trying different models. To see the error messages, set debug to True in launch()