ritikaaA commited on
Commit
8b0167d
·
verified ·
1 Parent(s): 3410504

added semantic

Browse files
Files changed (1) hide show
  1. app.py +119 -0
app.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import random
3
+ from huggingface_hub import InferenceClient
4
+ from sentence_transformers import SentenceTransformer
5
+ import torch
6
+
7
+ client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
+
9
+ def respond(message, history):
10
+ messages = [{"role": "system", "content": "You are a book influencer that is nice and friendly."}]
11
+
12
+ if history:
13
+ messages.extend(history)
14
+
15
+ messages.append({"role": "user", "content": message})
16
+
17
+ response = client.chat_completion(messages, max_tokens = 100)
18
+ #connecting to llm, max caps response
19
+ return response['choices'][0]['message']['content'].strip()
20
+
21
+ print("hello world")
22
+ chatbot = gr.ChatInterface(respond, type="messages", title = "LLM Chatbox", theme = "gradio/soft")
23
+ # declaring chatbot so that user can interact and see their conversation history and send new messages
24
+
25
+ # ===== LOAD & PROCESS YOUR NEW CONTENT =====
26
+ with open("toxic_foods_for_dogs.txt", "r", encoding="utf-8") as file:
27
+ # Read the entire contents of the file and store it in a variable
28
+ toxic_food_text = file.read()
29
+
30
+ # Print the text below
31
+ print(toxic_food_text)
32
+
33
+ # ===== APPLY THE COMPLETE WORKFLOW =====
34
+ def preprocess_text(text):
35
+ # Strip extra whitespace from the beginning and the end of the text
36
+ cleaned_text = text.strip()
37
+ # Split the cleaned_text by every newline character (\n)
38
+ chunks = cleaned_text.split("\n")
39
+
40
+ # Create an empty list to store cleaned chunks
41
+ cleaned_chunks = []
42
+
43
+ # Write your for-in loop below to clean each chunk and add it to the cleaned_chunks list
44
+ for chunk in chunks:
45
+ stripped_chunk = chunk.strip()
46
+ cleaned_chunks.append(stripped_chunk)
47
+
48
+ # Print cleaned_chunks
49
+ print(cleaned_chunks)
50
+
51
+ # Print the length of cleaned_chunks
52
+ print(len(cleaned_chunks))
53
+
54
+ # Return the cleaned_chunks
55
+ return cleaned_chunks
56
+
57
+ cleaned_chunks = preprocess_text(toxic_food_text)
58
+
59
+ # Load the pre-trained embedding model that converts text to vectors
60
+ model = SentenceTransformer('all-MiniLM-L6-v2')
61
+
62
+ def create_embeddings(text_chunks):
63
+ # Convert each text chunk into a vector embedding and store as a tensor
64
+ chunk_embeddings = model.encode(text_chunks, convert_to_tensor=True) # Replace ... with the text_chunks list
65
+ #replace ... with text_chunks
66
+ # Print the chunk embeddings
67
+ print(chunk_embeddings)
68
+
69
+ # Print the shape of chunk_embeddings
70
+ print(chunk_embeddings.shape)
71
+
72
+ # Return the chunk_embeddings
73
+ return chunk_embeddings
74
+
75
+ # Call the create_embeddings function and store the result in a new chunk_embeddings variable
76
+ chunk_embeddings = create_embeddings(cleaned_chunks)
77
+
78
+ # Define a function to find the most relevant text chunks for a given query, chunk_embeddings, and text_chunks
79
+ def get_top_chunks(query, chunk_embeddings, text_chunks):
80
+ # Convert the query text into a vector embedding
81
+ query_embedding = model.encode(query, convert_to_tensor=True) # Complete this line
82
+
83
+ # Normalize the query embedding to unit length for accurate similarity comparison. Normalize = bring to a length of 1
84
+ query_embedding_normalized = query_embedding / query_embedding.norm()
85
+
86
+ # Normalize all chunk embeddings to unit length for consistent comparison
87
+ chunk_embeddings_normalized = chunk_embeddings / chunk_embeddings.norm(dim=1, keepdim=True)
88
+
89
+ # Calculate cosine similarity between query and all chunks using matrix multiplication
90
+ similarities = torch.matmul(chunk_embeddings_normalized, query_embedding_normalized)
91
+
92
+ # Print the similarities
93
+ print(similarities)
94
+
95
+ # Find the indices of the 3 chunks with highest similarity scores
96
+ top_indices = torch.topk(similarities, k=5).indices
97
+
98
+ # Print the top indices
99
+ print(top_indices)
100
+
101
+ # Create an empty list to store the most relevant chunks
102
+ top_chunks = []
103
+
104
+ # Loop through the top indices and retrieve the corresponding text chunks
105
+ for i in top_indices:
106
+ relevant_info = cleaned_chunks[i]
107
+ top_chunks.append(relevant_info)
108
+
109
+
110
+ # Return the list of most relevant chunks
111
+ return top_chunks
112
+
113
+ # ===== EXPERIMENT & VERIFY =====
114
+ top_results = get_top_chunks(message, chunk_embeddings, cleaned_chunks)
115
+
116
+ # Print the top results
117
+ print(top_results)
118
+
119
+ chatbot.launch()