mingbaer commited on
Commit
06b71f4
·
verified ·
1 Parent(s): 66ea143

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -1
app.py CHANGED
@@ -1,7 +1,56 @@
1
  import gradio as gr
2
  from huggingface_hub import InferenceClient
 
 
 
3
 
4
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  def respond(message, history):
7
 
 
1
  import gradio as gr
2
  from huggingface_hub import InferenceClient
3
+ # SEMANTIC SEARCH STEP 1
4
+ from sentence_transformers import SentenceTransformer
5
+ import torch
6
 
7
+ # SEMANTIC SEARCH STEP 2 --> EDIT WITH YOUR OWN KNOWLEDGEBASE WHEN READY
8
+ with open("water_cycle.txt", "r", encoding="utf-8") as file:
9
+ water_cycle_text = file.read()
10
+ print(water_cycle_text)
11
+
12
+ # SEMANTIC SEARCH STEP 3
13
+ def preprocess_text(text):
14
+ cleaned_text = text.strip()
15
+ chunks = cleaned_text.split("\n")
16
+ cleaned_chunks = []
17
+ for chunk in chunks:
18
+ stripped_chunk = chunk.strip()
19
+ cleaned_chunks.append(stripped_chunk)
20
+ print(cleaned_chunks)
21
+ print(len(cleaned_chunks))
22
+ return cleaned_chunks
23
+
24
+ cleaned_chunks = preprocess_text(water_cycle_text) # edit this with my knowledgebase when ready
25
+
26
+ # SEMANTIC SEARCH STEP 4
27
+ model = SentenceTransformer('all-MiniLM-L6-v2')
28
+
29
+ def create_embeddings(text_chunks):
30
+ chunk_embeddings = model.encode(text_chunks, convert_to_tensor=True) # Replace ... with the text_chunks list
31
+ print(chunk_embeddings)
32
+ print(chunk_embeddings.shape)
33
+ return chunk_embeddings
34
+
35
+ chunk_embeddings = create_embeddings(cleaned_chunks)
36
+
37
+ # SEMANTIC SEARCH STEP 5
38
+ def get_top_chunks(query, chunk_embeddings, text_chunks):
39
+ query_embedding = model.encode(query, convert_to_tensor=True) # Complete this line
40
+ query_embedding_normalized = query_embedding / query_embedding.norm()
41
+ chunk_embeddings_normalized = chunk_embeddings / chunk_embeddings.norm(dim=1, keepdim=True)
42
+ similarities = torch.matmul(chunk_embeddings_normalized, query_embedding_normalized) # Complete this line
43
+ print(similarities)
44
+ top_indices = torch.topk(similarities, k=3).indices
45
+ print(top_indices)
46
+ top_chunks = []
47
+ for i in top_indices:
48
+ relevant_info = text_chunks[i]
49
+ top_chunks.append(relevant_info)
50
+
51
+ return top_chunks
52
+
53
+ client = InferenceClient("microsoft/phi-4")
54
 
55
  def respond(message, history):
56