Spaces:
Sleeping
Sleeping
File size: 11,052 Bytes
8b0167d 031abc6 ce7de46 8b0167d 6f6895e 51513ec 8b0167d 2dcffab 1f909e1 affd614 2dcffab 1f909e1 2dcffab e7d88fc 2dcffab 1f909e1 2dcffab 1f909e1 2dcffab 020aaf0 1f909e1 c4d80ba 8b0167d e22ba95 6f177dd e22ba95 99ab6cc f92d42b 9f9d628 99ab6cc d61dfa5 99ab6cc 8b0167d 2eea792 8b0167d bff159e 8b0167d 031abc6 8b0167d 031abc6 8b0167d 031abc6 61e4851 8b0167d 61e4851 fa444f7 61e4851 fa444f7 61e4851 fa444f7 61e4851 fa444f7 61e4851 f300e64 a5cf346 dd92fb1 f300e64 7a423cc ce7de46 7a423cc 8b0167d 2b09ab4 1d49779 61e4851 ce7de46 dcb004e 61e4851 dcb004e 8b0167d 9eb6176 8b0167d ee81309 512cd48 8b0167d 49214e8 2957e04 8b0167d 512cd48 8b0167d 512cd48 8b0167d 512cd48 8b0167d 512cd48 8b0167d 512cd48 8b0167d a813db8 49d839d 7a0d626 a813db8 0218a86 b021139 a813db8 49d839d a813db8 eeb640d cc32827 eeb640d f8641c9 a813db8 dee4d84 68451cb dee4d84 8b0167d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 |
import gradio as gr
import random
from huggingface_hub import InferenceClient
from sentence_transformers import SentenceTransformer
import torch
import glob
import re
client = InferenceClient("Qwen/Qwen2.5-72B-Instruct")
def respond(message, history):
global brand_chunks, safe_chunks, health_chunks, nutrition_chunks, all_chunks
lower_msg = message.lower()
if any(word in lower_msg for word in ["unsafe", "toxic", "harmful", "not safe", "poison"]):
search_chunks = safe_chunks
search_embeddings = safe_embeddings
elif any(word in lower_msg for word in ["nutrition", "diet", "nutrient", "protein", "calories", "feed"]):
search_chunks = nutrition_chunks
search_embeddings = nutrition_embeddings
elif any(word in lower_msg for word in ["brand", "brands", "dog food brand"]):
search_chunks = brand_chunks
search_embeddings = brand_embeddings
elif any(word in lower_msg for word in ["health risk", "disease", "illness"]):
search_chunks = health_chunks
search_embeddings = health_embeddings
else:
search_chunks = all_chunks
search_embeddings = all_embeddings
print("DEBUG: respond() called with:", message)
top_results = get_top_chunks(message, search_embeddings, search_chunks)
print("These are top results", top_results)
urgent_keywords = [
"puke", "vomit", "throw up", "seizure", "bleeding", "choking",
"can't breathe", "emergency", "poison", "collapsed", "trauma", "injury"
]
if any(word in message.lower() for word in urgent_keywords):
return ("This sounds like a possible medical emergency. "
"Please contact your veterinarian or an emergency animal hospital immediately. "
"Do not rely solely on online advice."
)
# ✅ Format context for LLM
if top_results:
formatted_info = "\n".join(f"- {chunk}" for chunk in top_results)
system_prompt = (
f"You are a friendly chatbot that gives advice about nutrition for dogs.\n"
f"Using the provided information from multiple sources \n{formatted_info}\n"
f"Respond in 3-5 complete sentences and apply common sense based on the user's question."
f"If the user asks about something you were not trained on, "
f"give a cautious answer and suggest checking with a vet."
)
else:
system_prompt = (
"You are a friendly chatbot that gives advice about what dogs can eat.\n"
"If the user asks about a food not in your database. Respond cautiously and suggest checking with a vet."
)
messages = [{"role": "system", "content": system_prompt}]
if history:
messages.extend(history)
messages.append({"role": "user", "content": message})
response = client.chat_completion(messages, max_tokens=500, temperature=0.2)
return response['choices'][0]['message']['content'].strip()
print("hello world")
#chatbot = gr.ChatInterface(respond, type="messages", title = "LLM Chatbox", theme = "gradio/soft")
# declaring chatbot so that user can interact and see their conversation history and send new messages
# ===== LOAD & PROCESS YOUR NEW CONTENT =====
#with open("toxic_foods_for_dogs.txt", "r", encoding="utf-8") as file:
# Read the entire contents of the file and store it in a variable
# toxic_food_text = file.read()
#all_texts = []
#for filepath in glob.glob("data/*.txt"):
# with open(filepath, "r", encoding="utf-8") as file:
# all_texts.append(file.read())
#combined_text = "\n".join(all_texts)
with open("food_brand_options.txt", "r", encoding="utf-8") as f:
brand_options = f.read()
with open("foods_not_safe.txt", "r", encoding="utf-8") as file:
not_safe = file.read()
with open("health_risks.txt", "r", encoding="utf-8") as fi:
health_risks = fi.read()
with open("nutrition.txt", "r", encoding="utf-8") as fil:
nutrition = fil.read()
#def preprocess_text(text):
# cleaned_text = text.strip()
# chunks = cleaned_text.split("\n")
# cleaned_chunks = [chunk.strip() for chunk in chunks if chunk.strip()]
# print(cleaned_chunks)
# print(len(cleaned_chunks))
# return cleaned_chunks
def preprocess_text(text, chunk_size=200, overlap=50):
words = text.strip().split()
cleaned_chunks = []
for i in range(0, len(words), chunk_size - overlap):
chunk_words = words[i:i + chunk_size]
chunk_text = " ".join(chunk_words).strip()
if chunk_text:
cleaned_chunks.append(chunk_text)
print(f"Total chunks created: {len(cleaned_chunks)}")
return cleaned_chunks
def split_by_breed(text):
breeds = [
"Beagle", "Bulldog", "Rottweiler", "Siberian Husky",
"French Bulldog", "Labrador Retriever", "German Shepherd", "Poodle"
]
pattern = r"(?:Breed:\s*)?(" + "|".join(breeds) + r")"
sections = re.split(pattern, text)
chunks = []
for i in range(1, len(sections), 2):
breed_name = sections[i].strip()
breed_info = sections[i+1].strip() if i+1 < len(sections) else ""
if breed_info:
chunks.append(f"Breed: {breed_name}\n{breed_info}")
print(f"Total chunks created: {len(chunks)}")
return chunks
#def preprocess_text(text):
# cleaned_text = text.strip()
# chunks = cleaned_text.split("\n")
# cleaned_chunks = []
# for chunk in chunks:
# stripped_chunk = chunk.strip()
# cleaned_chunks.append(stripped_chunk)
# print(len(cleaned_chunks))
# return cleaned_chunks
model = SentenceTransformer('all-MiniLM-L6-v2')
def create_embeddings(text_chunks):
embeddings = model.encode(text_chunks, convert_to_tensor=True)
if embeddings.ndim == 1:
embeddings = embeddings.unsqueeze(0)
return embeddings
brand_chunks = preprocess_text(brand_options)
safe_chunks = preprocess_text(not_safe)
health_chunks = preprocess_text(health_risks)
nutrition_chunks = split_by_breed(nutrition)
all_chunks = brand_chunks + safe_chunks + health_chunks + nutrition_chunks
brand_embeddings = create_embeddings(brand_chunks)
safe_embeddings = create_embeddings(safe_chunks)
health_embeddings = create_embeddings(health_chunks)
nutrition_embeddings = create_embeddings(nutrition_chunks)
all_embeddings = create_embeddings(all_chunks)
# Load the pre-trained embedding model that converts text to vectors
model = SentenceTransformer('all-MiniLM-L6-v2')
def create_embeddings(text_chunks):
# Convert each text chunk into a vector embedding and store as a tensor
chunk_embeddings = model.encode(text_chunks, convert_to_tensor=True) # Replace ... with the text_chunks list
#replace ... with text_chunks
# Print the chunk embeddings
print(chunk_embeddings)
# Print the shape of chunk_embeddings
print(chunk_embeddings.shape)
# Return the chunk_embeddings
return chunk_embeddings
# Call the create_embeddings function and store the result in a new chunk_embeddings variable
chunk_embeddings = create_embeddings(brand_chunks)
# Define a function to find the most relevant text chunks for a given query, chunk_embeddings, and text_chunks
def get_top_chunks(query, chunk_embeddings, text_chunks, top_k=7, similarity_threshold=0.4):
if not text_chunks or chunk_embeddings is None or chunk_embeddings.size(0) == 0:
return []
# Convert the query text into a vector embedding
query_embedding = model.encode(query, convert_to_tensor=True) # Complete this line
# Normalize the query embedding to unit length for accurate similarity comparison. Normalize = bring to a length of 1
query_embedding_normalized = query_embedding / query_embedding.norm()
# chunk_embeddings_normalized = chunk_embeddings / chunk_embeddings.norm(dim=1, keepdim=True)
if chunk_embeddings.ndim == 1:
chunk_embeddings_normalized = chunk_embeddings / chunk_embeddings.norm()
else:
chunk_embeddings_normalized = chunk_embeddings / chunk_embeddings.norm(dim=1, keepdim=True)
# Calculate cosine similarity between query and all chunks using matrix multiplication
similarities = torch.matmul(chunk_embeddings_normalized, query_embedding_normalized)
# Print the similarities
print(similarities)
# Find the indices of the 3 chunks with highest similarity scores
top_indices = torch.topk(similarities, k= min(3, len(text_chunks))).indices
candidate_chunks = [(i.item(), similarities[i].item()) for i in top_indices]
# Print the top indices
print(top_indices)
filtered_chunks = [(idx, score) for idx, score in candidate_chunks if score >= similarity_threshold]
def keyword_score(chunk_text, query_text):
q_words = set(query_text.lower().split())
c_words = set(chunk_text.lower().split())
return len(q_words & c_words)
reranked = sorted(
filtered_chunks,
key=lambda x: keyword_score(text_chunks[x[0]], query),
reverse=True
)
final_chunks = [text_chunks[idx] for idx, _ in reranked]
return final_chunks
# Create an empty list to store the most relevant chunks
# top_chunks = []
# Loop through the top indices and retrieve the corresponding text chunks
# for i in top_indices:
# relevant_info = brand_chunks[i]
# top_chunks.append(relevant_info)
# Return the list of most relevant chunks
# return top_chunks
# theme
custom_theme = gr.themes.Soft(
primary_hue="purple",
secondary_hue="purple",
neutral_hue="purple",
spacing_size="lg",
radius_size="lg",
text_size="lg",
font=[gr.themes.GoogleFont("Intel One Mono"), "serif"],
)
about_text = "## About this bot Our bot will tell how to care for your dog's nutrition. Use the chat box on the right to try it out!"
with gr.Blocks(theme=custom_theme) as chatbot:
with gr.Row(scale=1):
gr.Image(
value="BarkBites.png",
show_label=False,
show_share_button = False,
show_download_button = False
)
with gr.Row(scale=3):
with gr.Column(scale=1):
with gr.Row():
level = gr.Dropdown(
choices = ["Small", "Medium", "Large"],
label="Dog Size",
info="What is your dog's size?",
interactive=True
)
gr.Image(
value="BarkBot.png",
show_label=False,
show_share_button=False,
show_download_button=False
)
with gr.Column(scale=4):
gr.ChatInterface(
fn=respond,
type="messages",
examples=["What should I feed my pet husky?", "Give me a meal plan for my labrador.", "Help! My dog is puking everywhere!"],
title="BarkBites",
theme="gradio/soft",
description="Are you worried that something isn’t safe to eat for your dog? Or that they aren’t getting enough nutrition? Look no further, BarkBites is here to help!"
)
chatbot.launch() |