barkbites

Sleeping

ritikaaA commited on Aug 15, 2025

Commit

ce7de46

verified ·

1 Parent(s): dee4d84

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ from huggingface_hub import InferenceClient
 from sentence_transformers import SentenceTransformer
 import torch
 import glob
 client = InferenceClient("deepseek-ai/DeepSeek-R1-Distill-Qwen-32B")
@@ -114,6 +115,23 @@ def preprocess_text(text, chunk_size=200, overlap=50):
     print(f"Total chunks created: {len(cleaned_chunks)}")
     return cleaned_chunks
 #def preprocess_text(text):
 #  cleaned_text = text.strip()
 #  chunks = cleaned_text.split("\n")
@@ -135,7 +153,7 @@ def create_embeddings(text_chunks):
 brand_chunks = preprocess_text(brand_options)
 safe_chunks = preprocess_text(not_safe)
 health_chunks = preprocess_text(health_risks)
-nutrition_chunks = preprocess_text(nutrition)
 all_chunks = brand_chunks + safe_chunks + health_chunks + nutrition_chunks
 brand_embeddings = create_embeddings(brand_chunks)

 from sentence_transformers import SentenceTransformer
 import torch
 import glob
+import re
 client = InferenceClient("deepseek-ai/DeepSeek-R1-Distill-Qwen-32B")
     print(f"Total chunks created: {len(cleaned_chunks)}")
     return cleaned_chunks
+def split_by_breed(text):
+    breeds = [
+        "Beagle", "Bulldog", "Rottweiler", "Siberian Husky",
+        "French Bulldog", "Labrador Retriever", "German Shepherd", "Poodle"
+    ]
+    pattern = r"(?:Breed:\s*)?(" + "|".join(breeds) + r")"
+    sections = re.split(pattern, text)
+    chunks = []
+    for i in range(1, len(sections), 2):
+        breed_name = sections[i].strip()
+        breed_info = sections[i+1].strip() if i+1 < len(sections) else ""
+        if breed_info:
+            chunks.append(f"Breed: {breed_name}\n{breed_info}")
+    print(f"Total chunks created: {len(chunks)}")
+    return chunks
 #def preprocess_text(text):
 #  cleaned_text = text.strip()
 #  chunks = cleaned_text.split("\n")
 brand_chunks = preprocess_text(brand_options)
 safe_chunks = preprocess_text(not_safe)
 health_chunks = preprocess_text(health_risks)
+nutrition_chunks = split_by_breed(nutrition)
 all_chunks = brand_chunks + safe_chunks + health_chunks + nutrition_chunks
 brand_embeddings = create_embeddings(brand_chunks)