Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -4,6 +4,7 @@ from huggingface_hub import InferenceClient
|
|
| 4 |
from sentence_transformers import SentenceTransformer
|
| 5 |
import torch
|
| 6 |
import glob
|
|
|
|
| 7 |
|
| 8 |
client = InferenceClient("deepseek-ai/DeepSeek-R1-Distill-Qwen-32B")
|
| 9 |
|
|
@@ -114,6 +115,23 @@ def preprocess_text(text, chunk_size=200, overlap=50):
|
|
| 114 |
print(f"Total chunks created: {len(cleaned_chunks)}")
|
| 115 |
return cleaned_chunks
|
| 116 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
#def preprocess_text(text):
|
| 118 |
# cleaned_text = text.strip()
|
| 119 |
# chunks = cleaned_text.split("\n")
|
|
@@ -135,7 +153,7 @@ def create_embeddings(text_chunks):
|
|
| 135 |
brand_chunks = preprocess_text(brand_options)
|
| 136 |
safe_chunks = preprocess_text(not_safe)
|
| 137 |
health_chunks = preprocess_text(health_risks)
|
| 138 |
-
nutrition_chunks =
|
| 139 |
all_chunks = brand_chunks + safe_chunks + health_chunks + nutrition_chunks
|
| 140 |
|
| 141 |
brand_embeddings = create_embeddings(brand_chunks)
|
|
|
|
| 4 |
from sentence_transformers import SentenceTransformer
|
| 5 |
import torch
|
| 6 |
import glob
|
| 7 |
+
import re
|
| 8 |
|
| 9 |
client = InferenceClient("deepseek-ai/DeepSeek-R1-Distill-Qwen-32B")
|
| 10 |
|
|
|
|
| 115 |
print(f"Total chunks created: {len(cleaned_chunks)}")
|
| 116 |
return cleaned_chunks
|
| 117 |
|
| 118 |
+
def split_by_breed(text):
|
| 119 |
+
breeds = [
|
| 120 |
+
"Beagle", "Bulldog", "Rottweiler", "Siberian Husky",
|
| 121 |
+
"French Bulldog", "Labrador Retriever", "German Shepherd", "Poodle"
|
| 122 |
+
]
|
| 123 |
+
pattern = r"(?:Breed:\s*)?(" + "|".join(breeds) + r")"
|
| 124 |
+
sections = re.split(pattern, text)
|
| 125 |
+
|
| 126 |
+
chunks = []
|
| 127 |
+
for i in range(1, len(sections), 2):
|
| 128 |
+
breed_name = sections[i].strip()
|
| 129 |
+
breed_info = sections[i+1].strip() if i+1 < len(sections) else ""
|
| 130 |
+
if breed_info:
|
| 131 |
+
chunks.append(f"Breed: {breed_name}\n{breed_info}")
|
| 132 |
+
print(f"Total chunks created: {len(chunks)}")
|
| 133 |
+
return chunks
|
| 134 |
+
|
| 135 |
#def preprocess_text(text):
|
| 136 |
# cleaned_text = text.strip()
|
| 137 |
# chunks = cleaned_text.split("\n")
|
|
|
|
| 153 |
brand_chunks = preprocess_text(brand_options)
|
| 154 |
safe_chunks = preprocess_text(not_safe)
|
| 155 |
health_chunks = preprocess_text(health_risks)
|
| 156 |
+
nutrition_chunks = split_by_breed(nutrition)
|
| 157 |
all_chunks = brand_chunks + safe_chunks + health_chunks + nutrition_chunks
|
| 158 |
|
| 159 |
brand_embeddings = create_embeddings(brand_chunks)
|