ritikaaA commited on
Commit
ce7de46
·
verified ·
1 Parent(s): dee4d84

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -1
app.py CHANGED
@@ -4,6 +4,7 @@ from huggingface_hub import InferenceClient
4
  from sentence_transformers import SentenceTransformer
5
  import torch
6
  import glob
 
7
 
8
  client = InferenceClient("deepseek-ai/DeepSeek-R1-Distill-Qwen-32B")
9
 
@@ -114,6 +115,23 @@ def preprocess_text(text, chunk_size=200, overlap=50):
114
  print(f"Total chunks created: {len(cleaned_chunks)}")
115
  return cleaned_chunks
116
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  #def preprocess_text(text):
118
  # cleaned_text = text.strip()
119
  # chunks = cleaned_text.split("\n")
@@ -135,7 +153,7 @@ def create_embeddings(text_chunks):
135
  brand_chunks = preprocess_text(brand_options)
136
  safe_chunks = preprocess_text(not_safe)
137
  health_chunks = preprocess_text(health_risks)
138
- nutrition_chunks = preprocess_text(nutrition)
139
  all_chunks = brand_chunks + safe_chunks + health_chunks + nutrition_chunks
140
 
141
  brand_embeddings = create_embeddings(brand_chunks)
 
4
  from sentence_transformers import SentenceTransformer
5
  import torch
6
  import glob
7
+ import re
8
 
9
  client = InferenceClient("deepseek-ai/DeepSeek-R1-Distill-Qwen-32B")
10
 
 
115
  print(f"Total chunks created: {len(cleaned_chunks)}")
116
  return cleaned_chunks
117
 
118
+ def split_by_breed(text):
119
+ breeds = [
120
+ "Beagle", "Bulldog", "Rottweiler", "Siberian Husky",
121
+ "French Bulldog", "Labrador Retriever", "German Shepherd", "Poodle"
122
+ ]
123
+ pattern = r"(?:Breed:\s*)?(" + "|".join(breeds) + r")"
124
+ sections = re.split(pattern, text)
125
+
126
+ chunks = []
127
+ for i in range(1, len(sections), 2):
128
+ breed_name = sections[i].strip()
129
+ breed_info = sections[i+1].strip() if i+1 < len(sections) else ""
130
+ if breed_info:
131
+ chunks.append(f"Breed: {breed_name}\n{breed_info}")
132
+ print(f"Total chunks created: {len(chunks)}")
133
+ return chunks
134
+
135
  #def preprocess_text(text):
136
  # cleaned_text = text.strip()
137
  # chunks = cleaned_text.split("\n")
 
153
  brand_chunks = preprocess_text(brand_options)
154
  safe_chunks = preprocess_text(not_safe)
155
  health_chunks = preprocess_text(health_risks)
156
+ nutrition_chunks = split_by_breed(nutrition)
157
  all_chunks = brand_chunks + safe_chunks + health_chunks + nutrition_chunks
158
 
159
  brand_embeddings = create_embeddings(brand_chunks)