amiraghhh commited on
Commit
ab46c35
·
verified ·
1 Parent(s): b30f331

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +0 -38
utils.py CHANGED
@@ -133,41 +133,3 @@ A: Information unavailable."""
133
  full_prompt = f"{instruction_text}{the_context_block}{query_footer}"
134
 
135
  return full_prompt
136
-
137
-
138
- def normalize_text(text):
139
- """Normalize text for duplicate detection by removing spaces and punctuation.
140
- Returns: str"""
141
- if not isinstance(text, str):
142
- return ""
143
-
144
- text = text.lower() # Lowercase
145
- text = re.sub(r'\s+', ' ', text) # Remove extra whitespace
146
- text = text.strip() # Remove leading/trailing spaces
147
- text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
148
-
149
- return text
150
-
151
-
152
- def chunk_text(text, chunk_size=384, chunk_overlap=20):
153
- """Split text into chunks for embeddings.
154
- Returns: list(chunks)"""
155
- splitter = RecursiveCharacterTextSplitter(
156
- separators=["\n\n", "\n", " ", ""],
157
- chunk_size=chunk_size,
158
- chunk_overlap=chunk_overlap
159
- )
160
- return splitter.split_text(text)
161
-
162
-
163
- def create_embeddings(texts):
164
- """Create embeddings for a list of texts.
165
- Returns: list(embeddings)"""
166
- embed_model = get_embed_model()
167
- return embed_model.encode(
168
- texts,
169
- batch_size=64,
170
- show_progress_bar=False,
171
- convert_to_numpy=True,
172
- normalize_embeddings=True
173
- )
 
133
  full_prompt = f"{instruction_text}{the_context_block}{query_footer}"
134
 
135
  return full_prompt