Update utils.py
Browse files
utils.py
CHANGED
|
@@ -133,41 +133,3 @@ A: Information unavailable."""
|
|
| 133 |
full_prompt = f"{instruction_text}{the_context_block}{query_footer}"
|
| 134 |
|
| 135 |
return full_prompt
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
def normalize_text(text):
|
| 139 |
-
"""Normalize text for duplicate detection by removing spaces and punctuation.
|
| 140 |
-
Returns: str"""
|
| 141 |
-
if not isinstance(text, str):
|
| 142 |
-
return ""
|
| 143 |
-
|
| 144 |
-
text = text.lower() # Lowercase
|
| 145 |
-
text = re.sub(r'\s+', ' ', text) # Remove extra whitespace
|
| 146 |
-
text = text.strip() # Remove leading/trailing spaces
|
| 147 |
-
text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
|
| 148 |
-
|
| 149 |
-
return text
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
def chunk_text(text, chunk_size=384, chunk_overlap=20):
|
| 153 |
-
"""Split text into chunks for embeddings.
|
| 154 |
-
Returns: list(chunks)"""
|
| 155 |
-
splitter = RecursiveCharacterTextSplitter(
|
| 156 |
-
separators=["\n\n", "\n", " ", ""],
|
| 157 |
-
chunk_size=chunk_size,
|
| 158 |
-
chunk_overlap=chunk_overlap
|
| 159 |
-
)
|
| 160 |
-
return splitter.split_text(text)
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
def create_embeddings(texts):
|
| 164 |
-
"""Create embeddings for a list of texts.
|
| 165 |
-
Returns: list(embeddings)"""
|
| 166 |
-
embed_model = get_embed_model()
|
| 167 |
-
return embed_model.encode(
|
| 168 |
-
texts,
|
| 169 |
-
batch_size=64,
|
| 170 |
-
show_progress_bar=False,
|
| 171 |
-
convert_to_numpy=True,
|
| 172 |
-
normalize_embeddings=True
|
| 173 |
-
)
|
|
|
|
| 133 |
full_prompt = f"{instruction_text}{the_context_block}{query_footer}"
|
| 134 |
|
| 135 |
return full_prompt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|