drewgenai commited on
Commit
9ad40ad
·
1 Parent(s): 8b32ebd

embedding flexibility and swap

Browse files
Files changed (1) hide show
  1. app.py +47 -7
app.py CHANGED
@@ -34,10 +34,16 @@ load_dotenv()
34
  UPLOAD_PATH = "./uploads"
35
  INITIAL_EMBEDDINGS_DIR = "./initial_embeddings"
36
  INITIAL_EMBEDDINGS_NAME = "initial_embeddings"
37
- XLSX_MODEL_ID = "Snowflake/snowflake-arctic-embed-m"
38
- PDF_MODEL_ID = "Snowflake/snowflake-arctic-embed-m"
39
  USER_EMBEDDINGS_NAME = "user_embeddings"
40
 
 
 
 
 
 
 
 
 
41
  # Make sure upload directory exists
42
  os.makedirs(UPLOAD_PATH, exist_ok=True)
43
 
@@ -61,6 +67,19 @@ qdrant_client = QdrantClient(":memory:")
61
  # Create a semantic splitter for PDF documents
62
  semantic_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  # Utility functions
65
  def load_and_chunk_excel_files():
66
  """Loads all .xlsx files from the initial embeddings directory and splits them into chunks."""
@@ -100,7 +119,16 @@ def embed_chunks_in_qdrant(chunks):
100
  print("No Excel files found to process or all files were empty.")
101
  return None
102
 
103
- xlsx_model = HuggingFaceEmbeddings(model_name=XLSX_MODEL_ID)
 
 
 
 
 
 
 
 
 
104
  print("Creating vector store...")
105
  vector_store = QdrantVectorStore.from_documents(
106
  documents=chunks,
@@ -151,6 +179,14 @@ async def load_and_chunk_pdf_files(files):
151
 
152
  return documents_with_metadata
153
 
 
 
 
 
 
 
 
 
154
  async def embed_pdf_chunks_in_qdrant(documents_with_metadata, model_name=PDF_MODEL_ID):
155
  """Create a vector store and embed PDF chunks into Qdrant."""
156
  if not documents_with_metadata:
@@ -158,7 +194,8 @@ async def embed_pdf_chunks_in_qdrant(documents_with_metadata, model_name=PDF_MOD
158
  return None
159
 
160
  # Create a new embeddings model
161
- pdf_model = HuggingFaceEmbeddings(model_name=model_name)
 
162
 
163
  try:
164
  # First, check if collection exists and delete it if it does
@@ -262,10 +299,11 @@ def search_excel_data(query: str, top_k: int = 3) -> str:
262
 
263
  # If we have a user collection, also search that
264
  try:
 
265
  user_vectorstore = QdrantVectorStore(
266
  client=qdrant_client,
267
  collection_name=USER_EMBEDDINGS_NAME,
268
- embedding=HuggingFaceEmbeddings(model_name=XLSX_MODEL_ID)
269
  )
270
 
271
  # Create a retrieval chain for user documents
@@ -284,6 +322,7 @@ def search_excel_data(query: str, top_k: int = 3) -> str:
284
  # Combine results
285
  return f"From Excel files:\n{result}\n\nFrom your uploaded PDF:\n{user_result}"
286
  except Exception as e:
 
287
  # If no user collection exists yet, just return Excel results
288
  return result
289
 
@@ -308,7 +347,7 @@ def identify_heal_instruments(protocol_text: str = "") -> str:
308
  user_vectorstore = QdrantVectorStore(
309
  client=qdrant_client,
310
  collection_name=USER_EMBEDDINGS_NAME,
311
- embedding=HuggingFaceEmbeddings(model_name=XLSX_MODEL_ID)
312
  )
313
  user_retriever = user_vectorstore.as_retriever(search_kwargs={"k": 10})
314
  except Exception as e:
@@ -521,4 +560,5 @@ async def on_message(msg: cl.Message):
521
  ):
522
  await final_answer.stream_token(msg_response.content)
523
 
524
- await final_answer.send()
 
 
34
  UPLOAD_PATH = "./uploads"
35
  INITIAL_EMBEDDINGS_DIR = "./initial_embeddings"
36
  INITIAL_EMBEDDINGS_NAME = "initial_embeddings"
 
 
37
  USER_EMBEDDINGS_NAME = "user_embeddings"
38
 
39
+ #XLSX_MODEL_ID = "Snowflake/snowflake-arctic-embed-m"
40
+ #XLSX_MODEL_ID = "text-embedding-3-small"
41
+ XLSX_MODEL_ID = "pritamdeka/S-PubMedBert-MS-MARCO"
42
+ #PDF_MODEL_ID = "Snowflake/snowflake-arctic-embed-m"
43
+ #PDF_MODEL_ID = "text-embedding-3-small"
44
+ PDF_MODEL_ID = "pritamdeka/S-PubMedBert-MS-MARCO"
45
+
46
+
47
  # Make sure upload directory exists
48
  os.makedirs(UPLOAD_PATH, exist_ok=True)
49
 
 
67
  # Create a semantic splitter for PDF documents
68
  semantic_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
69
 
70
+
71
+ # Add this utility function after the other utility functions
72
+ def get_embedding_model(model_id):
73
+ """Creates and returns the appropriate embedding model based on the model ID."""
74
+ if "text-embedding" in model_id:
75
+ # OpenAI embeddings
76
+ from langchain_openai import OpenAIEmbeddings
77
+ return OpenAIEmbeddings(model=model_id)
78
+ else:
79
+ # HuggingFace embeddings
80
+ return HuggingFaceEmbeddings(model_name=model_id)
81
+
82
+
83
  # Utility functions
84
  def load_and_chunk_excel_files():
85
  """Loads all .xlsx files from the initial embeddings directory and splits them into chunks."""
 
119
  print("No Excel files found to process or all files were empty.")
120
  return None
121
 
122
+ # Create embeddings model based on the configured model ID
123
+ if "text-embedding" in XLSX_MODEL_ID:
124
+ # OpenAI embeddings
125
+ from langchain_openai import OpenAIEmbeddings
126
+ xlsx_model = OpenAIEmbeddings(model=XLSX_MODEL_ID)
127
+ else:
128
+ # HuggingFace embeddings
129
+ xlsx_model = HuggingFaceEmbeddings(model_name=XLSX_MODEL_ID)
130
+
131
+ print(f"Using embedding model: {XLSX_MODEL_ID}")
132
  print("Creating vector store...")
133
  vector_store = QdrantVectorStore.from_documents(
134
  documents=chunks,
 
179
 
180
  return documents_with_metadata
181
 
182
+ # Add this utility function to get vector dimensions
183
+ def get_embedding_dimensions(model_id):
184
+ """Gets the dimensions of embeddings from a specific model."""
185
+ model = get_embedding_model(model_id)
186
+ sample_text = "Sample text to determine embedding dimension"
187
+ sample_embedding = model.embed_query(sample_text)
188
+ return len(sample_embedding)
189
+
190
  async def embed_pdf_chunks_in_qdrant(documents_with_metadata, model_name=PDF_MODEL_ID):
191
  """Create a vector store and embed PDF chunks into Qdrant."""
192
  if not documents_with_metadata:
 
194
  return None
195
 
196
  # Create a new embeddings model
197
+ pdf_model = get_embedding_model(model_name)
198
+ print(f"Using embedding model: {model_name}")
199
 
200
  try:
201
  # First, check if collection exists and delete it if it does
 
299
 
300
  # If we have a user collection, also search that
301
  try:
302
+ # Use the same model that was used to create the collection
303
  user_vectorstore = QdrantVectorStore(
304
  client=qdrant_client,
305
  collection_name=USER_EMBEDDINGS_NAME,
306
+ embedding=get_embedding_model(PDF_MODEL_ID) # Use PDF_MODEL_ID here
307
  )
308
 
309
  # Create a retrieval chain for user documents
 
322
  # Combine results
323
  return f"From Excel files:\n{result}\n\nFrom your uploaded PDF:\n{user_result}"
324
  except Exception as e:
325
+ print(f"Error searching user vector store: {str(e)}")
326
  # If no user collection exists yet, just return Excel results
327
  return result
328
 
 
347
  user_vectorstore = QdrantVectorStore(
348
  client=qdrant_client,
349
  collection_name=USER_EMBEDDINGS_NAME,
350
+ embedding=get_embedding_model(PDF_MODEL_ID) # Use PDF_MODEL_ID here
351
  )
352
  user_retriever = user_vectorstore.as_retriever(search_kwargs={"k": 10})
353
  except Exception as e:
 
560
  ):
561
  await final_answer.stream_token(msg_response.content)
562
 
563
+ await final_answer.send()
564
+