embedding flexibility and swap
Browse files
app.py
CHANGED
|
@@ -34,10 +34,16 @@ load_dotenv()
|
|
| 34 |
UPLOAD_PATH = "./uploads"
|
| 35 |
INITIAL_EMBEDDINGS_DIR = "./initial_embeddings"
|
| 36 |
INITIAL_EMBEDDINGS_NAME = "initial_embeddings"
|
| 37 |
-
XLSX_MODEL_ID = "Snowflake/snowflake-arctic-embed-m"
|
| 38 |
-
PDF_MODEL_ID = "Snowflake/snowflake-arctic-embed-m"
|
| 39 |
USER_EMBEDDINGS_NAME = "user_embeddings"
|
| 40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
# Make sure upload directory exists
|
| 42 |
os.makedirs(UPLOAD_PATH, exist_ok=True)
|
| 43 |
|
|
@@ -61,6 +67,19 @@ qdrant_client = QdrantClient(":memory:")
|
|
| 61 |
# Create a semantic splitter for PDF documents
|
| 62 |
semantic_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
|
| 63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
# Utility functions
|
| 65 |
def load_and_chunk_excel_files():
|
| 66 |
"""Loads all .xlsx files from the initial embeddings directory and splits them into chunks."""
|
|
@@ -100,7 +119,16 @@ def embed_chunks_in_qdrant(chunks):
|
|
| 100 |
print("No Excel files found to process or all files were empty.")
|
| 101 |
return None
|
| 102 |
|
| 103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
print("Creating vector store...")
|
| 105 |
vector_store = QdrantVectorStore.from_documents(
|
| 106 |
documents=chunks,
|
|
@@ -151,6 +179,14 @@ async def load_and_chunk_pdf_files(files):
|
|
| 151 |
|
| 152 |
return documents_with_metadata
|
| 153 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
async def embed_pdf_chunks_in_qdrant(documents_with_metadata, model_name=PDF_MODEL_ID):
|
| 155 |
"""Create a vector store and embed PDF chunks into Qdrant."""
|
| 156 |
if not documents_with_metadata:
|
|
@@ -158,7 +194,8 @@ async def embed_pdf_chunks_in_qdrant(documents_with_metadata, model_name=PDF_MOD
|
|
| 158 |
return None
|
| 159 |
|
| 160 |
# Create a new embeddings model
|
| 161 |
-
pdf_model =
|
|
|
|
| 162 |
|
| 163 |
try:
|
| 164 |
# First, check if collection exists and delete it if it does
|
|
@@ -262,10 +299,11 @@ def search_excel_data(query: str, top_k: int = 3) -> str:
|
|
| 262 |
|
| 263 |
# If we have a user collection, also search that
|
| 264 |
try:
|
|
|
|
| 265 |
user_vectorstore = QdrantVectorStore(
|
| 266 |
client=qdrant_client,
|
| 267 |
collection_name=USER_EMBEDDINGS_NAME,
|
| 268 |
-
embedding=
|
| 269 |
)
|
| 270 |
|
| 271 |
# Create a retrieval chain for user documents
|
|
@@ -284,6 +322,7 @@ def search_excel_data(query: str, top_k: int = 3) -> str:
|
|
| 284 |
# Combine results
|
| 285 |
return f"From Excel files:\n{result}\n\nFrom your uploaded PDF:\n{user_result}"
|
| 286 |
except Exception as e:
|
|
|
|
| 287 |
# If no user collection exists yet, just return Excel results
|
| 288 |
return result
|
| 289 |
|
|
@@ -308,7 +347,7 @@ def identify_heal_instruments(protocol_text: str = "") -> str:
|
|
| 308 |
user_vectorstore = QdrantVectorStore(
|
| 309 |
client=qdrant_client,
|
| 310 |
collection_name=USER_EMBEDDINGS_NAME,
|
| 311 |
-
embedding=
|
| 312 |
)
|
| 313 |
user_retriever = user_vectorstore.as_retriever(search_kwargs={"k": 10})
|
| 314 |
except Exception as e:
|
|
@@ -521,4 +560,5 @@ async def on_message(msg: cl.Message):
|
|
| 521 |
):
|
| 522 |
await final_answer.stream_token(msg_response.content)
|
| 523 |
|
| 524 |
-
await final_answer.send()
|
|
|
|
|
|
| 34 |
UPLOAD_PATH = "./uploads"
|
| 35 |
INITIAL_EMBEDDINGS_DIR = "./initial_embeddings"
|
| 36 |
INITIAL_EMBEDDINGS_NAME = "initial_embeddings"
|
|
|
|
|
|
|
| 37 |
USER_EMBEDDINGS_NAME = "user_embeddings"
|
| 38 |
|
| 39 |
+
#XLSX_MODEL_ID = "Snowflake/snowflake-arctic-embed-m"
|
| 40 |
+
#XLSX_MODEL_ID = "text-embedding-3-small"
|
| 41 |
+
XLSX_MODEL_ID = "pritamdeka/S-PubMedBert-MS-MARCO"
|
| 42 |
+
#PDF_MODEL_ID = "Snowflake/snowflake-arctic-embed-m"
|
| 43 |
+
#PDF_MODEL_ID = "text-embedding-3-small"
|
| 44 |
+
PDF_MODEL_ID = "pritamdeka/S-PubMedBert-MS-MARCO"
|
| 45 |
+
|
| 46 |
+
|
| 47 |
# Make sure upload directory exists
|
| 48 |
os.makedirs(UPLOAD_PATH, exist_ok=True)
|
| 49 |
|
|
|
|
| 67 |
# Create a semantic splitter for PDF documents
|
| 68 |
semantic_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
|
| 69 |
|
| 70 |
+
|
| 71 |
+
# Add this utility function after the other utility functions
|
| 72 |
+
def get_embedding_model(model_id):
|
| 73 |
+
"""Creates and returns the appropriate embedding model based on the model ID."""
|
| 74 |
+
if "text-embedding" in model_id:
|
| 75 |
+
# OpenAI embeddings
|
| 76 |
+
from langchain_openai import OpenAIEmbeddings
|
| 77 |
+
return OpenAIEmbeddings(model=model_id)
|
| 78 |
+
else:
|
| 79 |
+
# HuggingFace embeddings
|
| 80 |
+
return HuggingFaceEmbeddings(model_name=model_id)
|
| 81 |
+
|
| 82 |
+
|
| 83 |
# Utility functions
|
| 84 |
def load_and_chunk_excel_files():
|
| 85 |
"""Loads all .xlsx files from the initial embeddings directory and splits them into chunks."""
|
|
|
|
| 119 |
print("No Excel files found to process or all files were empty.")
|
| 120 |
return None
|
| 121 |
|
| 122 |
+
# Create embeddings model based on the configured model ID
|
| 123 |
+
if "text-embedding" in XLSX_MODEL_ID:
|
| 124 |
+
# OpenAI embeddings
|
| 125 |
+
from langchain_openai import OpenAIEmbeddings
|
| 126 |
+
xlsx_model = OpenAIEmbeddings(model=XLSX_MODEL_ID)
|
| 127 |
+
else:
|
| 128 |
+
# HuggingFace embeddings
|
| 129 |
+
xlsx_model = HuggingFaceEmbeddings(model_name=XLSX_MODEL_ID)
|
| 130 |
+
|
| 131 |
+
print(f"Using embedding model: {XLSX_MODEL_ID}")
|
| 132 |
print("Creating vector store...")
|
| 133 |
vector_store = QdrantVectorStore.from_documents(
|
| 134 |
documents=chunks,
|
|
|
|
| 179 |
|
| 180 |
return documents_with_metadata
|
| 181 |
|
| 182 |
+
# Add this utility function to get vector dimensions
|
| 183 |
+
def get_embedding_dimensions(model_id):
|
| 184 |
+
"""Gets the dimensions of embeddings from a specific model."""
|
| 185 |
+
model = get_embedding_model(model_id)
|
| 186 |
+
sample_text = "Sample text to determine embedding dimension"
|
| 187 |
+
sample_embedding = model.embed_query(sample_text)
|
| 188 |
+
return len(sample_embedding)
|
| 189 |
+
|
| 190 |
async def embed_pdf_chunks_in_qdrant(documents_with_metadata, model_name=PDF_MODEL_ID):
|
| 191 |
"""Create a vector store and embed PDF chunks into Qdrant."""
|
| 192 |
if not documents_with_metadata:
|
|
|
|
| 194 |
return None
|
| 195 |
|
| 196 |
# Create a new embeddings model
|
| 197 |
+
pdf_model = get_embedding_model(model_name)
|
| 198 |
+
print(f"Using embedding model: {model_name}")
|
| 199 |
|
| 200 |
try:
|
| 201 |
# First, check if collection exists and delete it if it does
|
|
|
|
| 299 |
|
| 300 |
# If we have a user collection, also search that
|
| 301 |
try:
|
| 302 |
+
# Use the same model that was used to create the collection
|
| 303 |
user_vectorstore = QdrantVectorStore(
|
| 304 |
client=qdrant_client,
|
| 305 |
collection_name=USER_EMBEDDINGS_NAME,
|
| 306 |
+
embedding=get_embedding_model(PDF_MODEL_ID) # Use PDF_MODEL_ID here
|
| 307 |
)
|
| 308 |
|
| 309 |
# Create a retrieval chain for user documents
|
|
|
|
| 322 |
# Combine results
|
| 323 |
return f"From Excel files:\n{result}\n\nFrom your uploaded PDF:\n{user_result}"
|
| 324 |
except Exception as e:
|
| 325 |
+
print(f"Error searching user vector store: {str(e)}")
|
| 326 |
# If no user collection exists yet, just return Excel results
|
| 327 |
return result
|
| 328 |
|
|
|
|
| 347 |
user_vectorstore = QdrantVectorStore(
|
| 348 |
client=qdrant_client,
|
| 349 |
collection_name=USER_EMBEDDINGS_NAME,
|
| 350 |
+
embedding=get_embedding_model(PDF_MODEL_ID) # Use PDF_MODEL_ID here
|
| 351 |
)
|
| 352 |
user_retriever = user_vectorstore.as_retriever(search_kwargs={"k": 10})
|
| 353 |
except Exception as e:
|
|
|
|
| 560 |
):
|
| 561 |
await final_answer.stream_token(msg_response.content)
|
| 562 |
|
| 563 |
+
await final_answer.send()
|
| 564 |
+
|