drewgenai commited on
Commit
249bea4
·
1 Parent(s): 9e80670

breakup user uploads functions

Browse files
Files changed (1) hide show
  1. app.py +54 -42
app.py CHANGED
@@ -31,12 +31,16 @@ from langchain_core.output_parsers import StrOutputParser
31
  load_dotenv()
32
 
33
  # Constants
 
34
  INITIAL_EMBEDDINGS_DIR = "./initial_embeddings"
35
  INITIAL_EMBEDDINGS_NAME = "initial_embeddings"
36
  XLSX_MODEL_ID = "Snowflake/snowflake-arctic-embed-m"
37
- UPLOAD_PATH = "./uploads"
38
  USER_EMBEDDINGS_NAME = "user_embeddings"
39
 
 
 
 
40
  # NIH HEAL CDE core domains
41
  NIH_HEAL_DOMAINS = [
42
  "Pain intensity",
@@ -54,9 +58,6 @@ NIH_HEAL_DOMAINS = [
54
  # Initialize Qdrant (in-memory)
55
  qdrant_client = QdrantClient(":memory:")
56
 
57
- # Make sure upload directory exists
58
- os.makedirs(UPLOAD_PATH, exist_ok=True)
59
-
60
  # Create a semantic splitter for PDF documents
61
  semantic_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
62
 
@@ -118,9 +119,9 @@ def process_initial_embeddings():
118
  def format_docs(docs):
119
  return "\n\n".join(doc.page_content for doc in docs)
120
 
121
- async def process_uploaded_files(files, model_name=XLSX_MODEL_ID):
122
- """Process uploaded PDF files and add them to a separate vector store collection"""
123
- print(f"Processing {len(files)} uploaded files")
124
  documents_with_metadata = []
125
 
126
  for file in files:
@@ -148,47 +149,58 @@ async def process_uploaded_files(files, model_name=XLSX_MODEL_ID):
148
  except Exception as e:
149
  print(f"Error processing {file.name}: {str(e)}")
150
 
151
- if documents_with_metadata:
152
- # Create a new embeddings model
153
- pdf_model = HuggingFaceEmbeddings(model_name=model_name)
 
 
 
 
154
 
155
- # Create a new vector store collection for user uploads
156
- try:
157
- # First, check if collection exists and delete it if it does
158
- if USER_EMBEDDINGS_NAME in [c.name for c in qdrant_client.get_collections().collections]:
159
- qdrant_client.delete_collection(USER_EMBEDDINGS_NAME)
160
-
161
- # Create the collection with proper parameters
162
- # Get the embedding dimension by creating a sample embedding
163
- sample_text = "Sample text to determine embedding dimension"
164
- sample_embedding = pdf_model.embed_query(sample_text)
165
- embedding_dimension = len(sample_embedding)
166
-
167
- qdrant_client.create_collection(
168
- collection_name=USER_EMBEDDINGS_NAME,
169
- vectors_config=VectorParams(size=embedding_dimension, distance=Distance.COSINE)
170
- )
171
-
172
- # Create the vector store
173
- user_vectorstore = QdrantVectorStore(
174
- client=qdrant_client,
175
- collection_name=USER_EMBEDDINGS_NAME,
176
- embedding=pdf_model
177
- )
178
-
179
- # Add documents to the vector store
180
- user_vectorstore.add_documents(documents_with_metadata)
181
 
182
- print(f"Added {len(documents_with_metadata)} chunks from uploaded files to collection '{USER_EMBEDDINGS_NAME}'")
183
- return user_vectorstore
184
- except Exception as e:
185
- print(f"Error creating vector store: {str(e)}")
186
- return None
187
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
 
189
  # Data processing and initialization
190
  vectorstore = process_initial_embeddings()
191
 
 
192
  # Create a retriever from the vector store
193
  if vectorstore:
194
  retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
 
31
  load_dotenv()
32
 
33
  # Constants
34
+ UPLOAD_PATH = "./uploads"
35
  INITIAL_EMBEDDINGS_DIR = "./initial_embeddings"
36
  INITIAL_EMBEDDINGS_NAME = "initial_embeddings"
37
  XLSX_MODEL_ID = "Snowflake/snowflake-arctic-embed-m"
38
+ PDF_MODEL_ID = "Snowflake/snowflake-arctic-embed-m"
39
  USER_EMBEDDINGS_NAME = "user_embeddings"
40
 
41
+ # Make sure upload directory exists
42
+ os.makedirs(UPLOAD_PATH, exist_ok=True)
43
+
44
  # NIH HEAL CDE core domains
45
  NIH_HEAL_DOMAINS = [
46
  "Pain intensity",
 
58
  # Initialize Qdrant (in-memory)
59
  qdrant_client = QdrantClient(":memory:")
60
 
 
 
 
61
  # Create a semantic splitter for PDF documents
62
  semantic_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
63
 
 
119
  def format_docs(docs):
120
  return "\n\n".join(doc.page_content for doc in docs)
121
 
122
+ async def load_and_chunk_pdf_files(files):
123
+ """Load PDF files and split them into chunks with metadata."""
124
+ print(f"Loading {len(files)} uploaded PDF files")
125
  documents_with_metadata = []
126
 
127
  for file in files:
 
149
  except Exception as e:
150
  print(f"Error processing {file.name}: {str(e)}")
151
 
152
+ return documents_with_metadata
153
+
154
+ async def embed_pdf_chunks_in_qdrant(documents_with_metadata, model_name=PDF_MODEL_ID):
155
+ """Create a vector store and embed PDF chunks into Qdrant."""
156
+ if not documents_with_metadata:
157
+ print("No documents to embed")
158
+ return None
159
 
160
+ # Create a new embeddings model
161
+ pdf_model = HuggingFaceEmbeddings(model_name=model_name)
162
+
163
+ try:
164
+ # First, check if collection exists and delete it if it does
165
+ if USER_EMBEDDINGS_NAME in [c.name for c in qdrant_client.get_collections().collections]:
166
+ qdrant_client.delete_collection(USER_EMBEDDINGS_NAME)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
 
168
+ # Create the collection with proper parameters
169
+ # Get the embedding dimension by creating a sample embedding
170
+ sample_text = "Sample text to determine embedding dimension"
171
+ sample_embedding = pdf_model.embed_query(sample_text)
172
+ embedding_dimension = len(sample_embedding)
173
+
174
+ qdrant_client.create_collection(
175
+ collection_name=USER_EMBEDDINGS_NAME,
176
+ vectors_config=VectorParams(size=embedding_dimension, distance=Distance.COSINE)
177
+ )
178
+
179
+ # Create the vector store
180
+ user_vectorstore = QdrantVectorStore(
181
+ client=qdrant_client,
182
+ collection_name=USER_EMBEDDINGS_NAME,
183
+ embedding=pdf_model
184
+ )
185
+
186
+ # Add documents to the vector store
187
+ user_vectorstore.add_documents(documents_with_metadata)
188
+
189
+ print(f"Added {len(documents_with_metadata)} chunks from uploaded files to collection '{USER_EMBEDDINGS_NAME}'")
190
+ return user_vectorstore
191
+ except Exception as e:
192
+ print(f"Error creating vector store: {str(e)}")
193
+ return None
194
+
195
+ async def process_uploaded_files(files, model_name=PDF_MODEL_ID):
196
+ """Process uploaded PDF files and add them to a separate vector store collection"""
197
+ documents_with_metadata = await load_and_chunk_pdf_files(files)
198
+ return await embed_pdf_chunks_in_qdrant(documents_with_metadata, model_name)
199
 
200
  # Data processing and initialization
201
  vectorstore = process_initial_embeddings()
202
 
203
+
204
  # Create a retriever from the vector store
205
  if vectorstore:
206
  retriever = vectorstore.as_retriever(search_kwargs={"k": 10})