breakup user uploads functions
Browse files
app.py
CHANGED
|
@@ -31,12 +31,16 @@ from langchain_core.output_parsers import StrOutputParser
|
|
| 31 |
load_dotenv()
|
| 32 |
|
| 33 |
# Constants
|
|
|
|
| 34 |
INITIAL_EMBEDDINGS_DIR = "./initial_embeddings"
|
| 35 |
INITIAL_EMBEDDINGS_NAME = "initial_embeddings"
|
| 36 |
XLSX_MODEL_ID = "Snowflake/snowflake-arctic-embed-m"
|
| 37 |
-
|
| 38 |
USER_EMBEDDINGS_NAME = "user_embeddings"
|
| 39 |
|
|
|
|
|
|
|
|
|
|
| 40 |
# NIH HEAL CDE core domains
|
| 41 |
NIH_HEAL_DOMAINS = [
|
| 42 |
"Pain intensity",
|
|
@@ -54,9 +58,6 @@ NIH_HEAL_DOMAINS = [
|
|
| 54 |
# Initialize Qdrant (in-memory)
|
| 55 |
qdrant_client = QdrantClient(":memory:")
|
| 56 |
|
| 57 |
-
# Make sure upload directory exists
|
| 58 |
-
os.makedirs(UPLOAD_PATH, exist_ok=True)
|
| 59 |
-
|
| 60 |
# Create a semantic splitter for PDF documents
|
| 61 |
semantic_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
|
| 62 |
|
|
@@ -118,9 +119,9 @@ def process_initial_embeddings():
|
|
| 118 |
def format_docs(docs):
|
| 119 |
return "\n\n".join(doc.page_content for doc in docs)
|
| 120 |
|
| 121 |
-
async def
|
| 122 |
-
"""
|
| 123 |
-
print(f"
|
| 124 |
documents_with_metadata = []
|
| 125 |
|
| 126 |
for file in files:
|
|
@@ -148,47 +149,58 @@ async def process_uploaded_files(files, model_name=XLSX_MODEL_ID):
|
|
| 148 |
except Exception as e:
|
| 149 |
print(f"Error processing {file.name}: {str(e)}")
|
| 150 |
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
# Get the embedding dimension by creating a sample embedding
|
| 163 |
-
sample_text = "Sample text to determine embedding dimension"
|
| 164 |
-
sample_embedding = pdf_model.embed_query(sample_text)
|
| 165 |
-
embedding_dimension = len(sample_embedding)
|
| 166 |
-
|
| 167 |
-
qdrant_client.create_collection(
|
| 168 |
-
collection_name=USER_EMBEDDINGS_NAME,
|
| 169 |
-
vectors_config=VectorParams(size=embedding_dimension, distance=Distance.COSINE)
|
| 170 |
-
)
|
| 171 |
-
|
| 172 |
-
# Create the vector store
|
| 173 |
-
user_vectorstore = QdrantVectorStore(
|
| 174 |
-
client=qdrant_client,
|
| 175 |
-
collection_name=USER_EMBEDDINGS_NAME,
|
| 176 |
-
embedding=pdf_model
|
| 177 |
-
)
|
| 178 |
-
|
| 179 |
-
# Add documents to the vector store
|
| 180 |
-
user_vectorstore.add_documents(documents_with_metadata)
|
| 181 |
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
|
| 189 |
# Data processing and initialization
|
| 190 |
vectorstore = process_initial_embeddings()
|
| 191 |
|
|
|
|
| 192 |
# Create a retriever from the vector store
|
| 193 |
if vectorstore:
|
| 194 |
retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
|
|
|
|
| 31 |
load_dotenv()
|
| 32 |
|
| 33 |
# Constants
|
| 34 |
+
UPLOAD_PATH = "./uploads"
|
| 35 |
INITIAL_EMBEDDINGS_DIR = "./initial_embeddings"
|
| 36 |
INITIAL_EMBEDDINGS_NAME = "initial_embeddings"
|
| 37 |
XLSX_MODEL_ID = "Snowflake/snowflake-arctic-embed-m"
|
| 38 |
+
PDF_MODEL_ID = "Snowflake/snowflake-arctic-embed-m"
|
| 39 |
USER_EMBEDDINGS_NAME = "user_embeddings"
|
| 40 |
|
| 41 |
+
# Make sure upload directory exists
|
| 42 |
+
os.makedirs(UPLOAD_PATH, exist_ok=True)
|
| 43 |
+
|
| 44 |
# NIH HEAL CDE core domains
|
| 45 |
NIH_HEAL_DOMAINS = [
|
| 46 |
"Pain intensity",
|
|
|
|
| 58 |
# Initialize Qdrant (in-memory)
|
| 59 |
qdrant_client = QdrantClient(":memory:")
|
| 60 |
|
|
|
|
|
|
|
|
|
|
| 61 |
# Create a semantic splitter for PDF documents
|
| 62 |
semantic_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
|
| 63 |
|
|
|
|
| 119 |
def format_docs(docs):
|
| 120 |
return "\n\n".join(doc.page_content for doc in docs)
|
| 121 |
|
| 122 |
+
async def load_and_chunk_pdf_files(files):
|
| 123 |
+
"""Load PDF files and split them into chunks with metadata."""
|
| 124 |
+
print(f"Loading {len(files)} uploaded PDF files")
|
| 125 |
documents_with_metadata = []
|
| 126 |
|
| 127 |
for file in files:
|
|
|
|
| 149 |
except Exception as e:
|
| 150 |
print(f"Error processing {file.name}: {str(e)}")
|
| 151 |
|
| 152 |
+
return documents_with_metadata
|
| 153 |
+
|
| 154 |
+
async def embed_pdf_chunks_in_qdrant(documents_with_metadata, model_name=PDF_MODEL_ID):
|
| 155 |
+
"""Create a vector store and embed PDF chunks into Qdrant."""
|
| 156 |
+
if not documents_with_metadata:
|
| 157 |
+
print("No documents to embed")
|
| 158 |
+
return None
|
| 159 |
|
| 160 |
+
# Create a new embeddings model
|
| 161 |
+
pdf_model = HuggingFaceEmbeddings(model_name=model_name)
|
| 162 |
+
|
| 163 |
+
try:
|
| 164 |
+
# First, check if collection exists and delete it if it does
|
| 165 |
+
if USER_EMBEDDINGS_NAME in [c.name for c in qdrant_client.get_collections().collections]:
|
| 166 |
+
qdrant_client.delete_collection(USER_EMBEDDINGS_NAME)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
|
| 168 |
+
# Create the collection with proper parameters
|
| 169 |
+
# Get the embedding dimension by creating a sample embedding
|
| 170 |
+
sample_text = "Sample text to determine embedding dimension"
|
| 171 |
+
sample_embedding = pdf_model.embed_query(sample_text)
|
| 172 |
+
embedding_dimension = len(sample_embedding)
|
| 173 |
+
|
| 174 |
+
qdrant_client.create_collection(
|
| 175 |
+
collection_name=USER_EMBEDDINGS_NAME,
|
| 176 |
+
vectors_config=VectorParams(size=embedding_dimension, distance=Distance.COSINE)
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
+
# Create the vector store
|
| 180 |
+
user_vectorstore = QdrantVectorStore(
|
| 181 |
+
client=qdrant_client,
|
| 182 |
+
collection_name=USER_EMBEDDINGS_NAME,
|
| 183 |
+
embedding=pdf_model
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
# Add documents to the vector store
|
| 187 |
+
user_vectorstore.add_documents(documents_with_metadata)
|
| 188 |
+
|
| 189 |
+
print(f"Added {len(documents_with_metadata)} chunks from uploaded files to collection '{USER_EMBEDDINGS_NAME}'")
|
| 190 |
+
return user_vectorstore
|
| 191 |
+
except Exception as e:
|
| 192 |
+
print(f"Error creating vector store: {str(e)}")
|
| 193 |
+
return None
|
| 194 |
+
|
| 195 |
+
async def process_uploaded_files(files, model_name=PDF_MODEL_ID):
|
| 196 |
+
"""Process uploaded PDF files and add them to a separate vector store collection"""
|
| 197 |
+
documents_with_metadata = await load_and_chunk_pdf_files(files)
|
| 198 |
+
return await embed_pdf_chunks_in_qdrant(documents_with_metadata, model_name)
|
| 199 |
|
| 200 |
# Data processing and initialization
|
| 201 |
vectorstore = process_initial_embeddings()
|
| 202 |
|
| 203 |
+
|
| 204 |
# Create a retriever from the vector store
|
| 205 |
if vectorstore:
|
| 206 |
retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
|