drewgenai commited on
Commit
9e80670
·
1 Parent(s): 2a037e9

split chunk and embed functions

Browse files
Files changed (1) hide show
  1. app.py +14 -9
app.py CHANGED
@@ -61,12 +61,9 @@ os.makedirs(UPLOAD_PATH, exist_ok=True)
61
  semantic_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
62
 
63
  # Utility functions
64
- def process_initial_embeddings():
65
- """Loads all .xlsx files, extracts text, embeds, and stores in Qdrant."""
66
-
67
- xlsx_model = HuggingFaceEmbeddings(model_name=XLSX_MODEL_ID)
68
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
69
-
70
  all_chunks = []
71
  file_count = 0
72
 
@@ -94,15 +91,18 @@ def process_initial_embeddings():
94
  print(f"Error processing {file}: {str(e)}")
95
 
96
  print(f"Processed {file_count} Excel files with a total of {len(all_chunks)} chunks.")
97
-
98
- # Create vector store with all documents at once
99
- if not all_chunks:
 
 
100
  print("No Excel files found to process or all files were empty.")
101
  return None
102
 
 
103
  print("Creating vector store...")
104
  vector_store = QdrantVectorStore.from_documents(
105
- documents=all_chunks,
106
  embedding=xlsx_model,
107
  location=":memory:",
108
  collection_name=INITIAL_EMBEDDINGS_NAME
@@ -110,6 +110,11 @@ def process_initial_embeddings():
110
  print(f"Successfully loaded all .xlsx files into Qdrant collection '{INITIAL_EMBEDDINGS_NAME}'.")
111
  return vector_store
112
 
 
 
 
 
 
113
  def format_docs(docs):
114
  return "\n\n".join(doc.page_content for doc in docs)
115
 
 
61
  semantic_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
62
 
63
  # Utility functions
64
+ def load_and_chunk_excel_files():
65
+ """Loads all .xlsx files from the initial embeddings directory and splits them into chunks."""
 
 
66
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
 
67
  all_chunks = []
68
  file_count = 0
69
 
 
91
  print(f"Error processing {file}: {str(e)}")
92
 
93
  print(f"Processed {file_count} Excel files with a total of {len(all_chunks)} chunks.")
94
+ return all_chunks
95
+
96
+ def embed_chunks_in_qdrant(chunks):
97
+ """Embeds document chunks and stores them in Qdrant."""
98
+ if not chunks:
99
  print("No Excel files found to process or all files were empty.")
100
  return None
101
 
102
+ xlsx_model = HuggingFaceEmbeddings(model_name=XLSX_MODEL_ID)
103
  print("Creating vector store...")
104
  vector_store = QdrantVectorStore.from_documents(
105
+ documents=chunks,
106
  embedding=xlsx_model,
107
  location=":memory:",
108
  collection_name=INITIAL_EMBEDDINGS_NAME
 
110
  print(f"Successfully loaded all .xlsx files into Qdrant collection '{INITIAL_EMBEDDINGS_NAME}'.")
111
  return vector_store
112
 
113
+ def process_initial_embeddings():
114
+ """Loads all .xlsx files, extracts text, embeds, and stores in Qdrant."""
115
+ chunks = load_and_chunk_excel_files()
116
+ return embed_chunks_in_qdrant(chunks)
117
+
118
  def format_docs(docs):
119
  return "\n\n".join(doc.page_content for doc in docs)
120