split chunk and embed functions
Browse files
app.py
CHANGED
|
@@ -61,12 +61,9 @@ os.makedirs(UPLOAD_PATH, exist_ok=True)
|
|
| 61 |
semantic_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
|
| 62 |
|
| 63 |
# Utility functions
|
| 64 |
-
def
|
| 65 |
-
"""Loads all .xlsx files
|
| 66 |
-
|
| 67 |
-
xlsx_model = HuggingFaceEmbeddings(model_name=XLSX_MODEL_ID)
|
| 68 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
|
| 69 |
-
|
| 70 |
all_chunks = []
|
| 71 |
file_count = 0
|
| 72 |
|
|
@@ -94,15 +91,18 @@ def process_initial_embeddings():
|
|
| 94 |
print(f"Error processing {file}: {str(e)}")
|
| 95 |
|
| 96 |
print(f"Processed {file_count} Excel files with a total of {len(all_chunks)} chunks.")
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
|
|
|
|
|
|
| 100 |
print("No Excel files found to process or all files were empty.")
|
| 101 |
return None
|
| 102 |
|
|
|
|
| 103 |
print("Creating vector store...")
|
| 104 |
vector_store = QdrantVectorStore.from_documents(
|
| 105 |
-
documents=
|
| 106 |
embedding=xlsx_model,
|
| 107 |
location=":memory:",
|
| 108 |
collection_name=INITIAL_EMBEDDINGS_NAME
|
|
@@ -110,6 +110,11 @@ def process_initial_embeddings():
|
|
| 110 |
print(f"Successfully loaded all .xlsx files into Qdrant collection '{INITIAL_EMBEDDINGS_NAME}'.")
|
| 111 |
return vector_store
|
| 112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
def format_docs(docs):
|
| 114 |
return "\n\n".join(doc.page_content for doc in docs)
|
| 115 |
|
|
|
|
| 61 |
semantic_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
|
| 62 |
|
| 63 |
# Utility functions
|
| 64 |
+
def load_and_chunk_excel_files():
|
| 65 |
+
"""Loads all .xlsx files from the initial embeddings directory and splits them into chunks."""
|
|
|
|
|
|
|
| 66 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
|
|
|
|
| 67 |
all_chunks = []
|
| 68 |
file_count = 0
|
| 69 |
|
|
|
|
| 91 |
print(f"Error processing {file}: {str(e)}")
|
| 92 |
|
| 93 |
print(f"Processed {file_count} Excel files with a total of {len(all_chunks)} chunks.")
|
| 94 |
+
return all_chunks
|
| 95 |
+
|
| 96 |
+
def embed_chunks_in_qdrant(chunks):
|
| 97 |
+
"""Embeds document chunks and stores them in Qdrant."""
|
| 98 |
+
if not chunks:
|
| 99 |
print("No Excel files found to process or all files were empty.")
|
| 100 |
return None
|
| 101 |
|
| 102 |
+
xlsx_model = HuggingFaceEmbeddings(model_name=XLSX_MODEL_ID)
|
| 103 |
print("Creating vector store...")
|
| 104 |
vector_store = QdrantVectorStore.from_documents(
|
| 105 |
+
documents=chunks,
|
| 106 |
embedding=xlsx_model,
|
| 107 |
location=":memory:",
|
| 108 |
collection_name=INITIAL_EMBEDDINGS_NAME
|
|
|
|
| 110 |
print(f"Successfully loaded all .xlsx files into Qdrant collection '{INITIAL_EMBEDDINGS_NAME}'.")
|
| 111 |
return vector_store
|
| 112 |
|
| 113 |
+
def process_initial_embeddings():
|
| 114 |
+
"""Loads all .xlsx files, extracts text, embeds, and stores in Qdrant."""
|
| 115 |
+
chunks = load_and_chunk_excel_files()
|
| 116 |
+
return embed_chunks_in_qdrant(chunks)
|
| 117 |
+
|
| 118 |
def format_docs(docs):
|
| 119 |
return "\n\n".join(doc.page_content for doc in docs)
|
| 120 |
|