Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -41,6 +41,7 @@ documents = load_docx_files_from_drive(docs_folder)
|
|
| 41 |
|
| 42 |
|
| 43 |
def split_extracted_text_into_chunks(documents):
|
|
|
|
| 44 |
# List to hold all chunks
|
| 45 |
chunks = []
|
| 46 |
|
|
@@ -73,6 +74,7 @@ chunks = split_extracted_text_into_chunks(documents)
|
|
| 73 |
|
| 74 |
|
| 75 |
def save_chunks_to_file(chunks, output_file_path):
|
|
|
|
| 76 |
# Open the file in write mode
|
| 77 |
with open(output_file_path, "w", encoding="utf-8") as file:
|
| 78 |
for i, chunk in enumerate(chunks, start=1):
|
|
@@ -100,6 +102,7 @@ embedding_model = HuggingFaceEmbeddings(
|
|
| 100 |
|
| 101 |
# Step 2: Embed the chunks (now simplified)
|
| 102 |
def embed_chunks(chunks):
|
|
|
|
| 103 |
return [
|
| 104 |
{"chunk": chunk, "embedding": embedding_model.embed_query(chunk)}
|
| 105 |
for chunk in chunks
|
|
@@ -111,6 +114,7 @@ embeddings = embed_chunks(chunks)
|
|
| 111 |
|
| 112 |
# Step 3: Prepare documents (unchanged)
|
| 113 |
def prepare_documents_for_chroma(embeddings):
|
|
|
|
| 114 |
return [
|
| 115 |
Document2(page_content=entry["chunk"], metadata={"chunk_index": i})
|
| 116 |
for i, entry in enumerate(embeddings, start=1)
|
|
|
|
| 41 |
|
| 42 |
|
| 43 |
def split_extracted_text_into_chunks(documents):
|
| 44 |
+
print("Splitting text into chunks")
|
| 45 |
# List to hold all chunks
|
| 46 |
chunks = []
|
| 47 |
|
|
|
|
| 74 |
|
| 75 |
|
| 76 |
def save_chunks_to_file(chunks, output_file_path):
|
| 77 |
+
print("Saving chunks to file")
|
| 78 |
# Open the file in write mode
|
| 79 |
with open(output_file_path, "w", encoding="utf-8") as file:
|
| 80 |
for i, chunk in enumerate(chunks, start=1):
|
|
|
|
| 102 |
|
| 103 |
# Step 2: Embed the chunks (now simplified)
|
| 104 |
def embed_chunks(chunks):
|
| 105 |
+
print("Embedding the chunks")
|
| 106 |
return [
|
| 107 |
{"chunk": chunk, "embedding": embedding_model.embed_query(chunk)}
|
| 108 |
for chunk in chunks
|
|
|
|
| 114 |
|
| 115 |
# Step 3: Prepare documents (unchanged)
|
| 116 |
def prepare_documents_for_chroma(embeddings):
|
| 117 |
+
print("Preparing documents for chroma")
|
| 118 |
return [
|
| 119 |
Document2(page_content=entry["chunk"], metadata={"chunk_index": i})
|
| 120 |
for i, entry in enumerate(embeddings, start=1)
|