Spaces:
Sleeping
Sleeping
Update services/kb_creation.py
Browse files- services/kb_creation.py +4 -4
services/kb_creation.py
CHANGED
|
@@ -76,9 +76,9 @@ def chunk_text(text, max_words=300):
|
|
| 76 |
|
| 77 |
def ingest_documents(folder_path):
|
| 78 |
"""Read .docx files, chunk text, generate embeddings, and store in ChromaDB."""
|
| 79 |
-
print(f"📂 Checking folder: {folder_path}")
|
| 80 |
files = [f for f in os.listdir(folder_path) if f.endswith('.docx')]
|
| 81 |
-
print(f"Found {len(files)} Word files: {files}")
|
| 82 |
|
| 83 |
if not files:
|
| 84 |
print("⚠️ No .docx files found. Please check the folder path.")
|
|
@@ -89,7 +89,7 @@ def ingest_documents(folder_path):
|
|
| 89 |
text = extract_text_from_docx(file_path)
|
| 90 |
chunks = chunk_text(text)
|
| 91 |
|
| 92 |
-
print(f"📄 Ingesting {file} with {len(chunks)} chunks")
|
| 93 |
|
| 94 |
for i, chunk in enumerate(chunks):
|
| 95 |
embedding = model.encode(chunk).tolist()
|
|
@@ -107,7 +107,7 @@ def search_knowledge_base(query, top_k=3):
|
|
| 107 |
"""Search ChromaDB using semantic similarity."""
|
| 108 |
query_embedding = model.encode(query).tolist()
|
| 109 |
results = collection.query(query_embeddings=[query_embedding], n_results=top_k,include=['embeddings','documents', 'metadatas', 'distances'])
|
| 110 |
-
print("results",results)
|
| 111 |
return results
|
| 112 |
|
| 113 |
# Example usage:
|
|
|
|
| 76 |
|
| 77 |
def ingest_documents(folder_path):
|
| 78 |
"""Read .docx files, chunk text, generate embeddings, and store in ChromaDB."""
|
| 79 |
+
#print(f"📂 Checking folder: {folder_path}")
|
| 80 |
files = [f for f in os.listdir(folder_path) if f.endswith('.docx')]
|
| 81 |
+
#print(f"Found {len(files)} Word files: {files}")
|
| 82 |
|
| 83 |
if not files:
|
| 84 |
print("⚠️ No .docx files found. Please check the folder path.")
|
|
|
|
| 89 |
text = extract_text_from_docx(file_path)
|
| 90 |
chunks = chunk_text(text)
|
| 91 |
|
| 92 |
+
#print(f"📄 Ingesting {file} with {len(chunks)} chunks")
|
| 93 |
|
| 94 |
for i, chunk in enumerate(chunks):
|
| 95 |
embedding = model.encode(chunk).tolist()
|
|
|
|
| 107 |
"""Search ChromaDB using semantic similarity."""
|
| 108 |
query_embedding = model.encode(query).tolist()
|
| 109 |
results = collection.query(query_embeddings=[query_embedding], n_results=top_k,include=['embeddings','documents', 'metadatas', 'distances'])
|
| 110 |
+
#print("results",results)
|
| 111 |
return results
|
| 112 |
|
| 113 |
# Example usage:
|