Spaces:

Jaita
/

chatbot-fastapi-backend

Sleeping

App Files Files Community

Jaita commited on Dec 5, 2025

Commit

740f68a

verified ·

1 Parent(s): 531a3a1

Update kb_embed.py

Browse files

Files changed (1) hide show

kb_embed.py +46 -29

kb_embed.py CHANGED Viewed

@@ -1,73 +1,91 @@
 from pathlib import Path
 import os
 from docx import Document
 from sentence_transformers import SentenceTransformer
 import chromadb
 from chromadb.config import Settings
-# --- Paths (relative to this file) ---
 BASE_DIR = Path(__file__).resolve().parent
 CHROMA_DIR = BASE_DIR / "chroma_db"
-MODEL_DIR = BASE_DIR / "all-MiniLM-L6-v2"
 DOCS_DIR = BASE_DIR / "GenericSOPsForTesting"
-# Ensure persistence folder exists
 CHROMA_DIR.mkdir(parents=True, exist_ok=True)
-# --- ChromaDB persistent client ---
 client = chromadb.PersistentClient(
     path=str(CHROMA_DIR),
     settings=Settings(anonymized_telemetry=False)
 )
 collection = client.get_or_create_collection(name="knowledge_base")
-# --- Embedding model ---
-# If you keep the model folder locally in the repo:
-model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder=str(MODEL_DIR))
-# Or, if you prefer auto-download & cache (no local folder required):
-# model = SentenceTransformer("all-MiniLM-L6-v2")
 def extract_text_from_docx(file_path: str) -> str:
-    """Extract text from a .docx file."""
     doc = Document(file_path)
-    return "\n".join(para.text for para in doc.paragraphs)
 def chunk_text(text: str, max_words: int = 300):
-    """Split text into smaller chunks for better embedding quality."""
     words = text.split()
-    return [" ".join(words[i:i + max_words]) for i in range(0, len(words), max_words)]
 def ingest_documents(folder_path: str):
-    """Read .docx files, chunk text, generate embeddings, and store in ChromaDB."""
-    print(f"📂 Checking folder: {folder_path}")
     files = [f for f in os.listdir(folder_path) if f.lower().endswith(".docx")]
-    print(f"Found {len(files)} Word files: {files}")
     if not files:
-        print("⚠️ No .docx files found. Please check the folder path.")
         return
     for file in files:
         file_path = os.path.join(folder_path, file)
         text = extract_text_from_docx(file_path)
         chunks = chunk_text(text)
-        print(f"📄 Ingesting {file} with {len(chunks)} chunks")
         for i, chunk in enumerate(chunks):
             embedding = model.encode(chunk).tolist()
             doc_id = f"{file}_{i}"
-            collection.add(
-                ids=[doc_id],
-                embeddings=[embedding],
-                documents=[chunk],
-                metadatas=[{"filename": file}]
-            )
-    print(f"✅ Documents ingested. Total entries: {collection.count()}")
 def search_knowledge_base(query: str, top_k: int = 3):
-    """Search ChromaDB using semantic similarity."""
     query_embedding = model.encode(query).tolist()
     results = collection.query(
         query_embeddings=[query_embedding],
@@ -77,8 +95,7 @@ def search_knowledge_base(query: str, top_k: int = 3):
     return results
 def main():
-    # One-liner, no hardcoding — relative to this file
-    ingest_documents(str(DOCS_DIR)) if DOCS_DIR.exists() else print(f"❌ Invalid folder path: {DOCS_DIR}")
 if __name__ == "__main__":
-    main()

+# kb_embed.py
 from pathlib import Path
 import os
 from docx import Document
 from sentence_transformers import SentenceTransformer
 import chromadb
 from chromadb.config import Settings
+import logging
+logging.basicConfig(level=logging.INFO)
 BASE_DIR = Path(__file__).resolve().parent
 CHROMA_DIR = BASE_DIR / "chroma_db"
+MODEL_DIR = BASE_DIR / "all-MiniLM-L6-v2"  # optional local cache
 DOCS_DIR = BASE_DIR / "GenericSOPsForTesting"
 CHROMA_DIR.mkdir(parents=True, exist_ok=True)
 client = chromadb.PersistentClient(
     path=str(CHROMA_DIR),
     settings=Settings(anonymized_telemetry=False)
 )
 collection = client.get_or_create_collection(name="knowledge_base")
+# Use default HF cache (simpler on Spaces). If you must use local folder, keep cache_folder.
+try:
+    # Prefer auto-download and cache:
+    model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+    # If you want to use local cache dir: uncomment
+    # model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder=str(MODEL_DIR))
+except Exception as e:
+    logging.exception(f"Failed to load embedding model: {e}")
+    raise
 def extract_text_from_docx(file_path: str) -> str:
     doc = Document(file_path)
+    return "\n".join(para.text for para in doc.paragraphs if para.text.strip())
 def chunk_text(text: str, max_words: int = 300):
     words = text.split()
+    chunks = [" ".join(words[i:i + max_words]) for i in range(0, len(words), max_words)]
+    return [c for c in chunks if c.strip()]
 def ingest_documents(folder_path: str):
+    logging.info(f"📂 Checking folder: {folder_path}")
+    if not os.path.isdir(folder_path):
+        logging.warning(f"❌ Invalid folder path: {folder_path}")
+        return
     files = [f for f in os.listdir(folder_path) if f.lower().endswith(".docx")]
+    logging.info(f"Found {len(files)} Word files: {files}")
     if not files:
+        logging.warning("⚠️ No .docx files found. Please check the folder path.")
         return
+    added = 0
     for file in files:
         file_path = os.path.join(folder_path, file)
         text = extract_text_from_docx(file_path)
         chunks = chunk_text(text)
+        if not chunks:
+            logging.warning(f"⚠️ No text chunks extracted from {file}")
+            continue
+        logging.info(f"📄 Ingesting {file} with {len(chunks)} chunks")
         for i, chunk in enumerate(chunks):
             embedding = model.encode(chunk).tolist()
             doc_id = f"{file}_{i}"
+            # Avoid duplicate ids (if re-ingesting)
+            try:
+                collection.add(
+                    ids=[doc_id],
+                    embeddings=[embedding],
+                    documents=[chunk],
+                    metadatas=[{"filename": file, "chunk_index": i}]
+                )
+                added += 1
+            except Exception as e:
+                logging.warning(f"Skipping duplicate or failed add for {doc_id}: {e}")
+    logging.info(f"✅ Documents ingested. Added entries: {added}. Total entries: {collection.count()}")
 def search_knowledge_base(query: str, top_k: int = 3):
     query_embedding = model.encode(query).tolist()
     results = collection.query(
         query_embeddings=[query_embedding],
     return results
 def main():
+    ingest_documents(str(DOCS_DIR)) if DOCS_DIR.exists() else logging.error(f"❌ Invalid folder path: {DOCS_DIR}")
 if __name__ == "__main__":
+    main()