Jaita commited on
Commit
740f68a
Β·
verified Β·
1 Parent(s): 531a3a1

Update kb_embed.py

Browse files
Files changed (1) hide show
  1. kb_embed.py +46 -29
kb_embed.py CHANGED
@@ -1,73 +1,91 @@
 
 
1
  from pathlib import Path
2
  import os
3
  from docx import Document
4
  from sentence_transformers import SentenceTransformer
5
  import chromadb
6
  from chromadb.config import Settings
 
 
 
7
 
8
- # --- Paths (relative to this file) ---
9
  BASE_DIR = Path(__file__).resolve().parent
10
  CHROMA_DIR = BASE_DIR / "chroma_db"
11
- MODEL_DIR = BASE_DIR / "all-MiniLM-L6-v2"
12
  DOCS_DIR = BASE_DIR / "GenericSOPsForTesting"
13
 
14
- # Ensure persistence folder exists
15
  CHROMA_DIR.mkdir(parents=True, exist_ok=True)
16
 
17
- # --- ChromaDB persistent client ---
18
  client = chromadb.PersistentClient(
19
  path=str(CHROMA_DIR),
20
  settings=Settings(anonymized_telemetry=False)
21
  )
22
  collection = client.get_or_create_collection(name="knowledge_base")
23
 
24
- # --- Embedding model ---
25
- # If you keep the model folder locally in the repo:
26
- model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder=str(MODEL_DIR))
27
- # Or, if you prefer auto-download & cache (no local folder required):
28
- # model = SentenceTransformer("all-MiniLM-L6-v2")
 
 
 
 
29
 
30
  def extract_text_from_docx(file_path: str) -> str:
31
- """Extract text from a .docx file."""
32
  doc = Document(file_path)
33
- return "\n".join(para.text for para in doc.paragraphs)
34
 
35
  def chunk_text(text: str, max_words: int = 300):
36
- """Split text into smaller chunks for better embedding quality."""
37
  words = text.split()
38
- return [" ".join(words[i:i + max_words]) for i in range(0, len(words), max_words)]
 
39
 
40
  def ingest_documents(folder_path: str):
41
- """Read .docx files, chunk text, generate embeddings, and store in ChromaDB."""
42
- print(f"πŸ“‚ Checking folder: {folder_path}")
 
 
 
43
  files = [f for f in os.listdir(folder_path) if f.lower().endswith(".docx")]
44
- print(f"Found {len(files)} Word files: {files}")
45
 
46
  if not files:
47
- print("⚠️ No .docx files found. Please check the folder path.")
48
  return
49
 
 
50
  for file in files:
51
  file_path = os.path.join(folder_path, file)
52
  text = extract_text_from_docx(file_path)
53
  chunks = chunk_text(text)
54
 
55
- print(f"πŸ“„ Ingesting {file} with {len(chunks)} chunks")
 
 
 
 
56
 
57
  for i, chunk in enumerate(chunks):
58
  embedding = model.encode(chunk).tolist()
59
  doc_id = f"{file}_{i}"
60
- collection.add(
61
- ids=[doc_id],
62
- embeddings=[embedding],
63
- documents=[chunk],
64
- metadatas=[{"filename": file}]
65
- )
66
 
67
- print(f"βœ… Documents ingested. Total entries: {collection.count()}")
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
  def search_knowledge_base(query: str, top_k: int = 3):
70
- """Search ChromaDB using semantic similarity."""
71
  query_embedding = model.encode(query).tolist()
72
  results = collection.query(
73
  query_embeddings=[query_embedding],
@@ -77,8 +95,7 @@ def search_knowledge_base(query: str, top_k: int = 3):
77
  return results
78
 
79
  def main():
80
- # One-liner, no hardcoding β€” relative to this file
81
- ingest_documents(str(DOCS_DIR)) if DOCS_DIR.exists() else print(f"❌ Invalid folder path: {DOCS_DIR}")
82
 
83
  if __name__ == "__main__":
84
- main()
 
1
+
2
+ # kb_embed.py
3
  from pathlib import Path
4
  import os
5
  from docx import Document
6
  from sentence_transformers import SentenceTransformer
7
  import chromadb
8
  from chromadb.config import Settings
9
+ import logging
10
+
11
+ logging.basicConfig(level=logging.INFO)
12
 
 
13
  BASE_DIR = Path(__file__).resolve().parent
14
  CHROMA_DIR = BASE_DIR / "chroma_db"
15
+ MODEL_DIR = BASE_DIR / "all-MiniLM-L6-v2" # optional local cache
16
  DOCS_DIR = BASE_DIR / "GenericSOPsForTesting"
17
 
 
18
  CHROMA_DIR.mkdir(parents=True, exist_ok=True)
19
 
 
20
  client = chromadb.PersistentClient(
21
  path=str(CHROMA_DIR),
22
  settings=Settings(anonymized_telemetry=False)
23
  )
24
  collection = client.get_or_create_collection(name="knowledge_base")
25
 
26
+ # Use default HF cache (simpler on Spaces). If you must use local folder, keep cache_folder.
27
+ try:
28
+ # Prefer auto-download and cache:
29
+ model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
30
+ # If you want to use local cache dir: uncomment
31
+ # model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder=str(MODEL_DIR))
32
+ except Exception as e:
33
+ logging.exception(f"Failed to load embedding model: {e}")
34
+ raise
35
 
36
  def extract_text_from_docx(file_path: str) -> str:
 
37
  doc = Document(file_path)
38
+ return "\n".join(para.text for para in doc.paragraphs if para.text.strip())
39
 
40
  def chunk_text(text: str, max_words: int = 300):
 
41
  words = text.split()
42
+ chunks = [" ".join(words[i:i + max_words]) for i in range(0, len(words), max_words)]
43
+ return [c for c in chunks if c.strip()]
44
 
45
  def ingest_documents(folder_path: str):
46
+ logging.info(f"πŸ“‚ Checking folder: {folder_path}")
47
+ if not os.path.isdir(folder_path):
48
+ logging.warning(f"❌ Invalid folder path: {folder_path}")
49
+ return
50
+
51
  files = [f for f in os.listdir(folder_path) if f.lower().endswith(".docx")]
52
+ logging.info(f"Found {len(files)} Word files: {files}")
53
 
54
  if not files:
55
+ logging.warning("⚠️ No .docx files found. Please check the folder path.")
56
  return
57
 
58
+ added = 0
59
  for file in files:
60
  file_path = os.path.join(folder_path, file)
61
  text = extract_text_from_docx(file_path)
62
  chunks = chunk_text(text)
63
 
64
+ if not chunks:
65
+ logging.warning(f"⚠️ No text chunks extracted from {file}")
66
+ continue
67
+
68
+ logging.info(f"πŸ“„ Ingesting {file} with {len(chunks)} chunks")
69
 
70
  for i, chunk in enumerate(chunks):
71
  embedding = model.encode(chunk).tolist()
72
  doc_id = f"{file}_{i}"
 
 
 
 
 
 
73
 
74
+ # Avoid duplicate ids (if re-ingesting)
75
+ try:
76
+ collection.add(
77
+ ids=[doc_id],
78
+ embeddings=[embedding],
79
+ documents=[chunk],
80
+ metadatas=[{"filename": file, "chunk_index": i}]
81
+ )
82
+ added += 1
83
+ except Exception as e:
84
+ logging.warning(f"Skipping duplicate or failed add for {doc_id}: {e}")
85
+
86
+ logging.info(f"βœ… Documents ingested. Added entries: {added}. Total entries: {collection.count()}")
87
 
88
  def search_knowledge_base(query: str, top_k: int = 3):
 
89
  query_embedding = model.encode(query).tolist()
90
  results = collection.query(
91
  query_embeddings=[query_embedding],
 
95
  return results
96
 
97
  def main():
98
+ ingest_documents(str(DOCS_DIR)) if DOCS_DIR.exists() else logging.error(f"❌ Invalid folder path: {DOCS_DIR}")
 
99
 
100
  if __name__ == "__main__":
101
+ main()