Spaces:
Sleeping
Sleeping
File size: 3,492 Bytes
740f68a 1fe13b4 740f68a 1fe13b4 740f68a 1fe13b4 ace2d6c 1fe13b4 740f68a 1fe13b4 740f68a 1fe13b4 740f68a 1fe13b4 740f68a 1fe13b4 740f68a 1fe13b4 740f68a 1fe13b4 740f68a 1fe13b4 740f68a 1fe13b4 740f68a 1fe13b4 740f68a 1fe13b4 740f68a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
# kb_embed.py
from pathlib import Path
import os
from docx import Document
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import logging
logging.basicConfig(level=logging.INFO)
BASE_DIR = Path(__file__).resolve().parent
CHROMA_DIR = BASE_DIR / "chroma_db"
MODEL_DIR = BASE_DIR / "all-MiniLM-L6-v2" # optional local cache
DOCS_DIR = BASE_DIR / "GenericSOPsForTesting"
CHROMA_DIR.mkdir(parents=True, exist_ok=True)
client = chromadb.PersistentClient(
path=str(CHROMA_DIR),
settings=Settings(anonymized_telemetry=False)
)
collection = client.get_or_create_collection(name="knowledge_base")
# Use default HF cache (simpler on Spaces). If you must use local folder, keep cache_folder.
try:
# Prefer auto-download and cache:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
# If you want to use local cache dir: uncomment
# model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder=str(MODEL_DIR))
except Exception as e:
logging.exception(f"Failed to load embedding model: {e}")
raise
def extract_text_from_docx(file_path: str) -> str:
doc = Document(file_path)
return "\n".join(para.text for para in doc.paragraphs if para.text.strip())
def chunk_text(text: str, max_words: int = 300):
words = text.split()
chunks = [" ".join(words[i:i + max_words]) for i in range(0, len(words), max_words)]
return [c for c in chunks if c.strip()]
def ingest_documents(folder_path: str):
logging.info(f"📂 Checking folder: {folder_path}")
if not os.path.isdir(folder_path):
logging.warning(f"❌ Invalid folder path: {folder_path}")
return
files = [f for f in os.listdir(folder_path) if f.lower().endswith(".docx")]
logging.info(f"Found {len(files)} Word files: {files}")
if not files:
logging.warning("⚠️ No .docx files found. Please check the folder path.")
return
added = 0
for file in files:
file_path = os.path.join(folder_path, file)
text = extract_text_from_docx(file_path)
chunks = chunk_text(text)
if not chunks:
logging.warning(f"⚠️ No text chunks extracted from {file}")
continue
logging.info(f"📄 Ingesting {file} with {len(chunks)} chunks")
for i, chunk in enumerate(chunks):
embedding = model.encode(chunk).tolist()
doc_id = f"{file}_{i}"
# Avoid duplicate ids (if re-ingesting)
try:
collection.add(
ids=[doc_id],
embeddings=[embedding],
documents=[chunk],
metadatas=[{"filename": file, "chunk_index": i}]
)
added += 1
except Exception as e:
logging.warning(f"Skipping duplicate or failed add for {doc_id}: {e}")
logging.info(f"✅ Documents ingested. Added entries: {added}. Total entries: {collection.count()}")
def search_knowledge_base(query: str, top_k: int = 3):
query_embedding = model.encode(query).tolist()
results = collection.query(
query_embeddings=[query_embedding],
n_results=top_k,
include=["documents", "metadatas", "distances"]
)
return results
def main():
ingest_documents(str(DOCS_DIR)) if DOCS_DIR.exists() else logging.error(f"❌ Invalid folder path: {DOCS_DIR}")
if __name__ == "__main__":
main()
|