File size: 3,492 Bytes
740f68a
 
1fe13b4
 
 
 
 
 
740f68a
 
 
1fe13b4
 
 
740f68a
1fe13b4
 
ace2d6c
 
1fe13b4
 
 
 
 
 
740f68a
 
 
 
 
 
 
 
 
1fe13b4
 
 
740f68a
1fe13b4
 
 
740f68a
 
1fe13b4
 
740f68a
 
 
 
 
1fe13b4
740f68a
1fe13b4
 
740f68a
1fe13b4
 
740f68a
1fe13b4
 
 
 
 
740f68a
 
 
 
 
1fe13b4
 
 
 
 
740f68a
 
 
 
 
 
 
 
 
 
 
 
 
1fe13b4
 
 
 
 
 
 
 
 
 
 
740f68a
1fe13b4
 
740f68a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102

# kb_embed.py
from pathlib import Path
import os
from docx import Document
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import logging

logging.basicConfig(level=logging.INFO)

BASE_DIR = Path(__file__).resolve().parent
CHROMA_DIR = BASE_DIR / "chroma_db"
MODEL_DIR = BASE_DIR / "all-MiniLM-L6-v2"  # optional local cache
DOCS_DIR = BASE_DIR / "GenericSOPsForTesting"

CHROMA_DIR.mkdir(parents=True, exist_ok=True)

client = chromadb.PersistentClient(
    path=str(CHROMA_DIR),
    settings=Settings(anonymized_telemetry=False)
)
collection = client.get_or_create_collection(name="knowledge_base")

# Use default HF cache (simpler on Spaces). If you must use local folder, keep cache_folder.
try:
    # Prefer auto-download and cache:
    model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
    # If you want to use local cache dir: uncomment
    # model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder=str(MODEL_DIR))
except Exception as e:
    logging.exception(f"Failed to load embedding model: {e}")
    raise

def extract_text_from_docx(file_path: str) -> str:
    doc = Document(file_path)
    return "\n".join(para.text for para in doc.paragraphs if para.text.strip())

def chunk_text(text: str, max_words: int = 300):
    words = text.split()
    chunks = [" ".join(words[i:i + max_words]) for i in range(0, len(words), max_words)]
    return [c for c in chunks if c.strip()]

def ingest_documents(folder_path: str):
    logging.info(f"📂 Checking folder: {folder_path}")
    if not os.path.isdir(folder_path):
        logging.warning(f"❌ Invalid folder path: {folder_path}")
        return

    files = [f for f in os.listdir(folder_path) if f.lower().endswith(".docx")]
    logging.info(f"Found {len(files)} Word files: {files}")

    if not files:
        logging.warning("⚠️ No .docx files found. Please check the folder path.")
        return

    added = 0
    for file in files:
        file_path = os.path.join(folder_path, file)
        text = extract_text_from_docx(file_path)
        chunks = chunk_text(text)

        if not chunks:
            logging.warning(f"⚠️ No text chunks extracted from {file}")
            continue

        logging.info(f"📄 Ingesting {file} with {len(chunks)} chunks")

        for i, chunk in enumerate(chunks):
            embedding = model.encode(chunk).tolist()
            doc_id = f"{file}_{i}"

            # Avoid duplicate ids (if re-ingesting)
            try:
                collection.add(
                    ids=[doc_id],
                    embeddings=[embedding],
                    documents=[chunk],
                    metadatas=[{"filename": file, "chunk_index": i}]
                )
                added += 1
            except Exception as e:
                logging.warning(f"Skipping duplicate or failed add for {doc_id}: {e}")

    logging.info(f"✅ Documents ingested. Added entries: {added}. Total entries: {collection.count()}")

def search_knowledge_base(query: str, top_k: int = 3):
    query_embedding = model.encode(query).tolist()
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k,
        include=["documents", "metadatas", "distances"]
    )
    return results

def main():
    ingest_documents(str(DOCS_DIR)) if DOCS_DIR.exists() else logging.error(f"❌ Invalid folder path: {DOCS_DIR}")

if __name__ == "__main__":
    main()