Spaces:

Jaita
/

chatbot-fastapi-backend

Sleeping

File size: 3,492 Bytes


# kb_embed.py
from pathlib import Path
import os
from docx import Document
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import logging

logging.basicConfig(level=logging.INFO)

BASE_DIR = Path(__file__).resolve().parent
CHROMA_DIR = BASE_DIR / "chroma_db"
MODEL_DIR = BASE_DIR / "all-MiniLM-L6-v2"  # optional local cache
DOCS_DIR = BASE_DIR / "GenericSOPsForTesting"

CHROMA_DIR.mkdir(parents=True, exist_ok=True)

client = chromadb.PersistentClient(
    path=str(CHROMA_DIR),
    settings=Settings(anonymized_telemetry=False)
)
collection = client.get_or_create_collection(name="knowledge_base")

# Use default HF cache (simpler on Spaces). If you must use local folder, keep cache_folder.
try:
    # Prefer auto-download and cache:
    model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
    # If you want to use local cache dir: uncomment
    # model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder=str(MODEL_DIR))
except Exception as e:
    logging.exception(f"Failed to load embedding model: {e}")
    raise

def extract_text_from_docx(file_path: str) -> str:
    doc = Document(file_path)
    return "\n".join(para.text for para in doc.paragraphs if para.text.strip())

def chunk_text(text: str, max_words: int = 300):
    words = text.split()
    chunks = [" ".join(words[i:i + max_words]) for i in range(0, len(words), max_words)]
    return [c for c in chunks if c.strip()]

def ingest_documents(folder_path: str):
    logging.info(f"📂 Checking folder: {folder_path}")
    if not os.path.isdir(folder_path):
        logging.warning(f"❌ Invalid folder path: {folder_path}")
        return

    files = [f for f in os.listdir(folder_path) if f.lower().endswith(".docx")]
    logging.info(f"Found {len(files)} Word files: {files}")

    if not files:
        logging.warning("⚠️ No .docx files found. Please check the folder path.")
        return

    added = 0
    for file in files:
        file_path = os.path.join(folder_path, file)
        text = extract_text_from_docx(file_path)
        chunks = chunk_text(text)

        if not chunks:
            logging.warning(f"⚠️ No text chunks extracted from {file}")
            continue

        logging.info(f"📄 Ingesting {file} with {len(chunks)} chunks")

        for i, chunk in enumerate(chunks):
            embedding = model.encode(chunk).tolist()
            doc_id = f"{file}_{i}"

            # Avoid duplicate ids (if re-ingesting)
            try:
                collection.add(
                    ids=[doc_id],
                    embeddings=[embedding],
                    documents=[chunk],
                    metadatas=[{"filename": file, "chunk_index": i}]
                )
                added += 1
            except Exception as e:
                logging.warning(f"Skipping duplicate or failed add for {doc_id}: {e}")

    logging.info(f"✅ Documents ingested. Added entries: {added}. Total entries: {collection.count()}")

def search_knowledge_base(query: str, top_k: int = 3):
    query_embedding = model.encode(query).tolist()
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k,
        include=["documents", "metadatas", "distances"]
    )
    return results

def main():
    ingest_documents(str(DOCS_DIR)) if DOCS_DIR.exists() else logging.error(f"❌ Invalid folder path: {DOCS_DIR}")

if __name__ == "__main__":
    main()