File size: 2,508 Bytes
79166eb
 
 
 
 
e003aa9
8d4698b
e003aa9
 
ce7038c
 
e003aa9
79166eb
e003aa9
79166eb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import os
from docx import Document
from sentence_transformers import SentenceTransformer
import chromadb


CHROMA_PATH = os.getenv("CHROMA_PATH", "/data/chroma_db")
os.makedirs(CHROMA_PATH, exist_ok=True)
print("Chroma path:", CHROMA_PATH)
print("Writable:",os.access("/data",os.W_OK))
print("Exists:",os.path.exists("/data"))

# Initialize ChromaDB client (persistent storage)
client = chromadb.PersistentClient(path=CHROMA_PATH)
collection = client.get_or_create_collection(name="knowledge_base")

# Load MiniLM model for embeddings
#model = SentenceTransformer('./models/all-MiniLM-L6-v2')
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def extract_text_from_docx(file_path):
    """Extract text from a .docx file."""
    doc = Document(file_path)
    return '\n'.join([para.text for para in doc.paragraphs])

def chunk_text(text, max_words=300):
    """Split text into smaller chunks for better embedding quality."""
    words = text.split()
    return [' '.join(words[i:i + max_words]) for i in range(0, len(words), max_words)]

def ingest_documents(folder_path):
    """Read .docx files, chunk text, generate embeddings, and store in ChromaDB."""
    print(f"📂 Checking folder: {folder_path}")
    files = [f for f in os.listdir(folder_path) if f.endswith('.docx')]
    print(f"Found {len(files)} Word files: {files}")

    if not files:
        print("⚠️ No .docx files found. Please check the folder path.")
        return

    for file in files:
        file_path = os.path.join(folder_path, file)
        text = extract_text_from_docx(file_path)
        chunks = chunk_text(text)

        print(f"📄 Ingesting {file} with {len(chunks)} chunks")

        for i, chunk in enumerate(chunks):
            embedding = model.encode(chunk).tolist()
            doc_id = f"{file}_{i}"
            collection.add(
                ids=[doc_id],
                embeddings=[embedding],
                documents=[chunk],
                metadatas=[{"filename": file}]
            )

    print(f"✅ Documents ingested. Total entries: {collection.count()}")

def search_knowledge_base(query, top_k=3):
    """Search ChromaDB using semantic similarity."""
    query_embedding = model.encode(query).tolist()
    results = collection.query(query_embeddings=[query_embedding], n_results=top_k)
    print("results",results)
    return results

# Example usage:
# ingest_documents("path/to/docs")
# results = search_knowledge_base("inventory mismatch in warehouse")
# print(results)