Spaces:
Sleeping
Sleeping
| import os | |
| from docx import Document | |
| from sentence_transformers import SentenceTransformer | |
| import chromadb | |
| CHROMA_PATH = os.getenv("CHROMA_PATH", "/data/chroma_db") | |
| os.makedirs(CHROMA_PATH, exist_ok=True) | |
| print("Chroma path:", CHROMA_PATH) | |
| print("Writable:",os.access("/data",os.W_OK)) | |
| print("Exists:",os.path.exists("/data")) | |
| # Initialize ChromaDB client (persistent storage) | |
| client = chromadb.PersistentClient(path=CHROMA_PATH) | |
| collection = client.get_or_create_collection(name="knowledge_base") | |
| # Load MiniLM model for embeddings | |
| #model = SentenceTransformer('./models/all-MiniLM-L6-v2') | |
| model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") | |
| def extract_text_from_docx(file_path): | |
| """Extract text from a .docx file.""" | |
| doc = Document(file_path) | |
| return '\n'.join([para.text for para in doc.paragraphs]) | |
| def chunk_text(text, max_words=300): | |
| """Split text into smaller chunks for better embedding quality.""" | |
| words = text.split() | |
| return [' '.join(words[i:i + max_words]) for i in range(0, len(words), max_words)] | |
| def ingest_documents(folder_path): | |
| """Read .docx files, chunk text, generate embeddings, and store in ChromaDB.""" | |
| print(f"π Checking folder: {folder_path}") | |
| files = [f for f in os.listdir(folder_path) if f.endswith('.docx')] | |
| print(f"Found {len(files)} Word files: {files}") | |
| if not files: | |
| print("β οΈ No .docx files found. Please check the folder path.") | |
| return | |
| for file in files: | |
| file_path = os.path.join(folder_path, file) | |
| text = extract_text_from_docx(file_path) | |
| chunks = chunk_text(text) | |
| print(f"π Ingesting {file} with {len(chunks)} chunks") | |
| for i, chunk in enumerate(chunks): | |
| embedding = model.encode(chunk).tolist() | |
| doc_id = f"{file}_{i}" | |
| collection.add( | |
| ids=[doc_id], | |
| embeddings=[embedding], | |
| documents=[chunk], | |
| metadatas=[{"filename": file}] | |
| ) | |
| print(f"β Documents ingested. Total entries: {collection.count()}") | |
| def search_knowledge_base(query, top_k=3): | |
| """Search ChromaDB using semantic similarity.""" | |
| query_embedding = model.encode(query).tolist() | |
| results = collection.query(query_embeddings=[query_embedding], n_results=top_k) | |
| print("results",results) | |
| return results | |
| # Example usage: | |
| # ingest_documents("path/to/docs") | |
| # results = search_knowledge_base("inventory mismatch in warehouse") | |
| # print(results) |