Jaita commited on
Commit
79166eb
·
verified ·
1 Parent(s): 431968d

Create embed.py

Browse files
Files changed (1) hide show
  1. embed.py +63 -0
embed.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from docx import Document
3
+ from sentence_transformers import SentenceTransformer
4
+ import chromadb
5
+
6
+ # Initialize ChromaDB client (persistent storage)
7
+ client = chromadb.PersistentClient(path="chroma_db")
8
+ collection = client.get_or_create_collection(name="knowledge_base")
9
+
10
+ # Load MiniLM model for embeddings
11
+ #model = SentenceTransformer('./models/all-MiniLM-L6-v2')
12
+ model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
13
+
14
+ def extract_text_from_docx(file_path):
15
+ """Extract text from a .docx file."""
16
+ doc = Document(file_path)
17
+ return '\n'.join([para.text for para in doc.paragraphs])
18
+
19
+ def chunk_text(text, max_words=300):
20
+ """Split text into smaller chunks for better embedding quality."""
21
+ words = text.split()
22
+ return [' '.join(words[i:i + max_words]) for i in range(0, len(words), max_words)]
23
+
24
+ def ingest_documents(folder_path):
25
+ """Read .docx files, chunk text, generate embeddings, and store in ChromaDB."""
26
+ print(f"📂 Checking folder: {folder_path}")
27
+ files = [f for f in os.listdir(folder_path) if f.endswith('.docx')]
28
+ print(f"Found {len(files)} Word files: {files}")
29
+
30
+ if not files:
31
+ print("⚠️ No .docx files found. Please check the folder path.")
32
+ return
33
+
34
+ for file in files:
35
+ file_path = os.path.join(folder_path, file)
36
+ text = extract_text_from_docx(file_path)
37
+ chunks = chunk_text(text)
38
+
39
+ print(f"📄 Ingesting {file} with {len(chunks)} chunks")
40
+
41
+ for i, chunk in enumerate(chunks):
42
+ embedding = model.encode(chunk).tolist()
43
+ doc_id = f"{file}_{i}"
44
+ collection.add(
45
+ ids=[doc_id],
46
+ embeddings=[embedding],
47
+ documents=[chunk],
48
+ metadatas=[{"filename": file}]
49
+ )
50
+
51
+ print(f"✅ Documents ingested. Total entries: {collection.count()}")
52
+
53
+ def search_knowledge_base(query, top_k=3):
54
+ """Search ChromaDB using semantic similarity."""
55
+ query_embedding = model.encode(query).tolist()
56
+ results = collection.query(query_embeddings=[query_embedding], n_results=top_k)
57
+ print("results",results)
58
+ return results
59
+
60
+ # Example usage:
61
+ # ingest_documents("path/to/docs")
62
+ # results = search_knowledge_base("inventory mismatch in warehouse")
63
+ # print(results)