File size: 3,359 Bytes
0914e96 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
import os
import shutil
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from typing import List
# === β
THE FIX: Missing path variables are re-added here ===
SCRIPTS_DIR = os.path.dirname(os.path.abspath(__file__))
ROOT_DIR = os.path.dirname(SCRIPTS_DIR) # Go one level up from /scripts to /ai-service
KNOWLEDGE_BASE_DIR = os.path.join(ROOT_DIR, "knowledge_base")
EMBEDDING_MODEL_PATH = os.path.join(ROOT_DIR, "embedding_model")
DB_PATH = os.path.join(os.environ.get("WRITABLE_DIR", "/tmp"), "vector_db_persistent")
def load_documents_with_metadata(directory: str, role: str) -> List[Document]:
"""
Loads documents from a specific subdirectory and adds 'role' metadata to each one.
"""
# First, check if the directory exists to avoid errors
if not os.path.isdir(directory):
print(f"β οΈ Warning: Directory '{directory}' not found. Skipping role '{role}'.")
return []
loader = DirectoryLoader(directory, glob="**/*.md", show_progress=True, loader_cls=TextLoader)
documents = loader.load()
for doc in documents:
# We ensure metadata exists and then add the role
if doc.metadata is None:
doc.metadata = {}
doc.metadata["role"] = role
print(f" > Loaded {len(documents)} documents for role '{role}'")
return documents
def load_knowledge_base():
"""
Loads documents from role-specific folders, adds metadata, splits them,
and stores them in a persistent ChromaDB vector store.
"""
if os.path.exists(DB_PATH):
print(f"π§Ή Found existing DB at '{DB_PATH}'. Deleting it to rebuild.")
shutil.rmtree(DB_PATH)
print("π Loading documents with role metadata...")
brand_docs = load_documents_with_metadata(os.path.join(KNOWLEDGE_BASE_DIR, "brand"), "brand")
influencer_docs = load_documents_with_metadata(os.path.join(KNOWLEDGE_BASE_DIR, "influencer"), "influencer")
common_docs = load_documents_with_metadata(os.path.join(KNOWLEDGE_BASE_DIR, "common"), "common")
all_documents = brand_docs + influencer_docs + common_docs
if not all_documents:
print("β οΈ No documents were found in any sub-folder. Aborting.")
return
print(f"π Found {len(all_documents)} documents in total.")
print("πͺ Splitting documents into smaller chunks...")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(all_documents)
print(f"Split documents into {len(chunks)} chunks.")
print(f"π§ Loading embedding model from: {EMBEDDING_MODEL_PATH}")
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_PATH)
print(f"πΎ Creating and persisting vector store at: {DB_PATH}")
vector_store = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=DB_PATH)
print("\n" + "="*50)
print("β
Knowledge base loaded with role metadata successfully!")
print(f"Database is stored at: {DB_PATH}")
print("="*50 + "\n")
if __name__ == "__main__":
load_knowledge_base() |