import os import shutil from langchain_community.document_loaders import DirectoryLoader, TextLoader from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import Chroma from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_core.documents import Document from typing import List # === โœ… THE FIX: Missing path variables are re-added here === SCRIPTS_DIR = os.path.dirname(os.path.abspath(__file__)) ROOT_DIR = os.path.dirname(SCRIPTS_DIR) # Go one level up from /scripts to /ai-service KNOWLEDGE_BASE_DIR = os.path.join(ROOT_DIR, "knowledge_base") EMBEDDING_MODEL_PATH = os.path.join(ROOT_DIR, "embedding_model") DB_PATH = os.path.join(os.environ.get("WRITABLE_DIR", "/tmp"), "vector_db_persistent") def load_documents_with_metadata(directory: str, role: str) -> List[Document]: """ Loads documents from a specific subdirectory and adds 'role' metadata to each one. """ # First, check if the directory exists to avoid errors if not os.path.isdir(directory): print(f"โš ๏ธ Warning: Directory '{directory}' not found. Skipping role '{role}'.") return [] loader = DirectoryLoader(directory, glob="**/*.md", show_progress=True, loader_cls=TextLoader) documents = loader.load() for doc in documents: # We ensure metadata exists and then add the role if doc.metadata is None: doc.metadata = {} doc.metadata["role"] = role print(f" > Loaded {len(documents)} documents for role '{role}'") return documents def load_knowledge_base(): """ Loads documents from role-specific folders, adds metadata, splits them, and stores them in a persistent ChromaDB vector store. """ if os.path.exists(DB_PATH): print(f"๐Ÿงน Found existing DB at '{DB_PATH}'. Deleting it to rebuild.") shutil.rmtree(DB_PATH) print("๐Ÿ“š Loading documents with role metadata...") brand_docs = load_documents_with_metadata(os.path.join(KNOWLEDGE_BASE_DIR, "brand"), "brand") influencer_docs = load_documents_with_metadata(os.path.join(KNOWLEDGE_BASE_DIR, "influencer"), "influencer") common_docs = load_documents_with_metadata(os.path.join(KNOWLEDGE_BASE_DIR, "common"), "common") all_documents = brand_docs + influencer_docs + common_docs if not all_documents: print("โš ๏ธ No documents were found in any sub-folder. Aborting.") return print(f"๐Ÿ“„ Found {len(all_documents)} documents in total.") print("๐Ÿ”ช Splitting documents into smaller chunks...") text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) chunks = text_splitter.split_documents(all_documents) print(f"Split documents into {len(chunks)} chunks.") print(f"๐Ÿง  Loading embedding model from: {EMBEDDING_MODEL_PATH}") embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_PATH) print(f"๐Ÿ’พ Creating and persisting vector store at: {DB_PATH}") vector_store = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=DB_PATH) print("\n" + "="*50) print("โœ… Knowledge base loaded with role metadata successfully!") print(f"Database is stored at: {DB_PATH}") print("="*50 + "\n") if __name__ == "__main__": load_knowledge_base()