import os import shutil from langchain_community.document_loaders import PyPDFLoader from langchain_openai import AzureOpenAIEmbeddings from langchain_community.vectorstores import Chroma from langchain_text_splitters import RecursiveCharacterTextSplitter from dotenv import load_dotenv load_dotenv() DATA_FOLDER = "fons_knowledge_base" DB_FOLDER = "chroma_db_fons" EMBEDDING_MODEL = "text-embedding-ada-002" def ingest(): print("🧠 Building Knowledge Graph...") if os.path.exists(DB_FOLDER): shutil.rmtree(DB_FOLDER) embeddings = AzureOpenAIEmbeddings( azure_deployment=EMBEDDING_MODEL, openai_api_version=os.getenv("AZURE_OPENAI_API_VERSION"), azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"), api_key=os.getenv("AZURE_OPENAI_API_KEY"), ) vector_store = Chroma( persist_directory=DB_FOLDER, embedding_function=embeddings ) text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200 ) files = [ f for f in os.listdir(DATA_FOLDER) if f.endswith('.pdf') ] print(f" Processing {len(files)} PDFs...") for i, f in enumerate(files): try: loader = PyPDFLoader(os.path.join(DATA_FOLDER, f)) pages = loader.load() if pages: chunks = text_splitter.split_documents(pages) vector_store.add_documents(chunks) if (i + 1) % 10 == 0: print( f" [{i+1}/{len(files)}] Indexed...", end="\r" ) except Exception: pass print("\nšŸŽ‰ Database Rebuilt Successfully!") if __name__ == "__main__": ingest()