NurseCitizenDeveloper's picture
Deploy Open Nursing Validator (Docker)
6d12932 verified
import os
import shutil
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import AzureOpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
load_dotenv()
DATA_FOLDER = "fons_knowledge_base"
DB_FOLDER = "chroma_db_fons"
EMBEDDING_MODEL = "text-embedding-ada-002"
def ingest():
print("🧠 Building Knowledge Graph...")
if os.path.exists(DB_FOLDER):
shutil.rmtree(DB_FOLDER)
embeddings = AzureOpenAIEmbeddings(
azure_deployment=EMBEDDING_MODEL,
openai_api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
api_key=os.getenv("AZURE_OPENAI_API_KEY"),
)
vector_store = Chroma(
persist_directory=DB_FOLDER,
embedding_function=embeddings
)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200
)
files = [
f for f in os.listdir(DATA_FOLDER)
if f.endswith('.pdf')
]
print(f" Processing {len(files)} PDFs...")
for i, f in enumerate(files):
try:
loader = PyPDFLoader(os.path.join(DATA_FOLDER, f))
pages = loader.load()
if pages:
chunks = text_splitter.split_documents(pages)
vector_store.add_documents(chunks)
if (i + 1) % 10 == 0:
print(
f" [{i+1}/{len(files)}] Indexed...",
end="\r"
)
except Exception:
pass
print("\n🎉 Database Rebuilt Successfully!")
if __name__ == "__main__":
ingest()