agentic-rag-nik / src /bin /embed_generator.py
unikill066's picture
Upload 3 files
ae08b77 verified
"""
Author: Nikhil Nageshwar Inturi (GitHub: @unikill066)
Date: 2025-06-22
Generate embeddings for PDF documents
"""
# imports
import os, warnings
from dotenv import load_dotenv
warnings.filterwarnings("ignore")
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
# load environment variables
load_dotenv()
# logging - CHECK [x]
docs = list()
DOCS_DIR = "./docs"
PERSIST_DIR = "./chroma_db"
for _dir in [DOCS_DIR, PERSIST_DIR]:
os.makedirs(_dir, exist_ok=True)
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.5)
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
pdf_paths = [os.path.join(DOCS_DIR, url) for url in os.listdir(DOCS_DIR) if url.endswith(".pdf")] # loading the pdf files
# docs = [PyPDFLoader(url).load() for url in pdf_paths]
# docs_list = [item for sublist in docs for item in sublist]
for path in pdf_paths:
loader = PyPDFLoader(path)
docs.extend(loader.load()) # flatten the doc list
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=200, chunk_overlap=50)
doc_splits = text_splitter.split_documents(docs)
# load and save the embeddings
vectorstore = Chroma.from_documents(documents=doc_splits, collection_name="rag-nik", embedding=embedding_model, persist_directory=PERSIST_DIR)
vectorstore.persist()