Spaces:
Build error
Build error
| """ | |
| Author: Nikhil Nageshwar Inturi (GitHub: @unikill066) | |
| Date: 2025-06-22 | |
| Generate embeddings for PDF documents | |
| """ | |
| # imports | |
| import os, warnings | |
| from dotenv import load_dotenv | |
| warnings.filterwarnings("ignore") | |
| from langchain_openai import ChatOpenAI | |
| from langchain_openai import OpenAIEmbeddings | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_community.vectorstores import Chroma | |
| # load environment variables | |
| load_dotenv() | |
| # logging - CHECK [x] | |
| docs = list() | |
| DOCS_DIR = "./docs" | |
| PERSIST_DIR = "./chroma_db" | |
| for _dir in [DOCS_DIR, PERSIST_DIR]: | |
| os.makedirs(_dir, exist_ok=True) | |
| llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.5) | |
| embedding_model = OpenAIEmbeddings(model="text-embedding-3-small") | |
| pdf_paths = [os.path.join(DOCS_DIR, url) for url in os.listdir(DOCS_DIR) if url.endswith(".pdf")] # loading the pdf files | |
| # docs = [PyPDFLoader(url).load() for url in pdf_paths] | |
| # docs_list = [item for sublist in docs for item in sublist] | |
| for path in pdf_paths: | |
| loader = PyPDFLoader(path) | |
| docs.extend(loader.load()) # flatten the doc list | |
| text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=200, chunk_overlap=50) | |
| doc_splits = text_splitter.split_documents(docs) | |
| # load and save the embeddings | |
| vectorstore = Chroma.from_documents(documents=doc_splits, collection_name="rag-nik", embedding=embedding_model, persist_directory=PERSIST_DIR) | |
| vectorstore.persist() |