Doc_rag / ingest.py
Abdul-Haseeb's picture
Upload 2 files
08a44a0 verified
raw
history blame contribute delete
859 Bytes
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
import os
DATA_DIR = "data/nasa_docs"
documents = []
for file in os.listdir(DATA_DIR):
if file.endswith(".pdf"):
loader = PyPDFLoader(os.path.join(DATA_DIR, file))
documents.extend(loader.load())
splitter = RecursiveCharacterTextSplitter(
chunk_size=600,
chunk_overlap=100
)
chunks = splitter.split_documents(documents)
embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2"
)
vectorstore = FAISS.from_documents(chunks, embeddings)
vectorstore.save_local("vectorstore")
print(f"Ingested {len(chunks)} chunks")