Spaces:
Sleeping
Sleeping
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain_community.vectorstores import FAISS | |
| import os | |
| DATA_DIR = "data/nasa_docs" | |
| documents = [] | |
| for file in os.listdir(DATA_DIR): | |
| if file.endswith(".pdf"): | |
| loader = PyPDFLoader(os.path.join(DATA_DIR, file)) | |
| documents.extend(loader.load()) | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=600, | |
| chunk_overlap=100 | |
| ) | |
| chunks = splitter.split_documents(documents) | |
| embeddings = HuggingFaceEmbeddings( | |
| model_name="sentence-transformers/all-MiniLM-L6-v2" | |
| ) | |
| vectorstore = FAISS.from_documents(chunks, embeddings) | |
| vectorstore.save_local("vectorstore") | |
| print(f"Ingested {len(chunks)} chunks") | |