Spaces:
Sleeping
Sleeping
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_community.document_loaders import TextLoader | |
| from langchain_openai import OpenAIEmbeddings | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_community.document_loaders import PyPDFLoader | |
| import glob | |
| from dotenv import load_dotenv | |
| # Load environment variables from .env file | |
| load_dotenv() | |
| # 1. Load all files | |
| filepaths = glob.glob("ratelist_offers.pdf") # Adjust pattern if needed | |
| all_documents = [] | |
| for path in filepaths: | |
| loader = PyPDFLoader(path) | |
| docs = loader.load() | |
| all_documents.extend(docs) | |
| # 2. Chunk all documents | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=500, | |
| chunk_overlap=100 | |
| ) | |
| chunks = text_splitter.split_documents(all_documents) | |
| # 3. Create embeddings | |
| embeddings = OpenAIEmbeddings() | |
| # 4. Store vectors in FAISS | |
| faiss_index = FAISS.from_documents(chunks, embeddings) | |
| faiss_index.save_local("faiss_index_store") |