import os from langchain_community.document_loaders import PyPDFDirectoryLoader from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_huggingface import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS # ── Load all PDFs from KB folder ───────────────────────────────────────────── print("Loading PDFs from KB folder...") loader = PyPDFDirectoryLoader("KB") docs = loader.load() print(f"Loaded {len(docs)} pages from KB folder.") # ── Split into chunks ───────────────────────────────────────────────────────── print("Splitting into chunks...") splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200 ) all_chunks = splitter.split_documents(docs) print(f"Created {len(all_chunks)} chunks.") # ── Load embeddings ─────────────────────────────────────────────────────────── print("Loading embedding model...") embeddings = HuggingFaceEmbeddings( model_name="BAAI/bge-base-en", model_kwargs={"device": "cpu"}, encode_kwargs={"normalize_embeddings": True}, ) # ── Build and save FAISS vector store ──────────────────────────────────────── print("Building vector store...") persist_directory = "faiss_index" vector_store = FAISS.from_documents(all_chunks, embeddings) vector_store.save_local(persist_directory) print(f"Done! Database saved to '{persist_directory}'")