setuproject / update_kb.py
Thisaraa13
initial commit
1f14da1
Raw
History Blame Contribute Delete
1.79 kB
import os
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
# ── Load all PDFs from KB folder ─────────────────────────────────────────────
print("Loading PDFs from KB folder...")
loader = PyPDFDirectoryLoader("KB")
docs = loader.load()
print(f"Loaded {len(docs)} pages from KB folder.")
# ── Split into chunks ─────────────────────────────────────────────────────────
print("Splitting into chunks...")
splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200
)
all_chunks = splitter.split_documents(docs)
print(f"Created {len(all_chunks)} chunks.")
# ── Load embeddings ───────────────────────────────────────────────────────────
print("Loading embedding model...")
embeddings = HuggingFaceEmbeddings(
model_name="BAAI/bge-base-en",
model_kwargs={"device": "cpu"},
encode_kwargs={"normalize_embeddings": True},
)
# ── Build and save FAISS vector store ────────────────────────────────────────
print("Building vector store...")
persist_directory = "faiss_index"
vector_store = FAISS.from_documents(all_chunks, embeddings)
vector_store.save_local(persist_directory)
print(f"Done! Database saved to '{persist_directory}'")