Spaces:
Sleeping
Sleeping
File size: 1,786 Bytes
1f14da1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 | import os
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
# ββ Load all PDFs from KB folder βββββββββββββββββββββββββββββββββββββββββββββ
print("Loading PDFs from KB folder...")
loader = PyPDFDirectoryLoader("KB")
docs = loader.load()
print(f"Loaded {len(docs)} pages from KB folder.")
# ββ Split into chunks βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
print("Splitting into chunks...")
splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200
)
all_chunks = splitter.split_documents(docs)
print(f"Created {len(all_chunks)} chunks.")
# ββ Load embeddings βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
print("Loading embedding model...")
embeddings = HuggingFaceEmbeddings(
model_name="BAAI/bge-base-en",
model_kwargs={"device": "cpu"},
encode_kwargs={"normalize_embeddings": True},
)
# ββ Build and save FAISS vector store ββββββββββββββββββββββββββββββββββββββββ
print("Building vector store...")
persist_directory = "faiss_index"
vector_store = FAISS.from_documents(all_chunks, embeddings)
vector_store.save_local(persist_directory)
print(f"Done! Database saved to '{persist_directory}'") |