Spaces:
Sleeping
Sleeping
| import os | |
| from langchain_community.document_loaders import PyPDFDirectoryLoader | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| from langchain_community.vectorstores import FAISS | |
| # ββ Load all PDFs from KB folder βββββββββββββββββββββββββββββββββββββββββββββ | |
| print("Loading PDFs from KB folder...") | |
| loader = PyPDFDirectoryLoader("KB") | |
| docs = loader.load() | |
| print(f"Loaded {len(docs)} pages from KB folder.") | |
| # ββ Split into chunks βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print("Splitting into chunks...") | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=1000, | |
| chunk_overlap=200 | |
| ) | |
| all_chunks = splitter.split_documents(docs) | |
| print(f"Created {len(all_chunks)} chunks.") | |
| # ββ Load embeddings βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print("Loading embedding model...") | |
| embeddings = HuggingFaceEmbeddings( | |
| model_name="BAAI/bge-base-en", | |
| model_kwargs={"device": "cpu"}, | |
| encode_kwargs={"normalize_embeddings": True}, | |
| ) | |
| # ββ Build and save FAISS vector store ββββββββββββββββββββββββββββββββββββββββ | |
| print("Building vector store...") | |
| persist_directory = "faiss_index" | |
| vector_store = FAISS.from_documents(all_chunks, embeddings) | |
| vector_store.save_local(persist_directory) | |
| print(f"Done! Database saved to '{persist_directory}'") |