Medical-Chatbot / utils /create_faiss_from_dataset.py
deepak-cse-jha's picture
Build FAISS at runtime from HF dataset
617291c
import os
from huggingface_hub import hf_hub_download
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
DATASET_REPO = "deepak-cse-jha/medibot-data"
PDF_NAME = "The_GALE_ENCYCLOPEDIA_of_MEDICINE_SECOND.pdf"
FAISS_DIR = "/tmp/faiss_index"
PDF_PATH = "/tmp/medical.pdf"
def get_or_create_faiss():
# 1️⃣ If FAISS already exists, load it
if os.path.exists(FAISS_DIR):
embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2"
)
return FAISS.load_local(
FAISS_DIR,
embeddings,
allow_dangerous_deserialization=True,
)
# 2️⃣ Download PDF from HF Dataset
hf_hub_download(
repo_id=DATASET_REPO,
filename=PDF_NAME,
repo_type="dataset",
local_dir="/tmp",
local_dir_use_symlinks=False,
)
os.rename(f"/tmp/{PDF_NAME}", PDF_PATH)
# 3️⃣ Load and split PDF
loader = PyPDFLoader(PDF_PATH)
documents = loader.load()
splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
)
texts = splitter.split_documents(documents)
# 4️⃣ Create embeddings + FAISS
embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2"
)
vectorstore = FAISS.from_documents(texts, embeddings)
# 5️⃣ Save FAISS (runtime only)
vectorstore.save_local(FAISS_DIR)
return vectorstore