Spaces:

deepak-cse-jha
/

Medical-Chatbot

Sleeping

Medical-Chatbot / utils /create_faiss_from_dataset.py

Build FAISS at runtime from HF dataset

617291c 3 months ago

1.66 kB

	import os
	from huggingface_hub import hf_hub_download
	from langchain_community.document_loaders import PyPDFLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain_community.vectorstores import FAISS

	DATASET_REPO = "deepak-cse-jha/medibot-data"
	PDF_NAME = "The_GALE_ENCYCLOPEDIA_of_MEDICINE_SECOND.pdf"

	FAISS_DIR = "/tmp/faiss_index"
	PDF_PATH = "/tmp/medical.pdf"


	def get_or_create_faiss():
	# 1️⃣ If FAISS already exists, load it
	if os.path.exists(FAISS_DIR):
	embeddings = HuggingFaceEmbeddings(
	model_name="sentence-transformers/all-MiniLM-L6-v2"
	)
	return FAISS.load_local(
	FAISS_DIR,
	embeddings,
	allow_dangerous_deserialization=True,
	)

	# 2️⃣ Download PDF from HF Dataset
	hf_hub_download(
	repo_id=DATASET_REPO,
	filename=PDF_NAME,
	repo_type="dataset",
	local_dir="/tmp",
	local_dir_use_symlinks=False,
	)

	os.rename(f"/tmp/{PDF_NAME}", PDF_PATH)

	# 3️⃣ Load and split PDF
	loader = PyPDFLoader(PDF_PATH)
	documents = loader.load()

	splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=200,
	)
	texts = splitter.split_documents(documents)

	# 4️⃣ Create embeddings + FAISS
	embeddings = HuggingFaceEmbeddings(
	model_name="sentence-transformers/all-MiniLM-L6-v2"
	)

	vectorstore = FAISS.from_documents(texts, embeddings)

	# 5️⃣ Save FAISS (runtime only)
	vectorstore.save_local(FAISS_DIR)

	return vectorstore