Spaces:
Sleeping
Sleeping
| import os | |
| from huggingface_hub import hf_hub_download | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain_community.vectorstores import FAISS | |
| DATASET_REPO = "deepak-cse-jha/medibot-data" | |
| PDF_NAME = "The_GALE_ENCYCLOPEDIA_of_MEDICINE_SECOND.pdf" | |
| FAISS_DIR = "/tmp/faiss_index" | |
| PDF_PATH = "/tmp/medical.pdf" | |
| def get_or_create_faiss(): | |
| # 1️⃣ If FAISS already exists, load it | |
| if os.path.exists(FAISS_DIR): | |
| embeddings = HuggingFaceEmbeddings( | |
| model_name="sentence-transformers/all-MiniLM-L6-v2" | |
| ) | |
| return FAISS.load_local( | |
| FAISS_DIR, | |
| embeddings, | |
| allow_dangerous_deserialization=True, | |
| ) | |
| # 2️⃣ Download PDF from HF Dataset | |
| hf_hub_download( | |
| repo_id=DATASET_REPO, | |
| filename=PDF_NAME, | |
| repo_type="dataset", | |
| local_dir="/tmp", | |
| local_dir_use_symlinks=False, | |
| ) | |
| os.rename(f"/tmp/{PDF_NAME}", PDF_PATH) | |
| # 3️⃣ Load and split PDF | |
| loader = PyPDFLoader(PDF_PATH) | |
| documents = loader.load() | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=1000, | |
| chunk_overlap=200, | |
| ) | |
| texts = splitter.split_documents(documents) | |
| # 4️⃣ Create embeddings + FAISS | |
| embeddings = HuggingFaceEmbeddings( | |
| model_name="sentence-transformers/all-MiniLM-L6-v2" | |
| ) | |
| vectorstore = FAISS.from_documents(texts, embeddings) | |
| # 5️⃣ Save FAISS (runtime only) | |
| vectorstore.save_local(FAISS_DIR) | |
| return vectorstore | |