deepak-cse-jha commited on
Commit
617291c
·
1 Parent(s): d77f07f

Build FAISS at runtime from HF dataset

Browse files
.gitignore CHANGED
@@ -2,3 +2,5 @@
2
  data/
3
  vectorstore/
4
  venv/
 
 
 
2
  data/
3
  vectorstore/
4
  venv/
5
+ *.faiss
6
+ *.pkl
app.py CHANGED
@@ -8,25 +8,45 @@ from langchain.chains import RetrievalQA
8
  from langchain_core.prompts import PromptTemplate
9
  from langchain_groq import ChatGroq
10
 
 
 
 
 
11
  load_dotenv()
12
 
13
- DB_FAISS_PATH = "vectorstore/db_faiss"
 
 
14
 
15
  @st.cache_resource
16
  def get_vectorstore():
17
- if not os.path.exists(DB_FAISS_PATH):
18
- st.error("FAISS vectorstore not found")
19
- st.stop()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
 
21
  embeddings = HuggingFaceEmbeddings(
22
  model_name="sentence-transformers/all-MiniLM-L6-v2"
23
  )
24
 
25
- return FAISS.load_local(
26
- DB_FAISS_PATH,
27
- embeddings,
28
- allow_dangerous_deserialization=True
29
- )
30
 
31
 
32
  def get_prompt():
@@ -63,7 +83,7 @@ def main():
63
  for msg in st.session_state.messages:
64
  st.chat_message(msg["role"]).markdown(msg["content"])
65
 
66
- user_input = st.chat_input("Ask your question")
67
 
68
  if user_input:
69
  st.chat_message("user").markdown(user_input)
@@ -95,12 +115,8 @@ def main():
95
  {"role": "assistant", "content": answer}
96
  )
97
 
98
- st.chat_message("assistant").markdown(
99
- "Source Docs:\n\n" + str(sources)
100
- )
101
- st.session_state.messages.append(
102
- {"role": "assistant", "content": str(sources)}
103
- )
104
 
105
 
106
  if __name__ == "__main__":
 
8
  from langchain_core.prompts import PromptTemplate
9
  from langchain_groq import ChatGroq
10
 
11
+ from huggingface_hub import hf_hub_download
12
+ from langchain.document_loaders import PyPDFLoader
13
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
14
+
15
  load_dotenv()
16
 
17
+ HF_DATASET_REPO = "deepak-cse-jha/medibot-data"
18
+ PDF_FILENAME = "The_GALE_ENCYCLOPEDIA_of_MEDICINE_SECOND.pdf"
19
+
20
 
21
  @st.cache_resource
22
  def get_vectorstore():
23
+ # 1. Download PDF from HF Dataset
24
+ pdf_path = hf_hub_download(
25
+ repo_id=HF_DATASET_REPO,
26
+ filename=PDF_FILENAME,
27
+ repo_type="dataset"
28
+ )
29
+
30
+ # 2. Load PDF
31
+ loader = PyPDFLoader(pdf_path)
32
+ documents = loader.load()
33
+
34
+ # 3. Split text
35
+ splitter = RecursiveCharacterTextSplitter(
36
+ chunk_size=1000,
37
+ chunk_overlap=150
38
+ )
39
+ docs = splitter.split_documents(documents)
40
 
41
+ # 4. Create embeddings
42
  embeddings = HuggingFaceEmbeddings(
43
  model_name="sentence-transformers/all-MiniLM-L6-v2"
44
  )
45
 
46
+ # 5. Build FAISS in memory
47
+ vectorstore = FAISS.from_documents(docs, embeddings)
48
+
49
+ return vectorstore
 
50
 
51
 
52
  def get_prompt():
 
83
  for msg in st.session_state.messages:
84
  st.chat_message(msg["role"]).markdown(msg["content"])
85
 
86
+ user_input = st.chat_input("Ask your medical question")
87
 
88
  if user_input:
89
  st.chat_message("user").markdown(user_input)
 
115
  {"role": "assistant", "content": answer}
116
  )
117
 
118
+ with st.expander("Source Documents"):
119
+ st.write(sources)
 
 
 
 
120
 
121
 
122
  if __name__ == "__main__":
requirements.txt CHANGED
@@ -95,3 +95,4 @@ tzdata==2025.2; python_version >= '2'
95
  urllib3==2.5.0; python_version >= '3.9'
96
  yarl==1.20.1; python_version >= '3.9'
97
  zstandard==0.23.0; python_version >= '3.8'
 
 
95
  urllib3==2.5.0; python_version >= '3.9'
96
  yarl==1.20.1; python_version >= '3.9'
97
  zstandard==0.23.0; python_version >= '3.8'
98
+ huggingface_hub
utils/connect_memory_with_llm.py CHANGED
@@ -1,56 +1,62 @@
1
  import os
2
-
3
- from langchain_huggingface import HuggingFaceEndpoint
4
  from langchain_core.prompts import PromptTemplate
5
  from langchain.chains import RetrievalQA
6
- from langchain_huggingface import HuggingFaceEmbeddings
7
- from langchain_community.vectorstores import FAISS
8
 
9
- from dotenv import load_dotenv, find_dotenv
10
- load_dotenv(find_dotenv())
11
 
 
 
12
 
13
- HF_TOKEN=os.environ.get("HF_TOKEN")
14
- HUGGINGFACE_REPO_ID="mistralai/Mistral-7B-Instruct-v0.3"
15
 
16
- def load_llm(huggingface_repo_id):
17
- llm=HuggingFaceEndpoint(
18
- repo_id=huggingface_repo_id,
19
  temperature=0.5,
20
- model_kwargs={"token":HF_TOKEN,
21
- "max_length":"512"}
 
 
22
  )
23
- return llm
24
 
25
 
26
  CUSTOM_PROMPT_TEMPLATE = """
27
  Use the pieces of information provided in the context to answer user's question.
28
- If you dont know the answer, just say that you dont know, dont try to make up an answer.
29
- Dont provide anything out of the given context
 
 
 
30
 
31
- Context: {context}
32
- Question: {question}
33
 
34
- Start the answer directly. No small talk please.
35
  """
36
 
37
- def set_custom_prompt(custom_prompt_template):
38
- prompt=PromptTemplate(template=custom_prompt_template, input_variables=["context", "question"])
39
- return prompt
40
-
41
- DB_FAISS_PATH= "../vectorstore/db_faiss"
42
- embedding_model=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
43
- db=FAISS.load_local(DB_FAISS_PATH, embedding_model, allow_dangerous_deserialization=True)
44
-
45
- qa_chain=RetrievalQA.from_chain_type(
46
- llm=load_llm(HUGGINGFACE_REPO_ID),
47
- chain_type="stuff",
48
- retriever=db.as_retriever(search_kwargs={'k':3}),
49
- return_source_documents=True,
50
- chain_type_kwargs={'prompt':set_custom_prompt(CUSTOM_PROMPT_TEMPLATE)}
51
- )
52
-
53
- user_query=input("Write Query Here: ")
54
- response=qa_chain.invoke({'query': user_query})
55
- print("RESULT: ", response["result"])
56
- print("SOURCE DOCUMENTS: ", response["source_documents"])
 
 
 
 
 
 
1
  import os
2
+ from langchain_huggingface import HuggingFaceEndpoint, HuggingFaceEmbeddings
 
3
  from langchain_core.prompts import PromptTemplate
4
  from langchain.chains import RetrievalQA
 
 
5
 
6
+ from utils.create_faiss_from_dataset import get_or_create_faiss
7
+
8
 
9
+ HF_TOKEN = os.environ.get("HF_TOKEN")
10
+ HUGGINGFACE_REPO_ID = "mistralai/Mistral-7B-Instruct-v0.3"
11
 
 
 
12
 
13
+ def load_llm(repo_id):
14
+ return HuggingFaceEndpoint(
15
+ repo_id=repo_id,
16
  temperature=0.5,
17
+ model_kwargs={
18
+ "token": HF_TOKEN,
19
+ "max_length": 512
20
+ }
21
  )
 
22
 
23
 
24
  CUSTOM_PROMPT_TEMPLATE = """
25
  Use the pieces of information provided in the context to answer user's question.
26
+ If you don't know the answer, say you don't know.
27
+ Do not make up answers.
28
+
29
+ Context:
30
+ {context}
31
 
32
+ Question:
33
+ {question}
34
 
35
+ Answer directly.
36
  """
37
 
38
+
39
+ def set_custom_prompt():
40
+ return PromptTemplate(
41
+ template=CUSTOM_PROMPT_TEMPLATE,
42
+ input_variables=["context", "question"]
43
+ )
44
+
45
+
46
+ def get_qa_chain():
47
+ # ✅ Build or load FAISS at runtime
48
+ vectorstore = get_or_create_faiss()
49
+
50
+ retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
51
+
52
+ qa_chain = RetrievalQA.from_chain_type(
53
+ llm=load_llm(HUGGINGFACE_REPO_ID),
54
+ chain_type="stuff",
55
+ retriever=retriever,
56
+ return_source_documents=True,
57
+ chain_type_kwargs={
58
+ "prompt": set_custom_prompt()
59
+ }
60
+ )
61
+
62
+ return qa_chain
utils/create_faiss_from_dataset.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from huggingface_hub import hf_hub_download
3
+ from langchain_community.document_loaders import PyPDFLoader
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain_community.embeddings import HuggingFaceEmbeddings
6
+ from langchain_community.vectorstores import FAISS
7
+
8
+ DATASET_REPO = "deepak-cse-jha/medibot-data"
9
+ PDF_NAME = "The_GALE_ENCYCLOPEDIA_of_MEDICINE_SECOND.pdf"
10
+
11
+ FAISS_DIR = "/tmp/faiss_index"
12
+ PDF_PATH = "/tmp/medical.pdf"
13
+
14
+
15
+ def get_or_create_faiss():
16
+ # 1️⃣ If FAISS already exists, load it
17
+ if os.path.exists(FAISS_DIR):
18
+ embeddings = HuggingFaceEmbeddings(
19
+ model_name="sentence-transformers/all-MiniLM-L6-v2"
20
+ )
21
+ return FAISS.load_local(
22
+ FAISS_DIR,
23
+ embeddings,
24
+ allow_dangerous_deserialization=True,
25
+ )
26
+
27
+ # 2️⃣ Download PDF from HF Dataset
28
+ hf_hub_download(
29
+ repo_id=DATASET_REPO,
30
+ filename=PDF_NAME,
31
+ repo_type="dataset",
32
+ local_dir="/tmp",
33
+ local_dir_use_symlinks=False,
34
+ )
35
+
36
+ os.rename(f"/tmp/{PDF_NAME}", PDF_PATH)
37
+
38
+ # 3️⃣ Load and split PDF
39
+ loader = PyPDFLoader(PDF_PATH)
40
+ documents = loader.load()
41
+
42
+ splitter = RecursiveCharacterTextSplitter(
43
+ chunk_size=1000,
44
+ chunk_overlap=200,
45
+ )
46
+ texts = splitter.split_documents(documents)
47
+
48
+ # 4️⃣ Create embeddings + FAISS
49
+ embeddings = HuggingFaceEmbeddings(
50
+ model_name="sentence-transformers/all-MiniLM-L6-v2"
51
+ )
52
+
53
+ vectorstore = FAISS.from_documents(texts, embeddings)
54
+
55
+ # 5️⃣ Save FAISS (runtime only)
56
+ vectorstore.save_local(FAISS_DIR)
57
+
58
+ return vectorstore