Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -137,88 +137,89 @@
|
|
| 137 |
# st.info("Upload a PDF to begin.")
|
| 138 |
|
| 139 |
|
| 140 |
-
|
| 141 |
import streamlit as st
|
| 142 |
from langchain_community.document_loaders import PyPDFLoader
|
| 143 |
-
from
|
|
|
|
| 144 |
from langchain_community.vectorstores import FAISS
|
| 145 |
-
from langchain.embeddings import HuggingFaceEmbeddings
|
| 146 |
from langchain.chains import RetrievalQA
|
| 147 |
from langchain.prompts import PromptTemplate
|
| 148 |
from langchain.llms import HuggingFaceHub
|
| 149 |
-
import os
|
| 150 |
|
| 151 |
-
# Set Hugging Face API
|
| 152 |
-
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
custom_prompt = PromptTemplate(
|
| 156 |
input_variables=["context", "question"],
|
| 157 |
template="""
|
| 158 |
-
You are a helpful assistant. Use the context
|
| 159 |
-
If the answer is not in the context,
|
| 160 |
|
| 161 |
Context:
|
| 162 |
{context}
|
| 163 |
|
| 164 |
-
Question:
|
| 165 |
-
{question}
|
| 166 |
|
| 167 |
-
Answer:
|
| 168 |
-
"""
|
| 169 |
)
|
| 170 |
|
| 171 |
-
# Load PDF and split into chunks
|
| 172 |
-
|
| 173 |
-
from langchain_community.document_loaders import PyPDFLoader
|
| 174 |
-
import tempfile
|
| 175 |
-
|
| 176 |
-
def load_and_split_pdf(uploaded_file):
|
| 177 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
|
| 178 |
-
tmp_file.write(uploaded_file.read())
|
| 179 |
-
tmp_file_path = tmp_file.name
|
| 180 |
-
|
| 181 |
-
loader = PyPDFLoader(tmp_file_path)
|
| 182 |
-
documents = loader.load()
|
| 183 |
-
|
| 184 |
-
# Then your text splitting logic follows
|
| 185 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
|
| 186 |
-
chunks = text_splitter.split_documents(documents)
|
| 187 |
-
return chunks
|
| 188 |
-
|
| 189 |
-
# Build vectorstore from document chunks
|
| 190 |
-
def build_vectorstore(chunks):
|
| 191 |
-
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
| 192 |
-
db = FAISS.from_documents(chunks, embedding=embeddings)
|
| 193 |
-
return db
|
| 194 |
-
|
| 195 |
# Build QA chain
|
| 196 |
def build_qa_chain(vectorstore):
|
| 197 |
-
llm =
|
| 198 |
qa_chain = RetrievalQA.from_chain_type(
|
| 199 |
llm=llm,
|
| 200 |
-
retriever=vectorstore.as_retriever(),
|
| 201 |
-
chain_type="stuff",
|
| 202 |
chain_type_kwargs={"prompt": custom_prompt}
|
| 203 |
)
|
| 204 |
return qa_chain
|
| 205 |
|
| 206 |
-
# Streamlit
|
| 207 |
-
|
| 208 |
-
st.
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
# st.info("Upload a PDF to begin.")
|
| 138 |
|
| 139 |
|
| 140 |
+
import os
|
| 141 |
import streamlit as st
|
| 142 |
from langchain_community.document_loaders import PyPDFLoader
|
| 143 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 144 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 145 |
from langchain_community.vectorstores import FAISS
|
|
|
|
| 146 |
from langchain.chains import RetrievalQA
|
| 147 |
from langchain.prompts import PromptTemplate
|
| 148 |
from langchain.llms import HuggingFaceHub
|
|
|
|
| 149 |
|
| 150 |
+
# Set your Hugging Face API token here
|
| 151 |
+
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "your_hf_token_here"
|
| 152 |
+
|
| 153 |
+
# Load and split PDF
|
| 154 |
+
def load_and_split_pdf(uploaded_file):
|
| 155 |
+
with open("temp.pdf", "wb") as f:
|
| 156 |
+
f.write(uploaded_file.read())
|
| 157 |
+
loader = PyPDFLoader("temp.pdf")
|
| 158 |
+
documents = loader.load()
|
| 159 |
+
|
| 160 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
|
| 161 |
+
chunks = text_splitter.split_documents(documents)
|
| 162 |
+
return chunks
|
| 163 |
|
| 164 |
+
# Build vectorstore
|
| 165 |
+
def build_vectorstore(chunks):
|
| 166 |
+
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
| 167 |
+
vectorstore = FAISS.from_documents(chunks, embedding=embedding_model)
|
| 168 |
+
return vectorstore
|
| 169 |
+
|
| 170 |
+
# Load Lamini or other HF model
|
| 171 |
+
def get_llm():
|
| 172 |
+
return HuggingFaceHub(
|
| 173 |
+
repo_id="lamini/lamini-13b-chat",
|
| 174 |
+
model_kwargs={"temperature": 0.2, "max_new_tokens": 512}
|
| 175 |
+
)
|
| 176 |
+
|
| 177 |
+
# Create prompt template (optional for better accuracy)
|
| 178 |
custom_prompt = PromptTemplate(
|
| 179 |
input_variables=["context", "question"],
|
| 180 |
template="""
|
| 181 |
+
You are a helpful assistant. Use the following context to answer the question as accurately as possible.
|
| 182 |
+
If the answer is not in the context, respond with "Not found in the document."
|
| 183 |
|
| 184 |
Context:
|
| 185 |
{context}
|
| 186 |
|
| 187 |
+
Question: {question}
|
|
|
|
| 188 |
|
| 189 |
+
Answer:"""
|
|
|
|
| 190 |
)
|
| 191 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
# Build QA chain
|
| 193 |
def build_qa_chain(vectorstore):
|
| 194 |
+
llm = get_llm()
|
| 195 |
qa_chain = RetrievalQA.from_chain_type(
|
| 196 |
llm=llm,
|
| 197 |
+
retriever=vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5}),
|
|
|
|
| 198 |
chain_type_kwargs={"prompt": custom_prompt}
|
| 199 |
)
|
| 200 |
return qa_chain
|
| 201 |
|
| 202 |
+
# Streamlit UI
|
| 203 |
+
def main():
|
| 204 |
+
st.set_page_config(page_title="PDF Chatbot", layout="wide")
|
| 205 |
+
st.title("Chat with your PDF")
|
| 206 |
+
|
| 207 |
+
uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
|
| 208 |
+
|
| 209 |
+
if uploaded_file:
|
| 210 |
+
st.success("PDF uploaded successfully!")
|
| 211 |
+
with st.spinner("Processing PDF..."):
|
| 212 |
+
chunks = load_and_split_pdf(uploaded_file)
|
| 213 |
+
vectorstore = build_vectorstore(chunks)
|
| 214 |
+
qa_chain = build_qa_chain(vectorstore)
|
| 215 |
+
st.success("Ready to chat!")
|
| 216 |
+
|
| 217 |
+
user_question = st.text_input("Ask a question based on the PDF:")
|
| 218 |
+
if user_question:
|
| 219 |
+
with st.spinner("Generating answer..."):
|
| 220 |
+
result = qa_chain.run(user_question)
|
| 221 |
+
st.markdown("**Answer:**")
|
| 222 |
+
st.write(result)
|
| 223 |
+
|
| 224 |
+
if __name__ == "__main__":
|
| 225 |
+
main()
|