rbbist's picture
Update app.py
27bc93f verified
import streamlit as st
import PyPDF2
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
st.set_page_config(page_title="PDF QA App (CPU)", layout="wide")
st.title("📘 Ask Questions from Uploaded PDFs (Free & CPU Friendly)")
uploaded_files = st.file_uploader("Upload multiple PDF files", type=["pdf"], accept_multiple_files=True)
@st.cache_resource
def load_llm():
model_id = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
return HuggingFacePipeline(pipeline=pipe)
if uploaded_files:
st.info("Reading and processing PDFs...")
all_text = ""
for file in uploaded_files:
reader = PyPDF2.PdfReader(file)
for page in reader.pages:
text = page.extract_text()
if text:
all_text += text
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_text(all_text)
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
db = FAISS.from_texts(texts, embeddings)
retriever = db.as_retriever()
llm = load_llm()
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
question = st.text_input("Ask a question based on the uploaded PDFs:")
if question:
with st.spinner("Generating answer..."):
answer = qa_chain.run(question)
st.success(answer)