import streamlit as st import PyPDF2 from langchain.embeddings import SentenceTransformerEmbeddings from langchain.vectorstores import FAISS from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.chains import RetrievalQA from langchain.llms import HuggingFacePipeline from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline st.set_page_config(page_title="PDF QA App (CPU)", layout="wide") st.title("📘 Ask Questions from Uploaded PDFs (Free & CPU Friendly)") uploaded_files = st.file_uploader("Upload multiple PDF files", type=["pdf"], accept_multiple_files=True) @st.cache_resource def load_llm(): model_id = "google/flan-t5-base" tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForSeq2SeqLM.from_pretrained(model_id) pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer) return HuggingFacePipeline(pipeline=pipe) if uploaded_files: st.info("Reading and processing PDFs...") all_text = "" for file in uploaded_files: reader = PyPDF2.PdfReader(file) for page in reader.pages: text = page.extract_text() if text: all_text += text text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) texts = text_splitter.split_text(all_text) embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") db = FAISS.from_texts(texts, embeddings) retriever = db.as_retriever() llm = load_llm() qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever) question = st.text_input("Ask a question based on the uploaded PDFs:") if question: with st.spinner("Generating answer..."): answer = qa_chain.run(question) st.success(answer)