Spaces:
Sleeping
Sleeping
File size: 1,813 Bytes
0e40d5c 430911f 27bc93f 0e40d5c 27bc93f 430911f 0e40d5c 873b7fb 430911f 27bc93f 430911f 27bc93f 430911f 27bc93f 430911f 873b7fb 430911f 873b7fb 430911f 873b7fb 430911f 27bc93f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
import streamlit as st
import PyPDF2
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
st.set_page_config(page_title="PDF QA App (CPU)", layout="wide")
st.title("📘 Ask Questions from Uploaded PDFs (Free & CPU Friendly)")
uploaded_files = st.file_uploader("Upload multiple PDF files", type=["pdf"], accept_multiple_files=True)
@st.cache_resource
def load_llm():
model_id = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
return HuggingFacePipeline(pipeline=pipe)
if uploaded_files:
st.info("Reading and processing PDFs...")
all_text = ""
for file in uploaded_files:
reader = PyPDF2.PdfReader(file)
for page in reader.pages:
text = page.extract_text()
if text:
all_text += text
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_text(all_text)
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
db = FAISS.from_texts(texts, embeddings)
retriever = db.as_retriever()
llm = load_llm()
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
question = st.text_input("Ask a question based on the uploaded PDFs:")
if question:
with st.spinner("Generating answer..."):
answer = qa_chain.run(question)
st.success(answer)
|