Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import PyPDF2 | |
| from langchain.embeddings import SentenceTransformerEmbeddings | |
| from langchain.vectorstores import FAISS | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.chains import RetrievalQA | |
| from langchain.llms import HuggingFacePipeline | |
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline | |
| st.set_page_config(page_title="PDF QA App (CPU)", layout="wide") | |
| st.title("📘 Ask Questions from Uploaded PDFs (Free & CPU Friendly)") | |
| uploaded_files = st.file_uploader("Upload multiple PDF files", type=["pdf"], accept_multiple_files=True) | |
| def load_llm(): | |
| model_id = "google/flan-t5-base" | |
| tokenizer = AutoTokenizer.from_pretrained(model_id) | |
| model = AutoModelForSeq2SeqLM.from_pretrained(model_id) | |
| pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer) | |
| return HuggingFacePipeline(pipeline=pipe) | |
| if uploaded_files: | |
| st.info("Reading and processing PDFs...") | |
| all_text = "" | |
| for file in uploaded_files: | |
| reader = PyPDF2.PdfReader(file) | |
| for page in reader.pages: | |
| text = page.extract_text() | |
| if text: | |
| all_text += text | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
| texts = text_splitter.split_text(all_text) | |
| embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") | |
| db = FAISS.from_texts(texts, embeddings) | |
| retriever = db.as_retriever() | |
| llm = load_llm() | |
| qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever) | |
| question = st.text_input("Ask a question based on the uploaded PDFs:") | |
| if question: | |
| with st.spinner("Generating answer..."): | |
| answer = qa_chain.run(question) | |
| st.success(answer) | |