Spaces:
Sleeping
Sleeping
File size: 2,304 Bytes
350649c 35efd38 99379ec 35efd38 42a870f 3c68338 35efd38 cce75d3 35efd38 cce75d3 35efd38 cce75d3 35efd38 66a3ddd 35efd38 66a3ddd 42a870f 66a3ddd 42a870f cce75d3 35efd38 42a870f 35efd38 cce75d3 35efd38 42a870f 35efd38 66a3ddd 35efd38 42a870f 35efd38 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 | import streamlit as st
import zipfile, io, os
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
# Cache the QA initialization so ingestion runs once per session
@st.cache_resource
def init_qa(zip_bytes):
tmp_dir = "tmp_pdfs"
# Clean up or create temp folder
if os.path.exists(tmp_dir):
for f in os.listdir(tmp_dir):
os.remove(os.path.join(tmp_dir, f))
else:
os.makedirs(tmp_dir)
# Extract uploaded ZIP
with zipfile.ZipFile(io.BytesIO(zip_bytes)) as z:
z.extractall(tmp_dir)
# Load all PDFs
docs = []
for fname in os.listdir(tmp_dir):
if fname.lower().endswith(".pdf"):
loader = PyPDFLoader(os.path.join(tmp_dir, fname))
docs.extend(loader.load())
# Split into manageable chunks
splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_docs = splitter.split_documents(docs)
# Build vector store
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
vector_store = FAISS.from_documents(split_docs, embeddings)
# Load the QA model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
return vector_store, qa_pipeline
# Streamlit UI
st.title("RoBERTa QA Streamlit App")
st.write("Upload a ZIP of PDFs to initialize the QA engine.")
zip_file = st.file_uploader("ZIP file", type=["zip"])
if zip_file:
vector_store, qa = init_qa(zip_file.read())
query = st.text_input("Ask a question:")
if query:
docs = vector_store.similarity_search(query, k=4)
context = "\n\n".join([doc.page_content for doc in docs])
# Run QA
result = qa(question=query, context=context)
answer = result.get("answer", "No answer found.")
st.write(answer)
else:
st.info("Awaiting ZIP upload.") |