Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import zipfile, io, os | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain.text_splitter import CharacterTextSplitter | |
| from langchain.schema import Document | |
| from langchain.embeddings import SentenceTransformerEmbeddings | |
| from langchain.vectorstores import FAISS | |
| from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline | |
| # Cache the QA initialization so ingestion runs once per session | |
| def init_qa(zip_bytes): | |
| tmp_dir = "tmp_pdfs" | |
| # Clean up or create temp folder | |
| if os.path.exists(tmp_dir): | |
| for f in os.listdir(tmp_dir): | |
| os.remove(os.path.join(tmp_dir, f)) | |
| else: | |
| os.makedirs(tmp_dir) | |
| # Extract uploaded ZIP | |
| with zipfile.ZipFile(io.BytesIO(zip_bytes)) as z: | |
| z.extractall(tmp_dir) | |
| # Load all PDFs | |
| docs = [] | |
| for fname in os.listdir(tmp_dir): | |
| if fname.lower().endswith(".pdf"): | |
| loader = PyPDFLoader(os.path.join(tmp_dir, fname)) | |
| docs.extend(loader.load()) | |
| # Split into manageable chunks | |
| splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
| split_docs = splitter.split_documents(docs) | |
| # Build vector store | |
| embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") | |
| vector_store = FAISS.from_documents(split_docs, embeddings) | |
| # Load the QA model and tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2") | |
| model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2") | |
| qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer) | |
| return vector_store, qa_pipeline | |
| # Streamlit UI | |
| st.title("RoBERTa QA Streamlit App") | |
| st.write("Upload a ZIP of PDFs to initialize the QA engine.") | |
| zip_file = st.file_uploader("ZIP file", type=["zip"]) | |
| if zip_file: | |
| vector_store, qa = init_qa(zip_file.read()) | |
| query = st.text_input("Ask a question:") | |
| if query: | |
| docs = vector_store.similarity_search(query, k=4) | |
| context = "\n\n".join([doc.page_content for doc in docs]) | |
| # Run QA | |
| result = qa(question=query, context=context) | |
| answer = result.get("answer", "No answer found.") | |
| st.write(answer) | |
| else: | |
| st.info("Awaiting ZIP upload.") |