File size: 2,304 Bytes
350649c
35efd38
99379ec
35efd38
 
 
 
42a870f
3c68338
35efd38
 
 
 
 
 
 
 
 
 
cce75d3
35efd38
 
 
cce75d3
35efd38
 
 
 
 
 
cce75d3
35efd38
 
 
66a3ddd
35efd38
 
 
66a3ddd
42a870f
 
 
 
66a3ddd
42a870f
cce75d3
35efd38
42a870f
 
35efd38
cce75d3
35efd38
42a870f
35efd38
66a3ddd
35efd38
 
42a870f
 
 
35efd38
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import streamlit as st
import zipfile, io, os
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline

# Cache the QA initialization so ingestion runs once per session
@st.cache_resource
def init_qa(zip_bytes):
    tmp_dir = "tmp_pdfs"
    # Clean up or create temp folder
    if os.path.exists(tmp_dir):
        for f in os.listdir(tmp_dir):
            os.remove(os.path.join(tmp_dir, f))
    else:
        os.makedirs(tmp_dir)

    # Extract uploaded ZIP
    with zipfile.ZipFile(io.BytesIO(zip_bytes)) as z:
        z.extractall(tmp_dir)

    # Load all PDFs
    docs = []
    for fname in os.listdir(tmp_dir):
        if fname.lower().endswith(".pdf"):
            loader = PyPDFLoader(os.path.join(tmp_dir, fname))
            docs.extend(loader.load())

    # Split into manageable chunks
    splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    split_docs = splitter.split_documents(docs)

    # Build vector store
    embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    vector_store = FAISS.from_documents(split_docs, embeddings)

    # Load the QA model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
    model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")
    qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

    return vector_store, qa_pipeline

# Streamlit UI
st.title("RoBERTa QA Streamlit App")
st.write("Upload a ZIP of PDFs to initialize the QA engine.")
zip_file = st.file_uploader("ZIP file", type=["zip"])

if zip_file:
    vector_store, qa = init_qa(zip_file.read())
    query = st.text_input("Ask a question:")
    if query:
        docs = vector_store.similarity_search(query, k=4)
        context = "\n\n".join([doc.page_content for doc in docs])
        # Run QA
        result = qa(question=query, context=context)
        answer = result.get("answer", "No answer found.")
        st.write(answer)
else:
    st.info("Awaiting ZIP upload.")