Spaces:

aaporosh
/

SmartPDF_Q_A

Sleeping

File size: 6,503 Bytes

# ------------- app.py -------------
import streamlit as st
from pathlib import Path
from io import BytesIO
import pdfplumber, pytesseract, time, re, logging, os
from PIL import Image
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import numpy as np

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

###############################################################################
# Page layout
###############################################################################
st.set_page_config(page_title="PDF Chat & Summarize", layout="wide")
st.markdown("""
<style>
    .block-container { padding-top: 1rem; padding-bottom: 0; }
    .stTabs [data-baseweb="tab-list"] { gap: 4px; }
    .stTabs [data-baseweb="tab"] { padding: 8px 24px; }
    .chat-msg { padding: 0.5rem 1rem; border-radius: 8px; margin: 0.3rem 0; }
    .user   { background-color: #e3f2fd; margin-left: 20%; }
    .assistant { background-color: #f1f3f4; margin-right: 20%; }
</style>
""", unsafe_allow_html=True)

###############################################################################
# Cached heavy objects
###############################################################################
@st.cache_resource(show_spinner=False)
def load_embed():
    return SentenceTransformer("all-MiniLM-L6-v2")

@st.cache_resource(show_spinner=False)
def load_qa():
    return pipeline("text2text-generation", model="google/flan-t5-large", max_length=512)

@st.cache_resource(show_spinner=False)
def load_sum():
    return pipeline("summarization", model="facebook/bart-large-cnn", max_length=250)

embed = load_embed()
qa_pipe  = load_qa()
sum_pipe = load_sum()

###############################################################################
# Helpers
###############################################################################
def extract_pdf(uploaded_file):
    """Return (plain text, image_list)"""
    text = ""
    images = []
    with pdfplumber.open(BytesIO(uploaded_file.getbuffer())) as pdf:
        for page in pdf.pages:
            txt = page.extract_text_layout() or page.extract_text()
            if not txt:
                img = page.to_image(resolution=200).original
                txt = pytesseract.image_to_string(img)
            text += txt + "\n"
            for img in page.images:
                try:
                    x0, y0, x1, y1 = img["x0"], img["y0"], img["x1"], img["y1"]
                    pil = page.within_bbox((x0, y0, x1, y1)).to_image(resolution=200).original
                    images.append(pil)
                except Exception:
                    pass
    return text.strip(), images

def build_index(text):
    splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=80)
    chunks = splitter.split_text(text)
    vectors = embed.encode(chunks, show_progress_bar=False, batch_size=64)
    index = FAISS.from_embeddings(list(zip(chunks, vectors)), embed)
    return index

def summarize(text):
    if len(text) < 50:
        return "Document too short to summarize."
    # pick top 3k chars to stay within model limit
    truncated = text[:3000]
    return sum_pipe(truncated, max_length=250, min_length=60, do_sample=False)[0]["summary_text"]

def answer(question, index):
    if index is None:
        return "Please upload & process a PDF first."
    docs = index.similarity_search(question, k=4)
    context = "\n".join([d.page_content for d in docs])
    prompt = f"Answer the question using ONLY the context below.\n\nContext:\n{context}\n\nQuestion: {question}"
    return qa_pipe(prompt, max_length=256, do_sample=False)[0]["generated_text"]

###############################################################################
# Session init
###############################################################################
if "messages" not in st.session_state:
    st.session_state.messages = []
if "index" not in st.session_state:
    st.session_state.index = None
if "raw_text" not in st.session_state:
    st.session_state.raw_text = ""
if "images" not in st.session_state:
    st.session_state.images = []

###############################################################################
# Sidebar
###############################################################################
with st.sidebar:
    st.subheader("📁 Upload PDF")
    uploaded = st.file_uploader("Choose a file", type="pdf", label_visibility="collapsed")
    if uploaded and st.button("Process PDF"):
        with st.spinner("Extracting text & images…"):
            st.session_state.raw_text, st.session_state.images = extract_pdf(uploaded)
            st.session_state.index = build_index(st.session_state.raw_text)
            st.session_state.messages = []
            st.toast("PDF ready!")

    if st.session_state.images:
        st.subheader("🖼️ Extracted Images")
        for im in st.session_state.images:
            st.image(im, use_column_width=True)

###############################################################################
# Main Tabs
###############################################################################
tab_chat, tab_sum = st.tabs(["💬 Chat", "📄 Summarize"])

with tab_chat:
    if st.session_state.index is None:
        st.info("Upload & process a PDF first using the sidebar.")
    else:
        # history
        for role, msg in st.session_state.messages:
            css = "user" if role == "user" else "assistant"
            st.markdown(f'<div class="chat-msg {css}">{msg}</div>', unsafe_allow_html=True)

        # input
        if question := st.chat_input("Ask anything about the PDF…"):
            st.session_state.messages.append(("user", question))
            st.markdown(f'<div class="chat-msg user">{question}</div>', unsafe_allow_html=True)

            with st.spinner("Thinking…"):
                resp = answer(question, st.session_state.index)
            st.session_state.messages.append(("assistant", resp))
            st.markdown(f'<div class="chat-msg assistant">{resp}</div>', unsafe_allow_html=True)

with tab_sum:
    if not st.session_state.raw_text:
        st.info("Upload & process a PDF first.")
    else:
        if st.button("Generate Summary"):
            with st.spinner("Summarizing…"):
                summary = summarize(st.session_state.raw_text)
            st.subheader("Summary")
            st.write(summary)