File size: 6,503 Bytes
11694c7
f513b53
11694c7
 
 
 
fecb449
11694c7
 
fecb449
11694c7
56d0815
11694c7
 
56d0815
11694c7
 
 
 
 
 
 
 
 
 
 
 
 
 
56d0815
11694c7
 
 
 
 
 
56d0815
11694c7
 
 
fecb449
11694c7
 
 
56d0815
11694c7
 
 
56d0815
11694c7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d9893e1
11694c7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# ------------- app.py -------------
import streamlit as st
from pathlib import Path
from io import BytesIO
import pdfplumber, pytesseract, time, re, logging, os
from PIL import Image
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import numpy as np

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

###############################################################################
# Page layout
###############################################################################
st.set_page_config(page_title="PDF Chat & Summarize", layout="wide")
st.markdown("""
<style>
    .block-container { padding-top: 1rem; padding-bottom: 0; }
    .stTabs [data-baseweb="tab-list"] { gap: 4px; }
    .stTabs [data-baseweb="tab"] { padding: 8px 24px; }
    .chat-msg { padding: 0.5rem 1rem; border-radius: 8px; margin: 0.3rem 0; }
    .user   { background-color: #e3f2fd; margin-left: 20%; }
    .assistant { background-color: #f1f3f4; margin-right: 20%; }
</style>
""", unsafe_allow_html=True)

###############################################################################
# Cached heavy objects
###############################################################################
@st.cache_resource(show_spinner=False)
def load_embed():
    return SentenceTransformer("all-MiniLM-L6-v2")

@st.cache_resource(show_spinner=False)
def load_qa():
    return pipeline("text2text-generation", model="google/flan-t5-large", max_length=512)

@st.cache_resource(show_spinner=False)
def load_sum():
    return pipeline("summarization", model="facebook/bart-large-cnn", max_length=250)

embed = load_embed()
qa_pipe  = load_qa()
sum_pipe = load_sum()

###############################################################################
# Helpers
###############################################################################
def extract_pdf(uploaded_file):
    """Return (plain text, image_list)"""
    text = ""
    images = []
    with pdfplumber.open(BytesIO(uploaded_file.getbuffer())) as pdf:
        for page in pdf.pages:
            txt = page.extract_text_layout() or page.extract_text()
            if not txt:
                img = page.to_image(resolution=200).original
                txt = pytesseract.image_to_string(img)
            text += txt + "\n"
            for img in page.images:
                try:
                    x0, y0, x1, y1 = img["x0"], img["y0"], img["x1"], img["y1"]
                    pil = page.within_bbox((x0, y0, x1, y1)).to_image(resolution=200).original
                    images.append(pil)
                except Exception:
                    pass
    return text.strip(), images

def build_index(text):
    splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=80)
    chunks = splitter.split_text(text)
    vectors = embed.encode(chunks, show_progress_bar=False, batch_size=64)
    index = FAISS.from_embeddings(list(zip(chunks, vectors)), embed)
    return index

def summarize(text):
    if len(text) < 50:
        return "Document too short to summarize."
    # pick top 3k chars to stay within model limit
    truncated = text[:3000]
    return sum_pipe(truncated, max_length=250, min_length=60, do_sample=False)[0]["summary_text"]

def answer(question, index):
    if index is None:
        return "Please upload & process a PDF first."
    docs = index.similarity_search(question, k=4)
    context = "\n".join([d.page_content for d in docs])
    prompt = f"Answer the question using ONLY the context below.\n\nContext:\n{context}\n\nQuestion: {question}"
    return qa_pipe(prompt, max_length=256, do_sample=False)[0]["generated_text"]

###############################################################################
# Session init
###############################################################################
if "messages" not in st.session_state:
    st.session_state.messages = []
if "index" not in st.session_state:
    st.session_state.index = None
if "raw_text" not in st.session_state:
    st.session_state.raw_text = ""
if "images" not in st.session_state:
    st.session_state.images = []

###############################################################################
# Sidebar
###############################################################################
with st.sidebar:
    st.subheader("📁 Upload PDF")
    uploaded = st.file_uploader("Choose a file", type="pdf", label_visibility="collapsed")
    if uploaded and st.button("Process PDF"):
        with st.spinner("Extracting text & images…"):
            st.session_state.raw_text, st.session_state.images = extract_pdf(uploaded)
            st.session_state.index = build_index(st.session_state.raw_text)
            st.session_state.messages = []
            st.toast("PDF ready!")

    if st.session_state.images:
        st.subheader("🖼️ Extracted Images")
        for im in st.session_state.images:
            st.image(im, use_column_width=True)

###############################################################################
# Main Tabs
###############################################################################
tab_chat, tab_sum = st.tabs(["💬 Chat", "📄 Summarize"])

with tab_chat:
    if st.session_state.index is None:
        st.info("Upload & process a PDF first using the sidebar.")
    else:
        # history
        for role, msg in st.session_state.messages:
            css = "user" if role == "user" else "assistant"
            st.markdown(f'<div class="chat-msg {css}">{msg}</div>', unsafe_allow_html=True)

        # input
        if question := st.chat_input("Ask anything about the PDF…"):
            st.session_state.messages.append(("user", question))
            st.markdown(f'<div class="chat-msg user">{question}</div>', unsafe_allow_html=True)

            with st.spinner("Thinking…"):
                resp = answer(question, st.session_state.index)
            st.session_state.messages.append(("assistant", resp))
            st.markdown(f'<div class="chat-msg assistant">{resp}</div>', unsafe_allow_html=True)

with tab_sum:
    if not st.session_state.raw_text:
        st.info("Upload & process a PDF first.")
    else:
        if st.button("Generate Summary"):
            with st.spinner("Summarizing…"):
                summary = summarize(st.session_state.raw_text)
            st.subheader("Summary")
            st.write(summary)