File size: 7,194 Bytes
8c3af37
 
 
 
 
 
9f04e43
2669e06
 
8c3af37
 
 
 
c2df356
9f04e43
ec4459b
32e478b
fb33619
9f04e43
ec4459b
c2df356
9f04e43
 
 
 
 
 
 
 
 
 
 
8c3af37
9f04e43
8c3af37
 
 
 
 
 
9f04e43
8c3af37
32e478b
9f04e43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32e478b
9f04e43
 
 
 
 
 
 
8c3af37
 
 
 
 
 
9f04e43
 
8c3af37
9f04e43
 
8c3af37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261d957
8c3af37
 
 
 
 
 
261d957
8c3af37
32e478b
9f04e43
8c3af37
 
c2df356
8c3af37
ec4459b
c2df356
 
2669e06
c2df356
 
 
 
 
 
 
 
 
 
 
 
 
ec4459b
 
 
 
 
 
 
 
 
8c3af37
 
9f04e43
8c3af37
 
 
 
3a7fbd1
8c3af37
 
 
 
 
 
 
9f04e43
8c3af37
 
 
 
 
 
 
 
 
9f04e43
8c3af37
261d957
8c3af37
261d957
8c3af37
 
 
32e478b
8c3af37
 
 
 
9f04e43
8c3af37
 
 
 
9f04e43
8c3af37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ec4459b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f04e43
8c3af37
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
"""TEXT SUMMARIZATION Web APP"""

# Importing Packages
import base64
import streamlit as st
import torch
import io
from pdf2image import convert_from_path
from PIL import Image
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import pipeline
from reportlab.pdfgen import canvas   


# Streamlit Page Configuration
st.set_page_config(layout="wide")


# Load the tokenizer and model (cached to avoid reloads on rerun)
@st.cache_resource
def load_model(checkpoint="Lamini-1"):
    tokenizer = T5Tokenizer.from_pretrained(checkpoint)
    model = T5ForConditionalGeneration.from_pretrained(
        checkpoint,
        device_map="auto",
        torch_dtype=torch.float32,
        offload_folder="offload"
    )
    return tokenizer, model


tokenizer, base_model = load_model()


# File Loader & Processing
def file_processing(file):
    loader = PyPDFLoader(file)
    pages = loader.load_and_split()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    texts = text_splitter.split_documents(pages)
    return texts  


# Recursive Summarization
def recursive_summarize(texts, pipe_summ, chunk_summary_len=150, final_summary_len=400):
    summaries = []
    for chunk in texts:
        try:
            result = pipe_summ(
                chunk.page_content,
                max_length=chunk_summary_len,
                min_length=50
            )[0]["summary_text"]
            summaries.append(result)
        except Exception as e:
            st.error(f"Error summarizing chunk: {e}")

    combined = " ".join(summaries)

    # Summarize Again to Compress Further
    final = pipe_summ(
        combined,
        max_length=final_summary_len,
        min_length=100
    )[0]["summary_text"]

    return final


# Language Model Pipeline -> Summarization
def llm_pipeline(filepath, summary_length):
    pipe_summ = pipeline(
        "summarization",
        model=base_model,
        tokenizer=tokenizer
    )
    texts = file_processing(filepath)
    return recursive_summarize(texts, pipe_summ, chunk_summary_len=200, final_summary_len=summary_length)


# Display Background
def add_bg_from_local(image_file):
    with open(image_file, "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read())
    st.markdown(
        f"""
    <style>
    .stApp {{
        background-image: url(data:image/{"png"};base64,{encoded_string.decode()});
        background-size: cover;
        opacity:0.9;
    }}
    </style>
    """,
        unsafe_allow_html=True,
    )


add_bg_from_local("Images/background.jpg")

# Font Style
with open("font.css") as f:
    st.markdown("<style>{}</style>".format(f.read()), unsafe_allow_html=True)

# Sidebar
st.sidebar.image("Images/sidebar_pic.png")
st.sidebar.title("ABOUT THE APP")
st.sidebar.write("SummaScribe: Your PDF wingman! 🚀 Now with **chunk-wise recursive summarization** and inline PDF preview.")
selected_summary_length = st.sidebar.slider("SELECT SUMMARY STRENGTH", min_value=200, max_value=1500, value=500)


# Display PDF as images 
def display(file):
    try:
        images = convert_from_path(file, dpi=100, first_page=1, last_page=10)
        img_tags = ""
        for i, img in enumerate(images):
            buf = io.BytesIO()
            img.save(buf, format="PNG")
            b64 = base64.b64encode(buf.getvalue()).decode()
            img_tags += f'<img src="data:image/png;base64,{b64}" style="height:500px; margin-right:10px;" />'

        html = f"""
        <div style="display:flex; overflow-x:auto; white-space:nowrap; border:1px solid #ccc; padding:10px;">
            {img_tags}
        </div>
        """

        st.components.v1.html(html, height=550, scrolling=True)

    except Exception as e:
        st.error(f"Could not render PDF preview: {e}")
        with open(file, "rb") as f:
            st.download_button(
                label="Download Uploaded PDF",
                data=f,
                file_name=file.split("/")[-1],
                mime="application/pdf"
            )


# Title Styling
st.markdown(
    """
    <style>
    .summascribe-title {
        font-size: 50px;
        text-align: center;
        transition: transform 0.2s ease-in-out;
    }
    .summascribe-title span {
        transition: color 0.2s ease-in-out;
    }
    .summascribe-title:hover span {
        color: #f5fefd;
    }
    .summascribe-title:hover {
        transform: scale(1.15);
    }
    </style>
    """,
    unsafe_allow_html=True,
)

text = "SummaScribe"
colored_text = ''.join(
    ['<span style="color: hsl(220, 60%, {}%);">{}</span>'.format(70 - (i * 10 / len(text)), char) for i, char in
     enumerate(text)])
colored_text_with_malt = colored_text + ' <span style="color: hsl(220, 60%, 70%);">&#x2727;</span>'
st.markdown(f'<h1 class="summascribe-title">{colored_text_with_malt}</h1>', unsafe_allow_html=True)

st.markdown(
    '<h2 style="font-size:25px;color: #F5FEFD; text-align: center;">Text Document Summarization using LLMs</h2>',
    unsafe_allow_html=True,
)


# Main content
def main():
    uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"])
    with st.expander("NOTE"):
        st.write(
            "Summascribe currently accepts PDF documents that contain only text and no images."
        )
    if uploaded_file is not None:
        if st.button("Summarize"):
            col1, col2 = st.columns((1, 1))
            filepath = "data/" + uploaded_file.name
            with open(filepath, "wb") as temp_file:
                temp_file.write(uploaded_file.read())
            with col1:
                st.info("Uploaded File")
                display(filepath)
            with col2:
                st.spinner(text="In progress...")
                st.info("Summary")
                summary = llm_pipeline(filepath, selected_summary_length)
                st.success(summary, icon="✅")

                # --- Download options (side by side, full width) ---
                col_txt, col_pdf = st.columns(2)

                with col_txt:
                    st.download_button(
                        label="Download Summary as TXT",
                        data=summary,
                        file_name="summary.txt",
                        mime="text/plain",
                        use_container_width=True
                    )

                with col_pdf:
                    pdf_buffer = io.BytesIO()
                    c = canvas.Canvas(pdf_buffer)
                    text_obj = c.beginText(40, 800)
                    for line in summary.split("\n"):
                        text_obj.textLine(line)
                    c.drawText(text_obj)
                    c.save()
                    pdf_buffer.seek(0)

                    st.download_button(
                        label="Download Summary as PDF",
                        data=pdf_buffer,
                        file_name="summary.pdf",
                        mime="application/pdf",
                        use_container_width=True
                    )


if __name__ == "__main__":
    main()