SummaScribe

Build error

File size: 7,194 Bytes

"""TEXT SUMMARIZATION Web APP"""

# Importing Packages
import base64
import streamlit as st
import torch
import io
from pdf2image import convert_from_path
from PIL import Image
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import pipeline
from reportlab.pdfgen import canvas   


# Streamlit Page Configuration
st.set_page_config(layout="wide")


# Load the tokenizer and model (cached to avoid reloads on rerun)
@st.cache_resource
def load_model(checkpoint="Lamini-1"):
    tokenizer = T5Tokenizer.from_pretrained(checkpoint)
    model = T5ForConditionalGeneration.from_pretrained(
        checkpoint,
        device_map="auto",
        torch_dtype=torch.float32,
        offload_folder="offload"
    )
    return tokenizer, model


tokenizer, base_model = load_model()


# File Loader & Processing
def file_processing(file):
    loader = PyPDFLoader(file)
    pages = loader.load_and_split()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    texts = text_splitter.split_documents(pages)
    return texts  


# Recursive Summarization
def recursive_summarize(texts, pipe_summ, chunk_summary_len=150, final_summary_len=400):
    summaries = []
    for chunk in texts:
        try:
            result = pipe_summ(
                chunk.page_content,
                max_length=chunk_summary_len,
                min_length=50
            )[0]["summary_text"]
            summaries.append(result)
        except Exception as e:
            st.error(f"Error summarizing chunk: {e}")

    combined = " ".join(summaries)

    # Summarize Again to Compress Further
    final = pipe_summ(
        combined,
        max_length=final_summary_len,
        min_length=100
    )[0]["summary_text"]

    return final


# Language Model Pipeline -> Summarization
def llm_pipeline(filepath, summary_length):
    pipe_summ = pipeline(
        "summarization",
        model=base_model,
        tokenizer=tokenizer
    )
    texts = file_processing(filepath)
    return recursive_summarize(texts, pipe_summ, chunk_summary_len=200, final_summary_len=summary_length)


# Display Background
def add_bg_from_local(image_file):
    with open(image_file, "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read())
    st.markdown(
        f"""
    <style>
    .stApp {{
        background-image: url(data:image/{"png"};base64,{encoded_string.decode()});
        background-size: cover;
        opacity:0.9;
    }}
    </style>
    """,
        unsafe_allow_html=True,
    )


add_bg_from_local("Images/background.jpg")

# Font Style
with open("font.css") as f:
    st.markdown("<style>{}</style>".format(f.read()), unsafe_allow_html=True)

# Sidebar
st.sidebar.image("Images/sidebar_pic.png")
st.sidebar.title("ABOUT THE APP")
st.sidebar.write("SummaScribe: Your PDF wingman! 🚀 Now with **chunk-wise recursive summarization** and inline PDF preview.")
selected_summary_length = st.sidebar.slider("SELECT SUMMARY STRENGTH", min_value=200, max_value=1500, value=500)


# Display PDF as images 
def display(file):
    try:
        images = convert_from_path(file, dpi=100, first_page=1, last_page=10)
        img_tags = ""
        for i, img in enumerate(images):
            buf = io.BytesIO()
            img.save(buf, format="PNG")
            b64 = base64.b64encode(buf.getvalue()).decode()
            img_tags += f'<img src="data:image/png;base64,{b64}" style="height:500px; margin-right:10px;" />'

        html = f"""
        <div style="display:flex; overflow-x:auto; white-space:nowrap; border:1px solid #ccc; padding:10px;">
            {img_tags}
        </div>
        """

        st.components.v1.html(html, height=550, scrolling=True)

    except Exception as e:
        st.error(f"Could not render PDF preview: {e}")
        with open(file, "rb") as f:
            st.download_button(
                label="Download Uploaded PDF",
                data=f,
                file_name=file.split("/")[-1],
                mime="application/pdf"
            )


# Title Styling
st.markdown(
    """
    <style>
    .summascribe-title {
        font-size: 50px;
        text-align: center;
        transition: transform 0.2s ease-in-out;
    }
    .summascribe-title span {
        transition: color 0.2s ease-in-out;
    }
    .summascribe-title:hover span {
        color: #f5fefd;
    }
    .summascribe-title:hover {
        transform: scale(1.15);
    }
    </style>
    """,
    unsafe_allow_html=True,
)

text = "SummaScribe"
colored_text = ''.join(
    ['<span style="color: hsl(220, 60%, {}%);">{}</span>'.format(70 - (i * 10 / len(text)), char) for i, char in
     enumerate(text)])
colored_text_with_malt = colored_text + ' <span style="color: hsl(220, 60%, 70%);">&#x2727;</span>'
st.markdown(f'<h1 class="summascribe-title">{colored_text_with_malt}</h1>', unsafe_allow_html=True)

st.markdown(
    '<h2 style="font-size:25px;color: #F5FEFD; text-align: center;">Text Document Summarization using LLMs</h2>',
    unsafe_allow_html=True,
)


# Main content
def main():
    uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"])
    with st.expander("NOTE"):
        st.write(
            "Summascribe currently accepts PDF documents that contain only text and no images."
        )
    if uploaded_file is not None:
        if st.button("Summarize"):
            col1, col2 = st.columns((1, 1))
            filepath = "data/" + uploaded_file.name
            with open(filepath, "wb") as temp_file:
                temp_file.write(uploaded_file.read())
            with col1:
                st.info("Uploaded File")
                display(filepath)
            with col2:
                st.spinner(text="In progress...")
                st.info("Summary")
                summary = llm_pipeline(filepath, selected_summary_length)
                st.success(summary, icon="✅")

                # --- Download options (side by side, full width) ---
                col_txt, col_pdf = st.columns(2)

                with col_txt:
                    st.download_button(
                        label="Download Summary as TXT",
                        data=summary,
                        file_name="summary.txt",
                        mime="text/plain",
                        use_container_width=True
                    )

                with col_pdf:
                    pdf_buffer = io.BytesIO()
                    c = canvas.Canvas(pdf_buffer)
                    text_obj = c.beginText(40, 800)
                    for line in summary.split("\n"):
                        text_obj.textLine(line)
                    c.drawText(text_obj)
                    c.save()
                    pdf_buffer.seek(0)

                    st.download_button(
                        label="Download Summary as PDF",
                        data=pdf_buffer,
                        file_name="summary.pdf",
                        mime="application/pdf",
                        use_container_width=True
                    )


if __name__ == "__main__":
    main()