"""TEXT SUMMARIZATION Web APP""" # Importing Packages import base64 import streamlit as st import torch import io from pdf2image import convert_from_path from PIL import Image from langchain.document_loaders import PyPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from transformers import T5Tokenizer, T5ForConditionalGeneration from transformers import pipeline from reportlab.pdfgen import canvas # Streamlit Page Configuration st.set_page_config(layout="wide") # Load the tokenizer and model (cached to avoid reloads on rerun) @st.cache_resource def load_model(checkpoint="Lamini-1"): tokenizer = T5Tokenizer.from_pretrained(checkpoint) model = T5ForConditionalGeneration.from_pretrained( checkpoint, device_map="auto", torch_dtype=torch.float32, offload_folder="offload" ) return tokenizer, model tokenizer, base_model = load_model() # File Loader & Processing def file_processing(file): loader = PyPDFLoader(file) pages = loader.load_and_split() text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) texts = text_splitter.split_documents(pages) return texts # Recursive Summarization def recursive_summarize(texts, pipe_summ, chunk_summary_len=150, final_summary_len=400): summaries = [] for chunk in texts: try: result = pipe_summ( chunk.page_content, max_length=chunk_summary_len, min_length=50 )[0]["summary_text"] summaries.append(result) except Exception as e: st.error(f"Error summarizing chunk: {e}") combined = " ".join(summaries) # Summarize Again to Compress Further final = pipe_summ( combined, max_length=final_summary_len, min_length=100 )[0]["summary_text"] return final # Language Model Pipeline -> Summarization def llm_pipeline(filepath, summary_length): pipe_summ = pipeline( "summarization", model=base_model, tokenizer=tokenizer ) texts = file_processing(filepath) return recursive_summarize(texts, pipe_summ, chunk_summary_len=200, final_summary_len=summary_length) # Display Background def add_bg_from_local(image_file): with open(image_file, "rb") as image_file: encoded_string = base64.b64encode(image_file.read()) st.markdown( f""" """, unsafe_allow_html=True, ) add_bg_from_local("Images/background.jpg") # Font Style with open("font.css") as f: st.markdown("".format(f.read()), unsafe_allow_html=True) # Sidebar st.sidebar.image("Images/sidebar_pic.png") st.sidebar.title("ABOUT THE APP") st.sidebar.write("SummaScribe: Your PDF wingman! 🚀 Now with **chunk-wise recursive summarization** and inline PDF preview.") selected_summary_length = st.sidebar.slider("SELECT SUMMARY STRENGTH", min_value=200, max_value=1500, value=500) # Display PDF as images def display(file): try: images = convert_from_path(file, dpi=100, first_page=1, last_page=10) img_tags = "" for i, img in enumerate(images): buf = io.BytesIO() img.save(buf, format="PNG") b64 = base64.b64encode(buf.getvalue()).decode() img_tags += f'' html = f"""
{img_tags}
""" st.components.v1.html(html, height=550, scrolling=True) except Exception as e: st.error(f"Could not render PDF preview: {e}") with open(file, "rb") as f: st.download_button( label="Download Uploaded PDF", data=f, file_name=file.split("/")[-1], mime="application/pdf" ) # Title Styling st.markdown( """ """, unsafe_allow_html=True, ) text = "SummaScribe" colored_text = ''.join( ['{}'.format(70 - (i * 10 / len(text)), char) for i, char in enumerate(text)]) colored_text_with_malt = colored_text + ' ' st.markdown(f'

{colored_text_with_malt}

', unsafe_allow_html=True) st.markdown( '

Text Document Summarization using LLMs

', unsafe_allow_html=True, ) # Main content def main(): uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"]) with st.expander("NOTE"): st.write( "Summascribe currently accepts PDF documents that contain only text and no images." ) if uploaded_file is not None: if st.button("Summarize"): col1, col2 = st.columns((1, 1)) filepath = "data/" + uploaded_file.name with open(filepath, "wb") as temp_file: temp_file.write(uploaded_file.read()) with col1: st.info("Uploaded File") display(filepath) with col2: st.spinner(text="In progress...") st.info("Summary") summary = llm_pipeline(filepath, selected_summary_length) st.success(summary, icon="✅") # --- Download options (side by side, full width) --- col_txt, col_pdf = st.columns(2) with col_txt: st.download_button( label="Download Summary as TXT", data=summary, file_name="summary.txt", mime="text/plain", use_container_width=True ) with col_pdf: pdf_buffer = io.BytesIO() c = canvas.Canvas(pdf_buffer) text_obj = c.beginText(40, 800) for line in summary.split("\n"): text_obj.textLine(line) c.drawText(text_obj) c.save() pdf_buffer.seek(0) st.download_button( label="Download Summary as PDF", data=pdf_buffer, file_name="summary.pdf", mime="application/pdf", use_container_width=True ) if __name__ == "__main__": main()