import streamlit as st from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM from PIL import Image import pytesseract import pdfplumber import io st.set_page_config(page_title="Docurizzer", layout="centered") st.title("📄 Docurizzer") st.write("Summarize text, images, or PDFs with AI") st.sidebar.header("Summarization Settings") min_len = st.sidebar.slider("Min Length", min_value=10, max_value=100, value=40, step=10) max_len = st.sidebar.slider("Max Length", min_value=100, max_value=500, value=150, step=50) @st.cache_resource def load_model(): tokenizer = AutoTokenizer.from_pretrained("t5-small") model = AutoModelForSeq2SeqLM.from_pretrained("t5-small") return tokenizer, model tokenizer, model = load_model() def extract_text_from_image(image): """Extract text from image using OCR""" return pytesseract.image_to_string(image) def extract_text_from_pdf(pdf_file): """Extract text from PDF file""" text = "" with pdfplumber.open(pdf_file) as pdf: for page in pdf.pages: page_text = page.extract_text() if page_text: text += page_text + "\n" return text def summarize_text(text, min_Len, max_Len): """Summarize the given text""" if not text.strip(): return None input_text = "summarize: " + text[:4000] inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True) summary_ids = model.generate(inputs.input_ids, max_length=max_Len, min_length=min_Len, length_penalty=2.0, num_beams=4, early_stopping=True) return tokenizer.decode(summary_ids[0], skip_special_tokens=True) tab1, tab2, tab3 = st.tabs(["📝 Text", "🖼️ Image", "📄 PDF"]) with tab1: st.subheader("Text Summarization") text = st.text_area("Enter your text:", height=200, key="text_input") if st.button("Summarize Text", key="summarize_text"): if text.strip(): with st.spinner("Summarizing..."): summary = summarize_text(text, min_len, max_len) if summary: st.subheader("Summary") st.success(summary) st.download_button("Download Summary", summary, "summary.txt", key="download_text") else: st.warning("Please enter some text first.") with tab2: st.subheader("Image Summarization") st.info("Upload an image containing text to extract and summarize it using OCR.") uploaded_image = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg", "bmp", "tiff"], key="image_upload") if uploaded_image: image = Image.open(uploaded_image) st.image(image, caption="Uploaded Image", use_container_width=True) if st.button("Extract & Summarize", key="summarize_image"): with st.spinner("Extracting text from image..."): extracted_text = extract_text_from_image(image) if extracted_text.strip(): st.subheader("Extracted Text") with st.expander("View extracted text"): st.text(extracted_text) with st.spinner("Summarizing..."): summary = summarize_text(extracted_text, min_len, max_len) if summary: st.subheader("Summary") st.success(summary) st.download_button("Download Summary", summary, "image_summary.txt", key="download_image") else: st.warning("No text could be extracted from the image. Please try a clearer image.") with tab3: st.subheader("PDF Summarization") st.info("Upload a PDF document to extract and summarize its content.") uploaded_pdf = st.file_uploader("Upload a PDF", type=["pdf"], key="pdf_upload") if uploaded_pdf: st.success(f"Uploaded: {uploaded_pdf.name}") if st.button("Extract & Summarize", key="summarize_pdf"): with st.spinner("Extracting text from PDF..."): extracted_text = extract_text_from_pdf(uploaded_pdf) if extracted_text.strip(): st.subheader("Extracted Text") with st.expander("View extracted text"): st.text(extracted_text[:5000] + ("..." if len(extracted_text) > 5000 else "")) with st.spinner("Summarizing..."): summary = summarize_text(extracted_text, min_len, max_len) if summary: st.subheader("Summary") st.success(summary) st.download_button("Download Summary", summary, "pdf_summary.txt", key="download_pdf") else: st.warning("No text could be extracted from the PDF. The PDF might be image-based or empty.") st.divider() st.caption("Powered by T5 AI Model | Built with Streamlit | v1.1")