docurizer / app.py
the-carnage's picture
Add sidebar sliders for summary length control
94fa78d
import streamlit as st
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from PIL import Image
import pytesseract
import pdfplumber
import io
st.set_page_config(page_title="Docurizzer", layout="centered")
st.title("πŸ“„ Docurizzer")
st.write("Summarize text, images, or PDFs with AI")
st.sidebar.header("Summarization Settings")
min_len = st.sidebar.slider("Min Length", min_value=10, max_value=100, value=40, step=10)
max_len = st.sidebar.slider("Max Length", min_value=100, max_value=500, value=150, step=50)
@st.cache_resource
def load_model():
tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
return tokenizer, model
tokenizer, model = load_model()
def extract_text_from_image(image):
"""Extract text from image using OCR"""
return pytesseract.image_to_string(image)
def extract_text_from_pdf(pdf_file):
"""Extract text from PDF file"""
text = ""
with pdfplumber.open(pdf_file) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
return text
def summarize_text(text, min_Len, max_Len):
"""Summarize the given text"""
if not text.strip():
return None
input_text = "summarize: " + text[:4000]
inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
summary_ids = model.generate(inputs.input_ids, max_length=max_Len, min_length=min_Len, length_penalty=2.0, num_beams=4, early_stopping=True)
return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
tab1, tab2, tab3 = st.tabs(["πŸ“ Text", "πŸ–ΌοΈ Image", "πŸ“„ PDF"])
with tab1:
st.subheader("Text Summarization")
text = st.text_area("Enter your text:", height=200, key="text_input")
if st.button("Summarize Text", key="summarize_text"):
if text.strip():
with st.spinner("Summarizing..."):
summary = summarize_text(text, min_len, max_len)
if summary:
st.subheader("Summary")
st.success(summary)
st.download_button("Download Summary", summary, "summary.txt", key="download_text")
else:
st.warning("Please enter some text first.")
with tab2:
st.subheader("Image Summarization")
st.info("Upload an image containing text to extract and summarize it using OCR.")
uploaded_image = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg", "bmp", "tiff"], key="image_upload")
if uploaded_image:
image = Image.open(uploaded_image)
st.image(image, caption="Uploaded Image", use_container_width=True)
if st.button("Extract & Summarize", key="summarize_image"):
with st.spinner("Extracting text from image..."):
extracted_text = extract_text_from_image(image)
if extracted_text.strip():
st.subheader("Extracted Text")
with st.expander("View extracted text"):
st.text(extracted_text)
with st.spinner("Summarizing..."):
summary = summarize_text(extracted_text, min_len, max_len)
if summary:
st.subheader("Summary")
st.success(summary)
st.download_button("Download Summary", summary, "image_summary.txt", key="download_image")
else:
st.warning("No text could be extracted from the image. Please try a clearer image.")
with tab3:
st.subheader("PDF Summarization")
st.info("Upload a PDF document to extract and summarize its content.")
uploaded_pdf = st.file_uploader("Upload a PDF", type=["pdf"], key="pdf_upload")
if uploaded_pdf:
st.success(f"Uploaded: {uploaded_pdf.name}")
if st.button("Extract & Summarize", key="summarize_pdf"):
with st.spinner("Extracting text from PDF..."):
extracted_text = extract_text_from_pdf(uploaded_pdf)
if extracted_text.strip():
st.subheader("Extracted Text")
with st.expander("View extracted text"):
st.text(extracted_text[:5000] + ("..." if len(extracted_text) > 5000 else ""))
with st.spinner("Summarizing..."):
summary = summarize_text(extracted_text, min_len, max_len)
if summary:
st.subheader("Summary")
st.success(summary)
st.download_button("Download Summary", summary, "pdf_summary.txt", key="download_pdf")
else:
st.warning("No text could be extracted from the PDF. The PDF might be image-based or empty.")
st.divider()
st.caption("Powered by T5 AI Model | Built with Streamlit | v1.1")