Spaces:
Running
Running
| import streamlit as st | |
| from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM | |
| from PIL import Image | |
| import pytesseract | |
| import pdfplumber | |
| import io | |
| st.set_page_config(page_title="Docurizzer", layout="centered") | |
| st.title("π Docurizzer") | |
| st.write("Summarize text, images, or PDFs with AI") | |
| st.sidebar.header("Summarization Settings") | |
| min_len = st.sidebar.slider("Min Length", min_value=10, max_value=100, value=40, step=10) | |
| max_len = st.sidebar.slider("Max Length", min_value=100, max_value=500, value=150, step=50) | |
| def load_model(): | |
| tokenizer = AutoTokenizer.from_pretrained("t5-small") | |
| model = AutoModelForSeq2SeqLM.from_pretrained("t5-small") | |
| return tokenizer, model | |
| tokenizer, model = load_model() | |
| def extract_text_from_image(image): | |
| """Extract text from image using OCR""" | |
| return pytesseract.image_to_string(image) | |
| def extract_text_from_pdf(pdf_file): | |
| """Extract text from PDF file""" | |
| text = "" | |
| with pdfplumber.open(pdf_file) as pdf: | |
| for page in pdf.pages: | |
| page_text = page.extract_text() | |
| if page_text: | |
| text += page_text + "\n" | |
| return text | |
| def summarize_text(text, min_Len, max_Len): | |
| """Summarize the given text""" | |
| if not text.strip(): | |
| return None | |
| input_text = "summarize: " + text[:4000] | |
| inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True) | |
| summary_ids = model.generate(inputs.input_ids, max_length=max_Len, min_length=min_Len, length_penalty=2.0, num_beams=4, early_stopping=True) | |
| return tokenizer.decode(summary_ids[0], skip_special_tokens=True) | |
| tab1, tab2, tab3 = st.tabs(["π Text", "πΌοΈ Image", "π PDF"]) | |
| with tab1: | |
| st.subheader("Text Summarization") | |
| text = st.text_area("Enter your text:", height=200, key="text_input") | |
| if st.button("Summarize Text", key="summarize_text"): | |
| if text.strip(): | |
| with st.spinner("Summarizing..."): | |
| summary = summarize_text(text, min_len, max_len) | |
| if summary: | |
| st.subheader("Summary") | |
| st.success(summary) | |
| st.download_button("Download Summary", summary, "summary.txt", key="download_text") | |
| else: | |
| st.warning("Please enter some text first.") | |
| with tab2: | |
| st.subheader("Image Summarization") | |
| st.info("Upload an image containing text to extract and summarize it using OCR.") | |
| uploaded_image = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg", "bmp", "tiff"], key="image_upload") | |
| if uploaded_image: | |
| image = Image.open(uploaded_image) | |
| st.image(image, caption="Uploaded Image", use_container_width=True) | |
| if st.button("Extract & Summarize", key="summarize_image"): | |
| with st.spinner("Extracting text from image..."): | |
| extracted_text = extract_text_from_image(image) | |
| if extracted_text.strip(): | |
| st.subheader("Extracted Text") | |
| with st.expander("View extracted text"): | |
| st.text(extracted_text) | |
| with st.spinner("Summarizing..."): | |
| summary = summarize_text(extracted_text, min_len, max_len) | |
| if summary: | |
| st.subheader("Summary") | |
| st.success(summary) | |
| st.download_button("Download Summary", summary, "image_summary.txt", key="download_image") | |
| else: | |
| st.warning("No text could be extracted from the image. Please try a clearer image.") | |
| with tab3: | |
| st.subheader("PDF Summarization") | |
| st.info("Upload a PDF document to extract and summarize its content.") | |
| uploaded_pdf = st.file_uploader("Upload a PDF", type=["pdf"], key="pdf_upload") | |
| if uploaded_pdf: | |
| st.success(f"Uploaded: {uploaded_pdf.name}") | |
| if st.button("Extract & Summarize", key="summarize_pdf"): | |
| with st.spinner("Extracting text from PDF..."): | |
| extracted_text = extract_text_from_pdf(uploaded_pdf) | |
| if extracted_text.strip(): | |
| st.subheader("Extracted Text") | |
| with st.expander("View extracted text"): | |
| st.text(extracted_text[:5000] + ("..." if len(extracted_text) > 5000 else "")) | |
| with st.spinner("Summarizing..."): | |
| summary = summarize_text(extracted_text, min_len, max_len) | |
| if summary: | |
| st.subheader("Summary") | |
| st.success(summary) | |
| st.download_button("Download Summary", summary, "pdf_summary.txt", key="download_pdf") | |
| else: | |
| st.warning("No text could be extracted from the PDF. The PDF might be image-based or empty.") | |
| st.divider() | |
| st.caption("Powered by T5 AI Model | Built with Streamlit | v1.1") | |