Spaces:
Sleeping
Sleeping
File size: 5,799 Bytes
6d84ee5 bfc5e7a 5460734 6d84ee5 5460734 f232bf7 066352b 94fa78d 6d84ee5 bfc5e7a 6d84ee5 bfc5e7a 6d84ee5 5460734 f232bf7 5460734 f232bf7 5460734 f232bf7 5460734 bfc5e7a bd0a950 216c20d f232bf7 216c20d f232bf7 216c20d bfc5e7a 5460734 94fa78d 5460734 2a54e7f 5460734 94fa78d 5460734 2a54e7f 5460734 94fa78d 5460734 f232bf7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 | import streamlit as st
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from PIL import Image
import pytesseract
import pdfplumber
import io
st.set_page_config(page_title="Docurizzer", layout="centered")
st.title("π Docurizzer")
st.write("Intelligent document summarization tool")
st.sidebar.header("Summarization Settings")
min_len = st.sidebar.slider("Min Length", min_value=10, max_value=100, value=40, step=10)
max_len = st.sidebar.slider("Max Length", min_value=100, max_value=500, value=150, step=50)
@st.cache_resource
def load_model():
tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
return tokenizer, model
tokenizer, model = load_model()
def extract_text_from_image(image):
try:
return pytesseract.image_to_string(image)
except Exception as e:
st.error(f"Error extracting text from image: {str(e)}")
return ""
def extract_text_from_pdf(pdf_file):
text = ""
try:
with pdfplumber.open(pdf_file) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
return text
except Exception as e:
st.error(f"Error extracting text from PDF: {str(e)}")
return ""
def summarize_text(text, min_len, max_len):
if not text.strip():
return None
input_text = "summarize: " + text[:4000]
inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
input_token_count = inputs.input_ids.shape[1]
if input_token_count < 15:
return text.strip()
effective_max = min(max_len, max(int(input_token_count * 0.6), 20))
effective_min = 5
if effective_min >= effective_max:
effective_min = max(1, effective_max - 5)
if input_token_count < 50:
summary_ids = model.generate(
inputs.input_ids,
max_length=effective_max,
min_length=effective_min,
do_sample=False,
num_beams=1,
early_stopping=True
)
else:
summary_ids = model.generate(
inputs.input_ids,
max_length=effective_max,
min_length=effective_min,
length_penalty=2.0,
num_beams=4,
early_stopping=True
)
return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
tab1, tab2, tab3 = st.tabs(["π Text", "πΌοΈ Image", "π PDF"])
with tab1:
st.subheader("Text Summarization")
text = st.text_area("Enter your text:", height=200, key="text_input")
if st.button("Summarize Text", key="summarize_text"):
if text.strip():
with st.spinner("Summarizing..."):
summary = summarize_text(text, min_len, max_len)
if summary:
st.subheader("Summary")
st.success(summary)
st.download_button("Download Summary", summary, "summary.txt", key="download_text")
else:
st.warning("Please enter some text first.")
with tab2:
st.subheader("Image Summarization")
st.info("Upload an image containing text to extract and summarize it using OCR.")
uploaded_image = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg", "bmp", "tiff"], key="image_upload")
if uploaded_image:
image = Image.open(uploaded_image)
st.image(image, caption="Uploaded Image", use_container_width=True)
if st.button("Extract & Summarize", key="summarize_image"):
with st.spinner("Extracting text from image..."):
extracted_text = extract_text_from_image(image)
if extracted_text.strip():
st.subheader("Extracted Text")
with st.expander("View extracted text"):
st.text(extracted_text)
with st.spinner("Summarizing..."):
summary = summarize_text(extracted_text, min_len, max_len)
if summary:
st.subheader("Summary")
st.success(summary)
st.download_button("Download Summary", summary, "image_summary.txt", key="download_image")
else:
st.warning("No text could be extracted from the image. Please try a clearer image.")
with tab3:
st.subheader("PDF Summarization")
st.info("Upload a PDF document to extract and summarize its content.")
uploaded_pdf = st.file_uploader("Upload a PDF", type=["pdf"], key="pdf_upload")
if uploaded_pdf:
st.success(f"Uploaded: {uploaded_pdf.name}")
if st.button("Extract & Summarize", key="summarize_pdf"):
with st.spinner("Extracting text from PDF..."):
extracted_text = extract_text_from_pdf(uploaded_pdf)
if extracted_text.strip():
st.subheader("Extracted Text")
with st.expander("View extracted text"):
st.text(extracted_text[:5000] + ("..." if len(extracted_text) > 5000 else ""))
with st.spinner("Summarizing..."):
summary = summarize_text(extracted_text, min_len, max_len)
if summary:
st.subheader("Summary")
st.success(summary)
st.download_button("Download Summary", summary, "pdf_summary.txt", key="download_pdf")
else:
st.warning("No text could be extracted from the PDF. The PDF might be image-based or empty.")
st.divider()
st.caption("Built with Streamlit | v1.2")
|