File size: 5,799 Bytes
6d84ee5
bfc5e7a
5460734
 
 
 
6d84ee5
5460734
 
f232bf7
066352b
94fa78d
 
 
 
6d84ee5
 
bfc5e7a
 
 
6d84ee5
bfc5e7a
6d84ee5
5460734
f232bf7
 
 
 
 
5460734
 
 
f232bf7
 
 
 
 
 
 
 
 
 
5460734
f232bf7
5460734
 
bfc5e7a
 
bd0a950
216c20d
 
 
 
f232bf7
 
216c20d
 
 
 
 
 
 
 
 
f232bf7
 
216c20d
 
 
 
 
 
 
 
 
 
 
 
bfc5e7a
5460734
 
 
 
 
 
 
 
 
 
94fa78d
5460734
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2a54e7f
5460734
 
 
 
 
 
94fa78d
5460734
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2a54e7f
5460734
 
 
 
 
 
 
 
 
 
94fa78d
5460734
 
 
 
 
 
 
 
 
f232bf7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import streamlit as st
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from PIL import Image
import pytesseract
import pdfplumber
import io

st.set_page_config(page_title="Docurizzer", layout="centered")
st.title("πŸ“„ Docurizzer")
st.write("Intelligent document summarization tool")

st.sidebar.header("Summarization Settings")
min_len = st.sidebar.slider("Min Length", min_value=10, max_value=100, value=40, step=10)
max_len = st.sidebar.slider("Max Length", min_value=100, max_value=500, value=150, step=50)

@st.cache_resource
def load_model():
    tokenizer = AutoTokenizer.from_pretrained("t5-small")
    model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
    return tokenizer, model

tokenizer, model = load_model()

def extract_text_from_image(image):
    try:
        return pytesseract.image_to_string(image)
    except Exception as e:
        st.error(f"Error extracting text from image: {str(e)}")
        return ""

def extract_text_from_pdf(pdf_file):
    text = ""
    try:
        with pdfplumber.open(pdf_file) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
        return text
    except Exception as e:
        st.error(f"Error extracting text from PDF: {str(e)}")
        return ""

def summarize_text(text, min_len, max_len):
    if not text.strip():
        return None
    input_text = "summarize: " + text[:4000]
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
    input_token_count = inputs.input_ids.shape[1]
    
    if input_token_count < 15:
        return text.strip()
    
    effective_max = min(max_len, max(int(input_token_count * 0.6), 20))
    effective_min = 5
    
    if effective_min >= effective_max:
        effective_min = max(1, effective_max - 5)
    
    if input_token_count < 50:
        summary_ids = model.generate(
            inputs.input_ids, 
            max_length=effective_max, 
            min_length=effective_min,
            do_sample=False,
            num_beams=1,
            early_stopping=True
        )
    else:
        summary_ids = model.generate(
            inputs.input_ids, 
            max_length=effective_max, 
            min_length=effective_min,
            length_penalty=2.0, 
            num_beams=4, 
            early_stopping=True
        )
    
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

tab1, tab2, tab3 = st.tabs(["πŸ“ Text", "πŸ–ΌοΈ Image", "πŸ“„ PDF"])

with tab1:
    st.subheader("Text Summarization")
    text = st.text_area("Enter your text:", height=200, key="text_input")
    
    if st.button("Summarize Text", key="summarize_text"):
        if text.strip():
            with st.spinner("Summarizing..."):
                summary = summarize_text(text, min_len, max_len)
            if summary:
                st.subheader("Summary")
                st.success(summary)
                st.download_button("Download Summary", summary, "summary.txt", key="download_text")
        else:
            st.warning("Please enter some text first.")

with tab2:
    st.subheader("Image Summarization")
    st.info("Upload an image containing text to extract and summarize it using OCR.")
    
    uploaded_image = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg", "bmp", "tiff"], key="image_upload")
    
    if uploaded_image:
        image = Image.open(uploaded_image)
        st.image(image, caption="Uploaded Image", use_container_width=True)
        
        if st.button("Extract & Summarize", key="summarize_image"):
            with st.spinner("Extracting text from image..."):
                extracted_text = extract_text_from_image(image)
            
            if extracted_text.strip():
                st.subheader("Extracted Text")
                with st.expander("View extracted text"):
                    st.text(extracted_text)
                
                with st.spinner("Summarizing..."):
                    summary = summarize_text(extracted_text, min_len, max_len)
                
                if summary:
                    st.subheader("Summary")
                    st.success(summary)
                    st.download_button("Download Summary", summary, "image_summary.txt", key="download_image")
            else:
                st.warning("No text could be extracted from the image. Please try a clearer image.")

with tab3:
    st.subheader("PDF Summarization")
    st.info("Upload a PDF document to extract and summarize its content.")
    
    uploaded_pdf = st.file_uploader("Upload a PDF", type=["pdf"], key="pdf_upload")
    
    if uploaded_pdf:
        st.success(f"Uploaded: {uploaded_pdf.name}")
        
        if st.button("Extract & Summarize", key="summarize_pdf"):
            with st.spinner("Extracting text from PDF..."):
                extracted_text = extract_text_from_pdf(uploaded_pdf)
            
            if extracted_text.strip():
                st.subheader("Extracted Text")
                with st.expander("View extracted text"):
                    st.text(extracted_text[:5000] + ("..." if len(extracted_text) > 5000 else ""))
                
                with st.spinner("Summarizing..."):
                    summary = summarize_text(extracted_text, min_len, max_len)
                
                if summary:
                    st.subheader("Summary")
                    st.success(summary)
                    st.download_button("Download Summary", summary, "pdf_summary.txt", key="download_pdf")
            else:
                st.warning("No text could be extracted from the PDF. The PDF might be image-based or empty.")

st.divider()
st.caption("Built with Streamlit | v1.2")