the-carnage commited on
Commit
5460734
ยท
1 Parent(s): 4864977

Add image and PDF summarization support

Browse files
Files changed (2) hide show
  1. app.py +103 -35
  2. packages.txt +1 -1
app.py CHANGED
@@ -1,10 +1,13 @@
1
  import streamlit as st
2
  from transformers import pipeline
 
 
 
 
3
 
4
- st.set_page_config(page_title="Docurizzer - Document Summarizer", layout="centered")
5
- st.title("Docurizzer")
6
- st.markdown("*Paste your own text to summarize*")
7
-
8
 
9
  @st.cache_resource
10
  def load_model():
@@ -12,36 +15,101 @@ def load_model():
12
 
13
  summarizer = load_model()
14
 
15
- st.subheader("Enter Your Text")
16
- text = st.text_area(
17
- "Paste or type your text here:",
18
- height=250,
19
- placeholder="Enter the text you want to summarize..."
20
- )
21
-
22
- # Summarize button
23
- if text.strip():
24
- if st.button("Summarize", type="primary"):
25
- with st.spinner("Summarizing..."):
26
- # Handle text length for T5 model
27
- input_text = text[:4000] if len(text) > 4000 else text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
- summary = summarizer(
30
- input_text,
31
- max_length=150,
32
- min_length=40,
33
- do_sample=False
34
- )[0]["summary_text"]
35
-
36
- st.subheader("Summary")
37
- st.success(summary)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
- # Copy button
40
- st.download_button(
41
- label="Download Summary",
42
- data=summary,
43
- file_name="summary.txt",
44
- mime="text/plain"
45
- )
46
- else:
47
- st.info("Please provide some text using one of the methods above.")
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  from transformers import pipeline
3
+ from PIL import Image
4
+ import pytesseract
5
+ import pdfplumber
6
+ import io
7
 
8
+ st.set_page_config(page_title="Docurizzer", layout="centered")
9
+ st.title("๐Ÿ“„ Docurizzer")
10
+ st.write("Summarize text, images, or PDFs with AI")
 
11
 
12
  @st.cache_resource
13
  def load_model():
 
15
 
16
  summarizer = load_model()
17
 
18
+ def extract_text_from_image(image):
19
+ """Extract text from image using OCR"""
20
+ return pytesseract.image_to_string(image)
21
+
22
+ def extract_text_from_pdf(pdf_file):
23
+ """Extract text from PDF file"""
24
+ text = ""
25
+ with pdfplumber.open(pdf_file) as pdf:
26
+ for page in pdf.pages:
27
+ page_text = page.extract_text()
28
+ if page_text:
29
+ text += page_text + "\n"
30
+ return text
31
+
32
+ def summarize_text(text):
33
+ """Summarize the given text"""
34
+ if not text.strip():
35
+ return None
36
+ input_text = text[:4000]
37
+ result = summarizer(input_text, max_length=150, min_length=40, do_sample=False)
38
+ return result[0]["summary_text"]
39
+
40
+ tab1, tab2, tab3 = st.tabs(["๐Ÿ“ Text", "๐Ÿ–ผ๏ธ Image", "๐Ÿ“„ PDF"])
41
+
42
+ with tab1:
43
+ st.subheader("Text Summarization")
44
+ text = st.text_area("Enter your text:", height=200, key="text_input")
45
+
46
+ if st.button("Summarize Text", key="summarize_text"):
47
+ if text.strip():
48
+ with st.spinner("Summarizing..."):
49
+ summary = summarize_text(text)
50
+ if summary:
51
+ st.subheader("Summary")
52
+ st.success(summary)
53
+ st.download_button("Download Summary", summary, "summary.txt", key="download_text")
54
+ else:
55
+ st.warning("Please enter some text first.")
56
+
57
+ with tab2:
58
+ st.subheader("Image Summarization")
59
+ st.info("Upload an image containing text to extract and summarize it using OCR.")
60
+
61
+ uploaded_image = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg", "bmp", "tiff"], key="image_upload")
62
+
63
+ if uploaded_image:
64
+ image = Image.open(uploaded_image)
65
+ st.image(image, caption="Uploaded Image", use_container_width=True)
66
+
67
+ if st.button("Extract & Summarize", key="summarize_image"):
68
+ with st.spinner("Extracting text from image..."):
69
+ extracted_text = extract_text_from_image(image)
70
 
71
+ if extracted_text.strip():
72
+ st.subheader("Extracted Text")
73
+ with st.expander("View extracted text"):
74
+ st.text(extracted_text)
75
+
76
+ with st.spinner("Summarizing..."):
77
+ summary = summarize_text(extracted_text)
78
+
79
+ if summary:
80
+ st.subheader("Summary")
81
+ st.success(summary)
82
+ st.download_button("Download Summary", summary, "image_summary.txt", key="download_image")
83
+ else:
84
+ st.warning("No text could be extracted from the image. Please try a clearer image.")
85
+
86
+ with tab3:
87
+ st.subheader("PDF Summarization")
88
+ st.info("Upload a PDF document to extract and summarize its content.")
89
+
90
+ uploaded_pdf = st.file_uploader("Upload a PDF", type=["pdf"], key="pdf_upload")
91
+
92
+ if uploaded_pdf:
93
+ st.success(f"Uploaded: {uploaded_pdf.name}")
94
 
95
+ if st.button("Extract & Summarize", key="summarize_pdf"):
96
+ with st.spinner("Extracting text from PDF..."):
97
+ extracted_text = extract_text_from_pdf(uploaded_pdf)
98
+
99
+ if extracted_text.strip():
100
+ st.subheader("Extracted Text")
101
+ with st.expander("View extracted text"):
102
+ st.text(extracted_text[:5000] + ("..." if len(extracted_text) > 5000 else ""))
103
+
104
+ with st.spinner("Summarizing..."):
105
+ summary = summarize_text(extracted_text)
106
+
107
+ if summary:
108
+ st.subheader("Summary")
109
+ st.success(summary)
110
+ st.download_button("Download Summary", summary, "pdf_summary.txt", key="download_pdf")
111
+ else:
112
+ st.warning("No text could be extracted from the PDF. The PDF might be image-based or empty.")
113
+
114
+ st.divider()
115
+ st.caption("Powered by T5 AI Model | Built with Streamlit")
packages.txt CHANGED
@@ -1 +1 @@
1
-
 
1
+ tesseract-ocr