aspendse commited on
Commit
a16a237
·
verified ·
1 Parent(s): c7a2237

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -45
app.py CHANGED
@@ -1,48 +1,28 @@
1
- import gradio as gr
2
- from transformers import pipeline
3
- import PyPDF2
4
- import docx
5
- import os
6
 
7
- # Load a strong summarization model
8
  summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
9
 
10
- def extract_text_from_pdf(file):
11
- reader = PyPDF2.PdfReader(file.name)
12
- return "\n".join(page.extract_text() or "" for page in reader.pages)
13
-
14
- def extract_text_from_docx(file):
15
- doc = docx.Document(file.name)
16
- return "\n".join(para.text for para in doc.paragraphs)
17
-
18
- def summarize_input(pdf_file, docx_file, text_input):
19
- if pdf_file:
20
- text = extract_text_from_pdf(pdf_file)
21
- elif docx_file:
22
- text = extract_text_from_docx(docx_file)
23
- elif text_input:
24
- text = text_input
25
- else:
26
- return "❌ Please upload a file or paste text."
27
-
28
- if not text.strip():
29
- return "❌ Input could not be read or is empty."
30
-
31
- # Limit to 2000 chars to avoid model truncation
32
- if len(text) > 2000:
33
- text = text[:2000]
34
-
35
- summary = summarizer(text, max_length=150, min_length=40, do_sample=False)
36
- return summary[0]["summary_text"]
37
-
38
- with gr.Blocks() as demo:
39
- gr.Markdown("## 🧠 Smart Summarizer")
40
- with gr.Row():
41
- pdf_in = gr.File(label="Upload PDF", file_types=[".pdf"])
42
- docx_in = gr.File(label="Upload Word (.docx)", file_types=[".docx"])
43
- txt_in = gr.Textbox(lines=8, label="Or Paste Text")
44
- btn = gr.Button("Summarize")
45
- out = gr.Textbox(lines=8, label="Summary")
46
- btn.click(fn=summarize_input, inputs=[pdf_in, docx_in, txt_in], outputs=out)
47
-
48
- demo.launch()
 
1
+ import gradio as gr from transformers import pipeline import docx2txt import PyPDF2 import os
2
+
3
+ Load the summarizer model
 
 
4
 
 
5
  summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
6
 
7
+ Extract text based on file type
8
+
9
+ def extract_text(file): ext = file.name.split(".")[-1].lower() if ext == "txt": return file.read().decode("utf-8") elif ext == "pdf": reader = PyPDF2.PdfReader(file) return "\n".join(page.extract_text() for page in reader.pages if page.extract_text()) elif ext == "docx": return docx2txt.process(file) else: return "Unsupported file type. Please upload a .pdf, .docx, or .txt file."
10
+
11
+ Chunk long text for full-document summarization
12
+
13
+ def chunk_text(text, max_length=1000): paragraphs = text.split("\n") chunks = [] current_chunk = "" for para in paragraphs: if len(current_chunk) + len(para) < max_length: current_chunk += para + "\n" else: chunks.append(current_chunk.strip()) current_chunk = para + "\n" if current_chunk: chunks.append(current_chunk.strip()) return chunks
14
+
15
+ Full summarization function
16
+
17
+ def summarize_input(text, file): source_text = text.strip() if text.strip() else extract_text(file) if not source_text: return "Please enter text or upload a valid file."
18
+
19
+ chunks = chunk_text(source_text)
20
+ summaries = [summarizer(chunk, max_length=130, min_length=30, do_sample=False)[0]['summary_text'] for chunk in chunks]
21
+ return "\n\n".join(summaries)
22
+
23
+ Gradio interface
24
+
25
+ iface = gr.Interface( fn=summarize_input, inputs=[ gr.Textbox(lines=8, label="Enter text (optional)"), gr.File(label="Upload file (.txt, .pdf, .docx)", file_types=[".pdf", ".docx", ".txt"]) ], outputs=gr.Textbox(label="Summary"), title="📄 Smart Document Summarizer", description="Paste text or upload a document to get a full summary using a Hugging Face transformer." )
26
+
27
+ iface.launch()
28
+