Yatheshr commited on
Commit
e4777cb
·
verified ·
1 Parent(s): c03ebba

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -0
app.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import fitz # PyMuPDF
3
+ from transformers import pipeline
4
+ import textwrap
5
+
6
+ # Load summarization model
7
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
8
+
9
+ # Extract text from PDF
10
+ def extract_text_from_pdf(file_obj):
11
+ text = ""
12
+ with fitz.open(stream=file_obj.read(), filetype="pdf") as doc:
13
+ for page in doc:
14
+ text += page.get_text()
15
+ return text.strip().replace("\n", " ")
16
+
17
+ # Chunk long text into manageable sizes
18
+ def chunk_text(text, max_chunk_len=1000):
19
+ return textwrap.wrap(text, max_chunk_len)
20
+
21
+ # Summarize long PDFs by chunking
22
+ def summarize_long_pdf(file_obj):
23
+ full_text = extract_text_from_pdf(file_obj)
24
+ if not full_text:
25
+ return "❌ No readable text extracted from the PDF."
26
+
27
+ chunks = chunk_text(full_text, max_chunk_len=1000)
28
+ summaries = []
29
+
30
+ for i, chunk in enumerate(chunks):
31
+ try:
32
+ summary = summarizer(chunk, max_length=120, min_length=30, do_sample=False)[0]['summary_text']
33
+ summaries.append(f"🔹 Part {i+1}: {summary}")
34
+ except Exception as e:
35
+ summaries.append(f"⚠️ Error summarizing part {i+1}: {e}")
36
+
37
+ return "\n\n".join(summaries)
38
+
39
+ # Gradio UI
40
+ gr.Interface(
41
+ fn=summarize_long_pdf,
42
+ inputs=gr.File(label="📥 Upload Multi-page PDF"),
43
+ outputs=gr.Textbox(label="📝 Full Summary"),
44
+ title="📘 Multi-Page PDF Summarizer",
45
+ description="Upload long PDFs (e.g., Morningstar reports). Summarized in chunks using BART."
46
+ ).launch()