File size: 1,560 Bytes
e4777cb 129b0a2 e4777cb 8998039 e4777cb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
import gradio as gr
import fitz # PyMuPDF
from transformers import pipeline
import textwrap
# Load summarization model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
# Extract text from PDF
def extract_text_from_pdf(pdf_file):
text = ""
#with fitz.open(stream=file_obj.read(), filetype="pdf") as doc:
with fitz.open(pdf_file.name) as doc:
for page in doc:
text += page.get_text()
return text.strip().replace("\n", " ")
# Chunk long text into manageable sizes
def chunk_text(text, max_chunk_len=1000):
return textwrap.wrap(text, max_chunk_len)
# Summarize long PDFs by chunking
def summarize_long_pdf(file_obj):
full_text = extract_text_from_pdf(file_obj)
if not full_text:
return "β No readable text extracted from the PDF."
chunks = chunk_text(full_text, max_chunk_len=1000)
summaries = []
for i, chunk in enumerate(chunks):
try:
summary = summarizer(chunk, max_length=120, min_length=30, do_sample=False)[0]['summary_text']
summaries.append(f"πΉ Part {i+1}: {summary}")
except Exception as e:
summaries.append(f"β οΈ Error summarizing part {i+1}: {e}")
return "\n\n".join(summaries)
# Gradio UI
gr.Interface(
fn=summarize_long_pdf,
inputs=gr.File(label="π₯ Upload Multi-page PDF"),
outputs=gr.Textbox(label="π Full Summary"),
title="π Multi-Page PDF Summarizer",
description="Upload long PDFs (e.g., Morningstar reports). Summarized in chunks using BART."
).launch() |