|
|
import gradio as gr |
|
|
import fitz |
|
|
from transformers import pipeline |
|
|
import textwrap |
|
|
|
|
|
|
|
|
summarizer = pipeline("summarization", model="facebook/bart-large-cnn") |
|
|
|
|
|
|
|
|
def extract_text_from_pdf(pdf_file): |
|
|
text = "" |
|
|
|
|
|
with fitz.open(pdf_file.name) as doc: |
|
|
for page in doc: |
|
|
text += page.get_text() |
|
|
return text.strip().replace("\n", " ") |
|
|
|
|
|
|
|
|
def chunk_text(text, max_chunk_len=1000): |
|
|
return textwrap.wrap(text, max_chunk_len) |
|
|
|
|
|
|
|
|
def summarize_long_pdf(file_obj): |
|
|
full_text = extract_text_from_pdf(file_obj) |
|
|
if not full_text: |
|
|
return "β No readable text extracted from the PDF." |
|
|
|
|
|
chunks = chunk_text(full_text, max_chunk_len=1000) |
|
|
summaries = [] |
|
|
|
|
|
for i, chunk in enumerate(chunks): |
|
|
try: |
|
|
summary = summarizer(chunk, max_length=120, min_length=30, do_sample=False)[0]['summary_text'] |
|
|
summaries.append(f"πΉ Part {i+1}: {summary}") |
|
|
except Exception as e: |
|
|
summaries.append(f"β οΈ Error summarizing part {i+1}: {e}") |
|
|
|
|
|
return "\n\n".join(summaries) |
|
|
|
|
|
|
|
|
gr.Interface( |
|
|
fn=summarize_long_pdf, |
|
|
inputs=gr.File(label="π₯ Upload Multi-page PDF"), |
|
|
outputs=gr.Textbox(label="π Full Summary"), |
|
|
title="π Multi-Page PDF Summarizer", |
|
|
description="Upload long PDFs (e.g., Morningstar reports). Summarized in chunks using BART." |
|
|
).launch() |