File size: 1,560 Bytes
e4777cb
 
 
 
 
 
 
 
 
129b0a2
e4777cb
8998039
 
e4777cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import gradio as gr
import fitz  # PyMuPDF
from transformers import pipeline
import textwrap

# Load summarization model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Extract text from PDF
def extract_text_from_pdf(pdf_file):
    text = ""
    #with fitz.open(stream=file_obj.read(), filetype="pdf") as doc:
    with fitz.open(pdf_file.name) as doc:
        for page in doc:
            text += page.get_text()
    return text.strip().replace("\n", " ")

# Chunk long text into manageable sizes
def chunk_text(text, max_chunk_len=1000):
    return textwrap.wrap(text, max_chunk_len)

# Summarize long PDFs by chunking
def summarize_long_pdf(file_obj):
    full_text = extract_text_from_pdf(file_obj)
    if not full_text:
        return "❌ No readable text extracted from the PDF."

    chunks = chunk_text(full_text, max_chunk_len=1000)
    summaries = []

    for i, chunk in enumerate(chunks):
        try:
            summary = summarizer(chunk, max_length=120, min_length=30, do_sample=False)[0]['summary_text']
            summaries.append(f"πŸ”Ή Part {i+1}: {summary}")
        except Exception as e:
            summaries.append(f"⚠️ Error summarizing part {i+1}: {e}")

    return "\n\n".join(summaries)

# Gradio UI
gr.Interface(
    fn=summarize_long_pdf,
    inputs=gr.File(label="πŸ“₯ Upload Multi-page PDF"),
    outputs=gr.Textbox(label="πŸ“ Full Summary"),
    title="πŸ“˜ Multi-Page PDF Summarizer",
    description="Upload long PDFs (e.g., Morningstar reports). Summarized in chunks using BART."
).launch()