import gradio as gr import fitz # PyMuPDF from transformers import pipeline import textwrap # Load summarization model summarizer = pipeline("summarization", model="facebook/bart-large-cnn") # Extract text from PDF def extract_text_from_pdf(pdf_file): text = "" #with fitz.open(stream=file_obj.read(), filetype="pdf") as doc: with fitz.open(pdf_file.name) as doc: for page in doc: text += page.get_text() return text.strip().replace("\n", " ") # Chunk long text into manageable sizes def chunk_text(text, max_chunk_len=1000): return textwrap.wrap(text, max_chunk_len) # Summarize long PDFs by chunking def summarize_long_pdf(file_obj): full_text = extract_text_from_pdf(file_obj) if not full_text: return "❌ No readable text extracted from the PDF." chunks = chunk_text(full_text, max_chunk_len=1000) summaries = [] for i, chunk in enumerate(chunks): try: summary = summarizer(chunk, max_length=120, min_length=30, do_sample=False)[0]['summary_text'] summaries.append(f"🔹 Part {i+1}: {summary}") except Exception as e: summaries.append(f"⚠️ Error summarizing part {i+1}: {e}") return "\n\n".join(summaries) # Gradio UI gr.Interface( fn=summarize_long_pdf, inputs=gr.File(label="📥 Upload Multi-page PDF"), outputs=gr.Textbox(label="📝 Full Summary"), title="📘 Multi-Page PDF Summarizer", description="Upload long PDFs (e.g., Morningstar reports). Summarized in chunks using BART." ).launch()