Text-summarizer / app.py
aspendse's picture
Upload 2 files
a5c6c99 verified
import gradio as gr
from transformers import pipeline
import docx2txt
import PyPDF2
from docx import Document
from fpdf import FPDF
import os
from io import BytesIO
# Load models
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
translator_hi_en = pipeline("translation", model="Helsinki-NLP/opus-mt-hi-en")
translator_mr_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mr-en")
translator_en_hi = pipeline("translation", model="Helsinki-NLP/opus-mt-en-hi")
translator_en_mr = pipeline("translation", model="Helsinki-NLP/opus-mt-en-mr")
# Extract text based on file type
def extract_text(file):
ext = file.name.split(".")[-1].lower()
if ext == "txt":
return file.read().decode("utf-8")
elif ext == "pdf":
reader = PyPDF2.PdfReader(file)
return "\n".join(page.extract_text() for page in reader.pages if page.extract_text())
elif ext == "docx":
return docx2txt.process(file)
else:
return "Unsupported file type. Please upload a .pdf, .docx, or .txt file."
# Chunk long text for translation and summarization
def chunk_text(text, max_length=1000):
paragraphs = text.split("\n")
chunks = []
current_chunk = ""
for para in paragraphs:
if len(current_chunk) + len(para) < max_length:
current_chunk += para + "\n"
else:
chunks.append(current_chunk.strip())
current_chunk = para + "\n"
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
# Translate to English from selected language
def translate_to_english(text, lang):
if lang == "Hindi":
return " ".join([translator_hi_en(chunk)[0]['translation_text'] for chunk in chunk_text(text, 500)])
elif lang == "Marathi":
return " ".join([translator_mr_en(chunk)[0]['translation_text'] for chunk in chunk_text(text, 500)])
return text
# Translate from English to selected output language
def translate_from_english(text, lang):
if lang == "Hindi":
return " ".join([translator_en_hi(chunk)[0]['translation_text'] for chunk in chunk_text(text, 500)])
elif lang == "Marathi":
return " ".join([translator_en_mr(chunk)[0]['translation_text'] for chunk in chunk_text(text, 500)])
return text
# Save summary to DOCX
def generate_docx(text):
doc = Document()
doc.add_heading("Summary", 0)
doc.add_paragraph(text)
buffer = BytesIO()
doc.save(buffer)
buffer.seek(0)
return buffer
# Save summary to PDF
def generate_pdf(text):
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", size=12)
for line in text.split("\n"):
pdf.multi_cell(0, 10, line)
buffer = BytesIO()
pdf.output(buffer)
buffer.seek(0)
return buffer
# Main summarization function
def summarize_input(text, file, length, input_lang, output_lang):
source_text = text.strip() if text.strip() else extract_text(file)
if not source_text:
return "", None, None
# Translate to English if needed
if input_lang != "English":
source_text = translate_to_english(source_text, input_lang)
# Set summary length
if length == "Short (1–2 sentences)":
min_len, max_len = 20, 60
elif length == "Detailed (paragraph)":
min_len, max_len = 80, 200
else:
min_len, max_len = 40, 130
chunks = chunk_text(source_text)
summaries = [summarizer(chunk, max_length=max_len, min_length=min_len, do_sample=False)[0]['summary_text'] for chunk in chunks]
final_summary = "\n\n".join(summaries)
# Translate from English to output language
if output_lang != "English":
final_summary = translate_from_english(final_summary, output_lang)
docx_file = generate_docx(final_summary)
pdf_file = generate_pdf(final_summary)
return final_summary, ("summary.docx", docx_file), ("summary.pdf", pdf_file)
# Gradio interface
iface = gr.Interface(
fn=summarize_input,
inputs=[
gr.Textbox(lines=8, label="Enter text (optional)"),
gr.File(label="Upload file (.txt, .pdf, .docx)", file_types=[".pdf", ".docx", ".txt"]),
gr.Radio([
"Short (1–2 sentences)",
"Medium (3–5 sentences)",
"Detailed (paragraph)"
], label="Summary length", value="Medium (3–5 sentences)"),
gr.Dropdown(["English", "Hindi", "Marathi"], label="Document Language", value="English"),
gr.Dropdown(["English", "Hindi", "Marathi"], label="Summary Output Language", value="English")
],
outputs=[
gr.Textbox(label="Summary"),
gr.File(label="Download as DOCX"),
gr.File(label="Download as PDF")
],
title="🌍 Multilingual Document Summarizer",
description="Upload or paste a document in English, Hindi, or Marathi. App will translate if needed and summarize it into your chosen output language."
)
iface.launch()