Spaces:
Sleeping
Sleeping
File size: 4,900 Bytes
a5c6c99 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
import gradio as gr
from transformers import pipeline
import docx2txt
import PyPDF2
from docx import Document
from fpdf import FPDF
import os
from io import BytesIO
# Load models
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
translator_hi_en = pipeline("translation", model="Helsinki-NLP/opus-mt-hi-en")
translator_mr_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mr-en")
translator_en_hi = pipeline("translation", model="Helsinki-NLP/opus-mt-en-hi")
translator_en_mr = pipeline("translation", model="Helsinki-NLP/opus-mt-en-mr")
# Extract text based on file type
def extract_text(file):
ext = file.name.split(".")[-1].lower()
if ext == "txt":
return file.read().decode("utf-8")
elif ext == "pdf":
reader = PyPDF2.PdfReader(file)
return "\n".join(page.extract_text() for page in reader.pages if page.extract_text())
elif ext == "docx":
return docx2txt.process(file)
else:
return "Unsupported file type. Please upload a .pdf, .docx, or .txt file."
# Chunk long text for translation and summarization
def chunk_text(text, max_length=1000):
paragraphs = text.split("\n")
chunks = []
current_chunk = ""
for para in paragraphs:
if len(current_chunk) + len(para) < max_length:
current_chunk += para + "\n"
else:
chunks.append(current_chunk.strip())
current_chunk = para + "\n"
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
# Translate to English from selected language
def translate_to_english(text, lang):
if lang == "Hindi":
return " ".join([translator_hi_en(chunk)[0]['translation_text'] for chunk in chunk_text(text, 500)])
elif lang == "Marathi":
return " ".join([translator_mr_en(chunk)[0]['translation_text'] for chunk in chunk_text(text, 500)])
return text
# Translate from English to selected output language
def translate_from_english(text, lang):
if lang == "Hindi":
return " ".join([translator_en_hi(chunk)[0]['translation_text'] for chunk in chunk_text(text, 500)])
elif lang == "Marathi":
return " ".join([translator_en_mr(chunk)[0]['translation_text'] for chunk in chunk_text(text, 500)])
return text
# Save summary to DOCX
def generate_docx(text):
doc = Document()
doc.add_heading("Summary", 0)
doc.add_paragraph(text)
buffer = BytesIO()
doc.save(buffer)
buffer.seek(0)
return buffer
# Save summary to PDF
def generate_pdf(text):
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", size=12)
for line in text.split("\n"):
pdf.multi_cell(0, 10, line)
buffer = BytesIO()
pdf.output(buffer)
buffer.seek(0)
return buffer
# Main summarization function
def summarize_input(text, file, length, input_lang, output_lang):
source_text = text.strip() if text.strip() else extract_text(file)
if not source_text:
return "", None, None
# Translate to English if needed
if input_lang != "English":
source_text = translate_to_english(source_text, input_lang)
# Set summary length
if length == "Short (1–2 sentences)":
min_len, max_len = 20, 60
elif length == "Detailed (paragraph)":
min_len, max_len = 80, 200
else:
min_len, max_len = 40, 130
chunks = chunk_text(source_text)
summaries = [summarizer(chunk, max_length=max_len, min_length=min_len, do_sample=False)[0]['summary_text'] for chunk in chunks]
final_summary = "\n\n".join(summaries)
# Translate from English to output language
if output_lang != "English":
final_summary = translate_from_english(final_summary, output_lang)
docx_file = generate_docx(final_summary)
pdf_file = generate_pdf(final_summary)
return final_summary, ("summary.docx", docx_file), ("summary.pdf", pdf_file)
# Gradio interface
iface = gr.Interface(
fn=summarize_input,
inputs=[
gr.Textbox(lines=8, label="Enter text (optional)"),
gr.File(label="Upload file (.txt, .pdf, .docx)", file_types=[".pdf", ".docx", ".txt"]),
gr.Radio([
"Short (1–2 sentences)",
"Medium (3–5 sentences)",
"Detailed (paragraph)"
], label="Summary length", value="Medium (3–5 sentences)"),
gr.Dropdown(["English", "Hindi", "Marathi"], label="Document Language", value="English"),
gr.Dropdown(["English", "Hindi", "Marathi"], label="Summary Output Language", value="English")
],
outputs=[
gr.Textbox(label="Summary"),
gr.File(label="Download as DOCX"),
gr.File(label="Download as PDF")
],
title="🌍 Multilingual Document Summarizer",
description="Upload or paste a document in English, Hindi, or Marathi. App will translate if needed and summarize it into your chosen output language."
)
iface.launch()
|