File size: 4,900 Bytes
a5c6c99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138

import gradio as gr
from transformers import pipeline
import docx2txt
import PyPDF2
from docx import Document
from fpdf import FPDF
import os
from io import BytesIO

# Load models
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
translator_hi_en = pipeline("translation", model="Helsinki-NLP/opus-mt-hi-en")
translator_mr_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mr-en")
translator_en_hi = pipeline("translation", model="Helsinki-NLP/opus-mt-en-hi")
translator_en_mr = pipeline("translation", model="Helsinki-NLP/opus-mt-en-mr")

# Extract text based on file type
def extract_text(file):
    ext = file.name.split(".")[-1].lower()
    if ext == "txt":
        return file.read().decode("utf-8")
    elif ext == "pdf":
        reader = PyPDF2.PdfReader(file)
        return "\n".join(page.extract_text() for page in reader.pages if page.extract_text())
    elif ext == "docx":
        return docx2txt.process(file)
    else:
        return "Unsupported file type. Please upload a .pdf, .docx, or .txt file."

# Chunk long text for translation and summarization
def chunk_text(text, max_length=1000):
    paragraphs = text.split("\n")
    chunks = []
    current_chunk = ""
    for para in paragraphs:
        if len(current_chunk) + len(para) < max_length:
            current_chunk += para + "\n"
        else:
            chunks.append(current_chunk.strip())
            current_chunk = para + "\n"
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

# Translate to English from selected language
def translate_to_english(text, lang):
    if lang == "Hindi":
        return " ".join([translator_hi_en(chunk)[0]['translation_text'] for chunk in chunk_text(text, 500)])
    elif lang == "Marathi":
        return " ".join([translator_mr_en(chunk)[0]['translation_text'] for chunk in chunk_text(text, 500)])
    return text

# Translate from English to selected output language
def translate_from_english(text, lang):
    if lang == "Hindi":
        return " ".join([translator_en_hi(chunk)[0]['translation_text'] for chunk in chunk_text(text, 500)])
    elif lang == "Marathi":
        return " ".join([translator_en_mr(chunk)[0]['translation_text'] for chunk in chunk_text(text, 500)])
    return text

# Save summary to DOCX
def generate_docx(text):
    doc = Document()
    doc.add_heading("Summary", 0)
    doc.add_paragraph(text)
    buffer = BytesIO()
    doc.save(buffer)
    buffer.seek(0)
    return buffer

# Save summary to PDF
def generate_pdf(text):
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)
    for line in text.split("\n"):
        pdf.multi_cell(0, 10, line)
    buffer = BytesIO()
    pdf.output(buffer)
    buffer.seek(0)
    return buffer

# Main summarization function
def summarize_input(text, file, length, input_lang, output_lang):
    source_text = text.strip() if text.strip() else extract_text(file)
    if not source_text:
        return "", None, None

    # Translate to English if needed
    if input_lang != "English":
        source_text = translate_to_english(source_text, input_lang)

    # Set summary length
    if length == "Short (1–2 sentences)":
        min_len, max_len = 20, 60
    elif length == "Detailed (paragraph)":
        min_len, max_len = 80, 200
    else:
        min_len, max_len = 40, 130

    chunks = chunk_text(source_text)
    summaries = [summarizer(chunk, max_length=max_len, min_length=min_len, do_sample=False)[0]['summary_text'] for chunk in chunks]
    final_summary = "\n\n".join(summaries)

    # Translate from English to output language
    if output_lang != "English":
        final_summary = translate_from_english(final_summary, output_lang)

    docx_file = generate_docx(final_summary)
    pdf_file = generate_pdf(final_summary)
    return final_summary, ("summary.docx", docx_file), ("summary.pdf", pdf_file)

# Gradio interface
iface = gr.Interface(
    fn=summarize_input,
    inputs=[
        gr.Textbox(lines=8, label="Enter text (optional)"),
        gr.File(label="Upload file (.txt, .pdf, .docx)", file_types=[".pdf", ".docx", ".txt"]),
        gr.Radio([
            "Short (1–2 sentences)",
            "Medium (3–5 sentences)",
            "Detailed (paragraph)"
        ], label="Summary length", value="Medium (3–5 sentences)"),
        gr.Dropdown(["English", "Hindi", "Marathi"], label="Document Language", value="English"),
        gr.Dropdown(["English", "Hindi", "Marathi"], label="Summary Output Language", value="English")
    ],
    outputs=[
        gr.Textbox(label="Summary"),
        gr.File(label="Download as DOCX"),
        gr.File(label="Download as PDF")
    ],
    title="🌍 Multilingual Document Summarizer",
    description="Upload or paste a document in English, Hindi, or Marathi. App will translate if needed and summarize it into your chosen output language."
)

iface.launch()