Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import pipeline | |
| import docx2txt | |
| import PyPDF2 | |
| from docx import Document | |
| from fpdf import FPDF | |
| import os | |
| from io import BytesIO | |
| # Load models | |
| summarizer = pipeline("summarization", model="facebook/bart-large-cnn") | |
| translator_hi_en = pipeline("translation", model="Helsinki-NLP/opus-mt-hi-en") | |
| translator_mr_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mr-en") | |
| translator_en_hi = pipeline("translation", model="Helsinki-NLP/opus-mt-en-hi") | |
| translator_en_mr = pipeline("translation", model="Helsinki-NLP/opus-mt-en-mr") | |
| # Extract text based on file type | |
| def extract_text(file): | |
| ext = file.name.split(".")[-1].lower() | |
| if ext == "txt": | |
| return file.read().decode("utf-8") | |
| elif ext == "pdf": | |
| reader = PyPDF2.PdfReader(file) | |
| return "\n".join(page.extract_text() for page in reader.pages if page.extract_text()) | |
| elif ext == "docx": | |
| return docx2txt.process(file) | |
| else: | |
| return "Unsupported file type. Please upload a .pdf, .docx, or .txt file." | |
| # Chunk long text for translation and summarization | |
| def chunk_text(text, max_length=1000): | |
| paragraphs = text.split("\n") | |
| chunks = [] | |
| current_chunk = "" | |
| for para in paragraphs: | |
| if len(current_chunk) + len(para) < max_length: | |
| current_chunk += para + "\n" | |
| else: | |
| chunks.append(current_chunk.strip()) | |
| current_chunk = para + "\n" | |
| if current_chunk: | |
| chunks.append(current_chunk.strip()) | |
| return chunks | |
| # Translate to English from selected language | |
| def translate_to_english(text, lang): | |
| if lang == "Hindi": | |
| return " ".join([translator_hi_en(chunk)[0]['translation_text'] for chunk in chunk_text(text, 500)]) | |
| elif lang == "Marathi": | |
| return " ".join([translator_mr_en(chunk)[0]['translation_text'] for chunk in chunk_text(text, 500)]) | |
| return text | |
| # Translate from English to selected output language | |
| def translate_from_english(text, lang): | |
| if lang == "Hindi": | |
| return " ".join([translator_en_hi(chunk)[0]['translation_text'] for chunk in chunk_text(text, 500)]) | |
| elif lang == "Marathi": | |
| return " ".join([translator_en_mr(chunk)[0]['translation_text'] for chunk in chunk_text(text, 500)]) | |
| return text | |
| # Save summary to DOCX | |
| def generate_docx(text): | |
| doc = Document() | |
| doc.add_heading("Summary", 0) | |
| doc.add_paragraph(text) | |
| buffer = BytesIO() | |
| doc.save(buffer) | |
| buffer.seek(0) | |
| return buffer | |
| # Save summary to PDF | |
| def generate_pdf(text): | |
| pdf = FPDF() | |
| pdf.add_page() | |
| pdf.set_font("Arial", size=12) | |
| for line in text.split("\n"): | |
| pdf.multi_cell(0, 10, line) | |
| buffer = BytesIO() | |
| pdf.output(buffer) | |
| buffer.seek(0) | |
| return buffer | |
| # Main summarization function | |
| def summarize_input(text, file, length, input_lang, output_lang): | |
| source_text = text.strip() if text.strip() else extract_text(file) | |
| if not source_text: | |
| return "", None, None | |
| # Translate to English if needed | |
| if input_lang != "English": | |
| source_text = translate_to_english(source_text, input_lang) | |
| # Set summary length | |
| if length == "Short (1–2 sentences)": | |
| min_len, max_len = 20, 60 | |
| elif length == "Detailed (paragraph)": | |
| min_len, max_len = 80, 200 | |
| else: | |
| min_len, max_len = 40, 130 | |
| chunks = chunk_text(source_text) | |
| summaries = [summarizer(chunk, max_length=max_len, min_length=min_len, do_sample=False)[0]['summary_text'] for chunk in chunks] | |
| final_summary = "\n\n".join(summaries) | |
| # Translate from English to output language | |
| if output_lang != "English": | |
| final_summary = translate_from_english(final_summary, output_lang) | |
| docx_file = generate_docx(final_summary) | |
| pdf_file = generate_pdf(final_summary) | |
| return final_summary, ("summary.docx", docx_file), ("summary.pdf", pdf_file) | |
| # Gradio interface | |
| iface = gr.Interface( | |
| fn=summarize_input, | |
| inputs=[ | |
| gr.Textbox(lines=8, label="Enter text (optional)"), | |
| gr.File(label="Upload file (.txt, .pdf, .docx)", file_types=[".pdf", ".docx", ".txt"]), | |
| gr.Radio([ | |
| "Short (1–2 sentences)", | |
| "Medium (3–5 sentences)", | |
| "Detailed (paragraph)" | |
| ], label="Summary length", value="Medium (3–5 sentences)"), | |
| gr.Dropdown(["English", "Hindi", "Marathi"], label="Document Language", value="English"), | |
| gr.Dropdown(["English", "Hindi", "Marathi"], label="Summary Output Language", value="English") | |
| ], | |
| outputs=[ | |
| gr.Textbox(label="Summary"), | |
| gr.File(label="Download as DOCX"), | |
| gr.File(label="Download as PDF") | |
| ], | |
| title="🌍 Multilingual Document Summarizer", | |
| description="Upload or paste a document in English, Hindi, or Marathi. App will translate if needed and summarize it into your chosen output language." | |
| ) | |
| iface.launch() | |