aspendse commited on
Commit
a5c6c99
·
verified ·
1 Parent(s): ba56e29

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +137 -0
  2. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import gradio as gr
3
+ from transformers import pipeline
4
+ import docx2txt
5
+ import PyPDF2
6
+ from docx import Document
7
+ from fpdf import FPDF
8
+ import os
9
+ from io import BytesIO
10
+
11
+ # Load models
12
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
13
+ translator_hi_en = pipeline("translation", model="Helsinki-NLP/opus-mt-hi-en")
14
+ translator_mr_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mr-en")
15
+ translator_en_hi = pipeline("translation", model="Helsinki-NLP/opus-mt-en-hi")
16
+ translator_en_mr = pipeline("translation", model="Helsinki-NLP/opus-mt-en-mr")
17
+
18
+ # Extract text based on file type
19
+ def extract_text(file):
20
+ ext = file.name.split(".")[-1].lower()
21
+ if ext == "txt":
22
+ return file.read().decode("utf-8")
23
+ elif ext == "pdf":
24
+ reader = PyPDF2.PdfReader(file)
25
+ return "\n".join(page.extract_text() for page in reader.pages if page.extract_text())
26
+ elif ext == "docx":
27
+ return docx2txt.process(file)
28
+ else:
29
+ return "Unsupported file type. Please upload a .pdf, .docx, or .txt file."
30
+
31
+ # Chunk long text for translation and summarization
32
+ def chunk_text(text, max_length=1000):
33
+ paragraphs = text.split("\n")
34
+ chunks = []
35
+ current_chunk = ""
36
+ for para in paragraphs:
37
+ if len(current_chunk) + len(para) < max_length:
38
+ current_chunk += para + "\n"
39
+ else:
40
+ chunks.append(current_chunk.strip())
41
+ current_chunk = para + "\n"
42
+ if current_chunk:
43
+ chunks.append(current_chunk.strip())
44
+ return chunks
45
+
46
+ # Translate to English from selected language
47
+ def translate_to_english(text, lang):
48
+ if lang == "Hindi":
49
+ return " ".join([translator_hi_en(chunk)[0]['translation_text'] for chunk in chunk_text(text, 500)])
50
+ elif lang == "Marathi":
51
+ return " ".join([translator_mr_en(chunk)[0]['translation_text'] for chunk in chunk_text(text, 500)])
52
+ return text
53
+
54
+ # Translate from English to selected output language
55
+ def translate_from_english(text, lang):
56
+ if lang == "Hindi":
57
+ return " ".join([translator_en_hi(chunk)[0]['translation_text'] for chunk in chunk_text(text, 500)])
58
+ elif lang == "Marathi":
59
+ return " ".join([translator_en_mr(chunk)[0]['translation_text'] for chunk in chunk_text(text, 500)])
60
+ return text
61
+
62
+ # Save summary to DOCX
63
+ def generate_docx(text):
64
+ doc = Document()
65
+ doc.add_heading("Summary", 0)
66
+ doc.add_paragraph(text)
67
+ buffer = BytesIO()
68
+ doc.save(buffer)
69
+ buffer.seek(0)
70
+ return buffer
71
+
72
+ # Save summary to PDF
73
+ def generate_pdf(text):
74
+ pdf = FPDF()
75
+ pdf.add_page()
76
+ pdf.set_font("Arial", size=12)
77
+ for line in text.split("\n"):
78
+ pdf.multi_cell(0, 10, line)
79
+ buffer = BytesIO()
80
+ pdf.output(buffer)
81
+ buffer.seek(0)
82
+ return buffer
83
+
84
+ # Main summarization function
85
+ def summarize_input(text, file, length, input_lang, output_lang):
86
+ source_text = text.strip() if text.strip() else extract_text(file)
87
+ if not source_text:
88
+ return "", None, None
89
+
90
+ # Translate to English if needed
91
+ if input_lang != "English":
92
+ source_text = translate_to_english(source_text, input_lang)
93
+
94
+ # Set summary length
95
+ if length == "Short (1–2 sentences)":
96
+ min_len, max_len = 20, 60
97
+ elif length == "Detailed (paragraph)":
98
+ min_len, max_len = 80, 200
99
+ else:
100
+ min_len, max_len = 40, 130
101
+
102
+ chunks = chunk_text(source_text)
103
+ summaries = [summarizer(chunk, max_length=max_len, min_length=min_len, do_sample=False)[0]['summary_text'] for chunk in chunks]
104
+ final_summary = "\n\n".join(summaries)
105
+
106
+ # Translate from English to output language
107
+ if output_lang != "English":
108
+ final_summary = translate_from_english(final_summary, output_lang)
109
+
110
+ docx_file = generate_docx(final_summary)
111
+ pdf_file = generate_pdf(final_summary)
112
+ return final_summary, ("summary.docx", docx_file), ("summary.pdf", pdf_file)
113
+
114
+ # Gradio interface
115
+ iface = gr.Interface(
116
+ fn=summarize_input,
117
+ inputs=[
118
+ gr.Textbox(lines=8, label="Enter text (optional)"),
119
+ gr.File(label="Upload file (.txt, .pdf, .docx)", file_types=[".pdf", ".docx", ".txt"]),
120
+ gr.Radio([
121
+ "Short (1–2 sentences)",
122
+ "Medium (3–5 sentences)",
123
+ "Detailed (paragraph)"
124
+ ], label="Summary length", value="Medium (3–5 sentences)"),
125
+ gr.Dropdown(["English", "Hindi", "Marathi"], label="Document Language", value="English"),
126
+ gr.Dropdown(["English", "Hindi", "Marathi"], label="Summary Output Language", value="English")
127
+ ],
128
+ outputs=[
129
+ gr.Textbox(label="Summary"),
130
+ gr.File(label="Download as DOCX"),
131
+ gr.File(label="Download as PDF")
132
+ ],
133
+ title="🌍 Multilingual Document Summarizer",
134
+ description="Upload or paste a document in English, Hindi, or Marathi. App will translate if needed and summarize it into your chosen output language."
135
+ )
136
+
137
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ transformers>=4.40.0
2
+ torch>=2.1.0
3
+ gradio>=4.26.0
4
+ docx2txt==0.8
5
+ PyPDF2>=3.0.1
6
+ fpdf
7
+ python-docx