Azidan commited on
Commit
e6b80d2
·
verified ·
1 Parent(s): bb331f0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +216 -98
app.py CHANGED
@@ -1,42 +1,59 @@
1
  import gradio as gr
2
  import re
3
- from transformers import pipeline, AutoTokenizer
4
  from PyPDF2 import PdfReader
5
  import tempfile
 
6
 
7
  # =========================
8
- # Model setup (CPU-safe)
9
  # =========================
10
- # Use smaller, faster models to speed up processing
11
- MODEL_NAME = "sshleifer/distilbart-cnn-6-6" # Smaller than 12-6, faster on CPU
12
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 
 
 
 
 
 
13
  summarizer = pipeline(
14
  "summarization",
15
- model=MODEL_NAME,
16
- tokenizer=tokenizer,
17
  device=-1 # CPU only
18
  )
19
 
20
- # Use smaller flan-t5-small for faster advice generation
21
- advice_generator = pipeline(
22
  "text2text-generation",
23
- model="google/flan-t5-small",
24
  device=-1 # CPU only
25
  )
26
 
27
- CHUNK_SIZE = 900 # safe margin under typical max input
 
 
 
 
 
 
 
 
 
 
28
 
29
  # =========================
30
  # Utilities
31
  # =========================
32
  def clean_text(text: str) -> str:
33
  """Fix quotes, spacing, repetition, broken punctuation."""
34
- text = text.replace("", "'").replace("", "'")
35
- text = text.replace("", '"').replace("", '"')
36
  text = re.sub(r"[.]{2,}", ".", text)
37
  text = re.sub(r"[']{2,}", "'", text)
38
  text = re.sub(r"\s+", " ", text)
39
- sentences = re.split(r'(?<=[.!?])\s+', text)
40
  seen = set()
41
  result = []
42
  for s in sentences:
@@ -46,7 +63,7 @@ def clean_text(text: str) -> str:
46
  result.append(s.strip())
47
  return " ".join(result)
48
 
49
- def chunk_text(text: str):
50
  """Token-aware chunking to avoid model overflow."""
51
  tokens = tokenizer.encode(text, add_special_tokens=False)
52
  chunks = []
@@ -56,114 +73,179 @@ def chunk_text(text: str):
56
  chunks.append(chunk_text)
57
  return chunks
58
 
59
- def generate_ai_advice(summary: str) -> str:
60
- """Generate personalized study advice based on the paper summary."""
61
- truncated_summary = summary[:1000]
62
-
63
- prompt = (
64
- f"Read this summary of a technical paper: '{truncated_summary}'\n\n"
65
- "Generate exactly 5 practical study tips for a student to better understand and retain this content. "
66
- "Focus on active learning techniques, like practice, visualization, or connections to real-world applications. "
67
- "Make each tip start with a verb (e.g., 'Review...', 'Apply...') and keep them concise. "
68
- "Output only the 5 tips as bullet points, nothing else."
69
- )
70
-
71
- generated = advice_generator(
72
- prompt,
73
- max_length=250,
74
- num_return_sequences=1,
75
- do_sample=False,
76
- temperature=0.7
77
- )[0]["generated_text"]
78
 
79
- # Try to clean into bullet points
80
- tips = [tip.strip() for tip in generated.split('\n') if tip.strip().startswith('-') or tip.strip()]
81
- if not tips or len(tips) < 3:
82
- tips = [t.strip() for t in generated.split('.') if t.strip()]
83
-
84
- advice_md = "\n\n---\n\n### 📚 AI-Generated Study Tips\n\n"
85
- for i, tip in enumerate(tips[:5], 1):
86
- clean_tip = tip.lstrip('- ').strip()
87
- advice_md += f"- {clean_tip}\n"
 
 
 
 
 
 
 
 
 
88
 
89
- advice_md += "\n**Pro tip**: Combine these with spaced repetition (Anki / Quizlet) for long-term retention!"
90
- return advice_md
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
  def extract_possible_headings(text: str) -> str:
93
- """Attempt to extract potential titles and subtitles from raw text.
94
- This is a simple heuristic: short lines, all caps, or starting with numbers/sections."""
95
  lines = text.split('\n')
96
  headings = []
97
  for line in lines:
98
  stripped = line.strip()
99
- if stripped and (len(stripped) < 80) and (stripped.isupper() or re.match(r'^\d+\.?\s', stripped) or re.match(r'^[A-Z][a-z]+\s[A-Z]', stripped)):
 
 
 
 
 
100
  headings.append(stripped)
101
  if headings:
102
- return "### Extracted Possible Headings/Subtitles\n\n" + "\n- ".join([''] + headings) + "\n\n---\n\n"
103
  return ""
104
 
105
- def summarize_long_text(text: str, progress=gr.Progress()) -> str:
106
- """Summarize long text in chunks + add AI study advice.
107
- Now with longer summaries per chunk and formatted as bullet points."""
108
  if not text or len(text.strip()) == 0:
109
- return "No text provided."
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
  progress(0, desc="Extracting headings...")
112
- # Extract possible headings first
113
  headings_section = extract_possible_headings(text)
114
 
115
  progress(0.1, desc="Chunking text...")
116
- chunks = chunk_text(text)
117
 
118
  summaries = []
119
  progress(0.2, desc="Summarizing chunks...")
 
 
 
 
 
120
  for i in progress.tqdm(range(len(chunks))):
121
  chunk = chunks[i]
122
  try:
 
 
 
123
  summary = summarizer(
124
  chunk,
125
- max_length=200, # Reduced slightly for speed (compromise between length and time)
126
- min_length=60, # Reduced for speed
127
- do_sample=False
 
128
  )[0]["summary_text"]
 
129
  cleaned = clean_text(summary)
130
- summaries.append(f"**Chunk {i+1} Summary:** {cleaned}")
131
- except Exception:
 
 
132
  pass # skip problematic chunks
133
 
134
- # Format summaries as bullet points
135
- summary_md = "### Detailed Summary (in Bullet Points)\n\n"
 
136
  for s in summaries:
137
  summary_md += f"- {s}\n"
138
 
139
- progress(0.8, desc="Generating AI advice...")
140
- ai_advice = generate_ai_advice(summary_md) # Use the bulleted summary for advice generation
141
 
142
  progress(1, desc="Done!")
143
- return headings_section + summary_md + ai_advice
144
 
145
  def read_pdf(file) -> str:
146
  """Safely extract text from PDF."""
147
  try:
148
  reader = PdfReader(file)
149
  pages = [page.extract_text() or "" for page in reader.pages]
150
- return "\n".join(pages) # Join with newlines to preserve line breaks for heading detection
151
  except Exception as e:
152
  return f"PDF read error: {str(e)}"
153
 
154
- # =========================
155
- # Download helper
156
- # =========================
157
  def create_download_file(content: str) -> str:
158
- """Create temporary file for Gradio file download component"""
159
  with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as tmp:
160
  tmp.write(content)
161
  return tmp.name
162
 
163
- # =========================
164
- # Main handler
165
- # =========================
166
- def process_input(text: str, file, progress=gr.Progress()):
167
  input_text = ""
168
 
169
  progress(0, desc="Reading input...")
@@ -174,7 +256,14 @@ def process_input(text: str, file, progress=gr.Progress()):
174
  else:
175
  return "Please paste some text or upload a PDF.", None
176
 
177
- result = summarize_long_text(input_text, progress)
 
 
 
 
 
 
 
178
  download_path = create_download_file(result)
179
 
180
  return result, download_path
@@ -182,46 +271,75 @@ def process_input(text: str, file, progress=gr.Progress()):
182
  # =========================
183
  # Gradio UI
184
  # =========================
185
- with gr.Blocks() as demo:
186
- gr.Markdown("# 📄 Long Text Summarizer + AI Study Assistant")
 
 
187
  gr.Markdown(
188
- " Handles very long documents (thousands of words)\n"
189
- "• Supports **PDF** upload or direct paste\n"
190
- "• Runs on CPU works on free hardware\n"
191
- "• Gives you **longer, bullet-point summaries** with possible headings/subtitles\n"
192
- "• Includes **5 AI-generated study tips** tailored to the content\n"
193
- "• Download result as .txt file\n"
194
- "**Note**: Processing may take time for long documents on CPU (initial model load + inference). Please be patient!"
195
  )
196
 
197
  with gr.Row():
198
- text_input = gr.Textbox(
199
- lines=10,
200
- label="Paste your text here (optional)",
201
- placeholder="Paste lecture notes, article, book chapter...",
202
- )
203
- file_input = gr.File(
204
- label="Or upload a PDF",
205
- file_types=[".pdf"]
206
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
 
208
- summarize_btn = gr.Button("Summarize & Get Study Tips", variant="primary")
209
 
210
  output = gr.Textbox(
211
- lines=16,
212
- label="Summary + AI-generated study advice",
213
  interactive=False
214
  )
215
 
216
  download_output = gr.File(
217
- label="Download full result (.txt)",
218
  interactive=False
219
  )
220
 
221
  summarize_btn.click(
222
  fn=process_input,
223
- inputs=[text_input, file_input],
224
  outputs=[output, download_output]
225
  )
 
 
 
 
 
 
 
 
 
 
 
 
226
 
227
  demo.launch()
 
1
  import gradio as gr
2
  import re
3
+ from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
4
  from PyPDF2 import PdfReader
5
  import tempfile
6
+ import torch
7
 
8
  # =========================
9
+ # Model setup (CPU-safe, Multi-language)
10
  # =========================
11
+ # Use mBART for multilingual support (English + Arabic)
12
+ SUMMARIZER_MODEL = "facebook/mbart-large-50-many-to-many-mmt"
13
+ QA_MODEL = "google/flan-t5-base" # Better for question generation
14
+
15
+ print("Loading models... This may take a minute on first run.")
16
+
17
+ # Summarizer with mBART (supports Arabic)
18
+ summarizer_tokenizer = AutoTokenizer.from_pretrained(SUMMARIZER_MODEL)
19
+ summarizer_model = AutoModelForSeq2SeqLM.from_pretrained(SUMMARIZER_MODEL)
20
  summarizer = pipeline(
21
  "summarization",
22
+ model=summarizer_model,
23
+ tokenizer=summarizer_tokenizer,
24
  device=-1 # CPU only
25
  )
26
 
27
+ # Question generator
28
+ question_generator = pipeline(
29
  "text2text-generation",
30
+ model=QA_MODEL,
31
  device=-1 # CPU only
32
  )
33
 
34
+ CHUNK_SIZE = 512 # Conservative for mBART
35
+
36
+ # =========================
37
+ # Language Detection
38
+ # =========================
39
+ def detect_language(text: str) -> str:
40
+ """Simple heuristic: detect if text contains Arabic characters."""
41
+ arabic_pattern = re.compile(r'[\u0600-\u06FF]')
42
+ if arabic_pattern.search(text):
43
+ return "ar_AR" # Arabic
44
+ return "en_XX" # English
45
 
46
  # =========================
47
  # Utilities
48
  # =========================
49
  def clean_text(text: str) -> str:
50
  """Fix quotes, spacing, repetition, broken punctuation."""
51
+ text = text.replace("'", "'").replace("'", "'")
52
+ text = text.replace(""", '"').replace(""", '"')
53
  text = re.sub(r"[.]{2,}", ".", text)
54
  text = re.sub(r"[']{2,}", "'", text)
55
  text = re.sub(r"\s+", " ", text)
56
+ sentences = re.split(r'(?<=[.!?؟])\s+', text) # Added Arabic question mark
57
  seen = set()
58
  result = []
59
  for s in sentences:
 
63
  result.append(s.strip())
64
  return " ".join(result)
65
 
66
+ def chunk_text(text: str, tokenizer):
67
  """Token-aware chunking to avoid model overflow."""
68
  tokens = tokenizer.encode(text, add_special_tokens=False)
69
  chunks = []
 
73
  chunks.append(chunk_text)
74
  return chunks
75
 
76
+ def generate_questions(summary: str, language: str) -> str:
77
+ """Generate comprehension and critical thinking questions based on the summary."""
78
+ truncated_summary = summary[:800]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
+ if language == "ar_AR":
81
+ prompt = (
82
+ f"اقرأ هذا الملخص: '{truncated_summary}'\n\n"
83
+ "أنشئ 7 أسئلة مختلفة:\n"
84
+ "- 3 أسئلة فهم (ماذا، من، أين)\n"
85
+ "- 2 أسئلة تطبيقية (كيف يمكن استخدام هذا؟)\n"
86
+ "- 2 أسئلة تحليلية (لماذا، ما العلاقة بين؟)\n"
87
+ "اكتب الأسئلة فقط، كل سؤال في سطر جديد."
88
+ )
89
+ else:
90
+ prompt = (
91
+ f"Read this summary: '{truncated_summary}'\n\n"
92
+ "Generate exactly 7 diverse questions:\n"
93
+ "- 3 comprehension questions (What, Who, When, Where)\n"
94
+ "- 2 application questions (How can this be used? What if?)\n"
95
+ "- 2 analytical questions (Why, What's the relationship between?)\n"
96
+ "Write only the questions, one per line, numbered 1-7."
97
+ )
98
 
99
+ try:
100
+ generated = question_generator(
101
+ prompt,
102
+ max_length=400,
103
+ num_return_sequences=1,
104
+ do_sample=True,
105
+ temperature=0.8,
106
+ top_p=0.9
107
+ )[0]["generated_text"]
108
+
109
+ # Parse questions
110
+ questions = []
111
+ lines = generated.split('\n')
112
+ for line in lines:
113
+ line = line.strip()
114
+ # Remove numbering if present
115
+ line = re.sub(r'^\d+[\.\)]\s*', '', line)
116
+ if line and (line.endswith('?') or line.endswith('؟') or len(line) > 10):
117
+ questions.append(line)
118
+
119
+ if not questions or len(questions) < 3:
120
+ # Fallback: generate basic questions
121
+ if language == "ar_AR":
122
+ questions = [
123
+ "ما هي الفكرة الرئيسية في هذا النص؟",
124
+ "من هم الأشخاص أو الجهات الرئيسية المذكورة؟",
125
+ "كيف يمكن تطبيق هذه المعلومات في الحياة الواقعية؟",
126
+ "ما هي النقاط الأكثر أهمية في الملخص؟",
127
+ "لماذا هذا الموضوع مهم؟"
128
+ ]
129
+ else:
130
+ questions = [
131
+ "What is the main idea of this text?",
132
+ "Who are the key people or entities mentioned?",
133
+ "How can this information be applied in real life?",
134
+ "What are the most important points in the summary?",
135
+ "Why is this topic significant?",
136
+ "What connections can you make to other knowledge?",
137
+ "What questions remain unanswered?"
138
+ ]
139
+
140
+ # Format questions
141
+ header = "\n\n---\n\n### 🤔 Study Questions\n\n" if language == "en_XX" else "\n\n---\n\n### 🤔 أسئلة للدراسة\n\n"
142
+ questions_md = header
143
+ for i, q in enumerate(questions[:7], 1):
144
+ questions_md += f"{i}. {q}\n"
145
+
146
+ footer = "\n**Tip**: Answer these questions without looking at the text to test your understanding!" if language == "en_XX" else "\n**نصيحة**: حاول الإجابة على هذه الأسئلة دون النظر إلى النص لاختبار فهمك!"
147
+ questions_md += footer
148
+
149
+ return questions_md
150
+ except Exception as e:
151
+ return f"\n\n---\n\nError generating questions: {str(e)}\n"
152
 
153
  def extract_possible_headings(text: str) -> str:
154
+ """Attempt to extract potential titles and subtitles from raw text."""
 
155
  lines = text.split('\n')
156
  headings = []
157
  for line in lines:
158
  stripped = line.strip()
159
+ if stripped and (len(stripped) < 80) and (
160
+ stripped.isupper() or
161
+ re.match(r'^\d+\.?\s', stripped) or
162
+ re.match(r'^[A-Z][a-z]+\s[A-Z]', stripped) or
163
+ re.match(r'^[الفصل|Chapter|Section]', stripped, re.IGNORECASE)
164
+ ):
165
  headings.append(stripped)
166
  if headings:
167
+ return "### 📋 Extracted Headings\n\n" + "\n- ".join([''] + headings[:10]) + "\n\n---\n\n"
168
  return ""
169
 
170
+ def summarize_long_text(text: str, summary_length: str, language: str, progress=gr.Progress()) -> str:
171
+ """Summarize long text in chunks with configurable length + generate questions."""
 
172
  if not text or len(text.strip()) == 0:
173
+ return "No text provided." if language == "en_XX" else "لم يتم تقديم نص."
174
+
175
+ # Length mapping
176
+ length_map = {
177
+ "Short (25%)": {"max": 150, "min": 40},
178
+ "Medium (50%)": {"max": 250, "min": 80},
179
+ "Long (75%)": {"max": 400, "min": 120},
180
+ "قصير (25%)": {"max": 150, "min": 40},
181
+ "متوسط (50%)": {"max": 250, "min": 80},
182
+ "طويل (75%)": {"max": 400, "min": 120}
183
+ }
184
+
185
+ length_params = length_map.get(summary_length, {"max": 250, "min": 80})
186
 
187
  progress(0, desc="Extracting headings...")
 
188
  headings_section = extract_possible_headings(text)
189
 
190
  progress(0.1, desc="Chunking text...")
191
+ chunks = chunk_text(text, summarizer_tokenizer)
192
 
193
  summaries = []
194
  progress(0.2, desc="Summarizing chunks...")
195
+
196
+ # Set language tokens for mBART
197
+ src_lang = language
198
+ tgt_lang = language
199
+
200
  for i in progress.tqdm(range(len(chunks))):
201
  chunk = chunks[i]
202
  try:
203
+ # For mBART, we need to set source and target language
204
+ summarizer_tokenizer.src_lang = src_lang
205
+
206
  summary = summarizer(
207
  chunk,
208
+ max_length=length_params["max"],
209
+ min_length=length_params["min"],
210
+ do_sample=False,
211
+ forced_bos_token_id=summarizer_tokenizer.lang_code_to_id[tgt_lang]
212
  )[0]["summary_text"]
213
+
214
  cleaned = clean_text(summary)
215
+ chunk_label = f"**Chunk {i+1}:**" if language == "en_XX" else f"**الجزء {i+1}:**"
216
+ summaries.append(f"{chunk_label} {cleaned}")
217
+ except Exception as e:
218
+ print(f"Error in chunk {i}: {str(e)}")
219
  pass # skip problematic chunks
220
 
221
+ # Format summaries
222
+ header = "### 📝 Detailed Summary\n\n" if language == "en_XX" else "### 📝 ملخص تفصيلي\n\n"
223
+ summary_md = header
224
  for s in summaries:
225
  summary_md += f"- {s}\n"
226
 
227
+ progress(0.8, desc="Generating questions...")
228
+ questions = generate_questions(summary_md, language)
229
 
230
  progress(1, desc="Done!")
231
+ return headings_section + summary_md + questions
232
 
233
  def read_pdf(file) -> str:
234
  """Safely extract text from PDF."""
235
  try:
236
  reader = PdfReader(file)
237
  pages = [page.extract_text() or "" for page in reader.pages]
238
+ return "\n".join(pages)
239
  except Exception as e:
240
  return f"PDF read error: {str(e)}"
241
 
 
 
 
242
  def create_download_file(content: str) -> str:
243
+ """Create temporary file for download"""
244
  with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as tmp:
245
  tmp.write(content)
246
  return tmp.name
247
 
248
+ def process_input(text: str, file, summary_length: str, language: str, progress=gr.Progress()):
 
 
 
249
  input_text = ""
250
 
251
  progress(0, desc="Reading input...")
 
256
  else:
257
  return "Please paste some text or upload a PDF.", None
258
 
259
+ # Auto-detect language if not specified
260
+ if language == "Auto-detect":
261
+ detected_lang = detect_language(input_text)
262
+ language = detected_lang
263
+ else:
264
+ language = "ar_AR" if "Arabic" in language or "عربي" in language else "en_XX"
265
+
266
+ result = summarize_long_text(input_text, summary_length, language, progress)
267
  download_path = create_download_file(result)
268
 
269
  return result, download_path
 
271
  # =========================
272
  # Gradio UI
273
  # =========================
274
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
275
+ gr.Markdown("# 📄 Multilingual Text Summarizer + Study Assistant")
276
+ gr.Markdown("# ملخص النصوص متعدد اللغات + مساعد الدراسة")
277
+
278
  gr.Markdown(
279
+ "### Features / المميزات:\n"
280
+ "• **English & Arabic support** / دعم اللغة العربية والإنجليزية\n"
281
+ "• **PDF upload** / رفع ملفات PDF\n"
282
+ "• **Adjustable summary length** / طول ملخص قابل للتعديل\n"
283
+ "• **Intelligent study questions** / أسئلة دراسية ذكية\n"
284
+ "• **Free CPU-compatible** / يعمل على المعالج المجاني\n\n"
285
+ "⚠️ **Note**: First run may take 2-3 minutes to load models. Be patient!"
286
  )
287
 
288
  with gr.Row():
289
+ with gr.Column():
290
+ text_input = gr.Textbox(
291
+ lines=10,
292
+ label="📝 Paste your text / الصق نصك هنا",
293
+ placeholder="Paste lecture notes, article, research paper...\nالصق ملاحظات المحاضرة، مقال، ورقة بحثية...",
294
+ )
295
+ file_input = gr.File(
296
+ label="📎 Or upload PDF / أو ارفع ملف PDF",
297
+ file_types=[".pdf"]
298
+ )
299
+
300
+ with gr.Column():
301
+ language_choice = gr.Radio(
302
+ choices=["Auto-detect", "English", "Arabic / عربي"],
303
+ value="Auto-detect",
304
+ label="🌐 Language / اللغة"
305
+ )
306
+
307
+ length_choice = gr.Radio(
308
+ choices=["Short (25%)", "Medium (50%)", "Long (75%)"],
309
+ value="Medium (50%)",
310
+ label="📏 Summary Length / طول الملخص",
311
+ info="Short = concise, Long = detailed"
312
+ )
313
 
314
+ summarize_btn = gr.Button("Summarize & Generate Questions", variant="primary", size="lg")
315
 
316
  output = gr.Textbox(
317
+ lines=20,
318
+ label="📋 Summary + Study Questions / الملخص + الأسئلة الدراسية",
319
  interactive=False
320
  )
321
 
322
  download_output = gr.File(
323
+ label="💾 Download Result (.txt) / تحميل النتيجة",
324
  interactive=False
325
  )
326
 
327
  summarize_btn.click(
328
  fn=process_input,
329
+ inputs=[text_input, file_input, length_choice, language_choice],
330
  outputs=[output, download_output]
331
  )
332
+
333
+ gr.Markdown(
334
+ "---\n"
335
+ "### Tips for best results:\n"
336
+ "• For Arabic text, select 'Arabic' language for better results\n"
337
+ "• Longer texts work better (500+ words)\n"
338
+ "• PDF quality affects extraction - clear text works best\n\n"
339
+ "### نصائح لأفضل النتائج:\n"
340
+ "• للنصوص العربية، اختر 'عربي' للحصول على نتائج أفضل\n"
341
+ "• النصوص الأطول تعمل بشكل أفضل (500+ كلمة)\n"
342
+ "• جودة PDF تؤثر على الاستخراج - النص الواضح يعمل بشكل أفضل"
343
+ )
344
 
345
  demo.launch()