Azidan commited on
Commit
5dbff08
·
verified ·
1 Parent(s): ecd8def

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +111 -108
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import gradio as gr
2
  import re
3
- from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
4
  from PyPDF2 import PdfReader
5
  import tempfile
6
  import torch
@@ -8,29 +8,29 @@ import torch
8
  # =========================
9
  # Model setup (CPU-safe, Multi-language)
10
  # =========================
11
- # Use T5-based models that support text2text-generation
12
- EN_SUMMARIZER_MODEL = "google/flan-t5-base" # English - works with text2text
13
- AR_SUMMARIZER_MODEL = "csebuetnlp/mT5_multilingual_XLSum" # Multilingual (includes Arabic)
14
- QA_MODEL = "google/flan-t5-small" # Question generation
15
-
16
  print("Loading models... This may take a minute on first run.")
17
 
18
- # English summarizer using text2text-generation
19
- en_tokenizer = AutoTokenizer.from_pretrained(EN_SUMMARIZER_MODEL)
20
- en_model = AutoModelForSeq2SeqLM.from_pretrained(EN_SUMMARIZER_MODEL)
 
 
 
 
 
 
 
 
 
 
21
 
22
- # Multilingual summarizer (for Arabic and other languages)
23
- ar_tokenizer = AutoTokenizer.from_pretrained(AR_SUMMARIZER_MODEL)
24
- ar_model = AutoModelForSeq2SeqLM.from_pretrained(AR_SUMMARIZER_MODEL)
25
 
26
- # Question generator
27
- question_generator = pipeline(
28
- "text2text-generation",
29
- model=QA_MODEL,
30
- device=-1 # CPU only
31
- )
32
 
33
- CHUNK_SIZE = 400 # Conservative chunk size for T5 models
34
 
35
  # =========================
36
  # Language Detection
@@ -39,8 +39,8 @@ def detect_language(text: str) -> str:
39
  """Simple heuristic: detect if text contains Arabic characters."""
40
  arabic_pattern = re.compile(r'[\u0600-\u06FF]')
41
  if arabic_pattern.search(text):
42
- return "ar_AR" # Arabic
43
- return "en_XX" # English
44
 
45
  # =========================
46
  # Utilities
@@ -52,7 +52,7 @@ def clean_text(text: str) -> str:
52
  text = re.sub(r"[.]{2,}", ".", text)
53
  text = re.sub(r"[']{2,}", "'", text)
54
  text = re.sub(r"\s+", " ", text)
55
- sentences = re.split(r'(?<=[.!?؟])\s+', text) # Added Arabic question mark
56
  seen = set()
57
  result = []
58
  for s in sentences:
@@ -64,7 +64,6 @@ def clean_text(text: str) -> str:
64
 
65
  def chunk_text(text: str, language: str):
66
  """Token-aware chunking to avoid model overflow."""
67
- # Use appropriate tokenizer based on language
68
  tokenizer = ar_tokenizer if language == "ar_AR" else en_tokenizer
69
 
70
  tokens = tokenizer.encode(text, add_special_tokens=False)
@@ -75,85 +74,113 @@ def chunk_text(text: str, language: str):
75
  chunks.append(chunk_text)
76
  return chunks
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  def generate_questions(summary: str, language: str) -> str:
79
- """Generate comprehension and critical thinking questions based on the summary."""
80
- truncated_summary = summary[:800]
81
 
82
  if language == "ar_AR":
83
  prompt = (
84
- f"اقرأ هذا الملخص: '{truncated_summary}'\n\n"
85
- "أنشئ 7 أسئلة مختلفة:\n"
86
- "- 3 أسئلة فهم (ماذا، من، أين)\n"
87
- "- 2 أسئلة تطبيقية (كيف يمكن استخدام هذا؟)\n"
88
- "- 2 أسئلة تحليلية (لماذا، ما العلاقة بين؟)\n"
89
- "اكتب الأسئلة فقط، كل سؤال في سطر جديد."
90
  )
91
  else:
92
  prompt = (
93
- f"Read this summary: '{truncated_summary}'\n\n"
94
- "Generate exactly 7 diverse questions:\n"
95
- "- 3 comprehension questions (What, Who, When, Where)\n"
96
- "- 2 application questions (How can this be used? What if?)\n"
97
- "- 2 analytical questions (Why, What's the relationship between?)\n"
98
- "Write only the questions, one per line, numbered 1-7."
99
  )
100
 
101
  try:
102
- generated = question_generator(
103
- prompt,
104
- max_length=400,
105
- num_return_sequences=1,
106
- do_sample=True,
107
- temperature=0.8,
108
- top_p=0.9
109
- )[0]["generated_text"]
 
 
110
 
111
  # Parse questions
112
  questions = []
113
- lines = generated.split('\n')
114
- for line in lines:
115
  line = line.strip()
116
- # Remove numbering if present
117
  line = re.sub(r'^\d+[\.\)]\s*', '', line)
118
  if line and (line.endswith('?') or line.endswith('؟') or len(line) > 10):
119
  questions.append(line)
120
 
 
121
  if not questions or len(questions) < 3:
122
- # Fallback: generate basic questions
123
  if language == "ar_AR":
124
  questions = [
125
  "ما هي الفكرة الرئيسية في هذا النص؟",
126
- "من هم الأشخاص أو الجهات الرئيسية المذكورة؟",
127
- "كيف يمكن تطبيق هذه المعلومات في الحياة الواقعية؟",
128
- "ما هي النقاط الأكثر أهمية في الملخص؟",
129
- "لماذا هذا الموضوع مهم؟"
130
  ]
131
  else:
132
  questions = [
133
  "What is the main idea of this text?",
134
- "Who are the key people or entities mentioned?",
135
- "How can this information be applied in real life?",
136
- "What are the most important points in the summary?",
137
- "Why is this topic significant?",
138
- "What connections can you make to other knowledge?",
139
  "What questions remain unanswered?"
140
  ]
141
 
142
- # Format questions
143
  header = "\n\n---\n\n### 🤔 Study Questions\n\n" if language == "en_XX" else "\n\n---\n\n### 🤔 أسئلة للدراسة\n\n"
144
  questions_md = header
145
  for i, q in enumerate(questions[:7], 1):
146
  questions_md += f"{i}. {q}\n"
147
 
148
- footer = "\n**Tip**: Answer these questions without looking at the text to test your understanding!" if language == "en_XX" else "\n**نصيحة**: حاول الإجابة على هذه الأسئلة دون النظر إلى النص لاختبار فهمك!"
149
  questions_md += footer
150
 
151
  return questions_md
152
  except Exception as e:
153
- return f"\n\n---\n\nError generating questions: {str(e)}\n"
 
154
 
155
  def extract_possible_headings(text: str) -> str:
156
- """Attempt to extract potential titles and subtitles from raw text."""
157
  lines = text.split('\n')
158
  headings = []
159
  for line in lines:
@@ -174,17 +201,17 @@ def summarize_long_text(text: str, summary_length: str, language: str, progress=
174
  if not text or len(text.strip()) == 0:
175
  return "No text provided." if language == "en_XX" else "لم يتم تقديم نص."
176
 
177
- # Length mapping (for T5 models, these are approximate)
178
  length_map = {
179
  "Short (25%)": {"max": 128, "min": 30},
180
- "Medium (50%)": {"max": 256, "min": 60},
181
- "Long (75%)": {"max": 400, "min": 100},
182
  "قصير (25%)": {"max": 128, "min": 30},
183
- "متوسط (50%)": {"max": 256, "min": 60},
184
- "طويل (75%)": {"max": 400, "min": 100}
185
  }
186
 
187
- length_params = length_map.get(summary_length, {"max": 256, "min": 60})
188
 
189
  progress(0, desc="Extracting headings...")
190
  headings_section = extract_possible_headings(text)
@@ -195,41 +222,16 @@ def summarize_long_text(text: str, summary_length: str, language: str, progress=
195
  summaries = []
196
  progress(0.2, desc="Summarizing chunks...")
197
 
198
- for i in progress.tqdm(range(len(chunks))):
 
199
  chunk = chunks[i]
200
- try:
201
- if language == "ar_AR":
202
- # Use mT5 for Arabic with direct model inference
203
- inputs = ar_tokenizer(chunk, return_tensors="pt", max_length=512, truncation=True)
204
- summary_ids = ar_model.generate(
205
- inputs["input_ids"],
206
- max_length=length_params["max"],
207
- min_length=length_params["min"],
208
- length_penalty=2.0,
209
- num_beams=4,
210
- early_stopping=True
211
- )
212
- summary = ar_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
213
- else:
214
- # Use FLAN-T5 for English with summarization prompt
215
- prompt = f"Summarize the following text in detail:\n\n{chunk}"
216
- inputs = en_tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
217
- summary_ids = en_model.generate(
218
- inputs["input_ids"],
219
- max_length=length_params["max"],
220
- min_length=length_params["min"],
221
- num_beams=4,
222
- early_stopping=True
223
- )
224
- summary = en_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
225
-
226
- cleaned = clean_text(summary)
227
- if cleaned: # Only add non-empty summaries
228
- chunk_label = f"**Chunk {i+1}:**" if language == "en_XX" else f"**الجزء {i+1}:**"
229
- summaries.append(f"{chunk_label} {cleaned}")
230
- except Exception as e:
231
- print(f"Error in chunk {i}: {str(e)}")
232
- continue # skip problematic chunks
233
 
234
  # Format summaries
235
  header = "### 📝 Detailed Summary\n\n" if language == "en_XX" else "### 📝 ملخص تفصيلي\n\n"
@@ -247,7 +249,7 @@ def summarize_long_text(text: str, summary_length: str, language: str, progress=
247
  return headings_section + summary_md + questions
248
 
249
  def read_pdf(file) -> str:
250
- """Safely extract text from PDF."""
251
  try:
252
  reader = PdfReader(file)
253
  pages = [page.extract_text() or "" for page in reader.pages]
@@ -298,7 +300,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
298
  "• **Adjustable summary length** / طول ملخص قابل للتعديل\n"
299
  "• **Intelligent study questions** / أسئلة دراسية ذكية\n"
300
  "• **Free CPU-compatible** / يعمل على المعالج المجاني\n\n"
301
- "⚠️ **Note**: First run may take 2-3 minutes to load models. Be patient!"
302
  )
303
 
304
  with gr.Row():
@@ -349,13 +351,14 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
349
  gr.Markdown(
350
  "---\n"
351
  "### Tips for best results:\n"
352
- "• For Arabic text, select 'Arabic' language for better results\n"
353
  "• Longer texts work better (500+ words)\n"
354
- "• PDF quality affects extraction - clear text works best\n\n"
355
  "### نصائح لأفضل النتائج:\n"
356
  "• للنصوص العربية، اختر 'عربي' للحصول على نتائج أفضل\n"
357
  "• النصوص الأطول تعمل بشكل أفضل (500+ كلمة)\n"
358
- "• جودة PDF تؤثر على الاستخراج - النص الواضح يعمل بشكل أفضل"
359
  )
360
 
361
- demo.launch()
 
 
1
  import gradio as gr
2
  import re
3
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
4
  from PyPDF2 import PdfReader
5
  import tempfile
6
  import torch
 
8
  # =========================
9
  # Model setup (CPU-safe, Multi-language)
10
  # =========================
 
 
 
 
 
11
  print("Loading models... This may take a minute on first run.")
12
 
13
+ # Use T5 models - load directly without pipeline
14
+ EN_MODEL_NAME = "google/flan-t5-base"
15
+ AR_MODEL_NAME = "csebuetnlp/mT5_multilingual_XLSum"
16
+
17
+ # Load English model
18
+ print("Loading English model...")
19
+ en_tokenizer = AutoTokenizer.from_pretrained(EN_MODEL_NAME)
20
+ en_model = AutoModelForSeq2SeqLM.from_pretrained(EN_MODEL_NAME)
21
+
22
+ # Load Arabic/Multilingual model
23
+ print("Loading Arabic model...")
24
+ ar_tokenizer = AutoTokenizer.from_pretrained(AR_MODEL_NAME)
25
+ ar_model = AutoModelForSeq2SeqLM.from_pretrained(AR_MODEL_NAME)
26
 
27
+ # Load question generator (same as English model)
28
+ qa_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
29
+ qa_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
30
 
31
+ CHUNK_SIZE = 400
 
 
 
 
 
32
 
33
+ print("Models loaded successfully!")
34
 
35
  # =========================
36
  # Language Detection
 
39
  """Simple heuristic: detect if text contains Arabic characters."""
40
  arabic_pattern = re.compile(r'[\u0600-\u06FF]')
41
  if arabic_pattern.search(text):
42
+ return "ar_AR"
43
+ return "en_XX"
44
 
45
  # =========================
46
  # Utilities
 
52
  text = re.sub(r"[.]{2,}", ".", text)
53
  text = re.sub(r"[']{2,}", "'", text)
54
  text = re.sub(r"\s+", " ", text)
55
+ sentences = re.split(r'(?<=[.!?؟])\s+', text)
56
  seen = set()
57
  result = []
58
  for s in sentences:
 
64
 
65
  def chunk_text(text: str, language: str):
66
  """Token-aware chunking to avoid model overflow."""
 
67
  tokenizer = ar_tokenizer if language == "ar_AR" else en_tokenizer
68
 
69
  tokens = tokenizer.encode(text, add_special_tokens=False)
 
74
  chunks.append(chunk_text)
75
  return chunks
76
 
77
+ def generate_summary(text: str, language: str, max_length: int, min_length: int) -> str:
78
+ """Generate summary using the appropriate model."""
79
+ try:
80
+ if language == "ar_AR":
81
+ # Arabic model
82
+ inputs = ar_tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
83
+ with torch.no_grad():
84
+ summary_ids = ar_model.generate(
85
+ inputs["input_ids"],
86
+ max_length=max_length,
87
+ min_length=min_length,
88
+ length_penalty=2.0,
89
+ num_beams=4,
90
+ early_stopping=True
91
+ )
92
+ summary = ar_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
93
+ else:
94
+ # English model with instruction
95
+ prompt = f"Summarize this text in detail:\n\n{text}"
96
+ inputs = en_tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
97
+ with torch.no_grad():
98
+ summary_ids = en_model.generate(
99
+ inputs["input_ids"],
100
+ max_length=max_length,
101
+ min_length=min_length,
102
+ num_beams=4,
103
+ early_stopping=True
104
+ )
105
+ summary = en_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
106
+
107
+ return clean_text(summary)
108
+ except Exception as e:
109
+ print(f"Error generating summary: {str(e)}")
110
+ return ""
111
+
112
  def generate_questions(summary: str, language: str) -> str:
113
+ """Generate comprehension questions based on the summary."""
114
+ truncated_summary = summary[:600]
115
 
116
  if language == "ar_AR":
117
  prompt = (
118
+ f"اقرأ هذا النص: {truncated_summary}\n\n"
119
+ "اكتب 5 أسئلة مهمة عن هذا النص. كل سؤال في سطر جديد."
 
 
 
 
120
  )
121
  else:
122
  prompt = (
123
+ f"Read this text: {truncated_summary}\n\n"
124
+ "Write 5 important questions about this text. One question per line."
 
 
 
 
125
  )
126
 
127
  try:
128
+ inputs = qa_tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
129
+ with torch.no_grad():
130
+ question_ids = qa_model.generate(
131
+ inputs["input_ids"],
132
+ max_length=300,
133
+ num_beams=4,
134
+ early_stopping=True,
135
+ temperature=0.8
136
+ )
137
+ generated = qa_tokenizer.decode(question_ids[0], skip_special_tokens=True)
138
 
139
  # Parse questions
140
  questions = []
141
+ for line in generated.split('\n'):
 
142
  line = line.strip()
 
143
  line = re.sub(r'^\d+[\.\)]\s*', '', line)
144
  if line and (line.endswith('?') or line.endswith('؟') or len(line) > 10):
145
  questions.append(line)
146
 
147
+ # Fallback questions
148
  if not questions or len(questions) < 3:
 
149
  if language == "ar_AR":
150
  questions = [
151
  "ما هي الفكرة الرئيسية في هذا النص؟",
152
+ "ما هي النقاط المهمة المذكورة؟",
153
+ "كيف يمكن تطبيق هذه المعلومات؟",
154
+ "لماذا هذا الموضوع مهم؟",
155
+ "ما هي الاستنتاجات الرئيسية؟"
156
  ]
157
  else:
158
  questions = [
159
  "What is the main idea of this text?",
160
+ "What are the key points mentioned?",
161
+ "How can this information be applied?",
162
+ "Why is this topic important?",
163
+ "What are the main conclusions?",
164
+ "What connections can you make to other topics?",
165
  "What questions remain unanswered?"
166
  ]
167
 
168
+ # Format
169
  header = "\n\n---\n\n### 🤔 Study Questions\n\n" if language == "en_XX" else "\n\n---\n\n### 🤔 أسئلة للدراسة\n\n"
170
  questions_md = header
171
  for i, q in enumerate(questions[:7], 1):
172
  questions_md += f"{i}. {q}\n"
173
 
174
+ footer = "\n**Tip**: Try to answer these without looking at the text!" if language == "en_XX" else "\n**نصيحة**: حاول الإجابة دون النظر إلى النص!"
175
  questions_md += footer
176
 
177
  return questions_md
178
  except Exception as e:
179
+ print(f"Error generating questions: {str(e)}")
180
+ return "\n\n---\n\nUnable to generate questions.\n"
181
 
182
  def extract_possible_headings(text: str) -> str:
183
+ """Extract potential titles and subtitles from raw text."""
184
  lines = text.split('\n')
185
  headings = []
186
  for line in lines:
 
201
  if not text or len(text.strip()) == 0:
202
  return "No text provided." if language == "en_XX" else "لم يتم تقديم نص."
203
 
204
+ # Length mapping
205
  length_map = {
206
  "Short (25%)": {"max": 128, "min": 30},
207
+ "Medium (50%)": {"max": 200, "min": 50},
208
+ "Long (75%)": {"max": 300, "min": 80},
209
  "قصير (25%)": {"max": 128, "min": 30},
210
+ "متوسط (50%)": {"max": 200, "min": 50},
211
+ "طويل (75%)": {"max": 300, "min": 80}
212
  }
213
 
214
+ length_params = length_map.get(summary_length, {"max": 200, "min": 50})
215
 
216
  progress(0, desc="Extracting headings...")
217
  headings_section = extract_possible_headings(text)
 
222
  summaries = []
223
  progress(0.2, desc="Summarizing chunks...")
224
 
225
+ total_chunks = len(chunks)
226
+ for i in range(total_chunks):
227
  chunk = chunks[i]
228
+ progress((0.2 + 0.6 * i / total_chunks), desc=f"Summarizing chunk {i+1}/{total_chunks}...")
229
+
230
+ summary = generate_summary(chunk, language, length_params["max"], length_params["min"])
231
+
232
+ if summary:
233
+ chunk_label = f"**Chunk {i+1}:**" if language == "en_XX" else f"**الجزء {i+1}:**"
234
+ summaries.append(f"{chunk_label} {summary}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
 
236
  # Format summaries
237
  header = "### 📝 Detailed Summary\n\n" if language == "en_XX" else "### 📝 ملخص تفصيلي\n\n"
 
249
  return headings_section + summary_md + questions
250
 
251
  def read_pdf(file) -> str:
252
+ """Extract text from PDF."""
253
  try:
254
  reader = PdfReader(file)
255
  pages = [page.extract_text() or "" for page in reader.pages]
 
300
  "• **Adjustable summary length** / طول ملخص قابل للتعديل\n"
301
  "• **Intelligent study questions** / أسئلة دراسية ذكية\n"
302
  "• **Free CPU-compatible** / يعمل على المعالج المجاني\n\n"
303
+ "⚠️ **Note**: First run may take 2-3 minutes to load models. Processing is slower on CPU."
304
  )
305
 
306
  with gr.Row():
 
351
  gr.Markdown(
352
  "---\n"
353
  "### Tips for best results:\n"
354
+ "• For Arabic text, select 'Arabic' for better results\n"
355
  "• Longer texts work better (500+ words)\n"
356
+ "• Processing may take 30-60 seconds on CPU\n\n"
357
  "### نصائح لأفضل النتائج:\n"
358
  "• للنصوص العربية، اختر 'عربي' للحصول على نتائج أفضل\n"
359
  "• النصوص الأطول تعمل بشكل أفضل (500+ كلمة)\n"
360
+ "• قد تستغرق المعالجة 30-60 ثانية على CPU"
361
  )
362
 
363
+ if __name__ == "__main__":
364
+ demo.launch()