Leen172 commited on
Commit
262640f
·
verified ·
1 Parent(s): 641ee3e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +464 -432
app.py CHANGED
@@ -1,432 +1,464 @@
1
- # app.py
2
- # -*- coding: utf-8 -*-
3
-
4
- import os, io, json, uuid, random, textwrap, unicodedata, tempfile
5
- from dataclasses import dataclass
6
- from typing import List, Tuple
7
-
8
- import pandas as pd
9
- from PIL import Image
10
- from pypdf import PdfReader
11
- import fitz # PyMuPDF
12
-
13
- import torch
14
- from transformers import pipeline
15
- from tqdm import tqdm
16
-
17
- import regex as re2
18
- import yake
19
- import gradio as gr
20
-
21
- # -----------------------------
22
- # ثوابت عامة
23
- # -----------------------------
24
- random.seed(42)
25
- DEFAULT_LANG = "ar"
26
- DEFAULT_NUM_QUESTIONS = 8
27
- DEFAULT_TROCR_MODEL = "microsoft/trocr-base-printed" # أخف من large
28
- DEFAULT_TROCR_ZOOM = 2.8
29
-
30
- # -----------------------------
31
- # 2) دوال استخراج النص من PDF
32
- # -----------------------------
33
- def extract_text_with_pypdf(pdf_path: str) -> str:
34
- reader = PdfReader(pdf_path)
35
- texts = []
36
- for i, page in enumerate(reader.pages):
37
- try:
38
- t = page.extract_text() or ""
39
- except Exception:
40
- t = ""
41
- texts.append(t)
42
- return "\n".join(texts).strip()
43
-
44
- def pdf_pages_to_images(pdf_path: str, zoom: float = 2.5) -> List[Image.Image]:
45
- doc = fitz.open(pdf_path)
46
- imgs = []
47
- mat = fitz.Matrix(zoom, zoom)
48
- for page in doc:
49
- pix = page.get_pixmap(matrix=mat, alpha=False)
50
- img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
51
- imgs.append(img)
52
- doc.close()
53
- return imgs
54
-
55
- def extract_text_with_ocr(pdf_path: str, model_id: str, zoom: float = 2.5, disable_tqdm: bool = True) -> str:
56
- device = 0 if torch.cuda.is_available() else -1
57
- ocr = pipeline("image-to-text", model=model_id, device=device)
58
-
59
- images = pdf_pages_to_images(pdf_path, zoom=zoom)
60
- page_texts = []
61
- pbar = tqdm(images, desc="TrOCR", unit="p", disable=disable_tqdm)
62
- for idx, img in enumerate(pbar):
63
- try:
64
- out = ocr(img)
65
- txt = out[0]["generated_text"].strip() if out and "generated_text" in out[0] else ""
66
- except Exception:
67
- txt = ""
68
- page_texts.append(f"--- [Page {idx+1}] ---\n{txt}")
69
- return "\n\n".join(page_texts).strip()
70
-
71
- def is_extraction_good(text: str, min_chars: int = 250, min_alpha_ratio: float = 0.15) -> bool:
72
- if len(text) < min_chars:
73
- return False
74
- alnum = sum(ch.isalnum() for ch in text)
75
- ratio = alnum / max(1, len(text))
76
- return ratio >= min_alpha_ratio
77
-
78
- def save_text(text: str, out_path: str) -> None:
79
- os.makedirs(os.path.dirname(out_path) or ".", exist_ok=True)
80
- with open(out_path, "w", encoding="utf-8") as f:
81
- f.write(text)
82
-
83
- def pdf_to_txt(pdf_path: str, out_txt_path: str = None,
84
- ocr_model: str = DEFAULT_TROCR_MODEL,
85
- ocr_zoom: float = DEFAULT_TROCR_ZOOM) -> Tuple[str, str, str]:
86
- assert os.path.isfile(pdf_path), f"File not found: {pdf_path}"
87
-
88
- embedded_text = extract_text_with_pypdf(pdf_path)
89
- if is_extraction_good(embedded_text):
90
- final_text = embedded_text
91
- method = "embedded (pypdf)"
92
- else:
93
- final_text = extract_text_with_ocr(pdf_path, model_id=ocr_model, zoom=ocr_zoom)
94
- method = "OCR (Hugging Face TrOCR)"
95
-
96
- if out_txt_path is None:
97
- base, _ = os.path.splitext(pdf_path)
98
- out_txt_path = base + ".txt"
99
-
100
- header = f"[[ Extraction method: {method} ]]\n\n"
101
- save_text(header + final_text, out_txt_path)
102
- return final_text, out_txt_path, method
103
-
104
- # -----------------------------
105
- # 3) تصحيح/تطبيع النص العربي
106
- # -----------------------------
107
- def strip_page_headers(text: str) -> str:
108
- lines = text.splitlines()
109
- out = []
110
- for ln in lines:
111
- if re2.match(r"^\s*--- \[Page \d+\] ---\s*$", ln):
112
- continue
113
- if re2.match(r"^\s*(Page\s*\d+|صفحة\s*\d+)\s*$", ln):
114
- continue
115
- if re2.match(r"^\s*[-–—_*]{3,}\s*$", ln):
116
- continue
117
- out.append(ln)
118
- return "\n".join(out)
119
-
120
- AR_DIAC = r"[ًٌٍَُِّْ]"
121
- def normalize_arabic(text: str) -> str:
122
- text = unicodedata.normalize("NFKC", text)
123
- text = re2.sub(r"[ـ]", "", text)
124
- text = re2.sub(AR_DIAC, "", text)
125
- text = re2.sub(r"[إأآا]", "ا", text)
126
- text = re2.sub(r"[يى]", "ي", text)
127
- text = re2.sub(r"\s+", " ", text)
128
- return text.strip()
129
-
130
- def arabic_ocr_fixes(text: str) -> str:
131
- fixes = {
132
- " الصطناعي": " الاصطناعي",
133
- "صطناعي": "اصطناعي",
134
- "التعل م": "التعلم",
135
- "الذكاء الاصطناعيي": "الذكاء الاصطناعي",
136
- "ذكاء صطناعي": "ذكاء اصطناعي",
137
- "الذكاء الاصطناعي.": "الذكاء الاصطناعي.",
138
- "التعليم ": "التعليم ",
139
- " مع غني": " غني",
140
- "مع غني ": " غني ",
141
- " غير المشبعة": " غي��ُ المشبعة",
142
- }
143
- for wrong, right in fixes.items():
144
- text = text.replace(wrong, right)
145
- return text
146
-
147
- def postprocess_text(raw_text: str, lang: str = "ar") -> str:
148
- t = strip_page_headers(raw_text)
149
- t = t.replace("\r", "\n")
150
- t = re2.sub(r"\n{3,}", "\n\n", t)
151
- t = re2.sub(r"\d+\s*[\[\(][^\]\)]*[\]\)]", " ", t)
152
- t = re2.sub(r"\[\d+\]", " ", t)
153
-
154
- if lang == "ar":
155
- t = normalize_arabic(t)
156
- t = arabic_ocr_fixes(t)
157
- return t
158
-
159
- # -----------------------------
160
- # 4) YAKE + تقسيم الجمل
161
- # -----------------------------
162
- SENT_SPLIT = re2.compile(r"(?<=[\.!؟\?])\s+")
163
- AR_STOP = set("""
164
- في على من إلى عن مع لدى ذلك هذه هذا الذين التي الذي اللواتي اللواتيا أو أم إن أن كان تكون كانوا كانت كنت كنا كانا كانتِ ثم قد لقد ربما بل لكن لكنَّ إلا سوى حتى حيث كما لما لماّ لماَّ لماً ما ماذا لماذا متى أين كيف أي أيّ أيُّ هناك هنا هناكَ تلك ذلكم ذلكن أولئك هؤلاء هما هن هم أنتِ أنتَ أنتما أنتن أنتم أنا نحن هي هو هنَّ همَّ
165
- و أو كما بين بسبب بدون خلال عبر لدى لدىً حتى حيث ضمن عبره عليها عليه عليهم علي على إلي إليك إليه إليها لديك لديكِ لديه لديها لكم لكنكما لكنكن ولكن
166
- هذا هذه ذلك تلك هؤلاء أولئك كل بعض أي أيّ أيًا أحد شيء شيئًا أشياء
167
- "وهنا","اليه","الي","له","لها","لدي","لديه","لديها","لنا","عنده","عندها","مع","عبر","ضمن","حسب","حيث","كما","قد","بل","لكن","إذ","اذ","اذا","إن","أن","أيضا","فإن","فانه","فإنه","انه","إنه","مثلا","مثلاً","مثلاَ"
168
- """.split())
169
-
170
- def top_keywords_yake(text: str, max_k: int = 120, lan: str = 'ar') -> List[str]:
171
- kw_extractor = yake.KeywordExtractor(lan=lan, n=1, top=max_k)
172
- candidates = [kw for kw, _ in kw_extractor.extract_keywords(text)]
173
- seen, out = set(), []
174
- for k in candidates:
175
- kk = k.strip()
176
- if not kk or kk in seen:
177
- continue
178
- if lan == "ar" and kk in AR_STOP:
179
- continue
180
- if len(kk) < 3:
181
- continue
182
- if re2.match(r"^[\p{P}\p{S}]+$", kk):
183
- continue
184
- seen.add(kk)
185
- out.append(kk)
186
- return out
187
-
188
- # -----------------------------
189
- # 5) مُولِّد MCQ
190
- # -----------------------------
191
- @dataclass
192
- class MCQ:
193
- id: str
194
- question: str
195
- choices: List[str]
196
- answer_index: int
197
- explanation: str
198
-
199
- def split_sentences(text: str) -> List[str]:
200
- sents = [s.strip() for s in SENT_SPLIT.split(text) if s.strip()]
201
- return [s for s in sents if len(s) >= 25]
202
-
203
- def build_distractors(correct: str, pool: List[str], k: int = 3) -> List[str]:
204
- cand = []
205
- for w in pool:
206
- if not w:
207
- continue
208
- w2 = w.strip()
209
- if w2 == correct.strip():
210
- continue
211
- if len(w2) < 3:
212
- continue
213
- if w2 in AR_STOP:
214
- continue
215
- cand.append(w2)
216
-
217
- random.shuffle(cand)
218
- out = []
219
- for w in cand:
220
- out.append(w)
221
- if len(out) == k:
222
- break
223
-
224
- fillers = ["—", "-", "—-"]
225
- while len(out) < k:
226
- out.append(random.choice(fillers))
227
- return out
228
-
229
- def make_mcqs_from_text(text: str, n: int = 8, lang: str = 'ar') -> List[MCQ]:
230
- sentences = split_sentences(text)
231
- if not sentences:
232
- raise ValueError("النص قصير جدًا أو غير صالح لتوليد أسئلة.")
233
-
234
- keywords = top_keywords_yake(text, max_k=160, lan=lang)
235
- if not keywords:
236
- toks = re2.findall(r"[\p{L}\p{N}_]+", text)
237
- toks = [t for t in toks if not (lang == "ar" and t in AR_STOP)]
238
- freq = {}
239
- for t in toks:
240
- freq[t] = freq.get(t, 0) + 1
241
- keywords = [w for w, c in sorted(freq.items(), key=lambda x: -x[1])][:80]
242
-
243
- sent_for_kw = {}
244
- for s in sentences:
245
- for kw in keywords:
246
- if re2.search(rf"(?<!\p{{L}}){re2.escape(kw)}(?!\p{{L}})", s) and kw not in sent_for_kw:
247
- sent_for_kw[kw] = s
248
-
249
- items: List[MCQ] = []
250
- used_sents = set()
251
- pool_iter = [kw for kw in keywords if kw in sent_for_kw]
252
-
253
- for kw in pool_iter:
254
- if len(items) >= n:
255
- break
256
- s = sent_for_kw[kw]
257
- if s in used_sents:
258
- continue
259
- blanked = re2.sub(rf"(?<!\p{{L}}){re2.escape(kw)}(?!\p{{L}})", "_____", s, count=1)
260
- correct = kw
261
- distractors = build_distractors(correct, [x for x in keywords if x != kw], k=3)
262
- choices = distractors + [correct]
263
- random.shuffle(choices)
264
- ans_idx = choices.index(correct)
265
- exp = f"مقتبس من الجملة: {s[:220]}" + ("..." if len(s) > 220 else "")
266
- items.append(MCQ(
267
- id=str(uuid.uuid4())[:8],
268
- question=blanked,
269
- choices=choices,
270
- answer_index=ans_idx,
271
- explanation=exp
272
- ))
273
- used_sents.add(s)
274
-
275
- if not items:
276
- raise RuntimeError("تعذر توليد أسئلة من النص. جرّب نصاً أطول أو مختلفاً.")
277
- return items
278
-
279
- # -----------------------------
280
- # 7) أدوات تنظيف خيارات/إخراج JSON
281
- # -----------------------------
282
- AR_PUNCT = "،؛؟"
283
- EN_PUNCT = ",;?"
284
-
285
- def normalize_punct(s: str) -> str:
286
- if not s:
287
- return ""
288
- s = s.replace(",", "،").replace(";", "؛").replace("?", "؟")
289
- return s.strip().strip(AR_PUNCT + EN_PUNCT).strip()
290
-
291
- def is_bad_choice(txt: str) -> bool:
292
- if not txt:
293
- return True
294
- txt = txt.strip()
295
- BAD_NOISE = {"وهنا","اليه","الي","ليبق","لان","لانها","لانّه","ذلك","هذا","هذه"}
296
- if txt in BAD_NOISE:
297
- return True
298
- if len(txt) > 18 and " " not in txt:
299
- return True
300
- if len(txt) < 2:
301
- return True
302
- if txt in AR_STOP:
303
- return True
304
- if re2.match(r"^[\p{P}\p{S}]+$", txt):
305
- return True
306
- return False
307
-
308
- def build_json_records(items: List[MCQ], lang: str, source_pdf: str, method: str):
309
- json_data = []
310
- letters = ["A", "B", "C", "D"]
311
- for it in items:
312
- opts = []
313
- seen = set()
314
- for idx, lbl in enumerate(letters):
315
- raw = it.choices[idx] if idx < len(it.choices) else ""
316
- txt = normalize_punct(raw)
317
- if is_bad_choice(txt):
318
- txt = "—"
319
- if txt in seen:
320
- txt += " "
321
- seen.add(txt)
322
- opts.append({
323
- "id": lbl,
324
- "text": txt,
325
- "is_correct": (it.answer_index == idx)
326
- })
327
- q_clean = normalize_punct(it.question)
328
- exp_clean = normalize_punct(it.explanation)
329
- record = {
330
- "id": it.id,
331
- "question": q_clean,
332
- "options": opts,
333
- "explanation": exp_clean,
334
- "meta": {
335
- "lang": lang,
336
- "normalized": True,
337
- "source_pdf": source_pdf,
338
- "extraction_method": method
339
- }
340
- }
341
- json_data.append(record)
342
- return json_data
343
-
344
- # -----------------------------
345
- # الدالة الأساسية: تستقبل PDF وترجع JSON + ملف JSON
346
- # -----------------------------
347
- def process_pdf(pdf_file, num_questions=DEFAULT_NUM_QUESTIONS,
348
- lang=DEFAULT_LANG, trocr_model=DEFAULT_TROCR_MODEL,
349
- trocr_zoom=DEFAULT_TROCR_ZOOM):
350
- logs = []
351
- if pdf_file is None:
352
- return {}, None, "يرجى رفع ملف PDF أولاً."
353
-
354
- try:
355
- # خزّنه بمجلد مؤقت
356
- workdir = tempfile.mkdtemp(prefix="mcq_")
357
- pdf_path = os.path.join(workdir, pdf_file.name if hasattr(pdf_file, "name") else "input.pdf")
358
- with open(pdf_path, "wb") as f:
359
- f.write(pdf_file.read())
360
- logs.append(f"تم حفظ الملف في: {pdf_path}")
361
-
362
- # 1) استخراج نص
363
- raw_text, out_txt_path, method = pdf_to_txt(
364
- pdf_path=pdf_path,
365
- ocr_model=trocr_model,
366
- ocr_zoom=float(trocr_zoom)
367
- )
368
- logs.append(f"طريقة الاستخراج: {method}")
369
-
370
- # 2) تصحيح/تطبيع
371
- cleaned_text = postprocess_text(raw_text, lang=lang)
372
- cleaned_txt_path = os.path.join(workdir, "cleaned.txt")
373
- save_text(cleaned_text, cleaned_txt_path)
374
- logs.append("تم تنظيف/تطبيع النص.")
375
-
376
- # 3) توليد أسئلة
377
- items = make_mcqs_from_text(cleaned_text, n=int(num_questions), lang=lang)
378
- logs.append(f"تم توليد {len(items)} سؤالاً.")
379
-
380
- # 4) بناء JSON
381
- json_records = build_json_records(items, lang=lang, source_pdf=os.path.basename(pdf_path), method=method)
382
- json_str = json.dumps(json_records, ensure_ascii=False, indent=2)
383
-
384
- # 5) حفظ JSON للتحميل
385
- json_path = os.path.join(workdir, "mcqs.json")
386
- with open(json_path, "w", encoding="utf-8") as fj:
387
- fj.write(json_str)
388
- logs.append(f"تم إنشاء ملف JSON: {json_path}")
389
-
390
- # الإرجاع: كائن JSON، مسار ملف للتنزيل، سجل
391
- return json_records, json_path, "\n".join(logs)
392
-
393
- except Exception as e:
394
- logs.append(f"خطأ: {e}")
395
- return {}, None, "\n".join(logs)
396
-
397
- # -----------------------------
398
- # واجهة Gradio
399
- # -----------------------------
400
- with gr.Blocks(title="PDF → MCQ JSON (Arabic YAKE/TroCR)") as demo:
401
- gr.Markdown("## تحوي�� PDF إلى أسئلة اختيار من متعدد وإرجاع JSON جاهز للواجهة")
402
-
403
- with gr.Row():
404
- inp_pdf = gr.File(label="ارفع PDF", file_count="single", file_types=[".pdf"])
405
- with gr.Column():
406
- num_q = gr.Slider(4, 20, value=DEFAULT_NUM_QUESTIONS, step=1, label="عدد الأسئلة")
407
- trocr_zoom = gr.Slider(2.0, 3.5, value=DEFAULT_TROCR_ZOOM, step=0.1, label="دقة تحويل PDF لصور (Zoom)")
408
- trocr_model = gr.Dropdown(
409
- choices=[
410
- "microsoft/trocr-base-printed",
411
- "microsoft/trocr-large-printed",
412
- "microsoft/trocr-base-handwritten",
413
- "microsoft/trocr-large-handwritten",
414
- ],
415
- value=DEFAULT_TROCR_MODEL,
416
- label="موديل TrOCR"
417
- )
418
-
419
- btn = gr.Button("تشغيل المعالجة")
420
- out_json = gr.JSON(label="النتيجة (JSON)")
421
- out_file = gr.File(label="تحميل ملف JSON")
422
- out_log = gr.Textbox(label="Logs", lines=10)
423
-
424
- btn.click(
425
- fn=process_pdf,
426
- inputs=[inp_pdf, num_q, gr.State(DEFAULT_LANG), trocr_model, trocr_zoom],
427
- outputs=[out_json, out_file, out_log]
428
- )
429
-
430
- # ملاحظة: Spaces تتوقع اسم متغير "demo"
431
- if __name__ == "__main__":
432
- demo.queue().launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import os
5
+ import io
6
+ import json
7
+ import uuid
8
+ import random
9
+ import tempfile
10
+ import shutil
11
+ import unicodedata
12
+ from dataclasses import dataclass
13
+ from pathlib import Path
14
+ from typing import List, Tuple
15
+
16
+ import pandas as pd
17
+ from PIL import Image
18
+ from pypdf import PdfReader
19
+ import fitz # PyMuPDF
20
+ import regex as re2
21
+ import yake
22
+ from tqdm import tqdm
23
+
24
+ # ملاحظة: سنستورد torch/transformers داخل الدوال (تحميل كسول) لسرعة الإقلاع.
25
+
26
+ # =========================
27
+ # إعدادات عامة
28
+ # =========================
29
+ random.seed(42)
30
+ DEFAULT_LANG = "ar"
31
+ DEFAULT_NUM_QUESTIONS = 8
32
+ DEFAULT_TROCR_MODEL = "microsoft/trocr-base-printed" # أسرع من large
33
+ DEFAULT_TROCR_ZOOM = 2.8
34
+
35
+ # كاش بسيط للـ OCR pipeline
36
+ _OCR_PIPE = {}
37
+ def _get_ocr_pipeline(model_id: str):
38
+ """تحميل كسول + كاش لنموذج TrOCR."""
39
+ from transformers import pipeline # استيراد متأخر
40
+ import torch # استيراد متأخر
41
+ device = 0 if torch.cuda.is_available() else -1
42
+ if model_id not in _OCR_PIPE:
43
+ _OCR_PIPE[model_id] = pipeline("image-to-text", model=model_id, device=device)
44
+ return _OCR_PIPE[model_id]
45
+
46
+ # =========================
47
+ # 2) استخراج النص من PDF
48
+ # =========================
49
+ def extract_text_with_pypdf(pdf_path: str) -> str:
50
+ reader = PdfReader(pdf_path)
51
+ texts = []
52
+ for page in reader.pages:
53
+ try:
54
+ t = page.extract_text() or ""
55
+ except Exception:
56
+ t = ""
57
+ texts.append(t)
58
+ return "\n".join(texts).strip()
59
+
60
+ def pdf_pages_to_images(pdf_path: str, zoom: float = 2.5) -> List[Image.Image]:
61
+ doc = fitz.open(pdf_path)
62
+ imgs = []
63
+ mat = fitz.Matrix(zoom, zoom)
64
+ for page in doc:
65
+ pix = page.get_pixmap(matrix=mat, alpha=False)
66
+ img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
67
+ imgs.append(img)
68
+ doc.close()
69
+ return imgs
70
+
71
+ def extract_text_with_ocr(pdf_path: str, model_id: str, zoom: float = 2.5, disable_tqdm: bool = True) -> str:
72
+ ocr = _get_ocr_pipeline(model_id)
73
+ images = pdf_pages_to_images(pdf_path, zoom=zoom)
74
+ page_texts = []
75
+ pbar = tqdm(images, desc="TrOCR OCR", unit="p", disable=disable_tqdm)
76
+ for idx, img in enumerate(pbar):
77
+ try:
78
+ out = ocr(img)
79
+ txt = out[0]["generated_text"].strip() if out and "generated_text" in out[0] else ""
80
+ except Exception:
81
+ txt = ""
82
+ page_texts.append(f"--- [Page {idx+1}] ---\n{txt}")
83
+ return "\n\n".join(page_texts).strip()
84
+
85
+ def is_extraction_good(text: str, min_chars: int = 250, min_alpha_ratio: float = 0.15) -> bool:
86
+ if len(text) < min_chars:
87
+ return False
88
+ alnum = sum(ch.isalnum() for ch in text)
89
+ ratio = alnum / max(1, len(text))
90
+ return ratio >= min_alpha_ratio
91
+
92
+ def save_text(text: str, out_path: str) -> None:
93
+ os.makedirs(os.path.dirname(out_path) or ".", exist_ok=True)
94
+ with open(out_path, "w", encoding="utf-8") as f:
95
+ f.write(text)
96
+
97
+ def pdf_to_txt(pdf_path: str, out_txt_path: str = None,
98
+ ocr_model: str = DEFAULT_TROCR_MODEL,
99
+ ocr_zoom: float = DEFAULT_TROCR_ZOOM) -> Tuple[str, str, str]:
100
+ assert os.path.isfile(pdf_path), f"File not found: {pdf_path}"
101
+
102
+ embedded_text = extract_text_with_pypdf(pdf_path)
103
+ if is_extraction_good(embedded_text):
104
+ final_text = embedded_text
105
+ method = "embedded (pypdf)"
106
+ else:
107
+ if not ocr_model:
108
+ # وضع تجريبي بلا OCR
109
+ final_text = embedded_text
110
+ method = "embedded (pypdf: weak)"
111
+ else:
112
+ final_text = extract_text_with_ocr(pdf_path, model_id=ocr_model, zoom=ocr_zoom)
113
+ method = "OCR (Hugging Face TrOCR)"
114
+
115
+ if out_txt_path is None:
116
+ base, _ = os.path.splitext(pdf_path)
117
+ out_txt_path = base + ".txt"
118
+
119
+ header = f"[[ Extraction method: {method} ]]\n\n"
120
+ save_text(header + final_text, out_txt_path)
121
+ return final_text, out_txt_path, method
122
+
123
+ # =========================
124
+ # 3) تطبيع/تصحيح عربي
125
+ # =========================
126
+ def strip_page_headers(text: str) -> str:
127
+ lines = text.splitlines()
128
+ out = []
129
+ for ln in lines:
130
+ if re2.match(r"^\s*--- \[Page \d+\] ---\s*$", ln):
131
+ continue
132
+ if re2.match(r"^\s*(Page\s*\d+|صفحة\s*\d+)\s*$", ln):
133
+ continue
134
+ if re2.match(r"^\s*[-–—_*]{3,}\s*$", ln):
135
+ continue
136
+ out.append(ln)
137
+ return "\n".join(out)
138
+
139
+ AR_DIAC = r"[ًٌٍَُِّْ]"
140
+ def normalize_arabic(text: str) -> str:
141
+ text = unicodedata.normalize("NFKC", text)
142
+ text = re2.sub(r"[ـ]", "", text)
143
+ text = re2.sub(AR_DIAC, "", text)
144
+ text = re2.sub(r"[إأآا]", "ا", text)
145
+ text = re2.sub(r"[يى]", "ي", text)
146
+ text = re2.sub(r"\s+", " ", text)
147
+ return text.strip()
148
+
149
+ def arabic_ocr_fixes(text: str) -> str:
150
+ fixes = {
151
+ " الصطناعي": " الاصطناعي",
152
+ "صطناعي": "اصطناعي",
153
+ "التعل م": "التعلم",
154
+ "الذكاء الاصطناعيي": "الذكاء الاصطناعي",
155
+ "ذكاء صطناعي": "ذكاء اصطناعي",
156
+ "الذكاء الاصطناعي.": "الذكاء الاصطناعي.",
157
+ "التعليم ": "التعليم ",
158
+ " مع غني": " غني",
159
+ "مع غني ": " غني ",
160
+ " غير المشبعة": " غيرُ المشبعة",
161
+ }
162
+ for wrong, right in fixes.items():
163
+ text = text.replace(wrong, right)
164
+ return text
165
+
166
+ def postprocess_text(raw_text: str, lang: str = "ar") -> str:
167
+ t = strip_page_headers(raw_text)
168
+ t = t.replace("\r", "\n")
169
+ t = re2.sub(r"\n{3,}", "\n\n", t)
170
+ t = re2.sub(r"\d+\s*[\[\(][^\]\)]*[\]\)]", " ", t)
171
+ t = re2.sub(r"\[\d+\]", " ", t)
172
+ if lang == "ar":
173
+ t = normalize_arabic(t)
174
+ t = arabic_ocr_fixes(t)
175
+ return t
176
+
177
+ # =========================
178
+ # 4) YAKE + تقسيم الجمل
179
+ # =========================
180
+ SENT_SPLIT = re2.compile(r"(?<=[\.!؟\?])\s+")
181
+ AR_STOP = set("""
182
+ في على من إلى عن مع لدى ذلك هذه هذا الذين التي الذي اللواتي اللواتيا أو أم إن أن كان تكون كانوا كانت كنت كنا كانا كانتِ ثم قد لقد ربما بل لكن لكنَّ إلا سوى حتى حيث كما لما لماّ لماَّ لماً ما ماذا لماذا متى أين كيف أي أيّ أيُّ هناك هنا هناكَ تلك ذلكم ذلكن أولئك هؤلاء هما هن هم أنتِ أنتَ أنتما أنتن أنتم أنا نحن هي هو هنَّ همَّ
183
+ و أو كما بين بسبب بدون خلال عبر لدى لدىً حتى حيث ضمن عبره عليها عليه عليهم علي على إلي إليك إليه إليها لديك لديكِ لديه لديها لكم لكنكما لكنكن ولكن
184
+ هذا هذه ذلك تلك هؤلاء أولئك كل بعض أي أيّ أيًا أحد شيء شيئًا أشياء
185
+ "وهنا","اليه","الي","له","لها","لدي","لديه","لديها","لنا","عنده","عندها","مع","عبر","ضمن","حسب","حيث","كما","قد","بل","لكن","إذ","اذ","اذا","إن","أن","أيضا","فإن","فانه","فإنه","انه","إنه","مثلا","مثلاً","مثلاَ"
186
+ """.split())
187
+
188
+ def top_keywords_yake(text: str, max_k: int = 120, lan: str = 'ar') -> List[str]:
189
+ kw_extractor = yake.KeywordExtractor(lan=lan, n=1, top=max_k)
190
+ candidates = [kw for kw, _ in kw_extractor.extract_keywords(text)]
191
+ seen, out = set(), []
192
+ for k in candidates:
193
+ kk = k.strip()
194
+ if not kk or kk in seen:
195
+ continue
196
+ if lan == "ar" and kk in AR_STOP:
197
+ continue
198
+ if len(kk) < 3:
199
+ continue
200
+ if re2.match(r"^[\p{P}\p{S}]+$", kk):
201
+ continue
202
+ seen.add(kk)
203
+ out.append(kk)
204
+ return out
205
+
206
+ # =========================
207
+ # 5) مُولِّد MCQ
208
+ # =========================
209
+ @dataclass
210
+ class MCQ:
211
+ id: str
212
+ question: str
213
+ choices: List[str]
214
+ answer_index: int
215
+ explanation: str
216
+
217
+ def split_sentences(text: str) -> List[str]:
218
+ sents = [s.strip() for s in SENT_SPLIT.split(text) if s.strip()]
219
+ return [s for s in sents if len(s) >= 25]
220
+
221
+ def build_distractors(correct: str, pool: List[str], k: int = 3) -> List[str]:
222
+ cand = []
223
+ for w in pool:
224
+ if not w:
225
+ continue
226
+ w2 = w.strip()
227
+ if w2 == correct.strip():
228
+ continue
229
+ if len(w2) < 3:
230
+ continue
231
+ if w2 in AR_STOP:
232
+ continue
233
+ cand.append(w2)
234
+
235
+ random.shuffle(cand)
236
+ out = []
237
+ for w in cand:
238
+ out.append(w)
239
+ if len(out) == k:
240
+ break
241
+
242
+ fillers = ["—", "-", "—-"]
243
+ while len(out) < k:
244
+ out.append(random.choice(fillers))
245
+ return out
246
+
247
+ def make_mcqs_from_text(text: str, n: int = 8, lang: str = 'ar') -> List[MCQ]:
248
+ sentences = split_sentences(text)
249
+ if not sentences:
250
+ raise ValueError("النص قصير جدًا أو غير صالح لتوليد أسئلة.")
251
+
252
+ keywords = top_keywords_yake(text, max_k=160, lan=lang)
253
+ if not keywords:
254
+ toks = re2.findall(r"[\p{L}\p{N}_]+", text)
255
+ toks = [t for t in toks if not (lang == "ar" and t in AR_STOP)]
256
+ freq = {}
257
+ for t in toks:
258
+ freq[t] = freq.get(t, 0) + 1
259
+ keywords = [w for w, c in sorted(freq.items(), key=lambda x: -x[1])][:80]
260
+
261
+ sent_for_kw = {}
262
+ for s in sentences:
263
+ for kw in keywords:
264
+ if re2.search(rf"(?<!\p{{L}}){re2.escape(kw)}(?!\p{{L}})", s) and kw not in sent_for_kw:
265
+ sent_for_kw[kw] = s
266
+
267
+ items: List[MCQ] = []
268
+ used_sents = set()
269
+ pool_iter = [kw for kw in keywords if kw in sent_for_kw]
270
+
271
+ for kw in pool_iter:
272
+ if len(items) >= n:
273
+ break
274
+ s = sent_for_kw[kw]
275
+ if s in used_sents:
276
+ continue
277
+ blanked = re2.sub(rf"(?<!\p{{L}}){re2.escape(kw)}(?!\p{{L}})", "_____", s, count=1)
278
+ correct = kw
279
+ distractors = build_distractors(correct, [x for x in keywords if x != kw], k=3)
280
+ choices = distractors + [correct]
281
+ random.shuffle(choices)
282
+ ans_idx = choices.index(correct)
283
+ exp = f"مقتبس من الجملة: {s[:220]}" + ("..." if len(s) > 220 else "")
284
+ items.append(MCQ(
285
+ id=str(uuid.uuid4())[:8],
286
+ question=blanked,
287
+ choices=choices,
288
+ answer_index=ans_idx,
289
+ explanation=exp
290
+ ))
291
+ used_sents.add(s)
292
+
293
+ if not items:
294
+ raise RuntimeError("تعذر توليد أسئلة من النص. جرّب نصاً أطول أو مختلفاً.")
295
+ return items
296
+
297
+ # =========================
298
+ # 6) بناء JSON للإخراج
299
+ # =========================
300
+ AR_PUNCT = "،؛؟"
301
+ EN_PUNCT = ",;?"
302
+
303
+ def normalize_punct(s: str) -> str:
304
+ if not s:
305
+ return ""
306
+ s = s.replace(",", "،").replace(";", "؛").replace("?", "؟")
307
+ return s.strip().strip(AR_PUNCT + EN_PUNCT).strip()
308
+
309
+ def is_bad_choice(txt: str) -> bool:
310
+ if not txt:
311
+ return True
312
+ txt = txt.strip()
313
+ BAD_NOISE = {"وهنا","اليه","الي","ليبق","لان","لانها","لانّه","ذلك","هذا","هذه"}
314
+ if txt in BAD_NOISE:
315
+ return True
316
+ if len(txt) > 18 and " " not in txt:
317
+ return True
318
+ if len(txt) < 2:
319
+ return True
320
+ if txt in AR_STOP:
321
+ return True
322
+ if re2.match(r"^[\p{P}\p{S}]+$", txt):
323
+ return True
324
+ return False
325
+
326
+ def build_json_records(items: List[MCQ], lang: str, source_pdf: str, method: str):
327
+ json_data = []
328
+ letters = ["A", "B", "C", "D"]
329
+ for it in items:
330
+ opts = []
331
+ seen = set()
332
+ for idx, lbl in enumerate(letters):
333
+ raw = it.choices[idx] if idx < len(it.choices) else ""
334
+ txt = normalize_punct(raw)
335
+ if is_bad_choice(txt):
336
+ txt = ""
337
+ if txt in seen:
338
+ txt += " "
339
+ seen.add(txt)
340
+ opts.append({
341
+ "id": lbl,
342
+ "text": txt,
343
+ "is_correct": (it.answer_index == idx)
344
+ })
345
+ q_clean = normalize_punct(it.question)
346
+ exp_clean = normalize_punct(it.explanation)
347
+ record = {
348
+ "id": it.id,
349
+ "question": q_clean,
350
+ "options": opts,
351
+ "explanation": exp_clean,
352
+ "meta": {
353
+ "lang": lang,
354
+ "normalized": True,
355
+ "source_pdf": source_pdf,
356
+ "extraction_method": method
357
+ }
358
+ }
359
+ json_data.append(record)
360
+ return json_data
361
+
362
+ # =========================
363
+ # 7) الدالة الرئيسية (تتعامل مع Filepath من Gradio)
364
+ # =========================
365
+ def process_pdf(pdf_file_path,
366
+ num_questions=DEFAULT_NUM_QUESTIONS,
367
+ lang=DEFAULT_LANG,
368
+ trocr_model=DEFAULT_TROCR_MODEL,
369
+ trocr_zoom=DEFAULT_TROCR_ZOOM):
370
+ logs = []
371
+ try:
372
+ if not pdf_file_path:
373
+ return {}, None, "يرجى رفع ملف PDF أولاً."
374
+
375
+ # pdf_file_path قد يكون str أو NamedString -> خذه كمسار
376
+ src_path = str(pdf_file_path)
377
+ # اسم ملف مناسب
378
+ name_guess = getattr(pdf_file_path, "name", "") if hasattr(pdf_file_path, "name") else ""
379
+ filename = Path(name_guess).name or Path(src_path).name or "input.pdf"
380
+ if not Path(filename).suffix:
381
+ filename += ".pdf"
382
+
383
+ workdir = tempfile.mkdtemp(prefix="mcq_")
384
+ pdf_path = os.path.join(workdir, filename)
385
+ shutil.copy(src_path, pdf_path)
386
+ logs.append(f"تم نسخ الملف إلى: {pdf_path}")
387
+
388
+ # 1) استخراج النص
389
+ raw_text, out_txt_path, method = pdf_to_txt(
390
+ pdf_path=pdf_path,
391
+ ocr_model=trocr_model,
392
+ ocr_zoom=float(trocr_zoom)
393
+ )
394
+ logs.append(f"طريقة الاستخراج: {method}")
395
+
396
+ # 2) تنظيف/تطبيع
397
+ cleaned_text = postprocess_text(raw_text, lang=lang)
398
+ save_text(cleaned_text, os.path.join(workdir, "cleaned.txt"))
399
+ logs.append("تم تنظيف/تطبيع النص.")
400
+
401
+ # 3) توليد أسئلة
402
+ items = make_mcqs_from_text(cleaned_text, n=int(num_questions), lang=lang)
403
+ logs.append(f"تم توليد {len(items)} سؤالاً.")
404
+
405
+ # 4) بناء JSON
406
+ json_records = build_json_records(items, lang=lang, source_pdf=Path(filename).name, method=method)
407
+ json_str = json.dumps(json_records, ensure_ascii=False, indent=2)
408
+
409
+ # 5) حفظ ملف JSON للتنزيل
410
+ json_path = os.path.join(workdir, "mcqs.json")
411
+ with open(json_path, "w", encoding="utf-8") as fj:
412
+ fj.write(json_str)
413
+ logs.append("تم إنشاء ملف mcqs.json.")
414
+
415
+ return json_records, json_path, "\n".join(logs)
416
+
417
+ except Exception as e:
418
+ logs.append(f"خطأ: {e}")
419
+ return {}, None, "\n".join(logs)
420
+
421
+ # =========================
422
+ # 8) واجهة Gradio (v5)
423
+ # =========================
424
+ import gradio as gr
425
+
426
+ with gr.Blocks(title="PDF → MCQ JSON (Arabic YAKE / TrOCR)") as demo:
427
+ gr.Markdown("## تحويل PDF إلى أسئلة اختيار من متعدد وإرجاع JSON جاهز للواجهة")
428
+
429
+ with gr.Row():
430
+ inp_pdf = gr.File(
431
+ label="ارفع PDF",
432
+ file_count="single",
433
+ file_types=[".pdf"],
434
+ type="filepath", # مهم: يُعيد مسار الملف
435
+ )
436
+ with gr.Column():
437
+ num_q = gr.Slider(4, 20, value=DEFAULT_NUM_QUESTIONS, step=1, label="عدد الأسئلة")
438
+ trocr_zoom = gr.Slider(2.0, 3.5, value=DEFAULT_TROCR_ZOOM, step=0.1, label="دقة تحويل PDF لصور (Zoom)")
439
+ trocr_model = gr.Dropdown(
440
+ choices=[
441
+ "microsoft/trocr-base-printed",
442
+ "microsoft/trocr-large-printed",
443
+ "microsoft/trocr-base-handwritten",
444
+ "microsoft/trocr-large-handwritten",
445
+ ],
446
+ value=DEFAULT_TROCR_MODEL,
447
+ label="موديل TrOCR"
448
+ )
449
+
450
+ btn = gr.Button("تشغيل المعالجة", variant="primary")
451
+ out_json = gr.JSON(label="النتيجة (JSON)")
452
+ out_file = gr.File(label="تحميل ملف JSON")
453
+ out_log = gr.Textbox(label="Logs", lines=10)
454
+
455
+ btn.click(
456
+ fn=process_pdf,
457
+ inputs=[inp_pdf, num_q, gr.State(DEFAULT_LANG), trocr_model, trocr_zoom],
458
+ outputs=[out_json, out_file, out_log]
459
+ )
460
+
461
+ # ملاحظة: Spaces تتعرف تلقائياً على المتغير "demo".
462
+ # لو شغّلت محلياً:
463
+ if __name__ == "__main__":
464
+ demo.queue().launch()