Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
# -*- coding: utf-8 -*-
|
| 2 |
-
#
|
| 3 |
-
#
|
| 4 |
-
# +
|
| 5 |
|
| 6 |
import os, json, uuid, random, unicodedata
|
| 7 |
from dataclasses import dataclass
|
|
@@ -15,7 +15,7 @@ import regex as re2
|
|
| 15 |
import yake
|
| 16 |
import gradio as gr
|
| 17 |
|
| 18 |
-
# ------------------ إعدادات عامة ------------------
|
| 19 |
random.seed(42)
|
| 20 |
DEFAULT_NUM_QUESTIONS = 6
|
| 21 |
DEFAULT_TROCR_MODEL = "microsoft/trocr-base-printed"
|
|
@@ -23,7 +23,7 @@ DEFAULT_TROCR_ZOOM = 2.6
|
|
| 23 |
QUESTION_MODES = ["فراغ", "فهم مباشر"]
|
| 24 |
DIFFICULTY_MODES = ["سهل", "متوسط", "صعب"]
|
| 25 |
|
| 26 |
-
#
|
| 27 |
try:
|
| 28 |
from rank_bm25 import BM25Okapi
|
| 29 |
_HAS_BM25 = True
|
|
@@ -41,7 +41,7 @@ def get_ocr(model_id: str):
|
|
| 41 |
_OCR[model_id] = pipeline("image-to-text", model=model_id, device=dev)
|
| 42 |
return _OCR[model_id]
|
| 43 |
except Exception:
|
| 44 |
-
#
|
| 45 |
return lambda im: [{"generated_text": ""}]
|
| 46 |
|
| 47 |
# ------------------ PDF/TXT → نص ------------------
|
|
@@ -120,47 +120,6 @@ def postprocess(raw:str)->str:
|
|
| 120 |
t = re2.sub(r"\[\d+\]", " ", t)
|
| 121 |
return norm_ar(t)
|
| 122 |
|
| 123 |
-
def word_tokens(s: str) -> List[str]:
|
| 124 |
-
s = norm_ar(s)
|
| 125 |
-
return re2.findall(r"\p{L}+", s)
|
| 126 |
-
|
| 127 |
-
def token_set(s: str) -> set:
|
| 128 |
-
return set([t for t in word_tokens(s) if t not in AR_STOP])
|
| 129 |
-
|
| 130 |
-
def jaccard(a: str, b: str) -> float:
|
| 131 |
-
A, B = token_set(a), token_set(b)
|
| 132 |
-
if not A or not B: return 0.0
|
| 133 |
-
return len(A & B) / max(1, len(A | B))
|
| 134 |
-
|
| 135 |
-
def is_sub_or_super(a: str, b: str) -> bool:
|
| 136 |
-
# هل a包含 b أو b包含 a (بعد التطبيع)
|
| 137 |
-
A, B = norm_ar(a), norm_ar(b)
|
| 138 |
-
return (A in B) or (B in A)
|
| 139 |
-
|
| 140 |
-
def appears_as_long_fragment_in_sentence(w: str, sentence: str) -> bool:
|
| 141 |
-
# اعتبرها قصاصة مزعجة إذا هي مقطع متصل من نفس الجملة بعدد كلمات ≥ 3
|
| 142 |
-
toks = word_tokens(w)
|
| 143 |
-
if len(toks) < 3:
|
| 144 |
-
return False
|
| 145 |
-
return re2.search(rf"(?<!\p{{L}}){re2.escape(norm_ar(w))}(?!\p{{L}})", norm_ar(sentence)) is not None
|
| 146 |
-
|
| 147 |
-
def choice_length_ok(w: str) -> bool:
|
| 148 |
-
n = len(word_tokens(w))
|
| 149 |
-
return 1 <= n <= 6
|
| 150 |
-
|
| 151 |
-
def paragraph_index_map(text: str, sentences: List[str]) -> dict:
|
| 152 |
-
# قسّم النص لفقرات، وأعط كل جملة رقم الفقرة التي تحتويها
|
| 153 |
-
paras = [norm_ar(p) for p in re2.split(r"\n{2,}", text) if p.strip()]
|
| 154 |
-
mapping = {}
|
| 155 |
-
for i, s in enumerate(sentences):
|
| 156 |
-
ns = norm_ar(s)
|
| 157 |
-
pid = None
|
| 158 |
-
for j, p in enumerate(paras):
|
| 159 |
-
if ns and ns in p:
|
| 160 |
-
pid = j; break
|
| 161 |
-
mapping[s] = pid if pid is not None else -1
|
| 162 |
-
return mapping
|
| 163 |
-
|
| 164 |
# ------------------ بنية السؤال ------------------
|
| 165 |
SENT_SPLIT = re2.compile(r"(?<=[\.!؟\?])\s+")
|
| 166 |
AR_STOP = set("""في على من إلى عن مع لدى ذلك هذه هذا الذين التي الذي أو أم إن أن كان تكون كانوا كانت كنت ثم قد لقد ربما بل لكن إلا سوى حتى حيث كما لما ما لماذا متى أين كيف أي هناك هنا هؤلاء أولئك نحن هو هي هم هن أنت أنتم أنتن""".split())
|
|
@@ -176,8 +135,8 @@ def split_sents(t:str)->List[str]:
|
|
| 176 |
s=[x.strip() for x in SENT_SPLIT.split(t) if x.strip()]
|
| 177 |
return [x for x in s if len(x)>=25]
|
| 178 |
|
| 179 |
-
# ====== (1) عبارات مفتاحية
|
| 180 |
-
def yake_keywords(t: str, k: int =
|
| 181 |
phrases = []
|
| 182 |
seen = set()
|
| 183 |
for n in [3, 2, 1]:
|
|
@@ -200,7 +159,7 @@ def yake_keywords(t: str, k: int = 200) -> List[str]:
|
|
| 200 |
def good_kw(kw:str)->bool:
|
| 201 |
return kw and len(kw)>=2 and kw not in AR_STOP and not re2.match(r"^[\p{P}\p{S}\d_]+$", kw)
|
| 202 |
|
| 203 |
-
# ======
|
| 204 |
_HAS_CAMEL = False
|
| 205 |
try:
|
| 206 |
from camel_tools.morphology.analyzer import Analyzer
|
|
@@ -211,11 +170,10 @@ try:
|
|
| 211 |
except Exception:
|
| 212 |
_HAS_CAMEL = False
|
| 213 |
|
| 214 |
-
NER_TAGS = {"PER","LOC","ORG","MISC"}
|
| 215 |
|
| 216 |
def ar_pos(word: str) -> str:
|
| 217 |
if not _HAS_CAMEL:
|
| 218 |
-
# fallback مبسّط
|
| 219 |
if re2.match(r"^(في|على|الى|إلى|من|عن|حتى|ثم|بل|لكن|أو|و)$", word): return "PART"
|
| 220 |
if re2.match(r"^[\p{N}]+$", word): return "NUM"
|
| 221 |
if re2.search(r"(ة|ات|ون|ين|ان)$", word): return "NOUN"
|
|
@@ -239,7 +197,7 @@ def is_named_entity(token: str) -> bool:
|
|
| 239 |
return False
|
| 240 |
|
| 241 |
def is_clean_sentence(s: str) -> bool:
|
| 242 |
-
if not (
|
| 243 |
if re2.search(r"https?://|www\.", s): return False
|
| 244 |
if re2.search(r"\d{2,}", s): return False
|
| 245 |
return True
|
|
@@ -250,7 +208,7 @@ def safe_keyword(k: str) -> bool:
|
|
| 250 |
if ar_pos(k) in {"PRON","PART"}: return False
|
| 251 |
return True
|
| 252 |
|
| 253 |
-
# ======
|
| 254 |
_EMB = None
|
| 255 |
def get_embedder():
|
| 256 |
global _EMB
|
|
@@ -272,7 +230,7 @@ def nearest_terms(target: str, pool: List[str], k: int = 24) -> List[Tuple[str,
|
|
| 272 |
vecs = emb.encode([target] + cand, normalize_embeddings=True)
|
| 273 |
t, C = vecs[0], vecs[1:]
|
| 274 |
import numpy as np
|
| 275 |
-
sims = (C @ t)
|
| 276 |
idx = np.argsort(-sims)[:k]
|
| 277 |
return [(cand[i], float(sims[i])) for i in idx]
|
| 278 |
|
|
@@ -307,19 +265,6 @@ def mlm_distractors(sentence_with_blank: str, correct: str, k: int = 18) -> List
|
|
| 307 |
except Exception:
|
| 308 |
return []
|
| 309 |
|
| 310 |
-
def legacy_distractors(correct:str, pool:List[str], k:int=3)->List[str]:
|
| 311 |
-
L=len(correct.strip()); cand=[]
|
| 312 |
-
for w in pool:
|
| 313 |
-
w=w.strip()
|
| 314 |
-
if not w or w==correct or w in AR_STOP: continue
|
| 315 |
-
if re2.match(r"^[\p{P}\p{S}\d_]+$", w): continue
|
| 316 |
-
if abs(len(w)-L)<=3: cand.append(w)
|
| 317 |
-
random.shuffle(cand)
|
| 318 |
-
out=cand[:k]
|
| 319 |
-
while len(out)<k: out.append("…")
|
| 320 |
-
return out
|
| 321 |
-
|
| 322 |
-
# ====== Cross-Encoder اختياري للترتيب ======
|
| 323 |
_CE = None
|
| 324 |
def get_cross_encoder():
|
| 325 |
global _CE
|
|
@@ -331,15 +276,6 @@ def get_cross_encoder():
|
|
| 331 |
_CE = False
|
| 332 |
return _CE
|
| 333 |
|
| 334 |
-
def pos_compatible(a: str, b: str) -> bool:
|
| 335 |
-
pa, pb = ar_pos(a), ar_pos(b)
|
| 336 |
-
if "X" in (pa, pb):
|
| 337 |
-
return True
|
| 338 |
-
return pa == pb
|
| 339 |
-
|
| 340 |
-
def length_close(a: str, b: str) -> bool:
|
| 341 |
-
return abs(len(a) - len(b)) <= max(6, len(b)//2)
|
| 342 |
-
|
| 343 |
def rank_by_ce(sentence_with_blank: str, candidates: List[str]) -> List[str]:
|
| 344 |
ce = get_cross_encoder()
|
| 345 |
if not ce or not candidates:
|
|
@@ -352,7 +288,89 @@ def rank_by_ce(sentence_with_blank: str, candidates: List[str]) -> List[str]:
|
|
| 352 |
except Exception:
|
| 353 |
return candidates
|
| 354 |
|
| 355 |
-
# --------- أدوات
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 356 |
def tokenize_ar(s: str) -> List[str]:
|
| 357 |
s = norm_ar(s)
|
| 358 |
toks = re2.findall(r"\p{L}+", s)
|
|
@@ -377,84 +395,50 @@ def bm25_candidates(correct: str, sentences: List[str], bm, corpus_tokens, top:
|
|
| 377 |
pool.add(tok)
|
| 378 |
return list(pool)
|
| 379 |
|
| 380 |
-
def pick_clean_sentences(sents: List[str], difficulty: str) -> List[str]:
|
| 381 |
-
out = []
|
| 382 |
-
for s in sents:
|
| 383 |
-
if not is_clean_sentence(s):
|
| 384 |
-
continue
|
| 385 |
-
L = len(s)
|
| 386 |
-
if difficulty == "سهل" and not (70 <= L <= 180):
|
| 387 |
-
continue
|
| 388 |
-
if difficulty == "متوسط" and not (70 <= L <= 220):
|
| 389 |
-
continue
|
| 390 |
-
if difficulty == "صعب" and not (60 <= L <= 240):
|
| 391 |
-
continue
|
| 392 |
-
out.append(s)
|
| 393 |
-
return out
|
| 394 |
-
|
| 395 |
-
def similarity_caps(difficulty: str):
|
| 396 |
-
if difficulty == "سهل":
|
| 397 |
-
return 0.88
|
| 398 |
-
if difficulty == "صعب":
|
| 399 |
-
return 0.95
|
| 400 |
-
return 0.92
|
| 401 |
def typo_like_variants(answer: str, k: int = 4) -> List[str]:
|
| 402 |
-
"""مشتّتات شكلية: تع
|
| 403 |
a = norm_ar(answer)
|
| 404 |
vars = set()
|
| 405 |
-
# تعريف/تنكير
|
| 406 |
if a.startswith("ال"):
|
| 407 |
vars.add(a[2:])
|
| 408 |
else:
|
| 409 |
vars.add("ال" + a)
|
| 410 |
-
# ي/ى
|
| 411 |
vars.add(a.replace("ي", "ى"))
|
| 412 |
vars.add(a.replace("ى", "ي"))
|
| 413 |
-
# ة/ه
|
| 414 |
vars.add(a.replace("ة", "ه"))
|
| 415 |
vars.add(a.replace("ه", "ة"))
|
| 416 |
-
# حذف حرف وسطي إن الكلمة طويلة
|
| 417 |
if len(a) > 5:
|
| 418 |
mid = len(a)//2
|
| 419 |
vars.add(a[:mid] + a[mid+1:])
|
| 420 |
-
# تنظيف نهائي
|
| 421 |
out = [v for v in vars if v and norm_ar(v) != norm_ar(a)]
|
| 422 |
return out[:k]
|
| 423 |
|
| 424 |
-
# ====== مشتّتات ذكية
|
| 425 |
-
def
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
for v in typo_like_variants(correct, k=4):
|
| 431 |
-
base.append(v)
|
| 432 |
-
|
| 433 |
-
# (أ) جيران دلاليين من العبارات
|
| 434 |
-
base.extend([w for w, _ in nearest_terms(correct, phrase_pool, k=24)])
|
| 435 |
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
if w not in base:
|
| 439 |
-
base.append(w)
|
| 440 |
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
for w in bm25_candidates(correct, all_sentences, bm, corp, top=18):
|
| 445 |
-
if w not in base:
|
| 446 |
-
base.append(w)
|
| 447 |
|
|
|
|
|
|
|
| 448 |
|
| 449 |
# (أ) جيران دلاليين
|
| 450 |
base.extend([w for w,_ in nearest_terms(correct, phrase_pool, k=24)])
|
| 451 |
|
| 452 |
# (ب) FILL-MASK
|
| 453 |
-
for w in mlm_distractors(sentence.replace(
|
| 454 |
if w not in base:
|
| 455 |
base.append(w)
|
| 456 |
|
| 457 |
-
# (ج) BM25
|
| 458 |
if all_sentences:
|
| 459 |
bm, corp = bm25_build(all_sentences)
|
| 460 |
for w in bm25_candidates(correct, all_sentences, bm, corp, top=18):
|
|
@@ -462,9 +446,9 @@ def smart_distractors(correct: str, phrase_pool: List[str], sentence: str, k: in
|
|
| 462 |
base.append(w)
|
| 463 |
|
| 464 |
# فلترة صارمة
|
| 465 |
-
clean = []
|
| 466 |
for w in base:
|
| 467 |
-
w = w.strip()
|
| 468 |
if not w or w == correct:
|
| 469 |
continue
|
| 470 |
if not choice_length_ok(w):
|
|
@@ -483,7 +467,7 @@ def smart_distractors(correct: str, phrase_pool: List[str], sentence: str, k: in
|
|
| 483 |
continue
|
| 484 |
clean.append(w)
|
| 485 |
|
| 486 |
-
# ترتيب (اختياري) + فلتر قرب دلالي
|
| 487 |
clean = rank_by_ce(sentence.replace(correct, "_____"), clean)[:max(k*4, k)]
|
| 488 |
cap = similarity_caps(difficulty)
|
| 489 |
try:
|
|
@@ -499,7 +483,7 @@ def smart_distractors(correct: str, phrase_pool: List[str], sentence: str, k: in
|
|
| 499 |
except Exception:
|
| 500 |
pass
|
| 501 |
|
| 502 |
-
#
|
| 503 |
out, seen = [], set()
|
| 504 |
for w in clean:
|
| 505 |
if w in seen:
|
|
@@ -508,7 +492,7 @@ def smart_distractors(correct: str, phrase_pool: List[str], sentence: str, k: in
|
|
| 508 |
if len(out) >= k:
|
| 509 |
break
|
| 510 |
|
| 511 |
-
# تعويض ا
|
| 512 |
if len(out) < k:
|
| 513 |
extras = [w for w in phrase_pool
|
| 514 |
if w not in out and w != correct and choice_length_ok(w)
|
|
@@ -517,185 +501,10 @@ def smart_distractors(correct: str, phrase_pool: List[str], sentence: str, k: in
|
|
| 517 |
and jaccard(w, correct) < 0.5]
|
| 518 |
out.extend(extras[:(k-len(out))])
|
| 519 |
if len(out) < k:
|
| 520 |
-
out.extend(
|
| 521 |
return out[:k]
|
| 522 |
|
| 523 |
-
|
| 524 |
-
return ":" in s and s.index(":") < max(10, len(s)//6)
|
| 525 |
-
|
| 526 |
-
def is_nouny_phrase(w: str) -> bool:
|
| 527 |
-
# heuristics بسيطة: 1–4 كلمات، بدون أفعال ظاهرة، وتنتهي باسم/صفة شائعة
|
| 528 |
-
toks = word_tokens(w)
|
| 529 |
-
if not (1 <= len(toks) <= 4): return False
|
| 530 |
-
if re2.search(r"(يفعل|تفعل|يشهد|تقوم|يمكن|قد|سوف)$", w): return False
|
| 531 |
-
return True
|
| 532 |
-
|
| 533 |
-
def best_keyword_in_sentence(sentence: str, global_text: str) -> Optional[str]:
|
| 534 |
-
if looks_like_title_fragment(sentence):
|
| 535 |
-
# لا تفرّغ من شق العنوان قبل النقطتين
|
| 536 |
-
parts = sentence.split(":", 1)
|
| 537 |
-
sentence = parts[1] if len(parts) > 1 else sentence
|
| 538 |
-
try:
|
| 539 |
-
ex = yake.KeywordExtractor(lan='ar', n=3, top=24)
|
| 540 |
-
pairs = ex.extract_keywords(sentence)
|
| 541 |
-
except Exception:
|
| 542 |
-
pairs = []
|
| 543 |
-
cands = []
|
| 544 |
-
for w, _ in pairs:
|
| 545 |
-
w = re2.sub(r"\s+", " ", w.strip())
|
| 546 |
-
if not w or not good_kw(w) or not safe_keyword(w):
|
| 547 |
-
continue
|
| 548 |
-
if not is_nouny_phrase(w):
|
| 549 |
-
continue
|
| 550 |
-
if not re2.search(rf"(?<!\p{{L}}){re2.escape(w)}(?!\p{{L}})", sentence):
|
| 551 |
-
continue
|
| 552 |
-
freq_weight = global_text.count(w)
|
| 553 |
-
cands.append((w, len(w) + 0.7*freq_weight))
|
| 554 |
-
if not cands:
|
| 555 |
-
toks = [t for t in re2.findall(r"\p{L}+", sentence) if good_kw(t) and safe_keyword(t)]
|
| 556 |
-
toks = [t for t in toks if is_nouny_phrase(t)]
|
| 557 |
-
toks.sort(key=len, reverse=True)
|
| 558 |
-
return toks[0] if toks else None
|
| 559 |
-
cands.sort(key=lambda x: -x[1])
|
| 560 |
-
return cands[0][0]
|
| 561 |
-
|
| 562 |
-
|
| 563 |
-
# ====== (4-أ) مُولِّد أسئلة "فراغ" ======
|
| 564 |
-
def make_mcqs(text: str, n: int = 6, difficulty: str = "متوسط") -> List[MCQ]:
|
| 565 |
-
"""
|
| 566 |
-
توليد أسئلة فراغ ذكية مع:
|
| 567 |
-
- تنويع على مستوى الفقرات (حد أقصى لكل فقرة)
|
| 568 |
-
- تعبئة العدد المطلوب إلى n
|
| 569 |
-
- فلترة مشتّتات شكلية/قصاصات/تشابه عالي
|
| 570 |
-
المتطلبات الخارجية: paragraph_index_map, best_keyword_in_sentence, smart_distractors,
|
| 571 |
-
choice_length_ok, appears_as_long_fragment_in_sentence, is_sub_or_super, jaccard,
|
| 572 |
-
split_sents, pick_clean_sentences, yake_keywords, safe_keyword
|
| 573 |
-
"""
|
| 574 |
-
# 1) تجهيز الجمل والعبارات
|
| 575 |
-
all_sents = split_sents(text)
|
| 576 |
-
sents = pick_clean_sentences(all_sents, difficulty) or all_sents[:]
|
| 577 |
-
if not sents:
|
| 578 |
-
raise ValueError("النص قصير أو غير صالح.")
|
| 579 |
-
|
| 580 |
-
keyphrases = yake_keywords(text, k=260)
|
| 581 |
-
keyphrases = [kp for kp in keyphrases if safe_keyword(kp) and 2 <= len(kp) <= 40]
|
| 582 |
-
|
| 583 |
-
# 2) ربط مرن (لا يشترط ظهور وحيد)
|
| 584 |
-
sent_for: dict = {}
|
| 585 |
-
for s in sents:
|
| 586 |
-
for kp in keyphrases:
|
| 587 |
-
if kp in sent_for:
|
| 588 |
-
continue
|
| 589 |
-
if re2.search(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", s):
|
| 590 |
-
sent_for[kp] = s
|
| 591 |
-
if len(sent_for) >= n * 5:
|
| 592 |
-
break
|
| 593 |
-
|
| 594 |
-
# 3) إعداد التنويع حسب الفقرات
|
| 595 |
-
para_map = paragraph_index_map(text, sents)
|
| 596 |
-
used_sentences: set = set()
|
| 597 |
-
items: List[MCQ] = []
|
| 598 |
-
|
| 599 |
-
MAX_PER_PARA = 2 # حد أعلى للأسئلة من نفس الفقرة
|
| 600 |
-
para_count: dict = {} # pid -> count
|
| 601 |
-
|
| 602 |
-
def add_item_from_pair(sentence: str, kp: str) -> bool:
|
| 603 |
-
"""يبني سؤالاً من (جملة، كلمة مفتاحية) ويضيفه إلى items عند النجاح."""
|
| 604 |
-
nonlocal items, used_sentences, para_count
|
| 605 |
-
|
| 606 |
-
# حد الفقرة
|
| 607 |
-
pid = para_map.get(sentence, -1)
|
| 608 |
-
if para_count.get(pid, 0) >= MAX_PER_PARA:
|
| 609 |
-
return False
|
| 610 |
-
|
| 611 |
-
# لازم الكلمة موجودة نصياً بالجملة
|
| 612 |
-
if not re2.search(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", sentence):
|
| 613 |
-
return False
|
| 614 |
-
|
| 615 |
-
# ابنِ سؤال الفراغ
|
| 616 |
-
q = re2.sub(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", "_____", sentence, count=1)
|
| 617 |
-
|
| 618 |
-
# حضّر الخيارات
|
| 619 |
-
pool = [x for x in keyphrases if x != kp] or keyphrases[:]
|
| 620 |
-
ch = smart_distractors(
|
| 621 |
-
kp, pool, sentence, k=3, all_sentences=all_sents, difficulty=difficulty
|
| 622 |
-
) + [kp]
|
| 623 |
-
|
| 624 |
-
# تنظيف وتثبيت 4 خيارات حقيقية
|
| 625 |
-
choices, seen = [], set()
|
| 626 |
-
for c in ch:
|
| 627 |
-
c = (c or "").strip()
|
| 628 |
-
if not c or c in seen:
|
| 629 |
-
continue
|
| 630 |
-
if not choice_length_ok(c):
|
| 631 |
-
continue
|
| 632 |
-
if appears_as_long_fragment_in_sentence(c, sentence):
|
| 633 |
-
continue
|
| 634 |
-
if is_sub_or_super(c, kp) or jaccard(c, kp) >= 0.5:
|
| 635 |
-
continue
|
| 636 |
-
seen.add(c)
|
| 637 |
-
choices.append(c)
|
| 638 |
-
|
| 639 |
-
# أضف الجواب إن لم يُدرج بعد
|
| 640 |
-
if kp not in choices:
|
| 641 |
-
choices.append(kp)
|
| 642 |
-
seen.add(kp)
|
| 643 |
-
|
| 644 |
-
# نفضّل حذف السؤال على ملء "…"؛ لازم 4 خيارات فعلية
|
| 645 |
-
if len(choices) < 4:
|
| 646 |
-
return False
|
| 647 |
-
|
| 648 |
-
choices = choices[:4]
|
| 649 |
-
random.shuffle(choices)
|
| 650 |
-
ans = choices.index(kp)
|
| 651 |
-
|
| 652 |
-
items.append(MCQ(id=str(uuid.uuid4())[:8], question=q, choices=choices, answer_index=ans))
|
| 653 |
-
used_sentences.add(sentence)
|
| 654 |
-
para_count[pid] = para_count.get(pid, 0) + 1
|
| 655 |
-
return True
|
| 656 |
-
|
| 657 |
-
# 4) تمريرة أولى: أسئلة من فقرات مختلفة أولاً (ثم نسمح حتى حد MAX_PER_PARA)
|
| 658 |
-
for kp in sorted(sent_for.keys(), key=lambda x: (-len(x), x)):
|
| 659 |
-
if len(items) >= n:
|
| 660 |
-
break
|
| 661 |
-
s = sent_for[kp]
|
| 662 |
-
if s in used_sentences:
|
| 663 |
-
continue
|
| 664 |
-
_ = add_item_from_pair(s, kp)
|
| 665 |
-
|
| 666 |
-
# 5) تعبئة النقص: من جمل فقرات لم تبلغ حدّها، ثم أي جمل باقية
|
| 667 |
-
def fill_from_sentences(candidates: List[str]):
|
| 668 |
-
for s in candidates:
|
| 669 |
-
if len(items) >= n:
|
| 670 |
-
break
|
| 671 |
-
if s in used_sentences:
|
| 672 |
-
continue
|
| 673 |
-
# جرّب بكلمة رُبطت بهذه الجملة إن وجدت، وإلا اختَر هدفًا من نفس الجملة
|
| 674 |
-
kp = None
|
| 675 |
-
for kpp, ss in sent_for.items():
|
| 676 |
-
if ss == s:
|
| 677 |
-
kp = kpp
|
| 678 |
-
break
|
| 679 |
-
if kp is None:
|
| 680 |
-
kp = best_keyword_in_sentence(s, text)
|
| 681 |
-
if not kp:
|
| 682 |
-
continue
|
| 683 |
-
_ = add_item_from_pair(s, kp)
|
| 684 |
-
|
| 685 |
-
if len(items) < n:
|
| 686 |
-
remaining_new_para = [s for s in sents if para_count.get(para_map.get(s, -1), 0) < MAX_PER_PARA]
|
| 687 |
-
fill_from_sentences(remaining_new_para)
|
| 688 |
-
|
| 689 |
-
if len(items) < n:
|
| 690 |
-
leftovers = [s for s in sents if s not in used_sentences]
|
| 691 |
-
fill_from_sentences(leftovers)
|
| 692 |
-
|
| 693 |
-
if not items:
|
| 694 |
-
raise RuntimeError("تعذّر توليد أسئلة.")
|
| 695 |
-
return items[:n]
|
| 696 |
-
|
| 697 |
-
|
| 698 |
-
# ====== (4-ب) مُولِّد أسئلة "فهم مباشر" (mT5) ======
|
| 699 |
_MT5 = {"tok": None, "model": None, "ok": False}
|
| 700 |
def get_mt5():
|
| 701 |
if _MT5["tok"] is not None or _MT5["model"] is not None or _MT5["ok"]:
|
|
@@ -727,7 +536,7 @@ def comp_prompt(sentence: str) -> str:
|
|
| 727 |
"أنت منشئ أسئلة متعددة الخيارات باللغة العربية.\n"
|
| 728 |
"من الجملة التالية، أنشئ سؤال فهم مباشر واحدًا مع أربع خيارات وإشارة للجواب الصحيح.\n"
|
| 729 |
"أعد فقط JSON بهذا الشكل:\n"
|
| 730 |
-
"{"
|
| 731 |
"\"question\": \"...\",\n"
|
| 732 |
"\"choices\": [\"...\",\"...\",\"...\",\"...\"],\n"
|
| 733 |
"\"answer_index\": 0\n"
|
|
@@ -766,18 +575,15 @@ def gen_one_comp_q(sentence: str, tok, model, max_new_tokens=128) -> Optional[MC
|
|
| 766 |
def make_comp_mcqs(text: str, n: int = 6, difficulty: str = "متوسط") -> List[MCQ]:
|
| 767 |
tok, model, ok = get_mt5()
|
| 768 |
if not ok:
|
| 769 |
-
# عدم توفر mT5 → ارجعي لأسئلة الفراغ
|
| 770 |
return make_mcqs(text, n, difficulty=difficulty)
|
| 771 |
|
| 772 |
sents_all = split_sents(text)
|
| 773 |
-
sents =
|
| 774 |
-
if not sents:
|
| 775 |
-
sents = sents_all[:]
|
| 776 |
if not sents:
|
| 777 |
return make_mcqs(text, n, difficulty=difficulty)
|
| 778 |
|
| 779 |
-
# ج
|
| 780 |
-
def make_chunks(sents, max_len=
|
| 781 |
chunks = []
|
| 782 |
i = 0
|
| 783 |
while i < len(sents):
|
|
@@ -818,6 +624,105 @@ def make_comp_mcqs(text: str, n: int = 6, difficulty: str = "متوسط") -> Lis
|
|
| 818 |
return make_mcqs(text, n, difficulty=difficulty)
|
| 819 |
return items[:n]
|
| 820 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 821 |
|
| 822 |
# ------------------ تحويل إلى سجلات العرض ------------------
|
| 823 |
def clean_option_text(t: str) -> str:
|
|
@@ -825,6 +730,8 @@ def clean_option_text(t: str) -> str:
|
|
| 825 |
t = re2.sub(AR_DIAC, "", t)
|
| 826 |
t = re2.sub(r"\s+", " ", t)
|
| 827 |
t = re2.sub(r"^[\p{P}\p{S}_-]+|[\p{P}\p{S}_-]+$", "", t)
|
|
|
|
|
|
|
| 828 |
return t or "…"
|
| 829 |
|
| 830 |
def to_records(items:List[MCQ])->List[dict]:
|
|
@@ -835,9 +742,8 @@ def to_records(items:List[MCQ])->List[dict]:
|
|
| 835 |
for i,lbl in enumerate(["A","B","C","D"]):
|
| 836 |
txt=(it.choices[i] if i<len(it.choices) else "…")
|
| 837 |
txt=clean_option_text(txt.replace(",", "،").replace("?", "؟").replace(";", "؛"))
|
| 838 |
-
txt = re2.sub(r"^(.{,60})(?:\s.*)?$", r"\1", txt) # قصّ الخيار لـ ~60 حرف
|
| 839 |
if txt in used:
|
| 840 |
-
txt = f"
|
| 841 |
used.add(txt)
|
| 842 |
opts.append({"id":lbl,"text":txt,"is_correct":(i==it.answer_index)})
|
| 843 |
recs.append({"id":it.id,"question":it.question.strip(),"options":opts})
|
|
@@ -876,7 +782,7 @@ def render_quiz_html(records: List[dict]) -> str:
|
|
| 876 |
""")
|
| 877 |
return f"""<div id="quiz" class="quiz-wrap">{''.join(parts)}</div>"""
|
| 878 |
|
| 879 |
-
# ------------------
|
| 880 |
def build_quiz(text_area, file_path, n, model_id, zoom, mode, difficulty):
|
| 881 |
text_area = (text_area or "").strip()
|
| 882 |
if not text_area and not file_path:
|
|
@@ -903,7 +809,6 @@ def build_quiz(text_area, file_path, n, model_id, zoom, mode, difficulty):
|
|
| 903 |
warn = f"نمط مُستخدَم: **{used_mode}** — عدد الأسئلة: {len(items)}"
|
| 904 |
return render_quiz_html(recs), gr.update(visible=False), gr.update(visible=True), warn
|
| 905 |
|
| 906 |
-
|
| 907 |
# ------------------ CSS ------------------
|
| 908 |
CSS = """
|
| 909 |
:root{
|
|
@@ -914,12 +819,12 @@ body{direction:rtl; font-family:system-ui,'Cairo','IBM Plex Arabic',sans-serif;
|
|
| 914 |
.gradio-container{max-width:980px;margin:0 auto;padding:12px 12px 40px;}
|
| 915 |
h2.top{color:#eaeaf2;margin:6px 0 16px}
|
| 916 |
|
| 917 |
-
/* صفحة الإدخال ثابتة الارتفاع ولا تتغير أبعاده */
|
| 918 |
.input-panel{background:var(--panel);border:1px solid var(--border);border-radius:14px;padding:16px;
|
| 919 |
box-shadow:0 16px 38px rgba(0,0,0,.35); min-height:360px; display:flex; flex-direction:column; gap:12px;}
|
| 920 |
.small{opacity:.9;color:#d9dee8}
|
| 921 |
|
| 922 |
-
/*
|
| 923 |
[data-testid="file"] .file-preview, [data-testid="file"] .file-preview * { display:none !important; }
|
| 924 |
[data-testid="file"] .grid-wrap { display:block !important; }
|
| 925 |
.upload-like{border:2px dashed #3b3f52;background:#121318;border-radius:12px;padding:12px;color:#cfd5e3;min-height:90px}
|
|
@@ -947,14 +852,14 @@ textarea{min-height:120px}
|
|
| 947 |
|
| 948 |
.q-actions{display:flex;gap:10px;align-items:center;margin-top:10px}
|
| 949 |
.q-actions .q-submit{
|
| 950 |
-
background:#2dd4bf;border:none;color:#
|
| 951 |
}
|
| 952 |
.q-actions .q-submit:disabled{opacity:.5;cursor:not-allowed}
|
| 953 |
.q-note{color:#ffd1d6}
|
| 954 |
.q-note.warn{color:#ffd1d6}
|
| 955 |
"""
|
| 956 |
|
| 957 |
-
# ------------------ JS: ربط Submit
|
| 958 |
ATTACH_LISTENERS_JS = """
|
| 959 |
() => {
|
| 960 |
if (window.__q_submit_bound_multi2) { return 'already'; }
|
|
@@ -1013,7 +918,6 @@ ATTACH_LISTENERS_JS = """
|
|
| 1013 |
with gr.Blocks(title="Question Generator", css=CSS) as demo:
|
| 1014 |
gr.Markdown("<h2 class='top'>Question Generator</h2>")
|
| 1015 |
|
| 1016 |
-
# الصفحة 1: إدخال ثابت لا تتغير أبعاده
|
| 1017 |
page1 = gr.Group(visible=True, elem_classes=["input-panel"])
|
| 1018 |
with page1:
|
| 1019 |
gr.Markdown("اختر **أحد** الخيارين ثم اضغط الزر.", elem_classes=["small"])
|
|
@@ -1022,7 +926,6 @@ with gr.Blocks(title="Question Generator", css=CSS) as demo:
|
|
| 1022 |
file_types=[".pdf",".txt"], type="filepath", elem_classes=["upload-like"])
|
| 1023 |
num_q = gr.Slider(4, 20, value=DEFAULT_NUM_QUESTIONS, step=1, label="عدد الأسئلة")
|
| 1024 |
|
| 1025 |
-
# خيارات إضافية بدون تغيير البنية العامة
|
| 1026 |
mode_radio = gr.Radio(choices=QUESTION_MODES, value="فراغ", label="نوع السؤال")
|
| 1027 |
difficulty_radio = gr.Radio(choices=DIFFICULTY_MODES, value="متوسط", label="درجة الصعوبة")
|
| 1028 |
|
|
@@ -1041,13 +944,11 @@ with gr.Blocks(title="Question Generator", css=CSS) as demo:
|
|
| 1041 |
btn_build = gr.Button("generate quistion", elem_classes=["button-primary"])
|
| 1042 |
warn = gr.Markdown("", elem_classes=["small"])
|
| 1043 |
|
| 1044 |
-
# الصفحة 2: الأسئلة
|
| 1045 |
page2 = gr.Group(visible=False)
|
| 1046 |
with page2:
|
| 1047 |
quiz_html = gr.HTML("")
|
| 1048 |
-
js_wired = gr.Textbox(visible=False)
|
| 1049 |
|
| 1050 |
-
# بناء الامتحان + تبديل الصفحات + ربط الـJS
|
| 1051 |
btn_build.click(
|
| 1052 |
build_quiz,
|
| 1053 |
inputs=[text_area, file_comp, num_q, trocr_model, trocr_zoom, mode_radio, difficulty_radio],
|
|
|
|
| 1 |
# -*- coding: utf-8 -*-
|
| 2 |
+
# Question Generator — Final Publishable Build (Lite/Full)
|
| 3 |
+
# صفحات ثابتة + Submit لكل سؤال فعليًا + منع تغيّر أبعاد صفحة الإدخال
|
| 4 |
+
# طور "فراغ" + طور "فهم مباشر" (mT5) مع fallbacks، صعوبة، BM25، فلترة قوية للمشتّتات، وتنويع على مستوى الفقرات.
|
| 5 |
|
| 6 |
import os, json, uuid, random, unicodedata
|
| 7 |
from dataclasses import dataclass
|
|
|
|
| 15 |
import yake
|
| 16 |
import gradio as gr
|
| 17 |
|
| 18 |
+
# ------------------ إعدادات عامّة ------------------
|
| 19 |
random.seed(42)
|
| 20 |
DEFAULT_NUM_QUESTIONS = 6
|
| 21 |
DEFAULT_TROCR_MODEL = "microsoft/trocr-base-printed"
|
|
|
|
| 23 |
QUESTION_MODES = ["فراغ", "فهم مباشر"]
|
| 24 |
DIFFICULTY_MODES = ["سهل", "متوسط", "صعب"]
|
| 25 |
|
| 26 |
+
# BM25 (اختياري)
|
| 27 |
try:
|
| 28 |
from rank_bm25 import BM25Okapi
|
| 29 |
_HAS_BM25 = True
|
|
|
|
| 41 |
_OCR[model_id] = pipeline("image-to-text", model=model_id, device=dev)
|
| 42 |
return _OCR[model_id]
|
| 43 |
except Exception:
|
| 44 |
+
# بديل آمن: دالة تُعيد نصًا فارغًا
|
| 45 |
return lambda im: [{"generated_text": ""}]
|
| 46 |
|
| 47 |
# ------------------ PDF/TXT → نص ------------------
|
|
|
|
| 120 |
t = re2.sub(r"\[\d+\]", " ", t)
|
| 121 |
return norm_ar(t)
|
| 122 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
# ------------------ بنية السؤال ------------------
|
| 124 |
SENT_SPLIT = re2.compile(r"(?<=[\.!؟\?])\s+")
|
| 125 |
AR_STOP = set("""في على من إلى عن مع لدى ذلك هذه هذا الذين التي الذي أو أم إن أن كان تكون كانوا كانت كنت ثم قد لقد ربما بل لكن إلا سوى حتى حيث كما لما ما لماذا متى أين كيف أي هناك هنا هؤلاء أولئك نحن هو هي هم هن أنت أنتم أنتن""".split())
|
|
|
|
| 135 |
s=[x.strip() for x in SENT_SPLIT.split(t) if x.strip()]
|
| 136 |
return [x for x in s if len(x)>=25]
|
| 137 |
|
| 138 |
+
# ====== (1) عبارات مفتاحية (YAKE) ======
|
| 139 |
+
def yake_keywords(t: str, k: int = 260) -> List[str]:
|
| 140 |
phrases = []
|
| 141 |
seen = set()
|
| 142 |
for n in [3, 2, 1]:
|
|
|
|
| 159 |
def good_kw(kw:str)->bool:
|
| 160 |
return kw and len(kw)>=2 and kw not in AR_STOP and not re2.match(r"^[\p{P}\p{S}\d_]+$", kw)
|
| 161 |
|
| 162 |
+
# ====== POS/NER اختياري ======
|
| 163 |
_HAS_CAMEL = False
|
| 164 |
try:
|
| 165 |
from camel_tools.morphology.analyzer import Analyzer
|
|
|
|
| 170 |
except Exception:
|
| 171 |
_HAS_CAMEL = False
|
| 172 |
|
| 173 |
+
NER_TAGS = {"PER","LOC","ORG","MISC"}
|
| 174 |
|
| 175 |
def ar_pos(word: str) -> str:
|
| 176 |
if not _HAS_CAMEL:
|
|
|
|
| 177 |
if re2.match(r"^(في|على|الى|إلى|من|عن|حتى|ثم|بل|لكن|أو|و)$", word): return "PART"
|
| 178 |
if re2.match(r"^[\p{N}]+$", word): return "NUM"
|
| 179 |
if re2.search(r"(ة|ات|ون|ين|ان)$", word): return "NOUN"
|
|
|
|
| 197 |
return False
|
| 198 |
|
| 199 |
def is_clean_sentence(s: str) -> bool:
|
| 200 |
+
if not (60 <= len(s) <= 240): return False
|
| 201 |
if re2.search(r"https?://|www\.", s): return False
|
| 202 |
if re2.search(r"\d{2,}", s): return False
|
| 203 |
return True
|
|
|
|
| 208 |
if ar_pos(k) in {"PRON","PART"}: return False
|
| 209 |
return True
|
| 210 |
|
| 211 |
+
# ====== Embeddings/Masking/Cross-Encoder (اختياري) ======
|
| 212 |
_EMB = None
|
| 213 |
def get_embedder():
|
| 214 |
global _EMB
|
|
|
|
| 230 |
vecs = emb.encode([target] + cand, normalize_embeddings=True)
|
| 231 |
t, C = vecs[0], vecs[1:]
|
| 232 |
import numpy as np
|
| 233 |
+
sims = (C @ t)
|
| 234 |
idx = np.argsort(-sims)[:k]
|
| 235 |
return [(cand[i], float(sims[i])) for i in idx]
|
| 236 |
|
|
|
|
| 265 |
except Exception:
|
| 266 |
return []
|
| 267 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
_CE = None
|
| 269 |
def get_cross_encoder():
|
| 270 |
global _CE
|
|
|
|
| 276 |
_CE = False
|
| 277 |
return _CE
|
| 278 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
def rank_by_ce(sentence_with_blank: str, candidates: List[str]) -> List[str]:
|
| 280 |
ce = get_cross_encoder()
|
| 281 |
if not ce or not candidates:
|
|
|
|
| 288 |
except Exception:
|
| 289 |
return candidates
|
| 290 |
|
| 291 |
+
# --------- أدوات مساعدة للمشتّتات ---------
|
| 292 |
+
def word_tokens(s: str) -> List[str]:
|
| 293 |
+
s = norm_ar(s)
|
| 294 |
+
return re2.findall(r"\p{L}+", s)
|
| 295 |
+
|
| 296 |
+
def token_set(s: str) -> set:
|
| 297 |
+
return set([t for t in word_tokens(s) if t not in AR_STOP])
|
| 298 |
+
|
| 299 |
+
def jaccard(a: str, b: str) -> float:
|
| 300 |
+
A, B = token_set(a), token_set(b)
|
| 301 |
+
if not A or not B: return 0.0
|
| 302 |
+
return len(A & B) / max(1, len(A | B))
|
| 303 |
+
|
| 304 |
+
def is_sub_or_super(a: str, b: str) -> bool:
|
| 305 |
+
A, B = norm_ar(a), norm_ar(b)
|
| 306 |
+
return (A in B) or (B in A)
|
| 307 |
+
|
| 308 |
+
def appears_as_long_fragment_in_sentence(w: str, sentence: str) -> bool:
|
| 309 |
+
toks = word_tokens(w)
|
| 310 |
+
if len(toks) < 3:
|
| 311 |
+
return False
|
| 312 |
+
return re2.search(rf"(?<!\p{{L}}){re2.escape(norm_ar(w))}(?!\p{{L}})", norm_ar(sentence)) is not None
|
| 313 |
+
|
| 314 |
+
def choice_length_ok(w: str) -> bool:
|
| 315 |
+
n = len(word_tokens(w))
|
| 316 |
+
return 1 <= n <= 6
|
| 317 |
+
|
| 318 |
+
def paragraph_index_map(text: str, sentences: List[str]) -> dict:
|
| 319 |
+
paras = [norm_ar(p) for p in re2.split(r"\n{2,}", text) if p.strip()]
|
| 320 |
+
mapping = {}
|
| 321 |
+
for i, s in enumerate(sentences):
|
| 322 |
+
ns = norm_ar(s)
|
| 323 |
+
pid = None
|
| 324 |
+
for j, p in enumerate(paras):
|
| 325 |
+
if ns and ns in p:
|
| 326 |
+
pid = j; break
|
| 327 |
+
mapping[s] = pid if pid is not None else -1
|
| 328 |
+
return mapping
|
| 329 |
+
|
| 330 |
+
def looks_like_title_fragment(s: str) -> bool:
|
| 331 |
+
return ":" in s and s.index(":") < max(10, len(s)//6)
|
| 332 |
+
|
| 333 |
+
def is_nouny_phrase(w: str) -> bool:
|
| 334 |
+
toks = word_tokens(w)
|
| 335 |
+
if not (1 <= len(toks) <= 4): return False
|
| 336 |
+
if re2.search(r"(يفعل|تفعل|يشهد|تقوم|يمكن|قد|سوف)$", w): return False
|
| 337 |
+
return True
|
| 338 |
+
|
| 339 |
+
def best_keyword_in_sentence(sentence: str, global_text: str) -> Optional[str]:
|
| 340 |
+
if looks_like_title_fragment(sentence):
|
| 341 |
+
parts = sentence.split(":", 1)
|
| 342 |
+
sentence = parts[1] if len(parts) > 1 else sentence
|
| 343 |
+
try:
|
| 344 |
+
ex = yake.KeywordExtractor(lan='ar', n=3, top=24)
|
| 345 |
+
pairs = ex.extract_keywords(sentence)
|
| 346 |
+
except Exception:
|
| 347 |
+
pairs = []
|
| 348 |
+
cands = []
|
| 349 |
+
for w, _ in pairs:
|
| 350 |
+
w = re2.sub(r"\s+", " ", w.strip())
|
| 351 |
+
if not w or not good_kw(w) or not safe_keyword(w):
|
| 352 |
+
continue
|
| 353 |
+
if not is_nouny_phrase(w):
|
| 354 |
+
continue
|
| 355 |
+
if not re2.search(rf"(?<!\p{{L}}){re2.escape(w)}(?!\p{{L}})", sentence):
|
| 356 |
+
continue
|
| 357 |
+
freq_weight = global_text.count(w)
|
| 358 |
+
cands.append((w, len(w) + 0.7*freq_weight))
|
| 359 |
+
if not cands:
|
| 360 |
+
toks = [t for t in re2.findall(r"\p{L}+", sentence) if good_kw(t) and safe_keyword(t)]
|
| 361 |
+
toks = [t for t in toks if is_nouny_phrase(t)]
|
| 362 |
+
toks.sort(key=len, reverse=True)
|
| 363 |
+
return toks[0] if toks else None
|
| 364 |
+
cands.sort(key=lambda x: -x[1])
|
| 365 |
+
return cands[0][0]
|
| 366 |
+
|
| 367 |
+
def similarity_caps(difficulty: str):
|
| 368 |
+
if difficulty == "سهل":
|
| 369 |
+
return 0.88
|
| 370 |
+
if difficulty == "صعب":
|
| 371 |
+
return 0.95
|
| 372 |
+
return 0.92
|
| 373 |
+
|
| 374 |
def tokenize_ar(s: str) -> List[str]:
|
| 375 |
s = norm_ar(s)
|
| 376 |
toks = re2.findall(r"\p{L}+", s)
|
|
|
|
| 395 |
pool.add(tok)
|
| 396 |
return list(pool)
|
| 397 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 398 |
def typo_like_variants(answer: str, k: int = 4) -> List[str]:
|
| 399 |
+
"""مشتّتات شكلية: تعريف/تنكير، ي/ى، ة/ه، حذف حرف."""
|
| 400 |
a = norm_ar(answer)
|
| 401 |
vars = set()
|
|
|
|
| 402 |
if a.startswith("ال"):
|
| 403 |
vars.add(a[2:])
|
| 404 |
else:
|
| 405 |
vars.add("ال" + a)
|
|
|
|
| 406 |
vars.add(a.replace("ي", "ى"))
|
| 407 |
vars.add(a.replace("ى", "ي"))
|
|
|
|
| 408 |
vars.add(a.replace("ة", "ه"))
|
| 409 |
vars.add(a.replace("ه", "ة"))
|
|
|
|
| 410 |
if len(a) > 5:
|
| 411 |
mid = len(a)//2
|
| 412 |
vars.add(a[:mid] + a[mid+1:])
|
|
|
|
| 413 |
out = [v for v in vars if v and norm_ar(v) != norm_ar(a)]
|
| 414 |
return out[:k]
|
| 415 |
|
| 416 |
+
# ====== مشتّتات ذكية ======
|
| 417 |
+
def pos_compatible(a: str, b: str) -> bool:
|
| 418 |
+
pa, pb = ar_pos(a), ar_pos(b)
|
| 419 |
+
if "X" in (pa, pb):
|
| 420 |
+
return True
|
| 421 |
+
return pa == pb
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 422 |
|
| 423 |
+
def length_close(a: str, b: str) -> bool:
|
| 424 |
+
return abs(len(a) - len(b)) <= max(6, len(b)//2)
|
|
|
|
|
|
|
| 425 |
|
| 426 |
+
def smart_distractors(correct: str, phrase_pool: List[str], sentence: str, k: int = 3,
|
| 427 |
+
all_sentences: Optional[List[str]] = None, difficulty: str = "متوسط") -> List[str]:
|
| 428 |
+
base: List[str] = []
|
|
|
|
|
|
|
|
|
|
| 429 |
|
| 430 |
+
# (0) مشتّتات شكلية أولاً
|
| 431 |
+
base.extend(typo_like_variants(correct, k=4))
|
| 432 |
|
| 433 |
# (أ) جيران دلاليين
|
| 434 |
base.extend([w for w,_ in nearest_terms(correct, phrase_pool, k=24)])
|
| 435 |
|
| 436 |
# (ب) FILL-MASK
|
| 437 |
+
for w in mlm_distractors(sentence.replace(correct, "_____"), correct, k=18):
|
| 438 |
if w not in base:
|
| 439 |
base.append(w)
|
| 440 |
|
| 441 |
+
# (ج) BM25
|
| 442 |
if all_sentences:
|
| 443 |
bm, corp = bm25_build(all_sentences)
|
| 444 |
for w in bm25_candidates(correct, all_sentences, bm, corp, top=18):
|
|
|
|
| 446 |
base.append(w)
|
| 447 |
|
| 448 |
# فلترة صارمة
|
| 449 |
+
clean: List[str] = []
|
| 450 |
for w in base:
|
| 451 |
+
w = (w or "").strip()
|
| 452 |
if not w or w == correct:
|
| 453 |
continue
|
| 454 |
if not choice_length_ok(w):
|
|
|
|
| 467 |
continue
|
| 468 |
clean.append(w)
|
| 469 |
|
| 470 |
+
# ترتيب (اختياري) + فلتر قرب دلالي
|
| 471 |
clean = rank_by_ce(sentence.replace(correct, "_____"), clean)[:max(k*4, k)]
|
| 472 |
cap = similarity_caps(difficulty)
|
| 473 |
try:
|
|
|
|
| 483 |
except Exception:
|
| 484 |
pass
|
| 485 |
|
| 486 |
+
# تجميع أخير
|
| 487 |
out, seen = [], set()
|
| 488 |
for w in clean:
|
| 489 |
if w in seen:
|
|
|
|
| 492 |
if len(out) >= k:
|
| 493 |
break
|
| 494 |
|
| 495 |
+
# تعويض إذا لزم
|
| 496 |
if len(out) < k:
|
| 497 |
extras = [w for w in phrase_pool
|
| 498 |
if w not in out and w != correct and choice_length_ok(w)
|
|
|
|
| 501 |
and jaccard(w, correct) < 0.5]
|
| 502 |
out.extend(extras[:(k-len(out))])
|
| 503 |
if len(out) < k:
|
| 504 |
+
out.extend([w for w in ["…"]*(k-len(out))]) # لن تُقبل لاحقًا إن لم نكمل 4 خيارات
|
| 505 |
return out[:k]
|
| 506 |
|
| 507 |
+
# ====== mT5 (اختياري) ======
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 508 |
_MT5 = {"tok": None, "model": None, "ok": False}
|
| 509 |
def get_mt5():
|
| 510 |
if _MT5["tok"] is not None or _MT5["model"] is not None or _MT5["ok"]:
|
|
|
|
| 536 |
"أنت منشئ أسئلة متعددة الخيارات باللغة العربية.\n"
|
| 537 |
"من الجملة التالية، أنشئ سؤال فهم مباشر واحدًا مع أربع خيارات وإشارة للجواب الصحيح.\n"
|
| 538 |
"أعد فقط JSON بهذا الشكل:\n"
|
| 539 |
+
"{\n"
|
| 540 |
"\"question\": \"...\",\n"
|
| 541 |
"\"choices\": [\"...\",\"...\",\"...\",\"...\"],\n"
|
| 542 |
"\"answer_index\": 0\n"
|
|
|
|
| 575 |
def make_comp_mcqs(text: str, n: int = 6, difficulty: str = "متوسط") -> List[MCQ]:
|
| 576 |
tok, model, ok = get_mt5()
|
| 577 |
if not ok:
|
|
|
|
| 578 |
return make_mcqs(text, n, difficulty=difficulty)
|
| 579 |
|
| 580 |
sents_all = split_sents(text)
|
| 581 |
+
sents = [s for s in sents_all if is_clean_sentence(s)] or sents_all[:]
|
|
|
|
|
|
|
| 582 |
if not sents:
|
| 583 |
return make_mcqs(text, n, difficulty=difficulty)
|
| 584 |
|
| 585 |
+
# دمج جمل قصيرة لمقاطع مفيدة
|
| 586 |
+
def make_chunks(sents, max_len=220):
|
| 587 |
chunks = []
|
| 588 |
i = 0
|
| 589 |
while i < len(sents):
|
|
|
|
| 624 |
return make_mcqs(text, n, difficulty=difficulty)
|
| 625 |
return items[:n]
|
| 626 |
|
| 627 |
+
# ------------------ مُولّد أسئلة "فراغ" (نهائي) ------------------
|
| 628 |
+
def make_mcqs(text: str, n: int = 6, difficulty: str = "متوسط") -> List[MCQ]:
|
| 629 |
+
all_sents = split_sents(text)
|
| 630 |
+
sents = [s for s in all_sents if is_clean_sentence(s)] or all_sents[:]
|
| 631 |
+
if not sents:
|
| 632 |
+
raise ValueError("النص قصير أو غير صالح.")
|
| 633 |
+
|
| 634 |
+
keyphrases = yake_keywords(text, k=260)
|
| 635 |
+
keyphrases = [kp for kp in keyphrases if safe_keyword(kp) and 2 <= len(kp) <= 40]
|
| 636 |
+
|
| 637 |
+
sent_for: dict = {}
|
| 638 |
+
for s in sents:
|
| 639 |
+
for kp in keyphrases:
|
| 640 |
+
if kp in sent_for:
|
| 641 |
+
continue
|
| 642 |
+
if re2.search(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", s):
|
| 643 |
+
sent_for[kp] = s
|
| 644 |
+
if len(sent_for) >= n * 5:
|
| 645 |
+
break
|
| 646 |
+
|
| 647 |
+
para_map = paragraph_index_map(text, sents)
|
| 648 |
+
used_sentences: set = set()
|
| 649 |
+
items: List[MCQ] = []
|
| 650 |
+
|
| 651 |
+
MAX_PER_PARA = 2
|
| 652 |
+
para_count: dict = {}
|
| 653 |
+
|
| 654 |
+
def add_item_from_pair(sentence: str, kp: str) -> bool:
|
| 655 |
+
nonlocal items, used_sentences, para_count
|
| 656 |
+
pid = para_map.get(sentence, -1)
|
| 657 |
+
if para_count.get(pid, 0) >= MAX_PER_PARA:
|
| 658 |
+
return False
|
| 659 |
+
if not re2.search(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", sentence):
|
| 660 |
+
return False
|
| 661 |
+
q = re2.sub(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", "_____", sentence, count=1)
|
| 662 |
+
pool = [x for x in keyphrases if x != kp] or keyphrases[:]
|
| 663 |
+
ch = smart_distractors(kp, pool, sentence, k=3,
|
| 664 |
+
all_sentences=all_sents, difficulty=difficulty) + [kp]
|
| 665 |
+
|
| 666 |
+
choices, seen = [], set()
|
| 667 |
+
for c in ch:
|
| 668 |
+
c = (c or "").strip()
|
| 669 |
+
if not c or c in seen:
|
| 670 |
+
continue
|
| 671 |
+
if not choice_length_ok(c):
|
| 672 |
+
continue
|
| 673 |
+
if appears_as_long_fragment_in_sentence(c, sentence):
|
| 674 |
+
continue
|
| 675 |
+
if is_sub_or_super(c, kp) or jaccard(c, kp) >= 0.5:
|
| 676 |
+
continue
|
| 677 |
+
seen.add(c); choices.append(c)
|
| 678 |
+
|
| 679 |
+
if kp not in choices:
|
| 680 |
+
choices.append(kp); seen.add(kp)
|
| 681 |
+
if len(choices) < 4:
|
| 682 |
+
return False
|
| 683 |
+
|
| 684 |
+
choices = choices[:4]
|
| 685 |
+
random.shuffle(choices)
|
| 686 |
+
ans = choices.index(kp)
|
| 687 |
+
|
| 688 |
+
items.append(MCQ(id=str(uuid.uuid4())[:8], question=q, choices=choices, answer_index=ans))
|
| 689 |
+
used_sentences.add(sentence)
|
| 690 |
+
para_count[pid] = para_count.get(pid, 0) + 1
|
| 691 |
+
return True
|
| 692 |
+
|
| 693 |
+
# تمريرة أولى: تنويع على الفقرات
|
| 694 |
+
for kp in sorted(sent_for.keys(), key=lambda x: (-len(x), x)):
|
| 695 |
+
if len(items) >= n: break
|
| 696 |
+
s = sent_for[kp]
|
| 697 |
+
if s in used_sentences:
|
| 698 |
+
continue
|
| 699 |
+
_ = add_item_from_pair(s, kp)
|
| 700 |
+
|
| 701 |
+
def fill_from_sentences(candidates: List[str]):
|
| 702 |
+
for s in candidates:
|
| 703 |
+
if len(items) >= n: break
|
| 704 |
+
if s in used_sentences:
|
| 705 |
+
continue
|
| 706 |
+
kp = None
|
| 707 |
+
for kpp, ss in sent_for.items():
|
| 708 |
+
if ss == s:
|
| 709 |
+
kp = kpp; break
|
| 710 |
+
if kp is None:
|
| 711 |
+
kp = best_keyword_in_sentence(s, text)
|
| 712 |
+
if not kp:
|
| 713 |
+
continue
|
| 714 |
+
_ = add_item_from_pair(s, kp)
|
| 715 |
+
|
| 716 |
+
if len(items) < n:
|
| 717 |
+
remaining_new_para = [s for s in sents if para_count.get(para_map.get(s, -1), 0) < MAX_PER_PARA]
|
| 718 |
+
fill_from_sentences(remaining_new_para)
|
| 719 |
+
if len(items) < n:
|
| 720 |
+
leftovers = [s for s in sents if s not in used_sentences]
|
| 721 |
+
fill_from_sentences(leftovers)
|
| 722 |
+
|
| 723 |
+
if not items:
|
| 724 |
+
raise RuntimeError("تعذّر توليد أسئلة.")
|
| 725 |
+
return items[:n]
|
| 726 |
|
| 727 |
# ------------------ تحويل إلى سجلات العرض ------------------
|
| 728 |
def clean_option_text(t: str) -> str:
|
|
|
|
| 730 |
t = re2.sub(AR_DIAC, "", t)
|
| 731 |
t = re2.sub(r"\s+", " ", t)
|
| 732 |
t = re2.sub(r"^[\p{P}\p{S}_-]+|[\p{P}\p{S}_-]+$", "", t)
|
| 733 |
+
# قصّ لطول معقول
|
| 734 |
+
t = re2.sub(r"^(.{,60})(?:\s.*)?$", r"\1", t)
|
| 735 |
return t or "…"
|
| 736 |
|
| 737 |
def to_records(items:List[MCQ])->List[dict]:
|
|
|
|
| 742 |
for i,lbl in enumerate(["A","B","C","D"]):
|
| 743 |
txt=(it.choices[i] if i<len(it.choices) else "…")
|
| 744 |
txt=clean_option_text(txt.replace(",", "،").replace("?", "؟").replace(";", "؛"))
|
|
|
|
| 745 |
if txt in used:
|
| 746 |
+
txt = f"{txt}{i+1}"
|
| 747 |
used.add(txt)
|
| 748 |
opts.append({"id":lbl,"text":txt,"is_correct":(i==it.answer_index)})
|
| 749 |
recs.append({"id":it.id,"question":it.question.strip(),"options":opts})
|
|
|
|
| 782 |
""")
|
| 783 |
return f"""<div id="quiz" class="quiz-wrap">{''.join(parts)}</div>"""
|
| 784 |
|
| 785 |
+
# ------------------ بناء الامتحان وتبديل الصفحات ------------------
|
| 786 |
def build_quiz(text_area, file_path, n, model_id, zoom, mode, difficulty):
|
| 787 |
text_area = (text_area or "").strip()
|
| 788 |
if not text_area and not file_path:
|
|
|
|
| 809 |
warn = f"نمط مُستخدَم: **{used_mode}** — عدد الأسئلة: {len(items)}"
|
| 810 |
return render_quiz_html(recs), gr.update(visible=False), gr.update(visible=True), warn
|
| 811 |
|
|
|
|
| 812 |
# ------------------ CSS ------------------
|
| 813 |
CSS = """
|
| 814 |
:root{
|
|
|
|
| 819 |
.gradio-container{max-width:980px;margin:0 auto;padding:12px 12px 40px;}
|
| 820 |
h2.top{color:#eaeaf2;margin:6px 0 16px}
|
| 821 |
|
| 822 |
+
/* صفحة الإدخال ثابتة الارتفاع ولا تتغير أبعادها */
|
| 823 |
.input-panel{background:var(--panel);border:1px solid var(--border);border-radius:14px;padding:16px;
|
| 824 |
box-shadow:0 16px 38px rgba(0,0,0,.35); min-height:360px; display:flex; flex-direction:column; gap:12px;}
|
| 825 |
.small{opacity:.9;color:#d9dee8}
|
| 826 |
|
| 827 |
+
/* إخفاء معاينة الملف */
|
| 828 |
[data-testid="file"] .file-preview, [data-testid="file"] .file-preview * { display:none !important; }
|
| 829 |
[data-testid="file"] .grid-wrap { display:block !important; }
|
| 830 |
.upload-like{border:2px dashed #3b3f52;background:#121318;border-radius:12px;padding:12px;color:#cfd5e3;min-height:90px}
|
|
|
|
| 852 |
|
| 853 |
.q-actions{display:flex;gap:10px;align-items:center;margin-top:10px}
|
| 854 |
.q-actions .q-submit{
|
| 855 |
+
background:#2dd4bf;border:none;color:#0b0د10;font-weight:800;border-radius:10px;padding:8px 14px;cursor:pointer;
|
| 856 |
}
|
| 857 |
.q-actions .q-submit:disabled{opacity:.5;cursor:not-allowed}
|
| 858 |
.q-note{color:#ffd1d6}
|
| 859 |
.q-note.warn{color:#ffd1d6}
|
| 860 |
"""
|
| 861 |
|
| 862 |
+
# ------------------ JS: ربط Submit + إبراز الصح ------------------
|
| 863 |
ATTACH_LISTENERS_JS = """
|
| 864 |
() => {
|
| 865 |
if (window.__q_submit_bound_multi2) { return 'already'; }
|
|
|
|
| 918 |
with gr.Blocks(title="Question Generator", css=CSS) as demo:
|
| 919 |
gr.Markdown("<h2 class='top'>Question Generator</h2>")
|
| 920 |
|
|
|
|
| 921 |
page1 = gr.Group(visible=True, elem_classes=["input-panel"])
|
| 922 |
with page1:
|
| 923 |
gr.Markdown("اختر **أحد** الخيارين ثم اضغط الزر.", elem_classes=["small"])
|
|
|
|
| 926 |
file_types=[".pdf",".txt"], type="filepath", elem_classes=["upload-like"])
|
| 927 |
num_q = gr.Slider(4, 20, value=DEFAULT_NUM_QUESTIONS, step=1, label="عدد الأسئلة")
|
| 928 |
|
|
|
|
| 929 |
mode_radio = gr.Radio(choices=QUESTION_MODES, value="فراغ", label="نوع السؤال")
|
| 930 |
difficulty_radio = gr.Radio(choices=DIFFICULTY_MODES, value="متوسط", label="درجة الصعوبة")
|
| 931 |
|
|
|
|
| 944 |
btn_build = gr.Button("generate quistion", elem_classes=["button-primary"])
|
| 945 |
warn = gr.Markdown("", elem_classes=["small"])
|
| 946 |
|
|
|
|
| 947 |
page2 = gr.Group(visible=False)
|
| 948 |
with page2:
|
| 949 |
quiz_html = gr.HTML("")
|
| 950 |
+
js_wired = gr.Textbox(visible=False)
|
| 951 |
|
|
|
|
| 952 |
btn_build.click(
|
| 953 |
build_quiz,
|
| 954 |
inputs=[text_area, file_comp, num_q, trocr_model, trocr_zoom, mode_radio, difficulty_radio],
|