Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,12 +1,10 @@
|
|
| 1 |
# -*- coding: utf-8 -*-
|
| 2 |
-
#
|
| 3 |
-
# صفحات ثابتة + Submit لكل سؤال فعليًا + منع تغيّر أبعاد صفحة الإدخال
|
| 4 |
-
# طور "فراغ" + طور "فهم مباشر" (mT5) مع fallbacks، صعوبة، BM25، فلترة قوية للمشتّتات، وتنويع على مستوى الفقرات.
|
| 5 |
|
| 6 |
import os, json, uuid, random, unicodedata
|
| 7 |
from dataclasses import dataclass
|
| 8 |
from pathlib import Path
|
| 9 |
-
from typing import List, Tuple
|
| 10 |
|
| 11 |
from PIL import Image
|
| 12 |
from pypdf import PdfReader
|
|
@@ -15,34 +13,21 @@ import regex as re2
|
|
| 15 |
import yake
|
| 16 |
import gradio as gr
|
| 17 |
|
| 18 |
-
# ------------------ إعدادات عام
|
| 19 |
random.seed(42)
|
| 20 |
DEFAULT_NUM_QUESTIONS = 6
|
| 21 |
DEFAULT_TROCR_MODEL = "microsoft/trocr-base-printed"
|
| 22 |
DEFAULT_TROCR_ZOOM = 2.6
|
| 23 |
-
QUESTION_MODES = ["فراغ", "فهم مباشر"]
|
| 24 |
-
DIFFICULTY_MODES = ["سهل", "متوسط", "صعب"]
|
| 25 |
-
|
| 26 |
-
# BM25 (اختياري)
|
| 27 |
-
try:
|
| 28 |
-
from rank_bm25 import BM25Okapi
|
| 29 |
-
_HAS_BM25 = True
|
| 30 |
-
except Exception:
|
| 31 |
-
_HAS_BM25 = False
|
| 32 |
|
| 33 |
# ------------------ OCR (تحميل كسول) ------------------
|
| 34 |
_OCR = {}
|
| 35 |
def get_ocr(model_id: str):
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
return _OCR[model_id]
|
| 43 |
-
except Exception:
|
| 44 |
-
# بديل آمن: دالة تُعيد نصًا فارغًا
|
| 45 |
-
return lambda im: [{"generated_text": ""}]
|
| 46 |
|
| 47 |
# ------------------ PDF/TXT → نص ------------------
|
| 48 |
def extract_text_with_pypdf(path: str) -> str:
|
|
@@ -120,7 +105,7 @@ def postprocess(raw:str)->str:
|
|
| 120 |
t = re2.sub(r"\[\d+\]", " ", t)
|
| 121 |
return norm_ar(t)
|
| 122 |
|
| 123 |
-
# ------------------
|
| 124 |
SENT_SPLIT = re2.compile(r"(?<=[\.!؟\?])\s+")
|
| 125 |
AR_STOP = set("""في على من إلى عن مع لدى ذلك هذه هذا الذين التي الذي أو أم إن أن كان تكون كانوا كانت كنت ثم قد لقد ربما بل لكن إلا سوى حتى حيث كما لما ما لماذا متى أين كيف أي هناك هنا هؤلاء أولئك نحن هو هي هم هن أنت أنتم أنتن""".split())
|
| 126 |
|
|
@@ -135,8 +120,8 @@ def split_sents(t:str)->List[str]:
|
|
| 135 |
s=[x.strip() for x in SENT_SPLIT.split(t) if x.strip()]
|
| 136 |
return [x for x in s if len(x)>=25]
|
| 137 |
|
| 138 |
-
# ====== (1) عبارات مفتاحية
|
| 139 |
-
def yake_keywords(t: str, k: int =
|
| 140 |
phrases = []
|
| 141 |
seen = set()
|
| 142 |
for n in [3, 2, 1]:
|
|
@@ -159,56 +144,7 @@ def yake_keywords(t: str, k: int = 260) -> List[str]:
|
|
| 159 |
def good_kw(kw:str)->bool:
|
| 160 |
return kw and len(kw)>=2 and kw not in AR_STOP and not re2.match(r"^[\p{P}\p{S}\d_]+$", kw)
|
| 161 |
|
| 162 |
-
# ======
|
| 163 |
-
_HAS_CAMEL = False
|
| 164 |
-
try:
|
| 165 |
-
from camel_tools.morphology.analyzer import Analyzer
|
| 166 |
-
from camel_tools.ner import NERecognizer
|
| 167 |
-
_HAS_CAMEL = True
|
| 168 |
-
_AN = Analyzer.builtin_analyzer()
|
| 169 |
-
_NER = NERecognizer.pretrained()
|
| 170 |
-
except Exception:
|
| 171 |
-
_HAS_CAMEL = False
|
| 172 |
-
|
| 173 |
-
NER_TAGS = {"PER","LOC","ORG","MISC"}
|
| 174 |
-
|
| 175 |
-
def ar_pos(word: str) -> str:
|
| 176 |
-
if not _HAS_CAMEL:
|
| 177 |
-
if re2.match(r"^(في|على|الى|إلى|من|عن|حتى|ثم|بل|لكن|أو|و)$", word): return "PART"
|
| 178 |
-
if re2.match(r"^[\p{N}]+$", word): return "NUM"
|
| 179 |
-
if re2.search(r"(ة|ات|ون|ين|ان)$", word): return "NOUN"
|
| 180 |
-
return "X"
|
| 181 |
-
try:
|
| 182 |
-
ana = _AN.analyze(word)
|
| 183 |
-
if not ana: return "X"
|
| 184 |
-
from collections import Counter
|
| 185 |
-
pos_candidates = [a.get('pos','X') for a in ana]
|
| 186 |
-
return Counter(pos_candidates).most_common(1)[0][0] if pos_candidates else "X"
|
| 187 |
-
except Exception:
|
| 188 |
-
return "X"
|
| 189 |
-
|
| 190 |
-
def is_named_entity(token: str) -> bool:
|
| 191 |
-
if not _HAS_CAMEL:
|
| 192 |
-
return False
|
| 193 |
-
try:
|
| 194 |
-
tag = _NER.predict_sentence([token])[0]
|
| 195 |
-
return tag in NER_TAGS
|
| 196 |
-
except Exception:
|
| 197 |
-
return False
|
| 198 |
-
|
| 199 |
-
def is_clean_sentence(s: str) -> bool:
|
| 200 |
-
if not (60 <= len(s) <= 240): return False
|
| 201 |
-
if re2.search(r"https?://|www\.", s): return False
|
| 202 |
-
if re2.search(r"\d{2,}", s): return False
|
| 203 |
-
return True
|
| 204 |
-
|
| 205 |
-
def safe_keyword(k: str) -> bool:
|
| 206 |
-
if not good_kw(k): return False
|
| 207 |
-
if is_named_entity(k): return False
|
| 208 |
-
if ar_pos(k) in {"PRON","PART"}: return False
|
| 209 |
-
return True
|
| 210 |
-
|
| 211 |
-
# ====== Embeddings/Masking/Cross-Encoder (اختياري) ======
|
| 212 |
_EMB = None
|
| 213 |
def get_embedder():
|
| 214 |
global _EMB
|
|
@@ -217,10 +153,10 @@ def get_embedder():
|
|
| 217 |
from sentence_transformers import SentenceTransformer
|
| 218 |
_EMB = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
|
| 219 |
except Exception:
|
| 220 |
-
_EMB = False
|
| 221 |
return _EMB
|
| 222 |
|
| 223 |
-
def nearest_terms(target: str, pool: List[str], k: int =
|
| 224 |
emb = get_embedder()
|
| 225 |
if not emb:
|
| 226 |
return []
|
|
@@ -230,7 +166,7 @@ def nearest_terms(target: str, pool: List[str], k: int = 24) -> List[Tuple[str,
|
|
| 230 |
vecs = emb.encode([target] + cand, normalize_embeddings=True)
|
| 231 |
t, C = vecs[0], vecs[1:]
|
| 232 |
import numpy as np
|
| 233 |
-
sims = (C @ t)
|
| 234 |
idx = np.argsort(-sims)[:k]
|
| 235 |
return [(cand[i], float(sims[i])) for i in idx]
|
| 236 |
|
|
@@ -245,19 +181,19 @@ def get_masker():
|
|
| 245 |
_MLM = False
|
| 246 |
return _MLM
|
| 247 |
|
| 248 |
-
def mlm_distractors(sentence_with_blank: str, correct: str, k: int =
|
| 249 |
masker = get_masker()
|
| 250 |
if not masker:
|
| 251 |
return []
|
| 252 |
masked = sentence_with_blank.replace("_____", masker.tokenizer.mask_token)
|
| 253 |
try:
|
| 254 |
-
outs = masker(masked, top_k=max(
|
| 255 |
cands = []
|
| 256 |
for o in outs:
|
| 257 |
tok = o["token_str"].strip()
|
| 258 |
if tok and tok != correct and len(tok) >= 2 and not re2.match(r"^[\p{P}\p{S}\d_]+$", tok):
|
| 259 |
cands.append(tok)
|
| 260 |
-
|
| 261 |
for w in cands:
|
| 262 |
if w not in seen:
|
| 263 |
uniq.append(w); seen.add(w)
|
|
@@ -265,487 +201,127 @@ def mlm_distractors(sentence_with_blank: str, correct: str, k: int = 18) -> List
|
|
| 265 |
except Exception:
|
| 266 |
return []
|
| 267 |
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
A, B = token_set(a), token_set(b)
|
| 301 |
-
if not A or not B: return 0.0
|
| 302 |
-
return len(A & B) / max(1, len(A | B))
|
| 303 |
-
|
| 304 |
-
def is_sub_or_super(a: str, b: str) -> bool:
|
| 305 |
-
A, B = norm_ar(a), norm_ar(b)
|
| 306 |
-
return (A in B) or (B in A)
|
| 307 |
-
|
| 308 |
-
def appears_as_long_fragment_in_sentence(w: str, sentence: str) -> bool:
|
| 309 |
-
toks = word_tokens(w)
|
| 310 |
-
if len(toks) < 3:
|
| 311 |
-
return False
|
| 312 |
-
return re2.search(rf"(?<!\p{{L}}){re2.escape(norm_ar(w))}(?!\p{{L}})", norm_ar(sentence)) is not None
|
| 313 |
-
|
| 314 |
-
def choice_length_ok(w: str) -> bool:
|
| 315 |
-
n = len(word_tokens(w))
|
| 316 |
-
return 1 <= n <= 6
|
| 317 |
-
|
| 318 |
-
def paragraph_index_map(text: str, sentences: List[str]) -> dict:
|
| 319 |
-
paras = [norm_ar(p) for p in re2.split(r"\n{2,}", text) if p.strip()]
|
| 320 |
-
mapping = {}
|
| 321 |
-
for i, s in enumerate(sentences):
|
| 322 |
-
ns = norm_ar(s)
|
| 323 |
-
pid = None
|
| 324 |
-
for j, p in enumerate(paras):
|
| 325 |
-
if ns and ns in p:
|
| 326 |
-
pid = j; break
|
| 327 |
-
mapping[s] = pid if pid is not None else -1
|
| 328 |
-
return mapping
|
| 329 |
-
|
| 330 |
-
def looks_like_title_fragment(s: str) -> bool:
|
| 331 |
-
return ":" in s and s.index(":") < max(10, len(s)//6)
|
| 332 |
-
|
| 333 |
-
def is_nouny_phrase(w: str) -> bool:
|
| 334 |
-
toks = word_tokens(w)
|
| 335 |
-
if not (1 <= len(toks) <= 4): return False
|
| 336 |
-
if re2.search(r"(يفعل|تفعل|يشهد|تقوم|يمكن|قد|سوف)$", w): return False
|
| 337 |
-
return True
|
| 338 |
-
|
| 339 |
-
def best_keyword_in_sentence(sentence: str, global_text: str) -> Optional[str]:
|
| 340 |
-
if looks_like_title_fragment(sentence):
|
| 341 |
-
parts = sentence.split(":", 1)
|
| 342 |
-
sentence = parts[1] if len(parts) > 1 else sentence
|
| 343 |
-
try:
|
| 344 |
-
ex = yake.KeywordExtractor(lan='ar', n=3, top=24)
|
| 345 |
-
pairs = ex.extract_keywords(sentence)
|
| 346 |
-
except Exception:
|
| 347 |
-
pairs = []
|
| 348 |
-
cands = []
|
| 349 |
-
for w, _ in pairs:
|
| 350 |
-
w = re2.sub(r"\s+", " ", w.strip())
|
| 351 |
-
if not w or not good_kw(w) or not safe_keyword(w):
|
| 352 |
-
continue
|
| 353 |
-
if not is_nouny_phrase(w):
|
| 354 |
-
continue
|
| 355 |
-
if not re2.search(rf"(?<!\p{{L}}){re2.escape(w)}(?!\p{{L}})", sentence):
|
| 356 |
-
continue
|
| 357 |
-
freq_weight = global_text.count(w)
|
| 358 |
-
cands.append((w, len(w) + 0.7*freq_weight))
|
| 359 |
-
if not cands:
|
| 360 |
-
toks = [t for t in re2.findall(r"\p{L}+", sentence) if good_kw(t) and safe_keyword(t)]
|
| 361 |
-
toks = [t for t in toks if is_nouny_phrase(t)]
|
| 362 |
-
toks.sort(key=len, reverse=True)
|
| 363 |
-
return toks[0] if toks else None
|
| 364 |
-
cands.sort(key=lambda x: -x[1])
|
| 365 |
-
return cands[0][0]
|
| 366 |
-
|
| 367 |
-
def similarity_caps(difficulty: str):
|
| 368 |
-
if difficulty == "سهل":
|
| 369 |
-
return 0.88
|
| 370 |
-
if difficulty == "صعب":
|
| 371 |
-
return 0.95
|
| 372 |
-
return 0.92
|
| 373 |
-
|
| 374 |
-
def tokenize_ar(s: str) -> List[str]:
|
| 375 |
-
s = norm_ar(s)
|
| 376 |
-
toks = re2.findall(r"\p{L}+", s)
|
| 377 |
-
return [t for t in toks if len(t) >= 2 and t not in AR_STOP]
|
| 378 |
-
|
| 379 |
-
def bm25_build(sentences: List[str]):
|
| 380 |
-
if not _HAS_BM25 or not sentences:
|
| 381 |
-
return None, []
|
| 382 |
-
corpus_tokens = [tokenize_ar(s) for s in sentences]
|
| 383 |
-
bm = BM25Okapi(corpus_tokens)
|
| 384 |
-
return bm, corpus_tokens
|
| 385 |
-
|
| 386 |
-
def bm25_candidates(correct: str, sentences: List[str], bm, corpus_tokens, top: int = 20) -> List[str]:
|
| 387 |
-
if not bm: return []
|
| 388 |
-
q = tokenize_ar(correct)
|
| 389 |
-
scores = bm.get_scores(q)
|
| 390 |
-
idxs = sorted(range(len(scores)), key=lambda i: -scores[i])[:min(top, len(scores))]
|
| 391 |
-
pool = set()
|
| 392 |
-
for i in idxs:
|
| 393 |
-
for tok in corpus_tokens[i]:
|
| 394 |
-
if tok != correct and good_kw(tok):
|
| 395 |
-
pool.add(tok)
|
| 396 |
-
return list(pool)
|
| 397 |
-
|
| 398 |
-
def typo_like_variants(answer: str, k: int = 4) -> List[str]:
|
| 399 |
-
"""مشتّتات شكلية: تعريف/تنكير، ي/ى، ة/ه، حذف حرف."""
|
| 400 |
-
a = norm_ar(answer)
|
| 401 |
-
vars = set()
|
| 402 |
-
if a.startswith("ال"):
|
| 403 |
-
vars.add(a[2:])
|
| 404 |
-
else:
|
| 405 |
-
vars.add("ال" + a)
|
| 406 |
-
vars.add(a.replace("ي", "ى"))
|
| 407 |
-
vars.add(a.replace("ى", "ي"))
|
| 408 |
-
vars.add(a.replace("ة", "ه"))
|
| 409 |
-
vars.add(a.replace("ه", "ة"))
|
| 410 |
-
if len(a) > 5:
|
| 411 |
-
mid = len(a)//2
|
| 412 |
-
vars.add(a[:mid] + a[mid+1:])
|
| 413 |
-
out = [v for v in vars if v and norm_ar(v) != norm_ar(a)]
|
| 414 |
-
return out[:k]
|
| 415 |
-
|
| 416 |
-
# ====== مشتّتات ذكية ======
|
| 417 |
-
def pos_compatible(a: str, b: str) -> bool:
|
| 418 |
-
pa, pb = ar_pos(a), ar_pos(b)
|
| 419 |
-
if "X" in (pa, pb):
|
| 420 |
-
return True
|
| 421 |
-
return pa == pb
|
| 422 |
-
|
| 423 |
-
def length_close(a: str, b: str) -> bool:
|
| 424 |
-
return abs(len(a) - len(b)) <= max(6, len(b)//2)
|
| 425 |
-
|
| 426 |
-
def smart_distractors(correct: str, phrase_pool: List[str], sentence: str, k: int = 3,
|
| 427 |
-
all_sentences: Optional[List[str]] = None, difficulty: str = "متوسط") -> List[str]:
|
| 428 |
-
base: List[str] = []
|
| 429 |
-
|
| 430 |
-
# (0) مشتّتات شكلية أولاً
|
| 431 |
-
base.extend(typo_like_variants(correct, k=4))
|
| 432 |
-
|
| 433 |
-
# (أ) جيران دلاليين
|
| 434 |
-
base.extend([w for w,_ in nearest_terms(correct, phrase_pool, k=24)])
|
| 435 |
-
|
| 436 |
-
# (ب) FILL-MASK
|
| 437 |
-
for w in mlm_distractors(sentence.replace(correct, "_____"), correct, k=18):
|
| 438 |
-
if w not in base:
|
| 439 |
-
base.append(w)
|
| 440 |
-
|
| 441 |
-
# (ج) BM25
|
| 442 |
-
if all_sentences:
|
| 443 |
-
bm, corp = bm25_build(all_sentences)
|
| 444 |
-
for w in bm25_candidates(correct, all_sentences, bm, corp, top=18):
|
| 445 |
-
if w not in base:
|
| 446 |
-
base.append(w)
|
| 447 |
-
|
| 448 |
-
# فلترة صارمة
|
| 449 |
-
clean: List[str] = []
|
| 450 |
-
for w in base:
|
| 451 |
-
w = (w or "").strip()
|
| 452 |
-
if not w or w == correct:
|
| 453 |
-
continue
|
| 454 |
-
if not choice_length_ok(w):
|
| 455 |
-
continue
|
| 456 |
-
if appears_as_long_fragment_in_sentence(w, sentence):
|
| 457 |
-
continue
|
| 458 |
-
if is_named_entity(w):
|
| 459 |
-
continue
|
| 460 |
-
if not pos_compatible(w, correct):
|
| 461 |
-
continue
|
| 462 |
-
if not length_close(w, correct):
|
| 463 |
-
continue
|
| 464 |
-
if is_sub_or_super(w, correct):
|
| 465 |
continue
|
| 466 |
-
if
|
| 467 |
continue
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
# ترتيب (اختياري) + فلتر قرب دلالي
|
| 471 |
-
clean = rank_by_ce(sentence.replace(correct, "_____"), clean)[:max(k*4, k)]
|
| 472 |
-
cap = similarity_caps(difficulty)
|
| 473 |
-
try:
|
| 474 |
-
emb = get_embedder()
|
| 475 |
-
if emb and clean:
|
| 476 |
-
vecs = emb.encode([correct] + clean, normalize_embeddings=True)
|
| 477 |
-
c, others = vecs[0], vecs[1:]
|
| 478 |
-
import numpy as np
|
| 479 |
-
sims = others @ c
|
| 480 |
-
filtered = [w for w, s in zip(clean, sims) if s < cap]
|
| 481 |
-
if len(filtered) >= k:
|
| 482 |
-
clean = filtered
|
| 483 |
-
except Exception:
|
| 484 |
-
pass
|
| 485 |
-
|
| 486 |
-
# تجميع أخير
|
| 487 |
-
out, seen = [], set()
|
| 488 |
-
for w in clean:
|
| 489 |
-
if w in seen:
|
| 490 |
continue
|
| 491 |
-
|
| 492 |
if len(out) >= k:
|
| 493 |
break
|
| 494 |
|
| 495 |
-
#
|
| 496 |
if len(out) < k:
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
and not appears_as_long_fragment_in_sentence(w, sentence)
|
| 500 |
-
and not is_sub_or_super(w, correct)
|
| 501 |
-
and jaccard(w, correct) < 0.5]
|
| 502 |
-
out.extend(extras[:(k-len(out))])
|
| 503 |
-
if len(out) < k:
|
| 504 |
-
out.extend([w for w in ["…"]*(k-len(out))]) # لن تُقبل لاحقًا إن لم نكمل 4 خيارات
|
| 505 |
-
return out[:k]
|
| 506 |
-
|
| 507 |
-
# ====== mT5 (اختياري) ======
|
| 508 |
-
_MT5 = {"tok": None, "model": None, "ok": False}
|
| 509 |
-
def get_mt5():
|
| 510 |
-
if _MT5["tok"] is not None or _MT5["model"] is not None or _MT5["ok"]:
|
| 511 |
-
return _MT5["tok"], _MT5["model"], _MT5["ok"]
|
| 512 |
-
try:
|
| 513 |
-
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
| 514 |
-
_MT5["tok"] = AutoTokenizer.from_pretrained("google/mt5-small")
|
| 515 |
-
_MT5["model"] = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-small")
|
| 516 |
-
_MT5["ok"] = True
|
| 517 |
-
except Exception:
|
| 518 |
-
_MT5["tok"] = None; _MT5["model"] = None; _MT5["ok"] = False
|
| 519 |
-
return _MT5["tok"], _MT5["model"], _MT5["ok"]
|
| 520 |
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
|
| 524 |
-
except Exception:
|
| 525 |
-
pass
|
| 526 |
-
m = re2.search(r"\{.*\}", s, flags=re2.DOTALL)
|
| 527 |
-
if m:
|
| 528 |
-
try:
|
| 529 |
-
return json.loads(m.group(0))
|
| 530 |
-
except Exception:
|
| 531 |
-
return None
|
| 532 |
-
return None
|
| 533 |
-
|
| 534 |
-
def comp_prompt(sentence: str) -> str:
|
| 535 |
-
return (
|
| 536 |
-
"أنت منشئ أسئلة متعددة الخيارات باللغة العربية.\n"
|
| 537 |
-
"من الجملة التالية، أنشئ سؤال فهم مباشر واحدًا مع أربع خيارات وإشارة للجواب الصحيح.\n"
|
| 538 |
-
"أعد فقط JSON بهذا الشكل:\n"
|
| 539 |
-
"{\n"
|
| 540 |
-
"\"question\": \"...\",\n"
|
| 541 |
-
"\"choices\": [\"...\",\"...\",\"...\",\"...\"],\n"
|
| 542 |
-
"\"answer_index\": 0\n"
|
| 543 |
-
"}\n\n"
|
| 544 |
-
f"الجملة: {sentence}"
|
| 545 |
-
)
|
| 546 |
-
|
| 547 |
-
def gen_one_comp_q(sentence: str, tok, model, max_new_tokens=128) -> Optional[MCQ]:
|
| 548 |
-
try:
|
| 549 |
-
import torch
|
| 550 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 551 |
-
model = model.to(device)
|
| 552 |
-
inp = tok(comp_prompt(sentence), return_tensors="pt").to(device)
|
| 553 |
-
out = model.generate(
|
| 554 |
-
**inp,
|
| 555 |
-
max_new_tokens=max_new_tokens,
|
| 556 |
-
do_sample=True,
|
| 557 |
-
temperature=0.8,
|
| 558 |
-
top_p=0.9,
|
| 559 |
-
num_return_sequences=1,
|
| 560 |
-
eos_token_id=tok.eos_token_id
|
| 561 |
-
)
|
| 562 |
-
text = tok.decode(out[0], skip_special_tokens=True)
|
| 563 |
-
data = parse_json_block(text) or {}
|
| 564 |
-
q = str(data.get("question","")).strip()
|
| 565 |
-
choices = data.get("choices", [])
|
| 566 |
-
ai = data.get("answer_index", 0)
|
| 567 |
-
if not q or not isinstance(choices, list) or len(choices) < 4:
|
| 568 |
-
return None
|
| 569 |
-
choices = [str(c).strip() for c in choices][:4]
|
| 570 |
-
ai = ai if isinstance(ai, int) and 0 <= ai < 4 else 0
|
| 571 |
-
return MCQ(id=str(uuid.uuid4())[:8], question=q, choices=choices, answer_index=ai)
|
| 572 |
-
except Exception:
|
| 573 |
-
return None
|
| 574 |
-
|
| 575 |
-
def make_comp_mcqs(text: str, n: int = 6, difficulty: str = "متوسط") -> List[MCQ]:
|
| 576 |
-
tok, model, ok = get_mt5()
|
| 577 |
-
if not ok:
|
| 578 |
-
return make_mcqs(text, n, difficulty=difficulty)
|
| 579 |
-
|
| 580 |
-
sents_all = split_sents(text)
|
| 581 |
-
sents = [s for s in sents_all if is_clean_sentence(s)] or sents_all[:]
|
| 582 |
-
if not sents:
|
| 583 |
-
return make_mcqs(text, n, difficulty=difficulty)
|
| 584 |
-
|
| 585 |
-
# دمج جمل قصيرة لمقاطع مفيدة
|
| 586 |
-
def make_chunks(sents, max_len=220):
|
| 587 |
-
chunks = []
|
| 588 |
-
i = 0
|
| 589 |
-
while i < len(sents):
|
| 590 |
-
cur = sents[i]
|
| 591 |
-
j = i + 1
|
| 592 |
-
while j < len(sents) and len(cur) + 1 + len(sents[j]) <= max_len:
|
| 593 |
-
cur = cur + " " + sents[j]
|
| 594 |
-
j += 1
|
| 595 |
-
chunks.append(cur)
|
| 596 |
-
i = j
|
| 597 |
-
return chunks
|
| 598 |
-
|
| 599 |
-
candidates = sents[:] + make_chunks(sents, max_len=220)
|
| 600 |
-
random.shuffle(candidates)
|
| 601 |
-
|
| 602 |
-
items: List[MCQ] = []
|
| 603 |
-
tried = 0
|
| 604 |
-
for s in candidates:
|
| 605 |
-
if len(items) >= n: break
|
| 606 |
-
mcq = gen_one_comp_q(s, tok, model)
|
| 607 |
-
tried += 1
|
| 608 |
-
if mcq:
|
| 609 |
-
q = re2.sub(r"\s+", " ", mcq.question).strip()
|
| 610 |
-
if not (12 <= len(q) <= 220):
|
| 611 |
-
continue
|
| 612 |
-
choices = [re2.sub(r"\s+", " ", c).strip() for c in mcq.choices]
|
| 613 |
-
seen=set(); clean=[]
|
| 614 |
-
for c in choices:
|
| 615 |
-
if c and c not in seen:
|
| 616 |
-
seen.add(c); clean.append(c)
|
| 617 |
-
clean = (clean + ["…","…","…","…"])[:4]
|
| 618 |
-
ai = mcq.answer_index if isinstance(mcq.answer_index,int) and 0<=mcq.answer_index<4 else 0
|
| 619 |
-
items.append(MCQ(id=str(uuid.uuid4())[:8], question=q, choices=clean, answer_index=ai))
|
| 620 |
-
if tried >= n * 12:
|
| 621 |
-
break
|
| 622 |
|
| 623 |
-
|
| 624 |
-
|
| 625 |
-
|
| 626 |
-
|
| 627 |
-
# ------------------ مُولّد أسئلة "فراغ" (نهائي) ------------------
|
| 628 |
-
def make_mcqs(text: str, n: int = 6, difficulty: str = "متوسط") -> List[MCQ]:
|
| 629 |
-
all_sents = split_sents(text)
|
| 630 |
-
sents = [s for s in all_sents if is_clean_sentence(s)] or all_sents[:]
|
| 631 |
-
if not sents:
|
| 632 |
raise ValueError("النص قصير أو غير صالح.")
|
| 633 |
|
| 634 |
-
|
| 635 |
-
keyphrases =
|
|
|
|
| 636 |
|
| 637 |
-
|
|
|
|
| 638 |
for s in sents:
|
|
|
|
|
|
|
| 639 |
for kp in keyphrases:
|
| 640 |
-
if kp in sent_for:
|
| 641 |
continue
|
| 642 |
if re2.search(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", s):
|
| 643 |
-
sent_for[kp]
|
| 644 |
-
|
| 645 |
-
|
| 646 |
-
|
| 647 |
-
|
| 648 |
-
|
| 649 |
-
|
| 650 |
-
|
| 651 |
-
|
| 652 |
-
para_count: dict = {}
|
| 653 |
-
|
| 654 |
-
def add_item_from_pair(sentence: str, kp: str) -> bool:
|
| 655 |
-
nonlocal items, used_sentences, para_count
|
| 656 |
-
pid = para_map.get(sentence, -1)
|
| 657 |
-
if para_count.get(pid, 0) >= MAX_PER_PARA:
|
| 658 |
-
return False
|
| 659 |
-
if not re2.search(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", sentence):
|
| 660 |
-
return False
|
| 661 |
-
q = re2.sub(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", "_____", sentence, count=1)
|
| 662 |
-
pool = [x for x in keyphrases if x != kp] or keyphrases[:]
|
| 663 |
-
ch = smart_distractors(kp, pool, sentence, k=3,
|
| 664 |
-
all_sentences=all_sents, difficulty=difficulty) + [kp]
|
| 665 |
-
|
| 666 |
-
choices, seen = [], set()
|
| 667 |
-
for c in ch:
|
| 668 |
-
c = (c or "").strip()
|
| 669 |
-
if not c or c in seen:
|
| 670 |
-
continue
|
| 671 |
-
if not choice_length_ok(c):
|
| 672 |
continue
|
| 673 |
-
|
| 674 |
-
|
| 675 |
-
|
| 676 |
-
|
| 677 |
-
|
| 678 |
-
|
| 679 |
-
|
| 680 |
-
|
| 681 |
-
|
| 682 |
-
|
| 683 |
-
|
| 684 |
-
|
| 685 |
-
|
| 686 |
-
ans = choices.index(kp)
|
| 687 |
-
|
| 688 |
-
items.append(MCQ(id=str(uuid.uuid4())[:8], question=q, choices=choices, answer_index=ans))
|
| 689 |
-
used_sentences.add(sentence)
|
| 690 |
-
para_count[pid] = para_count.get(pid, 0) + 1
|
| 691 |
-
return True
|
| 692 |
-
|
| 693 |
-
# تمريرة أولى: تنويع على الفقرات
|
| 694 |
for kp in sorted(sent_for.keys(), key=lambda x: (-len(x), x)):
|
| 695 |
-
if len(items)
|
| 696 |
-
s
|
| 697 |
-
if s in
|
| 698 |
continue
|
| 699 |
-
_ = add_item_from_pair(s, kp)
|
| 700 |
|
| 701 |
-
|
| 702 |
-
|
| 703 |
-
if len(items) >= n: break
|
| 704 |
-
if s in used_sentences:
|
| 705 |
-
continue
|
| 706 |
-
kp = None
|
| 707 |
-
for kpp, ss in sent_for.items():
|
| 708 |
-
if ss == s:
|
| 709 |
-
kp = kpp; break
|
| 710 |
-
if kp is None:
|
| 711 |
-
kp = best_keyword_in_sentence(s, text)
|
| 712 |
-
if not kp:
|
| 713 |
-
continue
|
| 714 |
-
_ = add_item_from_pair(s, kp)
|
| 715 |
|
| 716 |
-
|
| 717 |
-
|
| 718 |
-
|
| 719 |
-
|
| 720 |
-
leftovers = [s for s in sents if s not in used_sentences]
|
| 721 |
-
fill_from_sentences(leftovers)
|
| 722 |
|
| 723 |
-
|
| 724 |
-
|
| 725 |
-
return items[:n]
|
| 726 |
|
| 727 |
-
|
| 728 |
-
|
| 729 |
-
|
| 730 |
-
t = re2.sub(AR_DIAC, "", t)
|
| 731 |
-
t = re2.sub(r"\s+", " ", t)
|
| 732 |
-
t = re2.sub(r"^[\p{P}\p{S}_-]+|[\p{P}\p{S}_-]+$", "", t)
|
| 733 |
-
# قصّ لطول معقول
|
| 734 |
-
t = re2.sub(r"^(.{,60})(?:\s.*)?$", r"\1", t)
|
| 735 |
-
return t or "…"
|
| 736 |
|
| 737 |
def to_records(items:List[MCQ])->List[dict]:
|
| 738 |
recs=[]
|
| 739 |
for it in items:
|
| 740 |
opts=[]
|
| 741 |
-
used=set()
|
| 742 |
for i,lbl in enumerate(["A","B","C","D"]):
|
| 743 |
-
txt=(it.choices[i] if i<len(it.choices) else "
|
| 744 |
-
txt=
|
| 745 |
-
|
| 746 |
-
txt = f"{txt}{i+1}"
|
| 747 |
-
used.add(txt)
|
| 748 |
-
opts.append({"id":lbl,"text":txt,"is_correct":(i==it.answer_index)})
|
| 749 |
recs.append({"id":it.id,"question":it.question.strip(),"options":opts})
|
| 750 |
return recs
|
| 751 |
|
|
@@ -782,32 +358,19 @@ def render_quiz_html(records: List[dict]) -> str:
|
|
| 782 |
""")
|
| 783 |
return f"""<div id="quiz" class="quiz-wrap">{''.join(parts)}</div>"""
|
| 784 |
|
| 785 |
-
# ------------------
|
| 786 |
-
def build_quiz(text_area, file_path, n, model_id, zoom
|
| 787 |
text_area = (text_area or "").strip()
|
| 788 |
if not text_area and not file_path:
|
| 789 |
return "", gr.update(visible=True), gr.update(visible=False), "🛈 الصق نصًا أو ارفع ملفًا أولًا."
|
| 790 |
-
|
|
|
|
|
|
|
|
|
|
| 791 |
cleaned = postprocess(raw)
|
| 792 |
-
|
| 793 |
-
|
| 794 |
-
|
| 795 |
-
if mode == "فهم مباشر":
|
| 796 |
-
tok, model, ok = get_mt5()
|
| 797 |
-
if ok:
|
| 798 |
-
items = make_comp_mcqs(cleaned, n=int(n), difficulty=difficulty)
|
| 799 |
-
else:
|
| 800 |
-
items = make_mcqs(cleaned, n=int(n), difficulty=difficulty)
|
| 801 |
-
used_mode = "فراغ (fallback)"
|
| 802 |
-
else:
|
| 803 |
-
items = make_mcqs(cleaned, n=int(n), difficulty=difficulty)
|
| 804 |
-
except Exception:
|
| 805 |
-
items = make_mcqs(cleaned, n=int(n), difficulty=difficulty)
|
| 806 |
-
used_mode = "��راغ (fallback)"
|
| 807 |
-
|
| 808 |
-
recs = to_records(items)
|
| 809 |
-
warn = f"نمط مُستخدَم: **{used_mode}** — عدد الأسئلة: {len(items)}"
|
| 810 |
-
return render_quiz_html(recs), gr.update(visible=False), gr.update(visible=True), warn
|
| 811 |
|
| 812 |
# ------------------ CSS ------------------
|
| 813 |
CSS = """
|
|
@@ -819,12 +382,12 @@ body{direction:rtl; font-family:system-ui,'Cairo','IBM Plex Arabic',sans-serif;
|
|
| 819 |
.gradio-container{max-width:980px;margin:0 auto;padding:12px 12px 40px;}
|
| 820 |
h2.top{color:#eaeaf2;margin:6px 0 16px}
|
| 821 |
|
| 822 |
-
/* صفحة الإدخال ثابتة الارتفاع ولا تتغير
|
| 823 |
.input-panel{background:var(--panel);border:1px solid var(--border);border-radius:14px;padding:16px;
|
| 824 |
box-shadow:0 16px 38px rgba(0,0,0,.35); min-height:360px; display:flex; flex-direction:column; gap:12px;}
|
| 825 |
.small{opacity:.9;color:#d9dee8}
|
| 826 |
|
| 827 |
-
/*
|
| 828 |
[data-testid="file"] .file-preview, [data-testid="file"] .file-preview * { display:none !important; }
|
| 829 |
[data-testid="file"] .grid-wrap { display:block !important; }
|
| 830 |
.upload-like{border:2px dashed #3b3f52;background:#121318;border-radius:12px;padding:12px;color:#cfd5e3;min-height:90px}
|
|
@@ -841,7 +404,7 @@ textarea{min-height:120px}
|
|
| 841 |
.q-badge.ok{background:#083a2a;color:#b6f4db;border:1px solid #145b44}
|
| 842 |
.q-badge.err{background:#3a0d14;color:#ffd1d6;border:1px solid #6a1e2b}
|
| 843 |
|
| 844 |
-
.q-text{color:
|
| 845 |
.opts{display:flex;flex-direction:column;gap:8px}
|
| 846 |
.opt{display:flex;gap:10px;align-items:center;background:#14161c;border:1px solid #2a2d3a;border-radius:12px;padding:10px;transition:background .15s,border-color .15s}
|
| 847 |
.opt input{accent-color:var(--accent2)}
|
|
@@ -852,16 +415,17 @@ textarea{min-height:120px}
|
|
| 852 |
|
| 853 |
.q-actions{display:flex;gap:10px;align-items:center;margin-top:10px}
|
| 854 |
.q-actions .q-submit{
|
| 855 |
-
background:#2dd4bf;border:none;color:#
|
| 856 |
}
|
| 857 |
.q-actions .q-submit:disabled{opacity:.5;cursor:not-allowed}
|
| 858 |
.q-note{color:#ffd1d6}
|
| 859 |
.q-note.warn{color:#ffd1d6}
|
| 860 |
"""
|
| 861 |
|
| 862 |
-
# ------------------ JS: ربط Submit
|
| 863 |
ATTACH_LISTENERS_JS = """
|
| 864 |
() => {
|
|
|
|
| 865 |
if (window.__q_submit_bound_multi2) { return 'already'; }
|
| 866 |
window.__q_submit_bound_multi2 = true;
|
| 867 |
|
|
@@ -884,30 +448,22 @@ ATTACH_LISTENERS_JS = """
|
|
| 884 |
|
| 885 |
const chosenLabel = chosen.closest('.opt');
|
| 886 |
|
|
|
|
| 887 |
if (chosen.value === correct) {
|
| 888 |
chosenLabel.classList.add('ok');
|
| 889 |
if (badge){ badge.hidden=false; badge.className='q-badge ok'; badge.textContent='Correct!'; }
|
|
|
|
| 890 |
card.querySelectorAll('input[type="radio"]').forEach(i => i.disabled = true);
|
| 891 |
e.target.disabled = true;
|
| 892 |
if (note) note.textContent = '';
|
| 893 |
-
|
| 894 |
-
const qNode = card.querySelector('.q-text');
|
| 895 |
-
if (qNode){
|
| 896 |
-
const full = qNode.textContent || qNode.innerText || '';
|
| 897 |
-
const correctText = [...card.querySelectorAll('.opt')].find(o =>
|
| 898 |
-
o.querySelector('input').value === correct
|
| 899 |
-
)?.querySelector('.opt-text')?.textContent || '';
|
| 900 |
-
if (full && correctText && full.includes('_____')){
|
| 901 |
-
const highlighted = full.replace('_____', `<mark style="background:#2dd4bf22;border:1px solid #2dd4bf55;border-radius:6px;padding:0 4px">${correctText}</mark>`);
|
| 902 |
-
qNode.innerHTML = highlighted;
|
| 903 |
-
}
|
| 904 |
-
}
|
| 905 |
return;
|
| 906 |
}
|
| 907 |
|
| 908 |
-
|
|
|
|
| 909 |
if (badge){ badge.hidden=false; badge.className='q-badge err'; badge.textContent='Incorrect.'; }
|
| 910 |
if (note) note.textContent = '';
|
|
|
|
| 911 |
});
|
| 912 |
|
| 913 |
return 'wired-multi2';
|
|
@@ -918,6 +474,7 @@ ATTACH_LISTENERS_JS = """
|
|
| 918 |
with gr.Blocks(title="Question Generator", css=CSS) as demo:
|
| 919 |
gr.Markdown("<h2 class='top'>Question Generator</h2>")
|
| 920 |
|
|
|
|
| 921 |
page1 = gr.Group(visible=True, elem_classes=["input-panel"])
|
| 922 |
with page1:
|
| 923 |
gr.Markdown("اختر **أحد** الخيارين ثم اضغط الزر.", elem_classes=["small"])
|
|
@@ -925,10 +482,6 @@ with gr.Blocks(title="Question Generator", css=CSS) as demo:
|
|
| 925 |
file_comp = gr.File(label="أو ارفع ملف (PDF / TXT)", file_count="single",
|
| 926 |
file_types=[".pdf",".txt"], type="filepath", elem_classes=["upload-like"])
|
| 927 |
num_q = gr.Slider(4, 20, value=DEFAULT_NUM_QUESTIONS, step=1, label="عدد الأسئلة")
|
| 928 |
-
|
| 929 |
-
mode_radio = gr.Radio(choices=QUESTION_MODES, value="فراغ", label="نوع السؤال")
|
| 930 |
-
difficulty_radio = gr.Radio(choices=DIFFICULTY_MODES, value="متوسط", label="درجة الصعوبة")
|
| 931 |
-
|
| 932 |
with gr.Accordion("خيارات PDF المصوّر (اختياري)", open=False):
|
| 933 |
trocr_model = gr.Dropdown(
|
| 934 |
choices=[
|
|
@@ -940,18 +493,19 @@ with gr.Blocks(title="Question Generator", css=CSS) as demo:
|
|
| 940 |
value=DEFAULT_TROCR_MODEL, label="نموذج TrOCR"
|
| 941 |
)
|
| 942 |
trocr_zoom = gr.Slider(2.0, 3.5, value=DEFAULT_TROCR_ZOOM, step=0.1, label="Zoom OCR")
|
| 943 |
-
|
| 944 |
btn_build = gr.Button("generate quistion", elem_classes=["button-primary"])
|
| 945 |
warn = gr.Markdown("", elem_classes=["small"])
|
| 946 |
|
|
|
|
| 947 |
page2 = gr.Group(visible=False)
|
| 948 |
with page2:
|
| 949 |
quiz_html = gr.HTML("")
|
| 950 |
-
js_wired = gr.Textbox(visible=False)
|
| 951 |
|
|
|
|
| 952 |
btn_build.click(
|
| 953 |
build_quiz,
|
| 954 |
-
inputs=[text_area, file_comp, num_q, trocr_model, trocr_zoom
|
| 955 |
outputs=[quiz_html, page1, page2, warn]
|
| 956 |
).then(
|
| 957 |
None, inputs=None, outputs=[js_wired], js=ATTACH_LISTENERS_JS
|
|
|
|
| 1 |
# -*- coding: utf-8 -*-
|
| 2 |
+
# صفحتان ثابتتان + Submit لكل سؤال يعمل فعليًا + منع تغيّر أبعاد صفحة الإدخال
|
|
|
|
|
|
|
| 3 |
|
| 4 |
import os, json, uuid, random, unicodedata
|
| 5 |
from dataclasses import dataclass
|
| 6 |
from pathlib import Path
|
| 7 |
+
from typing import List, Tuple
|
| 8 |
|
| 9 |
from PIL import Image
|
| 10 |
from pypdf import PdfReader
|
|
|
|
| 13 |
import yake
|
| 14 |
import gradio as gr
|
| 15 |
|
| 16 |
+
# ------------------ إعدادات عامة ------------------
|
| 17 |
random.seed(42)
|
| 18 |
DEFAULT_NUM_QUESTIONS = 6
|
| 19 |
DEFAULT_TROCR_MODEL = "microsoft/trocr-base-printed"
|
| 20 |
DEFAULT_TROCR_ZOOM = 2.6
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
# ------------------ OCR (تحميل كسول) ------------------
|
| 23 |
_OCR = {}
|
| 24 |
def get_ocr(model_id: str):
|
| 25 |
+
from transformers import pipeline
|
| 26 |
+
import torch
|
| 27 |
+
dev = 0 if torch.cuda.is_available() else -1
|
| 28 |
+
if model_id not in _OCR:
|
| 29 |
+
_OCR[model_id] = pipeline("image-to-text", model=model_id, device=dev)
|
| 30 |
+
return _OCR[model_id]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
# ------------------ PDF/TXT → نص ------------------
|
| 33 |
def extract_text_with_pypdf(path: str) -> str:
|
|
|
|
| 105 |
t = re2.sub(r"\[\d+\]", " ", t)
|
| 106 |
return norm_ar(t)
|
| 107 |
|
| 108 |
+
# ------------------ توليد أسئلة (تحسينات كبيرة داخليًا فقط) ------------------
|
| 109 |
SENT_SPLIT = re2.compile(r"(?<=[\.!؟\?])\s+")
|
| 110 |
AR_STOP = set("""في على من إلى عن مع لدى ذلك هذه هذا الذين التي الذي أو أم إن أن كان تكون كانوا كانت كنت ثم قد لقد ربما بل لكن إلا سوى حتى حيث كما لما ما لماذا متى أين كيف أي هناك هنا هؤلاء أولئك نحن هو هي هم هن أنت أنتم أنتن""".split())
|
| 111 |
|
|
|
|
| 120 |
s=[x.strip() for x in SENT_SPLIT.split(t) if x.strip()]
|
| 121 |
return [x for x in s if len(x)>=25]
|
| 122 |
|
| 123 |
+
# ====== (1) عبارات مفتاحية أذكى: n=3 ثم 2 ثم 1، مع فلترة ======
|
| 124 |
+
def yake_keywords(t: str, k: int = 160) -> List[str]:
|
| 125 |
phrases = []
|
| 126 |
seen = set()
|
| 127 |
for n in [3, 2, 1]:
|
|
|
|
| 144 |
def good_kw(kw:str)->bool:
|
| 145 |
return kw and len(kw)>=2 and kw not in AR_STOP and not re2.match(r"^[\p{P}\p{S}\d_]+$", kw)
|
| 146 |
|
| 147 |
+
# ====== (2) جيران دلاليًا + (3) FILL-MASK كبديل ======
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
_EMB = None
|
| 149 |
def get_embedder():
|
| 150 |
global _EMB
|
|
|
|
| 153 |
from sentence_transformers import SentenceTransformer
|
| 154 |
_EMB = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
|
| 155 |
except Exception:
|
| 156 |
+
_EMB = False # تشير لتعطّل التحميل
|
| 157 |
return _EMB
|
| 158 |
|
| 159 |
+
def nearest_terms(target: str, pool: List[str], k: int = 12) -> List[Tuple[str, float]]:
|
| 160 |
emb = get_embedder()
|
| 161 |
if not emb:
|
| 162 |
return []
|
|
|
|
| 166 |
vecs = emb.encode([target] + cand, normalize_embeddings=True)
|
| 167 |
t, C = vecs[0], vecs[1:]
|
| 168 |
import numpy as np
|
| 169 |
+
sims = (C @ t) # cosine لأن المتجهات مُطبّعة
|
| 170 |
idx = np.argsort(-sims)[:k]
|
| 171 |
return [(cand[i], float(sims[i])) for i in idx]
|
| 172 |
|
|
|
|
| 181 |
_MLM = False
|
| 182 |
return _MLM
|
| 183 |
|
| 184 |
+
def mlm_distractors(sentence_with_blank: str, correct: str, k: int = 8) -> List[str]:
|
| 185 |
masker = get_masker()
|
| 186 |
if not masker:
|
| 187 |
return []
|
| 188 |
masked = sentence_with_blank.replace("_____", masker.tokenizer.mask_token)
|
| 189 |
try:
|
| 190 |
+
outs = masker(masked, top_k=max(15, k+5))
|
| 191 |
cands = []
|
| 192 |
for o in outs:
|
| 193 |
tok = o["token_str"].strip()
|
| 194 |
if tok and tok != correct and len(tok) >= 2 and not re2.match(r"^[\p{P}\p{S}\d_]+$", tok):
|
| 195 |
cands.append(tok)
|
| 196 |
+
seen = set(); uniq = []
|
| 197 |
for w in cands:
|
| 198 |
if w not in seen:
|
| 199 |
uniq.append(w); seen.add(w)
|
|
|
|
| 201 |
except Exception:
|
| 202 |
return []
|
| 203 |
|
| 204 |
+
def legacy_distractors(correct:str, pool:List[str], k:int=3)->List[str]:
|
| 205 |
+
# النسخة القديمة كنسخة احتياط
|
| 206 |
+
L=len(correct.strip()); cand=[]
|
| 207 |
+
for w in pool:
|
| 208 |
+
w=w.strip()
|
| 209 |
+
if not w or w==correct or w in AR_STOP: continue
|
| 210 |
+
if re2.match(r"^[\p{P}\p{S}\d_]+$", w): continue
|
| 211 |
+
if abs(len(w)-L)<=3: cand.append(w)
|
| 212 |
+
random.shuffle(cand)
|
| 213 |
+
out=cand[:k]
|
| 214 |
+
while len(out)<k: out.append("—")
|
| 215 |
+
return out
|
| 216 |
+
|
| 217 |
+
def smart_distractors(correct: str, phrase_pool: List[str], sentence: str, k: int = 3) -> List[str]:
|
| 218 |
+
# 1) جيران دلاليًا
|
| 219 |
+
neigh = nearest_terms(correct, phrase_pool, k=12)
|
| 220 |
+
neigh = [w for w,sim in neigh if w != correct][:k+4]
|
| 221 |
+
|
| 222 |
+
# 2) FILL-MASK على الجملة (بديل)
|
| 223 |
+
if len(neigh) < k:
|
| 224 |
+
mlm = mlm_distractors(sentence.replace(correct, "_____"), correct, k=10)
|
| 225 |
+
for w in mlm:
|
| 226 |
+
if w not in neigh and w != correct:
|
| 227 |
+
neigh.append(w)
|
| 228 |
+
if len(neigh) >= k+4:
|
| 229 |
+
break
|
| 230 |
+
|
| 231 |
+
# 3) فلترة خفيفة
|
| 232 |
+
out = []
|
| 233 |
+
L = len(correct)
|
| 234 |
+
for w in neigh:
|
| 235 |
+
if w in AR_STOP:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
continue
|
| 237 |
+
if abs(len(w) - L) > max(6, L//2):
|
| 238 |
continue
|
| 239 |
+
if norm_ar(w) == norm_ar(correct):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
continue
|
| 241 |
+
out.append(w)
|
| 242 |
if len(out) >= k:
|
| 243 |
break
|
| 244 |
|
| 245 |
+
# 4) رجوع للخطة القديمة إذا ما كفى
|
| 246 |
if len(out) < k:
|
| 247 |
+
extra = legacy_distractors(correct, phrase_pool, k=k-len(out))
|
| 248 |
+
out.extend(extra)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
|
| 250 |
+
while len(out) < k:
|
| 251 |
+
out.append("—")
|
| 252 |
+
return out
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
|
| 254 |
+
# ====== (4) مُولِّد أسئلة جديد بمحافظته على نفس الواجهة تمامًا ======
|
| 255 |
+
def make_mcqs(text:str, n:int=6)->List[MCQ]:
|
| 256 |
+
sents=split_sents(text)
|
| 257 |
+
if not sents:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
raise ValueError("النص قصير أو غير صالح.")
|
| 259 |
|
| 260 |
+
# عبارات مفتاحية 1–3 كلمات
|
| 261 |
+
keyphrases = yake_keywords(text, k=160)
|
| 262 |
+
keyphrases = [kp for kp in keyphrases if good_kw(kp) and 2 <= len(kp) <= 40]
|
| 263 |
|
| 264 |
+
# ربط العبارة بجملة مناسبة (طول معقول ≥ 60) لضمان سياق واضح
|
| 265 |
+
sent_for={}
|
| 266 |
for s in sents:
|
| 267 |
+
if len(s) < 60:
|
| 268 |
+
continue
|
| 269 |
for kp in keyphrases:
|
| 270 |
+
if kp in sent_for:
|
| 271 |
continue
|
| 272 |
if re2.search(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", s):
|
| 273 |
+
sent_for[kp]=s
|
| 274 |
+
|
| 275 |
+
if not sent_for:
|
| 276 |
+
# fallback: لو ما لقينا مطابقات جيدة، نرجع للمفردات العامة من النص
|
| 277 |
+
tokens = [t for t in re2.findall(r"[\p{L}\p{N}_]+", text) if good_kw(t)]
|
| 278 |
+
freq = [w for w,_ in sorted(((t, text.count(t)) for t in tokens), key=lambda x:-x[1])]
|
| 279 |
+
keyphrases = freq[:120]
|
| 280 |
+
for s in sents:
|
| 281 |
+
if len(s) < 60:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
continue
|
| 283 |
+
for kp in keyphrases:
|
| 284 |
+
if kp in sent_for:
|
| 285 |
+
continue
|
| 286 |
+
if re2.search(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", s):
|
| 287 |
+
sent_for[kp]=s
|
| 288 |
+
if len(sent_for)>=n*2:
|
| 289 |
+
break
|
| 290 |
+
|
| 291 |
+
if not sent_for:
|
| 292 |
+
raise RuntimeError("تعذّر توليد أسئلة من هذا النص.")
|
| 293 |
+
|
| 294 |
+
# نعطي أولوية للعبارات الأطول (أكثر إعلامية)
|
| 295 |
+
items=[]; used_sents=set(); used_keys=set()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 296 |
for kp in sorted(sent_for.keys(), key=lambda x: (-len(x), x)):
|
| 297 |
+
if len(items)>=n: break
|
| 298 |
+
s=sent_for[kp]
|
| 299 |
+
if s in used_sents or kp in used_keys:
|
| 300 |
continue
|
|
|
|
| 301 |
|
| 302 |
+
# ابنِ سؤال الفراغ
|
| 303 |
+
q=re2.sub(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", "_____", s, count=1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 304 |
|
| 305 |
+
# مشتتات أذكى (مع رجوع تلقائي لو النماذج مش متاحة)
|
| 306 |
+
pool = [x for x in keyphrases if x != kp]
|
| 307 |
+
ch = smart_distractors(kp, pool, s, k=3) + [kp]
|
| 308 |
+
random.shuffle(ch); ans=ch.index(kp)
|
|
|
|
|
|
|
| 309 |
|
| 310 |
+
items.append(MCQ(id=str(uuid.uuid4())[:8], question=q, choices=ch, answer_index=ans))
|
| 311 |
+
used_sents.add(s); used_keys.add(kp)
|
|
|
|
| 312 |
|
| 313 |
+
if not items:
|
| 314 |
+
raise RuntimeError("تعذّر توليد أسئلة.")
|
| 315 |
+
return items
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 316 |
|
| 317 |
def to_records(items:List[MCQ])->List[dict]:
|
| 318 |
recs=[]
|
| 319 |
for it in items:
|
| 320 |
opts=[]
|
|
|
|
| 321 |
for i,lbl in enumerate(["A","B","C","D"]):
|
| 322 |
+
txt=(it.choices[i] if i<len(it.choices) else "—").strip()
|
| 323 |
+
txt=txt.replace(",", "،").replace("?", "؟").replace(";", "؛")
|
| 324 |
+
opts.append({"id":lbl,"text":txt or "—","is_correct":(i==it.answer_index)})
|
|
|
|
|
|
|
|
|
|
| 325 |
recs.append({"id":it.id,"question":it.question.strip(),"options":opts})
|
| 326 |
return recs
|
| 327 |
|
|
|
|
| 358 |
""")
|
| 359 |
return f"""<div id="quiz" class="quiz-wrap">{''.join(parts)}</div>"""
|
| 360 |
|
| 361 |
+
# ------------------ توليد الامتحان وتبديل الصفحات ------------------
|
| 362 |
+
def build_quiz(text_area, file_path, n, model_id, zoom):
|
| 363 |
text_area = (text_area or "").strip()
|
| 364 |
if not text_area and not file_path:
|
| 365 |
return "", gr.update(visible=True), gr.update(visible=False), "🛈 الصق نصًا أو ارفع ملفًا أولًا."
|
| 366 |
+
if text_area:
|
| 367 |
+
raw = text_area
|
| 368 |
+
else:
|
| 369 |
+
raw, _ = file_to_text(file_path, model_id=model_id, zoom=float(zoom))
|
| 370 |
cleaned = postprocess(raw)
|
| 371 |
+
items = make_mcqs(cleaned, n=int(n))
|
| 372 |
+
recs = to_records(items)
|
| 373 |
+
return render_quiz_html(recs), gr.update(visible=False), gr.update(visible=True), ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 374 |
|
| 375 |
# ------------------ CSS ------------------
|
| 376 |
CSS = """
|
|
|
|
| 382 |
.gradio-container{max-width:980px;margin:0 auto;padding:12px 12px 40px;}
|
| 383 |
h2.top{color:#eaeaf2;margin:6px 0 16px}
|
| 384 |
|
| 385 |
+
/* صفحة الإدخال ثابتة الارتفاع ولا تتغير بعد الرفع */
|
| 386 |
.input-panel{background:var(--panel);border:1px solid var(--border);border-radius:14px;padding:16px;
|
| 387 |
box-shadow:0 16px 38px rgba(0,0,0,.35); min-height:360px; display:flex; flex-direction:column; gap:12px;}
|
| 388 |
.small{opacity:.9;color:#d9dee8}
|
| 389 |
|
| 390 |
+
/* منع لوحة المعاينة الخاصة بالملفات التي تغيّر التخطيط */
|
| 391 |
[data-testid="file"] .file-preview, [data-testid="file"] .file-preview * { display:none !important; }
|
| 392 |
[data-testid="file"] .grid-wrap { display:block !important; }
|
| 393 |
.upload-like{border:2px dashed #3b3f52;background:#121318;border-radius:12px;padding:12px;color:#cfd5e3;min-height:90px}
|
|
|
|
| 404 |
.q-badge.ok{background:#083a2a;color:#b6f4db;border:1px solid #145b44}
|
| 405 |
.q-badge.err{background:#3a0d14;color:#ffd1d6;border:1px solid #6a1e2b}
|
| 406 |
|
| 407 |
+
.q-text{color:var(--text);font-size:1.06rem;line-height:1.8;margin:8px 0 12px}
|
| 408 |
.opts{display:flex;flex-direction:column;gap:8px}
|
| 409 |
.opt{display:flex;gap:10px;align-items:center;background:#14161c;border:1px solid #2a2d3a;border-radius:12px;padding:10px;transition:background .15s,border-color .15s}
|
| 410 |
.opt input{accent-color:var(--accent2)}
|
|
|
|
| 415 |
|
| 416 |
.q-actions{display:flex;gap:10px;align-items:center;margin-top:10px}
|
| 417 |
.q-actions .q-submit{
|
| 418 |
+
background:#2dd4bf;border:none;color:#0b0d10;font-weight:800;border-radius:10px;padding:8px 14px;cursor:pointer;
|
| 419 |
}
|
| 420 |
.q-actions .q-submit:disabled{opacity:.5;cursor:not-allowed}
|
| 421 |
.q-note{color:#ffd1d6}
|
| 422 |
.q-note.warn{color:#ffd1d6}
|
| 423 |
"""
|
| 424 |
|
| 425 |
+
# ------------------ JS: ربط Submit بعد الرندر (مع Output مخفي لضمان التنفيذ) ------------------
|
| 426 |
ATTACH_LISTENERS_JS = """
|
| 427 |
() => {
|
| 428 |
+
// اربط مرة واحدة فقط
|
| 429 |
if (window.__q_submit_bound_multi2) { return 'already'; }
|
| 430 |
window.__q_submit_bound_multi2 = true;
|
| 431 |
|
|
|
|
| 448 |
|
| 449 |
const chosenLabel = chosen.closest('.opt');
|
| 450 |
|
| 451 |
+
// حالة صحيحة: لوّن أخضر وأقفل السؤال كاملاً
|
| 452 |
if (chosen.value === correct) {
|
| 453 |
chosenLabel.classList.add('ok');
|
| 454 |
if (badge){ badge.hidden=false; badge.className='q-badge ok'; badge.textContent='Correct!'; }
|
| 455 |
+
// أقفل هذا السؤال فقط بعد الصح
|
| 456 |
card.querySelectorAll('input[type="radio"]').forEach(i => i.disabled = true);
|
| 457 |
e.target.disabled = true;
|
| 458 |
if (note) note.textContent = '';
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 459 |
return;
|
| 460 |
}
|
| 461 |
|
| 462 |
+
// حالة خاطئة: لوّن أحمر فقط، ولا تعطل أي شيء — ليقدر يجرّب خيار آخر
|
| 463 |
+
chosenLabel.classList.add('err'); // اتركه أحمر
|
| 464 |
if (badge){ badge.hidden=false; badge.className='q-badge err'; badge.textContent='Incorrect.'; }
|
| 465 |
if (note) note.textContent = '';
|
| 466 |
+
// مهم: لا تعطّل الراديو ولا الزر
|
| 467 |
});
|
| 468 |
|
| 469 |
return 'wired-multi2';
|
|
|
|
| 474 |
with gr.Blocks(title="Question Generator", css=CSS) as demo:
|
| 475 |
gr.Markdown("<h2 class='top'>Question Generator</h2>")
|
| 476 |
|
| 477 |
+
# الصفحة 1: إدخال ثابت لا تتغير أبعاده
|
| 478 |
page1 = gr.Group(visible=True, elem_classes=["input-panel"])
|
| 479 |
with page1:
|
| 480 |
gr.Markdown("اختر **أحد** الخيارين ثم اضغط الزر.", elem_classes=["small"])
|
|
|
|
| 482 |
file_comp = gr.File(label="أو ارفع ملف (PDF / TXT)", file_count="single",
|
| 483 |
file_types=[".pdf",".txt"], type="filepath", elem_classes=["upload-like"])
|
| 484 |
num_q = gr.Slider(4, 20, value=DEFAULT_NUM_QUESTIONS, step=1, label="عدد الأسئلة")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 485 |
with gr.Accordion("خيارات PDF المصوّر (اختياري)", open=False):
|
| 486 |
trocr_model = gr.Dropdown(
|
| 487 |
choices=[
|
|
|
|
| 493 |
value=DEFAULT_TROCR_MODEL, label="نموذج TrOCR"
|
| 494 |
)
|
| 495 |
trocr_zoom = gr.Slider(2.0, 3.5, value=DEFAULT_TROCR_ZOOM, step=0.1, label="Zoom OCR")
|
|
|
|
| 496 |
btn_build = gr.Button("generate quistion", elem_classes=["button-primary"])
|
| 497 |
warn = gr.Markdown("", elem_classes=["small"])
|
| 498 |
|
| 499 |
+
# الصفحة 2: الأسئلة
|
| 500 |
page2 = gr.Group(visible=False)
|
| 501 |
with page2:
|
| 502 |
quiz_html = gr.HTML("")
|
| 503 |
+
js_wired = gr.Textbox(visible=False) # Output مخفي لضمان تنفيذ JS
|
| 504 |
|
| 505 |
+
# بناء الامتحان + تبديل الصفحات + ربط الـJS
|
| 506 |
btn_build.click(
|
| 507 |
build_quiz,
|
| 508 |
+
inputs=[text_area, file_comp, num_q, trocr_model, trocr_zoom],
|
| 509 |
outputs=[quiz_html, page1, page2, warn]
|
| 510 |
).then(
|
| 511 |
None, inputs=None, outputs=[js_wired], js=ATTACH_LISTENERS_JS
|