Spaces:

Leen172
/

Question_generator

Build error

App Files Files Community

Leen172 commited on Nov 3, 2025

Commit

741560b

verified ·

1 Parent(s): 92b47c0

Update app.py

Browse files

Files changed (1) hide show

app.py +231 -330

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
-# صفحتان ثابتتان + Submit لكل سؤال يعمل فعليًا + منع تغيّر أبعاد صفحة الإدخال
-# + طور اختياري لأسئلة فهم مباشر باستخدام mT5 (تحميل كسول + fallback)
-# + تحكم بدرجة الصعوبة + مشتّتات أقوى (BM25) + فلاتر POS/NER وCross-Encoder اختياريين
 import os, json, uuid, random, unicodedata
 from dataclasses import dataclass
@@ -15,7 +15,7 @@ import regex as re2
 import yake
 import gradio as gr
-# ------------------ إعدادات عامة ------------------
 random.seed(42)
 DEFAULT_NUM_QUESTIONS = 6
 DEFAULT_TROCR_MODEL = "microsoft/trocr-base-printed"
@@ -23,7 +23,7 @@ DEFAULT_TROCR_ZOOM   = 2.6
 QUESTION_MODES = ["فراغ", "فهم مباشر"]
 DIFFICULTY_MODES = ["سهل", "متوسط", "صعب"]
-# NEW: BM25 (اختياري)
 try:
     from rank_bm25 import BM25Okapi
     _HAS_BM25 = True
@@ -41,7 +41,7 @@ def get_ocr(model_id: str):
             _OCR[model_id] = pipeline("image-to-text", model=model_id, device=dev)
         return _OCR[model_id]
     except Exception:
-        # ارجعي دالة وهمية تعيد نصًا فارغًا بدل التعطّل
         return lambda im: [{"generated_text": ""}]
 # ------------------ PDF/TXT → نص ------------------
@@ -120,47 +120,6 @@ def postprocess(raw:str)->str:
     t = re2.sub(r"\[\d+\]", " ", t)
     return norm_ar(t)
-def word_tokens(s: str) -> List[str]:
-    s = norm_ar(s)
-    return re2.findall(r"\p{L}+", s)
-def token_set(s: str) -> set:
-    return set([t for t in word_tokens(s) if t not in AR_STOP])
-def jaccard(a: str, b: str) -> float:
-    A, B = token_set(a), token_set(b)
-    if not A or not B: return 0.0
-    return len(A & B) / max(1, len(A | B))
-def is_sub_or_super(a: str, b: str) -> bool:
-    # هل a包含 b أو b包含 a (بعد التطبيع)
-    A, B = norm_ar(a), norm_ar(b)
-    return (A in B) or (B in A)
-def appears_as_long_fragment_in_sentence(w: str, sentence: str) -> bool:
-    # اعتبرها قصاصة مزعجة إذا هي مقطع متصل من نفس الجملة بعدد كلمات ≥ 3
-    toks = word_tokens(w)
-    if len(toks) < 3:
-        return False
-    return re2.search(rf"(?<!\p{{L}}){re2.escape(norm_ar(w))}(?!\p{{L}})", norm_ar(sentence)) is not None
-def choice_length_ok(w: str) -> bool:
-    n = len(word_tokens(w))
-    return 1 <= n <= 6
-def paragraph_index_map(text: str, sentences: List[str]) -> dict:
-    # قسّم النص لفقرات، وأعط كل جملة رقم الفقرة التي تحتويها
-    paras = [norm_ar(p) for p in re2.split(r"\n{2,}", text) if p.strip()]
-    mapping = {}
-    for i, s in enumerate(sentences):
-        ns = norm_ar(s)
-        pid = None
-        for j, p in enumerate(paras):
-            if ns and ns in p:
-                pid = j; break
-        mapping[s] = pid if pid is not None else -1
-    return mapping
 # ------------------ بنية السؤال ------------------
 SENT_SPLIT = re2.compile(r"(?<=[\.!؟\?])\s+")
 AR_STOP = set("""في على من إلى عن مع لدى ذلك هذه هذا الذين التي الذي أو أم إن أن كان تكون كانوا كانت كنت ثم قد لقد ربما بل لكن إلا سوى حتى حيث كما لما ما لماذا متى أين كيف أي هناك هنا هؤلاء أولئك نحن هو هي هم هن أنت أنتم أنتن""".split())
@@ -176,8 +135,8 @@ def split_sents(t:str)->List[str]:
     s=[x.strip() for x in SENT_SPLIT.split(t) if x.strip()]
     return [x for x in s if len(x)>=25]
-# ====== (1) عبارات مفتاحية أذكى: n=3 ثم 2 ثم 1، مع فلترة ======
-def yake_keywords(t: str, k: int = 200) -> List[str]:
     phrases = []
     seen = set()
     for n in [3, 2, 1]:
@@ -200,7 +159,7 @@ def yake_keywords(t: str, k: int = 200) -> List[str]:
 def good_kw(kw:str)->bool:
     return kw and len(kw)>=2 and kw not in AR_STOP and not re2.match(r"^[\p{P}\p{S}\d_]+$", kw)
-# ====== تحسينات الذكاء: POS/NER اختياري مع fallback ======
 _HAS_CAMEL = False
 try:
     from camel_tools.morphology.analyzer import Analyzer
@@ -211,11 +170,10 @@ try:
 except Exception:
     _HAS_CAMEL = False
-NER_TAGS = {"PER","LOC","ORG","MISC"}  # أسماء علم
 def ar_pos(word: str) -> str:
     if not _HAS_CAMEL:
-        # fallback مبسّط
         if re2.match(r"^(في|على|الى|إلى|من|عن|حتى|ثم|بل|لكن|أو|و)$", word): return "PART"
         if re2.match(r"^[\p{N}]+$", word): return "NUM"
         if re2.search(r"(ة|ات|ون|ين|ان)$", word): return "NOUN"
@@ -239,7 +197,7 @@ def is_named_entity(token: str) -> bool:
         return False
 def is_clean_sentence(s: str) -> bool:
-    if not (70 <= len(s) <= 220): return False
     if re2.search(r"https?://|www\.", s): return False
     if re2.search(r"\d{2,}", s): return False
     return True
@@ -250,7 +208,7 @@ def safe_keyword(k: str) -> bool:
     if ar_pos(k) in {"PRON","PART"}: return False
     return True
-# ====== (2) جيران دلاليًا + (3) FILL-MASK كبديل ======
 _EMB = None
 def get_embedder():
     global _EMB
@@ -272,7 +230,7 @@ def nearest_terms(target: str, pool: List[str], k: int = 24) -> List[Tuple[str,
     vecs = emb.encode([target] + cand, normalize_embeddings=True)
     t, C = vecs[0], vecs[1:]
     import numpy as np
-    sims = (C @ t)  # cosine لأن المتجهات مُطبّعة
     idx = np.argsort(-sims)[:k]
     return [(cand[i], float(sims[i])) for i in idx]
@@ -307,19 +265,6 @@ def mlm_distractors(sentence_with_blank: str, correct: str, k: int = 18) -> List
     except Exception:
         return []
-def legacy_distractors(correct:str, pool:List[str], k:int=3)->List[str]:
-    L=len(correct.strip()); cand=[]
-    for w in pool:
-        w=w.strip()
-        if not w or w==correct or w in AR_STOP: continue
-        if re2.match(r"^[\p{P}\p{S}\d_]+$", w): continue
-        if abs(len(w)-L)<=3: cand.append(w)
-    random.shuffle(cand)
-    out=cand[:k]
-    while len(out)<k: out.append("…")
-    return out
-# ====== Cross-Encoder اختياري للترتيب ======
 _CE = None
 def get_cross_encoder():
     global _CE
@@ -331,15 +276,6 @@ def get_cross_encoder():
             _CE = False
     return _CE
-def pos_compatible(a: str, b: str) -> bool:
-    pa, pb = ar_pos(a), ar_pos(b)
-    if "X" in (pa, pb):
-        return True
-    return pa == pb
-def length_close(a: str, b: str) -> bool:
-    return abs(len(a) - len(b)) <= max(6, len(b)//2)
 def rank_by_ce(sentence_with_blank: str, candidates: List[str]) -> List[str]:
     ce = get_cross_encoder()
     if not ce or not candidates:
@@ -352,7 +288,89 @@ def rank_by_ce(sentence_with_blank: str, candidates: List[str]) -> List[str]:
     except Exception:
         return candidates
-# --------- أدوات BM25 للصعوبة ---------
 def tokenize_ar(s: str) -> List[str]:
     s = norm_ar(s)
     toks = re2.findall(r"\p{L}+", s)
@@ -377,84 +395,50 @@ def bm25_candidates(correct: str, sentences: List[str], bm, corpus_tokens, top:
                 pool.add(tok)
     return list(pool)
-def pick_clean_sentences(sents: List[str], difficulty: str) -> List[str]:
-    out = []
-    for s in sents:
-        if not is_clean_sentence(s):
-            continue
-        L = len(s)
-        if difficulty == "سهل" and not (70 <= L <= 180):
-            continue
-        if difficulty == "متوسط" and not (70 <= L <= 220):
-            continue
-        if difficulty == "صعب" and not (60 <= L <= 240):
-            continue
-        out.append(s)
-    return out
-def similarity_caps(difficulty: str):
-    if difficulty == "سهل":
-        return 0.88
-    if difficulty == "صعب":
-        return 0.95
-    return 0.92
 def typo_like_variants(answer: str, k: int = 4) -> List[str]:
-    """مشتّتات شكلية: تعديلات طفيفة على الجواب (تعريف/تنكير، ي/ى، ة/ه، حذف حرف)."""
     a = norm_ar(answer)
     vars = set()
-    # تعريف/تنكير
     if a.startswith("ال"):
         vars.add(a[2:])
     else:
         vars.add("ال" + a)
-    # ي/ى
     vars.add(a.replace("ي", "ى"))
     vars.add(a.replace("ى", "ي"))
-    # ة/ه
     vars.add(a.replace("ة", "ه"))
     vars.add(a.replace("ه", "ة"))
-    # حذف حرف وسطي إن الكلمة طويلة
     if len(a) > 5:
         mid = len(a)//2
         vars.add(a[:mid] + a[mid+1:])
-    # تنظيف نهائي
     out = [v for v in vars if v and norm_ar(v) != norm_ar(a)]
     return out[:k]
-# ====== مشتّتات ذكية مع الصعوبة وBM25 ======
-def smart_distractors(correct: str, phrase_pool: List[str], sentence: str, k: int = 3,
-                      all_sentences: Optional[List[str]] = None, difficulty: str = "متوسط") -> List[str]:
-    base = []
-    # (0) مشتّتات شكلية أولاً (تعريف/تنكير، ي/ى، ة/ه، حذف حرف...)
-    for v in typo_like_variants(correct, k=4):
-        base.append(v)
-    # (أ) جيران دلاليين من العبارات
-    base.extend([w for w, _ in nearest_terms(correct, phrase_pool, k=24)])
-    # (ب) FILL-MASK على الجملة
-    for w in mlm_distractors(sentence.replace(correct, "_____"), correct, k=18):
-        if w not in base:
-            base.append(w)
-    # (ج) BM25 من النص كله
-    if all_sentences:
-        bm, corp = bm25_build(all_sentences)
-        for w in bm25_candidates(correct, all_sentences, bm, corp, top=18):
-            if w not in base:
-                base.append(w)
     # (أ) جيران دلاليين
     base.extend([w for w,_ in nearest_terms(correct, phrase_pool, k=24)])
     # (ب) FILL-MASK
-    for w in mlm_distractors(sentence.replace("_____", "_____"), correct, k=18):
         if w not in base:
             base.append(w)
-    # (ج) BM25 من النص
     if all_sentences:
         bm, corp = bm25_build(all_sentences)
         for w in bm25_candidates(correct, all_sentences, bm, corp, top=18):
@@ -462,9 +446,9 @@ def smart_distractors(correct: str, phrase_pool: List[str], sentence: str, k: in
                 base.append(w)
     # فلترة صارمة
-    clean = []
     for w in base:
-        w = w.strip()
         if not w or w == correct:
             continue
         if not choice_length_ok(w):
@@ -483,7 +467,7 @@ def smart_distractors(correct: str, phrase_pool: List[str], sentence: str, k: in
             continue
         clean.append(w)
-    # ترتيب (اختياري) + فلتر قرب دلالي حسب الصعوبة
     clean = rank_by_ce(sentence.replace(correct, "_____"), clean)[:max(k*4, k)]
     cap = similarity_caps(difficulty)
     try:
@@ -499,7 +483,7 @@ def smart_distractors(correct: str, phrase_pool: List[str], sentence: str, k: in
     except Exception:
         pass
-    # آخر ضبط
     out, seen = [], set()
     for w in clean:
         if w in seen:
@@ -508,7 +492,7 @@ def smart_distractors(correct: str, phrase_pool: List[str], sentence: str, k: in
         if len(out) >= k:
             break
-    # تعويض احتياطي
     if len(out) < k:
         extras = [w for w in phrase_pool
                   if w not in out and w != correct and choice_length_ok(w)
@@ -517,185 +501,10 @@ def smart_distractors(correct: str, phrase_pool: List[str], sentence: str, k: in
                   and jaccard(w, correct) < 0.5]
         out.extend(extras[:(k-len(out))])
     if len(out) < k:
-        out.extend(legacy_distractors(correct, phrase_pool, k=k-len(out)))
     return out[:k]
-def looks_like_title_fragment(s: str) -> bool:
-    return ":" in s and s.index(":") < max(10, len(s)//6)
-def is_nouny_phrase(w: str) -> bool:
-    # heuristics بسيطة: 1–4 كلمات، بدون أفعال ظاهرة، وتنتهي باسم/صفة شائعة
-    toks = word_tokens(w)
-    if not (1 <= len(toks) <= 4): return False
-    if re2.search(r"(يفعل|تفعل|يشهد|تقوم|يمكن|قد|سوف)$", w): return False
-    return True
-def best_keyword_in_sentence(sentence: str, global_text: str) -> Optional[str]:
-    if looks_like_title_fragment(sentence):
-        # لا تفرّغ من شق العنوان قبل النقطتين
-        parts = sentence.split(":", 1)
-        sentence = parts[1] if len(parts) > 1 else sentence
-    try:
-        ex = yake.KeywordExtractor(lan='ar', n=3, top=24)
-        pairs = ex.extract_keywords(sentence)
-    except Exception:
-        pairs = []
-    cands = []
-    for w, _ in pairs:
-        w = re2.sub(r"\s+", " ", w.strip())
-        if not w or not good_kw(w) or not safe_keyword(w):
-            continue
-        if not is_nouny_phrase(w):
-            continue
-        if not re2.search(rf"(?<!\p{{L}}){re2.escape(w)}(?!\p{{L}})", sentence):
-            continue
-        freq_weight = global_text.count(w)
-        cands.append((w, len(w) + 0.7*freq_weight))
-    if not cands:
-        toks = [t for t in re2.findall(r"\p{L}+", sentence) if good_kw(t) and safe_keyword(t)]
-        toks = [t for t in toks if is_nouny_phrase(t)]
-        toks.sort(key=len, reverse=True)
-        return toks[0] if toks else None
-    cands.sort(key=lambda x: -x[1])
-    return cands[0][0]
-# ====== (4-أ) مُولِّد أسئلة "فراغ" ======
-def make_mcqs(text: str, n: int = 6, difficulty: str = "متوسط") -> List[MCQ]:
-    """
-    توليد أسئلة فراغ ذكية مع:
-      - تنويع على مستوى الفقرات (حد أقصى لكل فقرة)
-      - تعبئة العدد المطلوب إلى n
-      - فلترة مشتّتات شكلية/قصاصات/تشابه عالي
-    المتطلبات الخارجية: paragraph_index_map, best_keyword_in_sentence, smart_distractors,
-    choice_length_ok, appears_as_long_fragment_in_sentence, is_sub_or_super, jaccard,
-    split_sents, pick_clean_sentences, yake_keywords, safe_keyword
-    """
-    # 1) تجهيز الجمل والعبارات
-    all_sents = split_sents(text)
-    sents = pick_clean_sentences(all_sents, difficulty) or all_sents[:]
-    if not sents:
-        raise ValueError("النص قصير أو غير صالح.")
-    keyphrases = yake_keywords(text, k=260)
-    keyphrases = [kp for kp in keyphrases if safe_keyword(kp) and 2 <= len(kp) <= 40]
-    # 2) ربط مرن (لا يشترط ظهور وحيد)
-    sent_for: dict = {}
-    for s in sents:
-        for kp in keyphrases:
-            if kp in sent_for:
-                continue
-            if re2.search(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", s):
-                sent_for[kp] = s
-        if len(sent_for) >= n * 5:
-            break
-    # 3) إعداد التنويع حسب الفقرات
-    para_map = paragraph_index_map(text, sents)
-    used_sentences: set = set()
-    items: List[MCQ] = []
-    MAX_PER_PARA = 2            # حد أعلى للأسئلة من نفس الفقرة
-    para_count: dict = {}       # pid -> count
-    def add_item_from_pair(sentence: str, kp: str) -> bool:
-        """يبني سؤالاً من (جملة، كلمة مفتاحية) ويضيفه إلى items عند النجاح."""
-        nonlocal items, used_sentences, para_count
-        # حد الفقرة
-        pid = para_map.get(sentence, -1)
-        if para_count.get(pid, 0) >= MAX_PER_PARA:
-            return False
-        # لازم الكلمة موجودة نصياً بالجملة
-        if not re2.search(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", sentence):
-            return False
-        # ابنِ سؤال الفراغ
-        q = re2.sub(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", "_____", sentence, count=1)
-        # حضّر الخيارات
-        pool = [x for x in keyphrases if x != kp] or keyphrases[:]
-        ch = smart_distractors(
-            kp, pool, sentence, k=3, all_sentences=all_sents, difficulty=difficulty
-        ) + [kp]
-        # تنظيف وتثبيت 4 خيارات حقيقية
-        choices, seen = [], set()
-        for c in ch:
-            c = (c or "").strip()
-            if not c or c in seen:
-                continue
-            if not choice_length_ok(c):
-                continue
-            if appears_as_long_fragment_in_sentence(c, sentence):
-                continue
-            if is_sub_or_super(c, kp) or jaccard(c, kp) >= 0.5:
-                continue
-            seen.add(c)
-            choices.append(c)
-        # أضف الجواب إن لم يُدرج بعد
-        if kp not in choices:
-            choices.append(kp)
-            seen.add(kp)
-        # نفضّل حذف السؤال على ملء "…"؛ لازم 4 خيارات فعلية
-        if len(choices) < 4:
-            return False
-        choices = choices[:4]
-        random.shuffle(choices)
-        ans = choices.index(kp)
-        items.append(MCQ(id=str(uuid.uuid4())[:8], question=q, choices=choices, answer_index=ans))
-        used_sentences.add(sentence)
-        para_count[pid] = para_count.get(pid, 0) + 1
-        return True
-    # 4) تمريرة أولى: أسئلة من فقرات مختلفة أولاً (ثم نسمح حتى حد MAX_PER_PARA)
-    for kp in sorted(sent_for.keys(), key=lambda x: (-len(x), x)):
-        if len(items) >= n:
-            break
-        s = sent_for[kp]
-        if s in used_sentences:
-            continue
-        _ = add_item_from_pair(s, kp)
-    # 5) تعبئة النقص: من جمل فقرات لم تبلغ حدّها، ثم أي جمل باقية
-    def fill_from_sentences(candidates: List[str]):
-        for s in candidates:
-            if len(items) >= n:
-                break
-            if s in used_sentences:
-                continue
-            # جرّب بكلمة رُبطت بهذه الجملة إن وجدت، وإلا اختَر هدفًا من نفس الجملة
-            kp = None
-            for kpp, ss in sent_for.items():
-                if ss == s:
-                    kp = kpp
-                    break
-            if kp is None:
-                kp = best_keyword_in_sentence(s, text)
-            if not kp:
-                continue
-            _ = add_item_from_pair(s, kp)
-    if len(items) < n:
-        remaining_new_para = [s for s in sents if para_count.get(para_map.get(s, -1), 0) < MAX_PER_PARA]
-        fill_from_sentences(remaining_new_para)
-    if len(items) < n:
-        leftovers = [s for s in sents if s not in used_sentences]
-        fill_from_sentences(leftovers)
-    if not items:
-        raise RuntimeError("تعذّر توليد أسئلة.")
-    return items[:n]
-# ====== (4-ب) مُولِّد أسئلة "فهم مباشر" (mT5) ======
 _MT5 = {"tok": None, "model": None, "ok": False}
 def get_mt5():
     if _MT5["tok"] is not None or _MT5["model"] is not None or _MT5["ok"]:
@@ -727,7 +536,7 @@ def comp_prompt(sentence: str) -> str:
         "أنت منشئ أسئلة متعددة الخيارات باللغة العربية.\n"
         "من الجملة التالية، أنشئ سؤال فهم مباشر واحدًا مع أربع خيارات وإشارة للجواب الصحيح.\n"
         "أعد فقط JSON بهذا الشكل:\n"
-        "{"
         "\"question\": \"...\",\n"
         "\"choices\": [\"...\",\"...\",\"...\",\"...\"],\n"
         "\"answer_index\": 0\n"
@@ -766,18 +575,15 @@ def gen_one_comp_q(sentence: str, tok, model, max_new_tokens=128) -> Optional[MC
 def make_comp_mcqs(text: str, n: int = 6, difficulty: str = "متوسط") -> List[MCQ]:
     tok, model, ok = get_mt5()
     if not ok:
-        # عدم توفر mT5 → ارجعي لأسئلة الفراغ
         return make_mcqs(text, n, difficulty=difficulty)
     sents_all = split_sents(text)
-    sents = pick_clean_sentences(sents_all, difficulty)
-    if not sents:
-        sents = sents_all[:]
     if not sents:
         return make_mcqs(text, n, difficulty=difficulty)
-    # جرّبي أولًا على جمل مفردة، ثم على “مقاطع” (دمج 2–3 جمل) إذا لزم
-    def make_chunks(sents, max_len=260):
         chunks = []
         i = 0
         while i < len(sents):
@@ -818,6 +624,105 @@ def make_comp_mcqs(text: str, n: int = 6, difficulty: str = "متوسط") -> Lis
         return make_mcqs(text, n, difficulty=difficulty)
     return items[:n]
 # ------------------ تحويل إلى سجلات العرض ------------------
 def clean_option_text(t: str) -> str:
@@ -825,6 +730,8 @@ def clean_option_text(t: str) -> str:
     t = re2.sub(AR_DIAC, "", t)
     t = re2.sub(r"\s+", " ", t)
     t = re2.sub(r"^[\p{P}\p{S}_-]+|[\p{P}\p{S}_-]+$", "", t)
     return t or "…"
 def to_records(items:List[MCQ])->List[dict]:
@@ -835,9 +742,8 @@ def to_records(items:List[MCQ])->List[dict]:
         for i,lbl in enumerate(["A","B","C","D"]):
             txt=(it.choices[i] if i<len(it.choices) else "…")
             txt=clean_option_text(txt.replace(",", "،").replace("?", "؟").replace(";", "؛"))
-            txt = re2.sub(r"^(.{,60})(?:\s.*)?$", r"\1", txt)  # قصّ الخيار لـ ~60 حرف
             if txt in used:
-                txt = f"…{i+1}"
             used.add(txt)
             opts.append({"id":lbl,"text":txt,"is_correct":(i==it.answer_index)})
         recs.append({"id":it.id,"question":it.question.strip(),"options":opts})
@@ -876,7 +782,7 @@ def render_quiz_html(records: List[dict]) -> str:
         """)
     return f"""<div id="quiz" class="quiz-wrap">{''.join(parts)}</div>"""
-# ------------------ توليد الامتحان وتبديل الصفحات ------------------
 def build_quiz(text_area, file_path, n, model_id, zoom, mode, difficulty):
     text_area = (text_area or "").strip()
     if not text_area and not file_path:
@@ -903,7 +809,6 @@ def build_quiz(text_area, file_path, n, model_id, zoom, mode, difficulty):
     warn = f"نمط مُستخدَم: **{used_mode}** — عدد الأسئلة: {len(items)}"
     return render_quiz_html(recs), gr.update(visible=False), gr.update(visible=True), warn
 # ------------------ CSS ------------------
 CSS = """
 :root{
@@ -914,12 +819,12 @@ body{direction:rtl; font-family:system-ui,'Cairo','IBM Plex Arabic',sans-serif;
 .gradio-container{max-width:980px;margin:0 auto;padding:12px 12px 40px;}
 h2.top{color:#eaeaf2;margin:6px 0 16px}
-/* صفحة الإدخال ثابتة الارتفاع ولا تتغير أبعاده */
 .input-panel{background:var(--panel);border:1px solid var(--border);border-radius:14px;padding:16px;
   box-shadow:0 16px 38px rgba(0,0,0,.35); min-height:360px; display:flex; flex-direction:column; gap:12px;}
 .small{opacity:.9;color:#d9dee8}
-/* منع لوحة المعاينة الخاصة بالملفات التي تغيّر التخطيط */
 [data-testid="file"] .file-preview, [data-testid="file"] .file-preview * { display:none !important; }
 [data-testid="file"] .grid-wrap { display:block !important; }
 .upload-like{border:2px dashed #3b3f52;background:#121318;border-radius:12px;padding:12px;color:#cfd5e3;min-height:90px}
@@ -947,14 +852,14 @@ textarea{min-height:120px}
 .q-actions{display:flex;gap:10px;align-items:center;margin-top:10px}
 .q-actions .q-submit{
-  background:#2dd4bf;border:none;color:#0b0d10;font-weight:800;border-radius:10px;padding:8px 14px;cursor:pointer;
 }
 .q-actions .q-submit:disabled{opacity:.5;cursor:not-allowed}
 .q-note{color:#ffd1d6}
 .q-note.warn{color:#ffd1d6}
 """
-# ------------------ JS: ربط Submit بعد الرندر (مع إبراز الصح لأسئلة الفراغ) ------------------
 ATTACH_LISTENERS_JS = """
 () => {
   if (window.__q_submit_bound_multi2) { return 'already'; }
@@ -1013,7 +918,6 @@ ATTACH_LISTENERS_JS = """
 with gr.Blocks(title="Question Generator", css=CSS) as demo:
     gr.Markdown("<h2 class='top'>Question Generator</h2>")
-    # الصفحة 1: إدخال ثابت لا تتغير أبعاده
     page1 = gr.Group(visible=True, elem_classes=["input-panel"])
     with page1:
         gr.Markdown("اختر **أحد** الخيارين ثم اضغط الزر.", elem_classes=["small"])
@@ -1022,7 +926,6 @@ with gr.Blocks(title="Question Generator", css=CSS) as demo:
                             file_types=[".pdf",".txt"], type="filepath", elem_classes=["upload-like"])
         num_q = gr.Slider(4, 20, value=DEFAULT_NUM_QUESTIONS, step=1, label="عدد الأسئلة")
-        # خيارات إضافية بدون تغيير البنية العامة
         mode_radio = gr.Radio(choices=QUESTION_MODES, value="فراغ", label="نوع السؤال")
         difficulty_radio = gr.Radio(choices=DIFFICULTY_MODES, value="متوسط", label="درجة الصعوبة")
@@ -1041,13 +944,11 @@ with gr.Blocks(title="Question Generator", css=CSS) as demo:
         btn_build = gr.Button("generate quistion", elem_classes=["button-primary"])
         warn = gr.Markdown("", elem_classes=["small"])
-    # الصفحة 2: الأسئلة
     page2 = gr.Group(visible=False)
     with page2:
         quiz_html = gr.HTML("")
-        js_wired  = gr.Textbox(visible=False)   # Output مخفي لضمان تنفيذ JS
-    # بناء الامتحان + تبديل الصفحات + ربط الـJS
     btn_build.click(
         build_quiz,
         inputs=[text_area, file_comp, num_q, trocr_model, trocr_zoom, mode_radio, difficulty_radio],

 # -*- coding: utf-8 -*-
+# Question Generator — Final Publishable Build (Lite/Full)
+# صفحات ثابتة + Submit لكل سؤال فعليًا + منع تغيّر أبعاد صفحة الإدخال
+# طور "فراغ" + طور "فهم مباشر" (mT5) مع fallbacks، صعوبة، BM25، فلترة قوية للمشتّتات، وتنويع على مستوى الفقرات.
 import os, json, uuid, random, unicodedata
 from dataclasses import dataclass
 import yake
 import gradio as gr
+# ------------------ إعدادات عامّة ------------------
 random.seed(42)
 DEFAULT_NUM_QUESTIONS = 6
 DEFAULT_TROCR_MODEL = "microsoft/trocr-base-printed"
 QUESTION_MODES = ["فراغ", "فهم مباشر"]
 DIFFICULTY_MODES = ["سهل", "متوسط", "صعب"]
+# BM25 (اختياري)
 try:
     from rank_bm25 import BM25Okapi
     _HAS_BM25 = True
             _OCR[model_id] = pipeline("image-to-text", model=model_id, device=dev)
         return _OCR[model_id]
     except Exception:
+        # بديل آمن: دالة تُعيد نصًا فارغًا
         return lambda im: [{"generated_text": ""}]
 # ------------------ PDF/TXT → نص ------------------
     t = re2.sub(r"\[\d+\]", " ", t)
     return norm_ar(t)
 # ------------------ بنية السؤال ------------------
 SENT_SPLIT = re2.compile(r"(?<=[\.!؟\?])\s+")
 AR_STOP = set("""في على من إلى عن مع لدى ذلك هذه هذا الذين التي الذي أو أم إن أن كان تكون كانوا كانت كنت ثم قد لقد ربما بل لكن إلا سوى حتى حيث كما لما ما لماذا متى أين كيف أي هناك هنا هؤلاء أولئك نحن هو هي هم هن أنت أنتم أنتن""".split())
     s=[x.strip() for x in SENT_SPLIT.split(t) if x.strip()]
     return [x for x in s if len(x)>=25]
+# ====== (1) عبارات مفتاحية (YAKE) ======
+def yake_keywords(t: str, k: int = 260) -> List[str]:
     phrases = []
     seen = set()
     for n in [3, 2, 1]:
 def good_kw(kw:str)->bool:
     return kw and len(kw)>=2 and kw not in AR_STOP and not re2.match(r"^[\p{P}\p{S}\d_]+$", kw)
+# ====== POS/NER اختياري ======
 _HAS_CAMEL = False
 try:
     from camel_tools.morphology.analyzer import Analyzer
 except Exception:
     _HAS_CAMEL = False
+NER_TAGS = {"PER","LOC","ORG","MISC"}
 def ar_pos(word: str) -> str:
     if not _HAS_CAMEL:
         if re2.match(r"^(في|على|الى|إلى|من|عن|حتى|ثم|بل|لكن|أو|و)$", word): return "PART"
         if re2.match(r"^[\p{N}]+$", word): return "NUM"
         if re2.search(r"(ة|ات|ون|ين|ان)$", word): return "NOUN"
         return False
 def is_clean_sentence(s: str) -> bool:
+    if not (60 <= len(s) <= 240): return False
     if re2.search(r"https?://|www\.", s): return False
     if re2.search(r"\d{2,}", s): return False
     return True
     if ar_pos(k) in {"PRON","PART"}: return False
     return True
+# ====== Embeddings/Masking/Cross-Encoder (اختياري) ======
 _EMB = None
 def get_embedder():
     global _EMB
     vecs = emb.encode([target] + cand, normalize_embeddings=True)
     t, C = vecs[0], vecs[1:]
     import numpy as np
+    sims = (C @ t)
     idx = np.argsort(-sims)[:k]
     return [(cand[i], float(sims[i])) for i in idx]
     except Exception:
         return []
 _CE = None
 def get_cross_encoder():
     global _CE
             _CE = False
     return _CE
 def rank_by_ce(sentence_with_blank: str, candidates: List[str]) -> List[str]:
     ce = get_cross_encoder()
     if not ce or not candidates:
     except Exception:
         return candidates
+# --------- أدوات مساعدة للمشتّتات ---------
+def word_tokens(s: str) -> List[str]:
+    s = norm_ar(s)
+    return re2.findall(r"\p{L}+", s)
+def token_set(s: str) -> set:
+    return set([t for t in word_tokens(s) if t not in AR_STOP])
+def jaccard(a: str, b: str) -> float:
+    A, B = token_set(a), token_set(b)
+    if not A or not B: return 0.0
+    return len(A & B) / max(1, len(A | B))
+def is_sub_or_super(a: str, b: str) -> bool:
+    A, B = norm_ar(a), norm_ar(b)
+    return (A in B) or (B in A)
+def appears_as_long_fragment_in_sentence(w: str, sentence: str) -> bool:
+    toks = word_tokens(w)
+    if len(toks) < 3:
+        return False
+    return re2.search(rf"(?<!\p{{L}}){re2.escape(norm_ar(w))}(?!\p{{L}})", norm_ar(sentence)) is not None
+def choice_length_ok(w: str) -> bool:
+    n = len(word_tokens(w))
+    return 1 <= n <= 6
+def paragraph_index_map(text: str, sentences: List[str]) -> dict:
+    paras = [norm_ar(p) for p in re2.split(r"\n{2,}", text) if p.strip()]
+    mapping = {}
+    for i, s in enumerate(sentences):
+        ns = norm_ar(s)
+        pid = None
+        for j, p in enumerate(paras):
+            if ns and ns in p:
+                pid = j; break
+        mapping[s] = pid if pid is not None else -1
+    return mapping
+def looks_like_title_fragment(s: str) -> bool:
+    return ":" in s and s.index(":") < max(10, len(s)//6)
+def is_nouny_phrase(w: str) -> bool:
+    toks = word_tokens(w)
+    if not (1 <= len(toks) <= 4): return False
+    if re2.search(r"(يفعل|تفعل|يشهد|تقوم|يمكن|قد|سوف)$", w): return False
+    return True
+def best_keyword_in_sentence(sentence: str, global_text: str) -> Optional[str]:
+    if looks_like_title_fragment(sentence):
+        parts = sentence.split(":", 1)
+        sentence = parts[1] if len(parts) > 1 else sentence
+    try:
+        ex = yake.KeywordExtractor(lan='ar', n=3, top=24)
+        pairs = ex.extract_keywords(sentence)
+    except Exception:
+        pairs = []
+    cands = []
+    for w, _ in pairs:
+        w = re2.sub(r"\s+", " ", w.strip())
+        if not w or not good_kw(w) or not safe_keyword(w):
+            continue
+        if not is_nouny_phrase(w):
+            continue
+        if not re2.search(rf"(?<!\p{{L}}){re2.escape(w)}(?!\p{{L}})", sentence):
+            continue
+        freq_weight = global_text.count(w)
+        cands.append((w, len(w) + 0.7*freq_weight))
+    if not cands:
+        toks = [t for t in re2.findall(r"\p{L}+", sentence) if good_kw(t) and safe_keyword(t)]
+        toks = [t for t in toks if is_nouny_phrase(t)]
+        toks.sort(key=len, reverse=True)
+        return toks[0] if toks else None
+    cands.sort(key=lambda x: -x[1])
+    return cands[0][0]
+def similarity_caps(difficulty: str):
+    if difficulty == "سهل":
+        return 0.88
+    if difficulty == "صعب":
+        return 0.95
+    return 0.92
 def tokenize_ar(s: str) -> List[str]:
     s = norm_ar(s)
     toks = re2.findall(r"\p{L}+", s)
                 pool.add(tok)
     return list(pool)
 def typo_like_variants(answer: str, k: int = 4) -> List[str]:
+    """مشتّتات شكلية: تعريف/تنكير، ي/ى، ة/ه، حذف حرف."""
     a = norm_ar(answer)
     vars = set()
     if a.startswith("ال"):
         vars.add(a[2:])
     else:
         vars.add("ال" + a)
     vars.add(a.replace("ي", "ى"))
     vars.add(a.replace("ى", "ي"))
     vars.add(a.replace("ة", "ه"))
     vars.add(a.replace("ه", "ة"))
     if len(a) > 5:
         mid = len(a)//2
         vars.add(a[:mid] + a[mid+1:])
     out = [v for v in vars if v and norm_ar(v) != norm_ar(a)]
     return out[:k]
+# ====== مشتّتات ذكية ======
+def pos_compatible(a: str, b: str) -> bool:
+    pa, pb = ar_pos(a), ar_pos(b)
+    if "X" in (pa, pb):
+        return True
+    return pa == pb
+def length_close(a: str, b: str) -> bool:
+    return abs(len(a) - len(b)) <= max(6, len(b)//2)
+def smart_distractors(correct: str, phrase_pool: List[str], sentence: str, k: int = 3,
+                      all_sentences: Optional[List[str]] = None, difficulty: str = "متوسط") -> List[str]:
+    base: List[str] = []
+    # (0) مشتّتات شكلية أولاً
+    base.extend(typo_like_variants(correct, k=4))
     # (أ) جيران دلاليين
     base.extend([w for w,_ in nearest_terms(correct, phrase_pool, k=24)])
     # (ب) FILL-MASK
+    for w in mlm_distractors(sentence.replace(correct, "_____"), correct, k=18):
         if w not in base:
             base.append(w)
+    # (ج) BM25
     if all_sentences:
         bm, corp = bm25_build(all_sentences)
         for w in bm25_candidates(correct, all_sentences, bm, corp, top=18):
                 base.append(w)
     # فلترة صارمة
+    clean: List[str] = []
     for w in base:
+        w = (w or "").strip()
         if not w or w == correct:
             continue
         if not choice_length_ok(w):
             continue
         clean.append(w)
+    # ترتيب (اختياري) + فلتر قرب دلالي
     clean = rank_by_ce(sentence.replace(correct, "_____"), clean)[:max(k*4, k)]
     cap = similarity_caps(difficulty)
     try:
     except Exception:
         pass
+    # تجميع أخير
     out, seen = [], set()
     for w in clean:
         if w in seen:
         if len(out) >= k:
             break
+    # تعويض إذا لزم
     if len(out) < k:
         extras = [w for w in phrase_pool
                   if w not in out and w != correct and choice_length_ok(w)
                   and jaccard(w, correct) < 0.5]
         out.extend(extras[:(k-len(out))])
     if len(out) < k:
+        out.extend([w for w in ["…"]*(k-len(out))])  # لن تُقبل لاحقًا إن لم نكمل 4 خيارات
     return out[:k]
+# ====== mT5 (اختياري) ======
 _MT5 = {"tok": None, "model": None, "ok": False}
 def get_mt5():
     if _MT5["tok"] is not None or _MT5["model"] is not None or _MT5["ok"]:
         "أنت منشئ أسئلة متعددة الخيارات باللغة العربية.\n"
         "من الجملة التالية، أنشئ سؤال فهم مباشر واحدًا مع أربع خيارات وإشارة للجواب الصحيح.\n"
         "أعد فقط JSON بهذا الشكل:\n"
+        "{\n"
         "\"question\": \"...\",\n"
         "\"choices\": [\"...\",\"...\",\"...\",\"...\"],\n"
         "\"answer_index\": 0\n"
 def make_comp_mcqs(text: str, n: int = 6, difficulty: str = "متوسط") -> List[MCQ]:
     tok, model, ok = get_mt5()
     if not ok:
         return make_mcqs(text, n, difficulty=difficulty)
     sents_all = split_sents(text)
+    sents = [s for s in sents_all if is_clean_sentence(s)] or sents_all[:]
     if not sents:
         return make_mcqs(text, n, difficulty=difficulty)
+    # دمج جمل قصيرة لمقاطع مفيدة
+    def make_chunks(sents, max_len=220):
         chunks = []
         i = 0
         while i < len(sents):
         return make_mcqs(text, n, difficulty=difficulty)
     return items[:n]
+# ------------------ مُولّد أسئلة "فراغ" (نهائي) ------------------
+def make_mcqs(text: str, n: int = 6, difficulty: str = "متوسط") -> List[MCQ]:
+    all_sents = split_sents(text)
+    sents = [s for s in all_sents if is_clean_sentence(s)] or all_sents[:]
+    if not sents:
+        raise ValueError("النص قصير أو غير صالح.")
+    keyphrases = yake_keywords(text, k=260)
+    keyphrases = [kp for kp in keyphrases if safe_keyword(kp) and 2 <= len(kp) <= 40]
+    sent_for: dict = {}
+    for s in sents:
+        for kp in keyphrases:
+            if kp in sent_for:
+                continue
+            if re2.search(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", s):
+                sent_for[kp] = s
+        if len(sent_for) >= n * 5:
+            break
+    para_map = paragraph_index_map(text, sents)
+    used_sentences: set = set()
+    items: List[MCQ] = []
+    MAX_PER_PARA = 2
+    para_count: dict = {}
+    def add_item_from_pair(sentence: str, kp: str) -> bool:
+        nonlocal items, used_sentences, para_count
+        pid = para_map.get(sentence, -1)
+        if para_count.get(pid, 0) >= MAX_PER_PARA:
+            return False
+        if not re2.search(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", sentence):
+            return False
+        q = re2.sub(rf"(?<!\p{{L}}){re2.escape(kp)}(?!\p{{L}})", "_____", sentence, count=1)
+        pool = [x for x in keyphrases if x != kp] or keyphrases[:]
+        ch = smart_distractors(kp, pool, sentence, k=3,
+                               all_sentences=all_sents, difficulty=difficulty) + [kp]
+        choices, seen = [], set()
+        for c in ch:
+            c = (c or "").strip()
+            if not c or c in seen:
+                continue
+            if not choice_length_ok(c):
+                continue
+            if appears_as_long_fragment_in_sentence(c, sentence):
+                continue
+            if is_sub_or_super(c, kp) or jaccard(c, kp) >= 0.5:
+                continue
+            seen.add(c); choices.append(c)
+        if kp not in choices:
+            choices.append(kp); seen.add(kp)
+        if len(choices) < 4:
+            return False
+        choices = choices[:4]
+        random.shuffle(choices)
+        ans = choices.index(kp)
+        items.append(MCQ(id=str(uuid.uuid4())[:8], question=q, choices=choices, answer_index=ans))
+        used_sentences.add(sentence)
+        para_count[pid] = para_count.get(pid, 0) + 1
+        return True
+    # تمريرة أولى: تنويع على الفقرات
+    for kp in sorted(sent_for.keys(), key=lambda x: (-len(x), x)):
+        if len(items) >= n: break
+        s = sent_for[kp]
+        if s in used_sentences:
+            continue
+        _ = add_item_from_pair(s, kp)
+    def fill_from_sentences(candidates: List[str]):
+        for s in candidates:
+            if len(items) >= n: break
+            if s in used_sentences:
+                continue
+            kp = None
+            for kpp, ss in sent_for.items():
+                if ss == s:
+                    kp = kpp; break
+            if kp is None:
+                kp = best_keyword_in_sentence(s, text)
+            if not kp:
+                continue
+            _ = add_item_from_pair(s, kp)
+    if len(items) < n:
+        remaining_new_para = [s for s in sents if para_count.get(para_map.get(s, -1), 0) < MAX_PER_PARA]
+        fill_from_sentences(remaining_new_para)
+    if len(items) < n:
+        leftovers = [s for s in sents if s not in used_sentences]
+        fill_from_sentences(leftovers)
+    if not items:
+        raise RuntimeError("تعذّر توليد أسئلة.")
+    return items[:n]
 # ------------------ تحويل إلى سجلات العرض ------------------
 def clean_option_text(t: str) -> str:
     t = re2.sub(AR_DIAC, "", t)
     t = re2.sub(r"\s+", " ", t)
     t = re2.sub(r"^[\p{P}\p{S}_-]+|[\p{P}\p{S}_-]+$", "", t)
+    # قصّ لطول معقول
+    t = re2.sub(r"^(.{,60})(?:\s.*)?$", r"\1", t)
     return t or "…"
 def to_records(items:List[MCQ])->List[dict]:
         for i,lbl in enumerate(["A","B","C","D"]):
             txt=(it.choices[i] if i<len(it.choices) else "…")
             txt=clean_option_text(txt.replace(",", "،").replace("?", "؟").replace(";", "؛"))
             if txt in used:
+                txt = f"{txt}‌{i+1}"
             used.add(txt)
             opts.append({"id":lbl,"text":txt,"is_correct":(i==it.answer_index)})
         recs.append({"id":it.id,"question":it.question.strip(),"options":opts})
         """)
     return f"""<div id="quiz" class="quiz-wrap">{''.join(parts)}</div>"""
+# ------------------ بناء الامتحان وتبديل الصفحات ------------------
 def build_quiz(text_area, file_path, n, model_id, zoom, mode, difficulty):
     text_area = (text_area or "").strip()
     if not text_area and not file_path:
     warn = f"نمط مُستخدَم: **{used_mode}** — عدد الأسئلة: {len(items)}"
     return render_quiz_html(recs), gr.update(visible=False), gr.update(visible=True), warn
 # ------------------ CSS ------------------
 CSS = """
 :root{
 .gradio-container{max-width:980px;margin:0 auto;padding:12px 12px 40px;}
 h2.top{color:#eaeaf2;margin:6px 0 16px}
+/* صفحة الإدخال ثابتة الارتفاع ولا تتغير أبعادها */
 .input-panel{background:var(--panel);border:1px solid var(--border);border-radius:14px;padding:16px;
   box-shadow:0 16px 38px rgba(0,0,0,.35); min-height:360px; display:flex; flex-direction:column; gap:12px;}
 .small{opacity:.9;color:#d9dee8}
+/* إخفاء معاينة الملف */
 [data-testid="file"] .file-preview, [data-testid="file"] .file-preview * { display:none !important; }
 [data-testid="file"] .grid-wrap { display:block !important; }
 .upload-like{border:2px dashed #3b3f52;background:#121318;border-radius:12px;padding:12px;color:#cfd5e3;min-height:90px}
 .q-actions{display:flex;gap:10px;align-items:center;margin-top:10px}
 .q-actions .q-submit{
+  background:#2dd4bf;border:none;color:#0b0د10;font-weight:800;border-radius:10px;padding:8px 14px;cursor:pointer;
 }
 .q-actions .q-submit:disabled{opacity:.5;cursor:not-allowed}
 .q-note{color:#ffd1d6}
 .q-note.warn{color:#ffd1d6}
 """
+# ------------------ JS: ربط Submit + إبراز الصح ------------------
 ATTACH_LISTENERS_JS = """
 () => {
   if (window.__q_submit_bound_multi2) { return 'already'; }
 with gr.Blocks(title="Question Generator", css=CSS) as demo:
     gr.Markdown("<h2 class='top'>Question Generator</h2>")
     page1 = gr.Group(visible=True, elem_classes=["input-panel"])
     with page1:
         gr.Markdown("اختر **أحد** الخيارين ثم اضغط الزر.", elem_classes=["small"])
                             file_types=[".pdf",".txt"], type="filepath", elem_classes=["upload-like"])
         num_q = gr.Slider(4, 20, value=DEFAULT_NUM_QUESTIONS, step=1, label="عدد الأسئلة")
         mode_radio = gr.Radio(choices=QUESTION_MODES, value="فراغ", label="نوع السؤال")
         difficulty_radio = gr.Radio(choices=DIFFICULTY_MODES, value="متوسط", label="درجة الصعوبة")
         btn_build = gr.Button("generate quistion", elem_classes=["button-primary"])
         warn = gr.Markdown("", elem_classes=["small"])
     page2 = gr.Group(visible=False)
     with page2:
         quiz_html = gr.HTML("")
+        js_wired  = gr.Textbox(visible=False)
     btn_build.click(
         build_quiz,
         inputs=[text_area, file_comp, num_q, trocr_model, trocr_zoom, mode_radio, difficulty_radio],