Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import io, json, csv, uuid, random, re | |
| import regex as re2, yake | |
| random.seed(42) | |
| # ====== ุงูู ุนุงูุฌุฉ ุงูุฃุณุงุณูุฉ ====== | |
| AR_STOP = set("ูู ุนูู ู ู ุฅูู ุนู ู ุน ูุฏู ุฐูู ูุฐู ูุฐุง ุงูุฐูู ุงูุชู ุงูุฐู ุงูููุงุชู ุงูููุงุชูุง ุฃู ุฃู ุฅู ุฃู ูุงู ุชููู ูุงููุง ูุงูุช ููุช ููุง ูุงูุง ูุงูุชู ุซู ูุฏ ููุฏ ุฑุจู ุง ุจู ููู ููููู ุฅูุง ุณูู ุญุชู ุญูุซ ูู ุง ูู ุง ูู ุงู ูู ุงูู ูู ุงู ู ุง ู ุงุฐุง ูู ุงุฐุง ู ุชู ุฃูู ููู ุฃู ุฃูู ุฃููู ููุงู ููุง ููุงูู ุชูู ุฐููู ุฐููู ุฃููุฆู ูุคูุงุก ูู ุง ูู ูู ุฃูุชู ุฃูุชู ุฃูุชู ุง ุฃูุชู ุฃูุชู ุฃูุง ูุญู ูู ูู ูููู ูู ูู".split()) | |
| SENT_SPLIT = re2.compile(r"(?<=[\.!ุ\?])\s+") | |
| def clean_text_basic(txt:str)->str: | |
| txt = txt.replace('\r',' ').replace('\t',' ') | |
| txt = re.sub(r"\u200f|\u200e|\ufeff"," ",txt) | |
| txt = re.sub(r"\s+"," ",txt) | |
| txt = re.sub(r"\s*([\.\!\?ุุ,:;ุ])\s*", r"\1 ", txt) | |
| return txt.strip() | |
| def extract_text_pdfminer(data: bytes) -> str: | |
| try: | |
| import pdfminer.high_level | |
| return pdfminer.high_level.extract_text(io.BytesIO(data)) or "" | |
| except Exception: | |
| return "" | |
| def extract_text_pypdf(data: bytes) -> str: | |
| try: | |
| from pypdf import PdfReader | |
| out = [] | |
| for p in PdfReader(io.BytesIO(data)).pages: | |
| out.append(p.extract_text() or "") | |
| return "\n".join(out) | |
| except Exception: | |
| return "" | |
| def extract_text_from_pdf(data: bytes) -> str: | |
| txt = extract_text_pdfminer(data) | |
| if not txt or len(txt.strip())<10: | |
| txt = extract_text_pypdf(data) | |
| return clean_text_basic(txt) | |
| def split_sentences(text:str): | |
| return [s for s in [s.strip() for s in SENT_SPLIT.split(text) if s.strip()] if len(s)>=25] | |
| def top_keywords_yake(text:str, max_k=120, lan='ar'): | |
| kws=[kw for kw,_ in yake.KeywordExtractor(lan=lan, n=1, top=max_k).extract_keywords(text)] | |
| out,seen=[],set() | |
| for k in kws: | |
| k=k.strip() | |
| if not k or k in seen: continue | |
| if lan=="ar" and k in AR_STOP: continue | |
| if len(k)<2: continue | |
| seen.add(k); out.append(k) | |
| return out | |
| def build_distractors(correct, pool, k=3): | |
| cand=[w for w in pool if w!=correct and len(w)>1] | |
| random.shuffle(cand) | |
| out=[] | |
| for w in cand: | |
| if len(out)==k: break | |
| w2=w.strip() | |
| if w2 and w2!=correct.strip(): out.append(w2) | |
| fillers=["โ","-","โ-"] | |
| while len(out)<k: out.append(random.choice(fillers)) | |
| return out | |
| def make_mcqs_from_text(text: str, n=8, lang='ar'): | |
| text = clean_text_basic(text) | |
| sents = split_sentences(text) | |
| if not sents: raise ValueError("ุงููุต ูุตูุฑ ุฌุฏูุง.") | |
| keywords = top_keywords_yake(text, 120, lang) | |
| if not keywords: | |
| toks = re.findall(r"[\p{L}\p{N}_]+", text) | |
| toks = [t for t in toks if not (lang=="ar" and t in AR_STOP)] | |
| from collections import Counter | |
| keywords=[w for w,_ in Counter(toks).most_common(80)] | |
| sent_for_kw={} | |
| for s in sents: | |
| for kw in keywords: | |
| if kw in s and kw not in sent_for_kw: | |
| sent_for_kw[kw]=s | |
| items=[]; used=set() | |
| pool=[kw for kw in keywords if kw in sent_for_kw] | |
| for kw in pool: | |
| if len(items)>=n: break | |
| s=sent_for_kw[kw] | |
| if s in used: continue | |
| blanked=s.replace(kw,"_____",1) | |
| choices=build_distractors(kw,[x for x in keywords if x!=kw],3)+[kw] | |
| random.shuffle(choices) | |
| ans=choices.index(kw) | |
| exp=f"ู ูุชุจุณ ู ู ุงูุฌู ูุฉ: {s[:220]}" + ("..." if len(s)>220 else "") | |
| items.append({ | |
| "id": str(uuid.uuid4())[:8], | |
| "question": blanked, | |
| "choices": choices, | |
| "answer_index": ans, | |
| "explanation": exp | |
| }) | |
| used.add(s) | |
| if not items: raise RuntimeError("ุชุนุฐุฑ ุงูุชูููุฏ.") | |
| return items | |
| def render_cards(items): | |
| html=[] | |
| for i,it in enumerate(items,1): | |
| li="".join(f"<li>{c}</li>" for c in it["choices"]) | |
| ans=["A","B","C","D"][it["answer_index"]] | |
| html.append(f""" | |
| <article class="card"> | |
| <header><span class="badge">ุณ {i}</span><h3>{it['question']}</h3></header> | |
| <ol type="A" class="choices">{li}</ol> | |
| <details><summary>ุงูุฅุฌุงุจุฉ</summary> | |
| <div class="answer"><b>ุงูุฅุฌุงุจุฉ:</b> {ans}</div></details> | |
| </article>""") | |
| return "\n".join(html) | |
| def to_files(items): | |
| json_bytes = io.BytesIO(json.dumps(items, ensure_ascii=False, indent=2).encode("utf-8")); json_bytes.name="mcqs.json" | |
| s=io.StringIO(); w=csv.writer(s) | |
| w.writerow(["id","question","A","B","C","D","answer_index","explanation"]) | |
| for it in items: | |
| ch=it["choices"] | |
| w.writerow([it["id"], it["question"], *(ch+['']*(4-len(ch))), it["answer_index"], it["explanation"]]) | |
| csv_bytes=io.BytesIO(s.getvalue().encode("utf-8")); csv_bytes.name="mcqs.csv" | |
| return json_bytes, csv_bytes | |
| def pipeline(text, file, n, lang): | |
| src = (text or "").strip() | |
| if file is not None: | |
| b = file.read() | |
| name = file.name.lower() | |
| if name.endswith(".pdf"): | |
| src = extract_text_from_pdf(b) | |
| elif name.endswith(".txt"): | |
| src = clean_text_basic(b.decode("utf-8","ignore")) | |
| else: | |
| return "โ ๏ธ ุงุฑูุนู PDF ุฃู TXT ููุท.", None, None | |
| if not src: return "โ ๏ธ ุฃุฏุฎูู ูุตูุง ุฃู ู ูููุง.", None, None | |
| items = make_mcqs_from_text(src, n, lang) | |
| html = render_cards(items) | |
| j,c = to_files(items) | |
| return html,j,c | |
| # ====== ูุงุฌูุฉ Gradio ====== | |
| theme = gr.themes.Soft() | |
| try: | |
| with open("styles.css","r",encoding="utf-8") as f: | |
| css = f.read() | |
| except FileNotFoundError: | |
| css = "" | |
| with gr.Blocks(theme=theme, css=css) as demo: | |
| gr.HTML("<style>body{direction:rtl}</style>") | |
| gr.Markdown("## ๐ง ู ูููุฏ ุฃุณุฆูุฉ ุงุฎุชูุงุฑ ู ู ู ุชุนุฏุฏ (PDF / TXT / ูุต)") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| t=gr.Textbox(label="ุงููุต",lines=8,placeholder="ุฃูุตูู ุงููุต ููุง ุฃู ุงุฑูุนู ู ูู") | |
| f=gr.File(label="ู ูู PDF ุฃู TXT",file_types=[".pdf",".txt"]) | |
| n=gr.Slider(1,50,value=10,step=1,label="ุนุฏุฏ ุงูุฃุณุฆูุฉ") | |
| lang=gr.Dropdown(["ar","en"],value="ar",label="ุงููุบุฉ") | |
| b=gr.Button("ุชูููุฏ") | |
| j=gr.File(label="ุชุญู ูู JSON") | |
| c=gr.File(label="ุชุญู ูู CSV") | |
| with gr.Column(scale=2): | |
| out=gr.HTML(label="ุงููุชุงุฆุฌ") | |
| b.click(pipeline,[t,f,n,lang],[out,j,c]) | |
| if __name__=="__main__": | |
| demo.launch() | |