motaseeem commited on
Commit
18cc7b7
·
verified ·
1 Parent(s): 1280b90

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +321 -0
app.py ADDED
@@ -0,0 +1,321 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import difflib
3
+ import numpy as np
4
+ import torch
5
+ import gradio as gr
6
+ import pyarabic.araby as araby
7
+
8
+ import stanza
9
+ from transformers import AutoTokenizer, AutoModel
10
+ from transformers import AutoTokenizer as HFTokenizer, AutoModelForSeq2SeqLM
11
+ from sentence_transformers import SentenceTransformer, util
12
+ import arabert.preprocess
13
+ import yake
14
+ from bert_score import score as bertscore
15
+
16
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
17
+ torch.set_grad_enabled(False)
18
+
19
+ # ---- نماذج وأدوات ----
20
+ ARAELECTRA_NAME = "aubmindlab/araelectra-base-discriminator"
21
+ SBERT_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
22
+ QG_MODEL = "Mihakram/AraT5-base-question-generation"
23
+
24
+ # Stanza (أول تشغيل قد يحمّل حزمة العربية ويكاشها)
25
+ stanza.download("ar", verbose=False)
26
+ nlp = stanza.Pipeline(lang="ar", processors="tokenize,pos,lemma,depparse", tokenize_no_ssplit=False, verbose=False)
27
+
28
+ # Arabert preprocessor
29
+ arabert_prep = arabert.preprocess.ArabertPreprocessor(ARAELECTRA_NAME)
30
+
31
+ # AraELECTRA (للأوفست والتمثيلات السياقية)
32
+ tokenizer_electra = AutoTokenizer.from_pretrained(ARAELECTRA_NAME)
33
+ model_electra = AutoModel.from_pretrained(ARAELECTRA_NAME).to(DEVICE)
34
+
35
+ # sBERT
36
+ sbert = SentenceTransformer(SBERT_MODEL, device=DEVICE)
37
+
38
+ # AraT5 (توليد سؤال)
39
+ qg_tokenizer = HFTokenizer.from_pretrained(QG_MODEL)
40
+ qg_model = AutoModelForSeq2SeqLM.from_pretrained(QG_MODEL).to(DEVICE)
41
+
42
+ # ---- أدوات مساعدة ----
43
+ def normalize(s: str) -> str:
44
+ t = araby.strip_tashkeel(s)
45
+ t = t.replace("آ","ا").replace("أ","ا").replace("إ","ا").replace("ى","ي")
46
+ t = t.replace("ـ","")
47
+ t = " ".join(t.split())
48
+ return t
49
+
50
+ def build_char_map(src: str, tgt: str):
51
+ sm = difflib.SequenceMatcher(a=src, b=tgt)
52
+ src2tgt = [-1] * len(src)
53
+ for tag, i1, i2, j1, j2 in sm.get_opcodes():
54
+ if tag == "equal":
55
+ for k in range(i2 - i1):
56
+ src2tgt[i1 + k] = j1 + k
57
+ elif tag in ("replace", "delete"):
58
+ for k in range(i2 - i1):
59
+ src2tgt[i1 + k] = j1
60
+ last = 0
61
+ for i in range(len(src2tgt)):
62
+ if src2tgt[i] == -1:
63
+ src2tgt[i] = last
64
+ else:
65
+ last = src2tgt[i]
66
+ return src2tgt
67
+
68
+ def map_span_src_to_tgt(src2tgt, start, end, tgt_len):
69
+ if start >= len(src2tgt): start = max(0, len(src2tgt)-1)
70
+ if end == 0: end = 1
71
+ if end-1 >= len(src2tgt): end = len(src2tgt)
72
+ ts = src2tgt[start]; te = src2tgt[end-1] + 1
73
+ ts = max(0, min(ts, max(0, tgt_len-1)))
74
+ te = max(ts+1, min(te, tgt_len))
75
+ return ts, te
76
+
77
+ def token_indices_overlapping_span(offsets, span_start, span_end):
78
+ idxs = []
79
+ for i, (s, e) in enumerate(offsets):
80
+ if e > span_start and s < span_end:
81
+ idxs.append(i)
82
+ return idxs
83
+
84
+ def electra_hidden_states(prep_text):
85
+ encoded = tokenizer_electra(prep_text, return_tensors="pt", return_offsets_mapping=True, padding=False, truncation=True).to(DEVICE)
86
+ offsets = encoded.pop("offset_mapping")[0].tolist()
87
+ with torch.no_grad():
88
+ out = model_electra(**encoded)
89
+ H = out.last_hidden_state.squeeze(0)
90
+ return offsets, H
91
+
92
+ def electra_phrase_vec_via_offsets(span_start, span_end, src2tgt, prep_text, offsets, H):
93
+ ts, te = map_span_src_to_tgt(src2tgt, span_start, span_end, len(prep_text))
94
+ tok_ids = token_indices_overlapping_span(offsets, ts, te)
95
+ if not tok_ids:
96
+ return None
97
+ vecs = [H[i] for i in tok_ids]
98
+ return torch.stack(vecs, dim=0).mean(dim=0)
99
+
100
+ # استخراج عبارات اسمية
101
+ def build_noun_phrases(doc, text_norm):
102
+ noun_phrases = []
103
+ for si, sent in enumerate(doc.sentences):
104
+ words_info = []
105
+ for ti, tok in enumerate(sent.tokens):
106
+ for w in tok.words:
107
+ words_info.append({
108
+ "id": w.id, "text": w.text, "upos": w.upos, "deprel": w.deprel,
109
+ "head": w.head, "start": tok.start_char, "end": tok.end_char, "tok_idx": ti
110
+ })
111
+ for wi in words_info:
112
+ if wi["upos"] not in {"NOUN","PROPN"}: # رؤوس اسمية
113
+ continue
114
+ head = wi
115
+ left_mods, right_mods = [], []
116
+ for cj in words_info:
117
+ if cj["head"] == head["id"] and cj["deprel"] in {"amod","compound","nmod"}:
118
+ (left_mods if cj["start"] <= head["start"] else right_mods).append(cj)
119
+ left_mods = sorted(left_mods, key=lambda x: x["start"])
120
+ right_mods = sorted(right_mods, key=lambda x: x["start"])
121
+ phrase_tokens = left_mods + [head] + right_mods
122
+ if len(phrase_tokens) < 2 and head["upos"] != "PROPN": # استثناء الأعلام المفردة
123
+ continue
124
+ span_start = min(t["start"] for t in phrase_tokens); span_end = max(t["end"] for t in phrase_tokens)
125
+ phrase_text = re.sub(r"\s+", " ", text_norm[span_start:span_end].strip())
126
+ if len(phrase_text) >= 2:
127
+ noun_phrases.append({"text": phrase_text, "start": span_start, "end": span_end})
128
+ # تمييز
129
+ uniq = {}
130
+ for np_item in noun_phrases:
131
+ key = np_item["text"]
132
+ if key not in uniq or (np_item["end"]-np_item["start"]) > (uniq[key]["end"]-uniq[key]["start"]):
133
+ uniq[key] = np_item
134
+ return list(uniq.values())
135
+
136
+ # الترتيب: sBERT + ELECTRA + MMR
137
+ def mmr_select(doc_emb, cand_embs, candidates, k=10, lam=0.7):
138
+ if not candidates: return []
139
+ chosen, rest = [], list(range(len(candidates)))
140
+ sim_doc = util.cos_sim(doc_emb, cand_embs)[0].cpu().numpy()
141
+ first = int(np.argmax(sim_doc)); chosen.append(first); rest.remove(first)
142
+ sim_between = util.cos_sim(cand_embs, cand_embs).cpu().numpy()
143
+ while len(chosen) < min(k, len(candidates)) and rest:
144
+ best_i, best_score = None, -1e9
145
+ for i in rest:
146
+ redundancy = max(sim_between[i, j] for j in chosen) if chosen else 0.0
147
+ score = 0.7*sim_doc[i] - 0.3*redundancy
148
+ if score > best_score: best_score, best_i = score, i
149
+ chosen.append(best_i); rest.remove(best_i)
150
+ return [candidates[i] for i in chosen]
151
+
152
+ def rank_keyphrases(text_norm, nps, alpha=0.8):
153
+ phrases = [p["text"] for p in nps]
154
+ if not phrases: return [], []
155
+ text_prep = arabert_prep.preprocess(text_norm)
156
+ src2tgt = build_char_map(text_norm, text_prep)
157
+ # sBERT
158
+ doc_emb = sbert.encode([text_prep], convert_to_tensor=True)
159
+ phr_embs = sbert.encode(phrases, convert_to_tensor=True)
160
+ sims_sbert = util.cos_sim(doc_emb, phr_embs).cpu().numpy()[0]
161
+ # ELECTRA
162
+ offsets, H = electra_hidden_states(text_prep)
163
+ doc_vec_electra = H.mean(dim=0)
164
+ sims_electra = []
165
+ for p in nps:
166
+ v = electra_phrase_vec_via_offsets(p["start"], p["end"], src2tgt, text_prep, offsets, H)
167
+ if v is None: sims_electra.append(0.0)
168
+ else:
169
+ num = torch.dot(doc_vec_electra, v).item()
170
+ den = float(doc_vec_electra.norm().item() * v.norm().item() + 1e-9)
171
+ sims_electra.append(num/den)
172
+ sims_electra = np.array(sims_electra)
173
+ blended = alpha*sims_sbert + (1-alpha)*sims_electra
174
+ order = np.argsort(-blended)
175
+ ranked = [(phrases[i], float(blended[i]), float(sims_sbert[i]), float(sims_electra[i])) for i in order]
176
+ diverse = mmr_select(doc_emb, phr_embs, phrases, k=min(12, len(phrases)), lam=0.7)
177
+ return ranked, diverse
178
+
179
+ # YAKE
180
+ def yake_scores_for_phrases(text_norm, phrases, max_ngram_size=5, lan="ar"):
181
+ kw_extractor = yake.KeywordExtractor(lan=lan, n=max_ngram_size, dedupLim=0.9, top=1000)
182
+ scored = kw_extractor.extract_keywords(text_norm)
183
+ norm = lambda s: re.sub(r"\s+"," ", s).strip().lower()
184
+ scored_norm = {norm(k): v for k, v in scored}
185
+ res = {}
186
+ for p in phrases:
187
+ res[p] = scored_norm.get(norm(p))
188
+ return res
189
+
190
+ def invert_and_minmax_yake(score_map):
191
+ vals = [None if v is None else 1/(1+v) for v in score_map.values()]
192
+ finite = [x for x in vals if x is not None]
193
+ if not finite: return {k:0.0 for k in score_map.keys()}
194
+ vmin, vmax = min(finite), max(finite); rng = (vmax-vmin) if vmax>vmin else 1.0
195
+ out = {}
196
+ for (k,_), pos in zip(score_map.items(), vals):
197
+ out[k] = 0.0 if pos is None else (pos - vmin)/rng
198
+ return out
199
+
200
+ def blend_semantic_with_yake(ranked_sem, yake_norm, w_sem=0.7, w_yake=0.3):
201
+ merged = []
202
+ for phr, sem_sc, sb, el in ranked_sem:
203
+ y = yake_norm.get(phr, 0.0)
204
+ final = w_sem*sem_sc + w_yake*y
205
+ merged.append((phr, final, sem_sc, y, sb, el))
206
+ merged.sort(key=lambda x: -x[1])
207
+ return merged
208
+
209
+ # تقسيم بالنقطة + اختيار جملة داعمة لكل عبارة
210
+ def split_by_dots(text: str):
211
+ parts = re.split(r"\.{1,}\s*", text)
212
+ return [p.strip() for p in parts if p.strip()]
213
+
214
+ def sentence_kind_from_root(stanza_sentence):
215
+ root = next((w for w in stanza_sentence.words if w.deprel == "root"), None)
216
+ if not root: return "unknown"
217
+ return "verbal" if root.upos == "VERB" else "nominal"
218
+
219
+ def split_and_tag_nominal_verbal_by_dots(text_norm):
220
+ sents = split_by_dots(text_norm)
221
+ tagged = []
222
+ for s in sents:
223
+ doc_s = nlp(s)
224
+ if not doc_s.sentences:
225
+ tagged.append({"text": s, "kind": "unknown"})
226
+ else:
227
+ tagged.append({"text": s, "kind": sentence_kind_from_root(doc_s.sentences[0])})
228
+ return tagged
229
+
230
+ def best_support_sentence_by_dots(text_norm, phrase):
231
+ sentences_tagged = split_and_tag_nominal_verbal_by_dots(text_norm)
232
+ if not sentences_tagged: return ""
233
+ sent_texts = [m["text"] for m in sentences_tagged]
234
+ sent_embs = sbert.encode(sent_texts, convert_to_tensor=True)
235
+ p_emb = sbert.encode([phrase], convert_to_tensor=True)
236
+ sims = util.cos_sim(p_emb, sent_embs)[0].cpu().numpy()
237
+ best_idx = int(np.argmax(sims))
238
+ return sent_texts[best_idx], sentences_tagged[best_idx]["kind"]
239
+
240
+ # توليد سؤال موحّد (بدون hints)
241
+ def gen_unified_question_freeform(phrases, supports, context_text, max_len=96, num_beams=5):
242
+ context_short = context_text.strip()[:600]
243
+ items_block = "\n".join([f"- العبارة: {p}\n جملة داعمة: {s}" for p, s in zip(phrases, supports)])
244
+ prompt = (
245
+ "حوّل العبارات التالية إلى سؤال واحد شامل بالعربية يعتمد على السياق. "
246
+ "يجب أن يغطي جميع العبارات بشكل موجز وواضح.\n"
247
+ f"{items_block}\n"
248
+ f"سياق: {context_short}\n"
249
+ "السؤال الموحد:"
250
+ )
251
+ inputs = qg_tokenizer(prompt, return_tensors="pt", truncation=True).to(DEVICE)
252
+ outputs = qg_model.generate(
253
+ **inputs, max_length=max_len, num_beams=num_beams,
254
+ early_stopping=True, no_repeat_ngram_size=3
255
+ )
256
+ q = qg_tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
257
+ q = q.rstrip("?.؟")
258
+ if q and not q.endswith("؟"): q += "؟"
259
+ return q
260
+
261
+ # الواجهة: خطوة واحدة تنفّذ كل شيء وتعرض النتائج
262
+ def run_pipeline(user_text):
263
+ if not user_text or len(user_text.strip()) < 5:
264
+ return "رجاءً أدخل نصًا عربيًا أطول.", "", "", "", ""
265
+
266
+ text_norm = normalize(user_text)
267
+ doc = nlp(text_norm)
268
+
269
+ # 1) عبارات اسمية
270
+ nps = build_noun_phrases(doc, text_norm)
271
+ if not nps:
272
+ return "لم تُستخرج عبارات اسمية.", "", "", "", ""
273
+
274
+ # 2) ترتيب دلالي
275
+ ranked_sem, diverse = rank_keyphrases(text_norm, nps, alpha=0.8)
276
+
277
+ # 3) YAKE + دمج
278
+ phrases = [r[0] for r in ranked_sem]
279
+ yake_raw = yake_scores_for_phrases(text_norm, phrases, max_ngram_size=5, lan="ar")
280
+ yake_norm = invert_and_minmax_yake(yake_raw)
281
+ ranked_blended = blend_semantic_with_yake(ranked_sem, yake_norm, w_sem=0.7, w_yake=0.3)
282
+
283
+ # 4) أفضل جملة داعمة لأول 5 عبارات
284
+ top_n = min(5, len(ranked_blended))
285
+ top_phrases = [ranked_blended[i][0] for i in range(top_n)]
286
+ supports = []
287
+ kinds = []
288
+ for p in top_phrases:
289
+ s, kind = best_support_sentence_by_dots(text_norm, p)
290
+ supports.append(s); kinds.append(kind)
291
+
292
+ # 5) سؤال موحّد من الخمس عبارات
293
+ unified_q = gen_unified_question_freeform(top_phrases, supports, text_norm)
294
+
295
+ # إخراج منسق
296
+ nps_str = "\n".join(f"- {p['text']}" for p in nps[:20])
297
+ ranked_str = "\n".join(f"{i+1:>2}. {t[0]} (score={t[1]:.3f})" for i, t in enumerate(ranked_blended[:15]))
298
+ support_str = "\n".join(f"{i+1:>2}. [{kinds[i]}] {top_phrases[i]} → {supports[i]}" for i in range(top_n))
299
+ diverse_str = "\n".join(f"- {d}" for d in diverse[:10])
300
+
301
+ return unified_q, ranked_str, support_str, diverse_str, nps_str
302
+
303
+ title = "Arabic Main Question Generation (Hybrid Pipeline)"
304
+ desc = "أدخل نصًا عربيًا؛ سنستخرج العبارات الاسمية، نرتّبها (sBERT + ELECTRA + YAKE + MMR)، نختار جملًا داعمة، ونولّد سؤالًا موحّدًا بـ AraT5."
305
+
306
+ with gr.Blocks(title=title) as demo:
307
+ gr.Markdown(f"# {title}\n{desc}")
308
+
309
+ with gr.Row():
310
+ inp = gr.Textbox(lines=12, label="النص العربي")
311
+ btn = gr.Button("تشغيل الـPipeline")
312
+
313
+ out_unified = gr.Textbox(label="السؤال الموحد (AraT5)")
314
+ out_ranked = gr.Textbox(label="Top Noun Phrases (Blended Ranking)")
315
+ out_support = gr.Textbox(label="أفضل الجمل الداعمة لأول 5 عبارات")
316
+ out_diverse = gr.Textbox(label="MMR Diverse Selection")
317
+ out_nps = gr.Textbox(label="العبارات الاسمية المستخرجة (أول 20)")
318
+
319
+ btn.click(run_pipeline, inputs=inp, outputs=[out_unified, out_ranked, out_support, out_diverse, out_nps])
320
+
321
+ demo.launch()