Alshargi commited on
Commit
d1c4815
·
verified ·
1 Parent(s): 5e4085a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +378 -138
app.py CHANGED
@@ -4,7 +4,7 @@ import os
4
  import re
5
  import time
6
  from functools import lru_cache
7
- from typing import List, Dict, Any, Tuple
8
 
9
  import numpy as np
10
  import pandas as pd
@@ -24,6 +24,11 @@ MODEL_NAME = os.getenv("HADITH_MODEL_NAME", "intfloat/multilingual-e5-base")
24
  DEFAULT_TOP_K = 10
25
  MAX_TOP_K = 50
26
 
 
 
 
 
 
27
  DEFAULT_HL_TOPN = 6 # 0 = disable highlighting (FAST)
28
  MAX_HL_TOPN = 25
29
 
@@ -69,7 +74,42 @@ def escape_html(s: str) -> str:
69
 
70
 
71
  # =========================
72
- # Segmenting
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  # =========================
74
  def split_ar_segments(text: str, max_len: int) -> List[str]:
75
  if not text:
@@ -93,11 +133,24 @@ def split_ar_segments(text: str, max_len: int) -> List[str]:
93
  if buf:
94
  segs.append(buf)
95
 
96
- # fallback chunking
97
  if len(segs) <= 1 and len(t) > max_len:
98
  segs = [t[i:i+max_len].strip() for i in range(0, len(t), max_len) if t[i:i+max_len].strip()]
99
  return segs
100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
  # =========================
103
  # Load model + index + meta (once)
@@ -123,113 +176,231 @@ if "arabic_clean" not in meta.columns:
123
  # =========================
124
  # Embedding helpers (cached)
125
  # =========================
126
- @lru_cache(maxsize=1024)
127
  def cached_query_emb(query_norm: str) -> bytes:
128
- """Cache query embedding (normalized, float32). Return as bytes for caching."""
129
  emb = model.encode(["query: " + query_norm], normalize_embeddings=True).astype("float32")[0]
130
  return emb.tobytes()
131
 
132
  def get_query_emb(query_norm: str) -> np.ndarray:
133
  return np.frombuffer(cached_query_emb(query_norm), dtype=np.float32)
134
 
 
 
 
 
 
 
 
 
135
 
136
- def semantic_search_df(query: str, top_k: int) -> pd.DataFrame:
137
- q = str(query or "").strip()
138
- if not q:
139
- return meta.iloc[0:0].copy()
140
-
141
- top_k = max(1, min(int(top_k), MAX_TOP_K))
142
- q_norm = normalize_ar(q)
143
 
144
- q_emb = get_query_emb(q_norm).reshape(1, -1)
145
- scores, idx = index.search(q_emb, top_k)
 
 
 
 
146
 
147
- res = meta.iloc[idx[0]].copy()
148
- res["score"] = scores[0]
149
- res = res.sort_values("score", ascending=False)
150
 
151
  res["arabic"] = res["arabic"].fillna("").astype(str)
152
  res = res[res["arabic"].str.strip() != ""]
153
  return res
154
 
 
 
 
 
 
 
 
 
155
 
156
- # =========================
157
- # Batch semantic highlight (FAST)
158
- # =========================
159
- def build_highlight_html_batch(
160
- query_norm: str,
161
- arabic_clean_list: List[str],
162
- hl_topn: int,
163
- seg_maxlen: int,
164
- ) -> Tuple[List[str], Dict[str, Any]]:
165
- """
166
- Return list of HTML strings (one per hadith), highlighted by segment similarity.
167
- Uses ONE encode() call for all segments across all hadith results (fast).
168
- """
169
- # If disabled:
170
- if hl_topn <= 0:
171
- return [escape_html(t) for t in arabic_clean_list], {"mode": "disabled"}
172
-
173
- # Split into segments per hadith
174
- per_segments: List[List[str]] = [split_ar_segments(t, seg_maxlen) for t in arabic_clean_list]
175
-
176
- # Flatten segments
177
- all_segments: List[str] = []
178
- offsets: List[Tuple[int,int]] = [] # (start, end) in flattened array
179
- cur = 0
180
- for segs in per_segments:
181
- start = cur
182
- all_segments.extend(segs)
183
- cur += len(segs)
184
- offsets.append((start, cur))
185
-
186
- # Edge cases
187
- if len(all_segments) == 0:
188
- return [escape_html(t) for t in arabic_clean_list], {"mode": "empty"}
189
-
190
- # Encode query once + encode all segments once
191
  q_emb = get_query_emb(query_norm) # (d,)
192
- seg_emb = model.encode(
193
- ["passage: " + s for s in all_segments],
194
- normalize_embeddings=True
195
- ).astype("float32") # (N, d)
196
 
197
- sims_all = (seg_emb @ q_emb).astype(np.float32) # (N,)
 
 
 
 
 
 
 
 
 
 
198
 
199
- # Build HTML per hadith
200
- html_out: List[str] = []
201
- for (start, end), segs in zip(offsets, per_segments):
202
- if start == end or len(segs) == 0:
203
- html_out.append("")
204
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
 
206
- sims = sims_all[start:end]
207
- s_min = float(np.min(sims))
208
- s_max = float(np.max(sims))
209
- denom = (s_max - s_min) if (s_max - s_min) > 1e-6 else 1.0
210
-
211
- order = np.argsort(-sims)
212
- keep = set(order[:min(hl_topn, len(segs))])
213
-
214
- parts: List[str] = []
215
- for i, seg in enumerate(segs):
216
- w = (float(sims[i]) - s_min) / denom
217
- alpha = (0.18 + 0.62 * w) if i in keep else (0.06 + 0.20 * w)
218
- alpha = max(0.05, min(alpha, 0.82))
219
- border_alpha = max(0.10, min(alpha * 0.8, 0.65))
220
-
221
- style = (
222
- f"background: rgba(255, 230, 120, {alpha:.3f});"
223
- f"border: 1px solid rgba(234, 179, 8, {border_alpha:.3f});"
224
- "border-radius: 12px;"
225
- "padding: 3px 8px;"
226
- "margin: 0 4px 6px 0;"
227
- "display: inline;"
228
- )
229
- parts.append(f'<span style="{style}">{escape_html(seg)}</span> ')
230
- html_out.append("".join(parts).strip())
231
 
232
- return html_out, {"mode": "batch", "segments_total": len(all_segments)}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
 
234
 
235
  # =========================
@@ -237,6 +408,7 @@ def build_highlight_html_batch(
237
  # =========================
238
  app = Flask(__name__)
239
 
 
240
  UI_HTML = r"""
241
  <!doctype html>
242
  <html lang="ar" dir="rtl">
@@ -251,6 +423,7 @@ UI_HTML = r"""
251
  :root{
252
  --bg:#f6f7fb; --card:#ffffff; --text:#0f172a; --muted:#475569;
253
  --line:#e5e7eb; --accent:#2563eb; --shadow: 0 10px 30px rgba(15, 23, 42, .08);
 
254
  }
255
  body{
256
  margin:0; background: linear-gradient(180deg, #ffffff, var(--bg)); color: var(--text);
@@ -296,26 +469,45 @@ UI_HTML = r"""
296
  background: var(--card); border: 1px solid var(--line); border-radius:18px;
297
  padding: 16px; box-shadow: var(--shadow);
298
  }
299
- .row{ display:grid; grid-template-columns: 210px 1fr; gap:14px; }
300
  @media (max-width: 900px){ .row{ grid-template-columns: 1fr; } }
301
 
302
  .left{ color: var(--muted); font-size:14px; direction:ltr; text-align:left; }
303
- .score{ font-weight:800; color: var(--accent); font-size:16px; }
 
 
 
 
 
 
 
 
304
 
305
  .arabic{
306
  direction: rtl; text-align:right; font-family: Amiri, serif; font-size:22px;
307
  line-height: 2.05; background:#fbfcff; border:1px solid var(--line);
308
  border-radius:16px; padding:14px; white-space: pre-wrap;
309
  }
 
 
 
 
 
 
 
 
 
 
310
  .english{
311
  direction:ltr; text-align:left; font-size:16px; line-height:1.8; color:#111827;
312
  background:#fbfcff; border:1px solid var(--line); border-radius:16px; padding:14px; white-space: pre-wrap;
313
  }
314
  details summary{
315
  cursor:pointer; color: var(--accent); margin-top:12px; user-select:none;
316
- direction:ltr; text-align:left; font-weight:700;
317
  }
318
  .empty{ margin-top: 14px; color: var(--muted); font-size: 15px; direction:ltr; text-align:left; }
 
319
  </style>
320
  </head>
321
  <body>
@@ -332,15 +524,14 @@ UI_HTML = r"""
332
  </form>
333
 
334
  <div class="controls">
335
- <label>
336
- Highlight Top Segments:
337
- <input id="hl" type="range" min="0" max="25" value="6">
338
- <b id="hlv">6</b>
 
339
  </label>
340
- <label>
341
- Segment Size:
342
- <input id="seg" type="range" min="120" max="420" step="20" value="220">
343
- <b id="segv">220</b>
344
  </label>
345
  </div>
346
 
@@ -365,7 +556,7 @@ UI_HTML = r"""
365
  l.textContent = r.value;
366
  r.addEventListener("input", ()=> l.textContent = r.value);
367
  }
368
- sync("hl","hlv"); sync("seg","segv");
369
 
370
  $("f").addEventListener("submit", async (e)=>{
371
  e.preventDefault();
@@ -373,11 +564,11 @@ UI_HTML = r"""
373
  const k = parseInt($("k").value||"10",10);
374
  const hl = parseInt($("hl").value||"6",10);
375
  const seg = parseInt($("seg").value||"220",10);
 
376
 
377
  $("msg").style.display="none";
378
  $("grid").innerHTML = "";
379
  $("meta").style.display="none";
380
- $("meta").innerHTML = pill("Query", q) + pill("TopK", k) + pill("Highlight", hl) + pill("SegLen", seg);
381
 
382
  if(!q){
383
  $("msg").textContent="اكتب نص البحث أولًا.";
@@ -388,15 +579,14 @@ UI_HTML = r"""
388
  $("msg").textContent="... جاري البحث";
389
  $("msg").style.display="block";
390
 
391
- const url = `/search?q=${encodeURIComponent(q)}&k=${encodeURIComponent(k)}&hl_topn=${encodeURIComponent(hl)}&seg_maxlen=${encodeURIComponent(seg)}&format=html`;
392
- const t0 = performance.now();
393
  const res = await fetch(url);
394
  const js = await res.json();
395
- const ms = Math.round(performance.now()-t0);
396
 
397
  $("meta").style.display="flex";
398
  $("meta").innerHTML =
399
- pill("Rows", js.rows) + pill("Results", js.n) + pill("Time", js.took_ms ?? ms) + pill("TopK", js.k) + pill("Query", js.query);
 
400
 
401
  if(!js.ok || !js.results || js.results.length===0){
402
  $("msg").textContent="لا توجد نتائج. جرّب كلمات مختلفة.";
@@ -406,25 +596,42 @@ UI_HTML = r"""
406
  $("msg").style.display="none";
407
 
408
  const cards = js.results.map(r=>{
409
- const ar = js.format==="html" ? (r.arabic_clean_html||esc(r.arabic_clean||"")) : esc(r.arabic_clean||"");
 
 
410
  const ar_tashkeel = esc(r.arabic||"");
411
  const en = esc(r.english||"");
 
412
  return `
413
  <div class="card">
414
  <div class="row">
415
  <div class="left">
416
- <div><span class="score">${Number(r.score||0).toFixed(4)}</span> score</div>
 
 
 
 
 
 
 
 
 
417
  <div style="margin-top:12px;">HadithID: <b>${esc(r.hadithID)}</b></div>
418
  <div>Collection: <b>${esc(r.collection)}</b></div>
419
  <div>No: <b>${esc(r.hadith_number)}</b></div>
420
  </div>
 
421
  <div>
422
- <div class="arabic">${ar}</div>
 
 
 
423
  <details>
424
  <summary>Show Arabic with tashkeel</summary>
425
  <div style="height:10px;"></div>
426
  <div class="arabic">${ar_tashkeel}</div>
427
  </details>
 
428
  <details>
429
  <summary>Show English</summary>
430
  <div style="height:10px;"></div>
@@ -476,6 +683,15 @@ def search():
476
  k = DEFAULT_TOP_K
477
  k = max(1, min(k, MAX_TOP_K))
478
 
 
 
 
 
 
 
 
 
 
479
  # Highlight controls
480
  hl_raw = request.args.get("hl_topn", str(DEFAULT_HL_TOPN)).strip()
481
  seg_raw = request.args.get("seg_maxlen", str(DEFAULT_SEG_MAXLEN)).strip()
@@ -500,6 +716,7 @@ def search():
500
  "query": "",
501
  "query_norm": "",
502
  "k": k,
 
503
  "n": 0,
504
  "rows": int(len(meta)),
505
  "took_ms": 0,
@@ -508,49 +725,73 @@ def search():
508
  })
509
 
510
  t0 = time.time()
511
- df = semantic_search_df(q, top_k=k)
 
 
 
 
 
 
 
 
 
512
  took_ms = int((time.time() - t0) * 1000)
513
 
514
- q_norm = normalize_ar(q)
 
 
 
 
 
 
 
515
 
516
- # Build clean arabic list (fallback derive if missing)
517
- arabic_list: List[str] = []
518
- for _, row in df.iterrows():
519
- ar = str(row.get("arabic", "") or "")
520
  ar_clean = row.get("arabic_clean", "")
521
  if ar_clean is None or (isinstance(ar_clean, float) and np.isnan(ar_clean)):
522
  ar_clean = ""
523
  ar_clean = str(ar_clean).strip()
524
  if not ar_clean:
525
- ar_clean = normalize_ar(ar)
526
- arabic_list.append(ar_clean)
527
-
528
- # Highlight (batch)
529
- ar_html_list: List[str] = ["" for _ in arabic_list]
530
- dbg: Dict[str, Any] = {}
531
- if want_html:
532
- ar_html_list, dbg = build_highlight_html_batch(
533
- query_norm=q_norm,
534
- arabic_clean_list=arabic_list,
535
- hl_topn=hl_topn,
536
- seg_maxlen=seg_maxlen,
537
- )
538
 
539
- results: List[Dict[str, Any]] = []
540
- for i, (_, row) in enumerate(df.iterrows()):
541
- arabic = str(row.get("arabic", "") or "")
542
- english = str(row.get("english", "") or "")
543
- r = {
 
 
 
 
 
544
  "hadithID": int(row.get("hadithID")) if pd.notna(row.get("hadithID")) else None,
545
  "collection": str(row.get("collection", "") or ""),
546
  "hadith_number": int(row.get("hadith_number")) if pd.notna(row.get("hadith_number")) else None,
547
- "score": float(row.get("score")) if pd.notna(row.get("score")) else 0.0,
 
 
 
 
 
 
 
 
 
 
548
  "arabic": arabic,
549
- "arabic_clean": arabic_list[i],
550
  "english": english,
 
 
551
  }
 
552
  if want_html:
553
- r["arabic_clean_html"] = ar_html_list[i] if ar_html_list[i] else escape_html(arabic_list[i])
 
 
 
 
 
 
554
  results.append(r)
555
 
556
  return jsonify({
@@ -558,17 +799,16 @@ def search():
558
  "query": q,
559
  "query_norm": q_norm,
560
  "k": k,
 
561
  "n": len(results),
562
  "rows": int(len(meta)),
563
  "took_ms": took_ms,
564
  "format": "html" if want_html else "json",
565
  "hl_topn": hl_topn,
566
  "seg_maxlen": seg_maxlen,
567
- "debug": dbg if want_html else {},
568
  "results": results,
569
  })
570
 
571
 
572
  if __name__ == "__main__":
573
- # local run only
574
  app.run(host="127.0.0.1", port=5000, debug=True)
 
4
  import re
5
  import time
6
  from functools import lru_cache
7
+ from typing import List, Dict, Any, Tuple, Optional
8
 
9
  import numpy as np
10
  import pandas as pd
 
24
  DEFAULT_TOP_K = 10
25
  MAX_TOP_K = 50
26
 
27
+ # pull more from FAISS then rerank by evidence
28
+ DEFAULT_RERANK_K = 35
29
+ MAX_RERANK_K = 120
30
+ MIN_RERANK_K = 20
31
+
32
  DEFAULT_HL_TOPN = 6 # 0 = disable highlighting (FAST)
33
  MAX_HL_TOPN = 25
34
 
 
74
 
75
 
76
  # =========================
77
+ # Lexical match helpers
78
+ # =========================
79
+ AR_STOPWORDS = {
80
+ "من","الى","إلى","عن","على","في","و","ثم","أو","او","كما","كان","كانت","يكون","تكون",
81
+ "هذا","هذه","ذلك","تلك","هناك","هنا","هو","هي","هم","هن","أنا","انت","أنت","نحن",
82
+ "ما","ماذا","هل","لماذا","لم","لن","لا","إن","أن","إنه","أنه","إلا","الا","حتى","قد",
83
+ "كل","أي","أيّ","اي","ايًّا","أيضا","أيضًا","مع","عند","بين","بعد","قبل","إذا","اذ","إذ",
84
+ "قال","وقالت","يقول","يقولون","رسول","الله","صلى","عليه","وسلم"
85
+ }
86
+
87
+ def ar_tokens(text_norm: str) -> List[str]:
88
+ if not text_norm:
89
+ return []
90
+ t = re.sub(r"[^\u0600-\u06FF0-9\s]", " ", text_norm)
91
+ t = re.sub(r"\s+", " ", t).strip()
92
+ toks = [x for x in t.split(" ") if x and x not in AR_STOPWORDS and len(x) >= 2]
93
+ seen = set()
94
+ out = []
95
+ for w in toks:
96
+ if w not in seen:
97
+ seen.add(w)
98
+ out.append(w)
99
+ return out
100
+
101
+ def lexical_match_ratio(query_norm: str, doc_norm: str) -> Tuple[float, List[str]]:
102
+ q_toks = ar_tokens(query_norm)
103
+ if not q_toks:
104
+ return 0.0, []
105
+ doc = " " + (doc_norm or "") + " "
106
+ matched = [w for w in q_toks if f" {w} " in doc]
107
+ ratio = len(matched) / max(1, len(q_toks))
108
+ return float(ratio), matched
109
+
110
+
111
+ # =========================
112
+ # Segmenting + isnad detection
113
  # =========================
114
  def split_ar_segments(text: str, max_len: int) -> List[str]:
115
  if not text:
 
133
  if buf:
134
  segs.append(buf)
135
 
 
136
  if len(segs) <= 1 and len(t) > max_len:
137
  segs = [t[i:i+max_len].strip() for i in range(0, len(t), max_len) if t[i:i+max_len].strip()]
138
  return segs
139
 
140
+ ISNAD_START = re.compile(r"^\s*(حدثنا|أخبرنا|أنبأنا|سمعت|حدثني|أخبرني|قال|عن)\b")
141
+ NAME_HEURISTIC = re.compile(r"(بن|ابن|أبو|أبي|بنت)\s+\S+")
142
+
143
+ def is_isnad_segment(seg: str) -> bool:
144
+ if not seg:
145
+ return False
146
+ s = seg.strip()
147
+ if ISNAD_START.search(s):
148
+ hits = len(NAME_HEURISTIC.findall(s))
149
+ chain_markers = sum(s.count(x) for x in [" عن ", " قال ", " حدثنا ", " أخبرنا ", " سمعت "])
150
+ if hits >= 1 or chain_markers >= 2 or len(s) < 120:
151
+ return True
152
+ return False
153
+
154
 
155
  # =========================
156
  # Load model + index + meta (once)
 
176
  # =========================
177
  # Embedding helpers (cached)
178
  # =========================
179
+ @lru_cache(maxsize=2048)
180
  def cached_query_emb(query_norm: str) -> bytes:
 
181
  emb = model.encode(["query: " + query_norm], normalize_embeddings=True).astype("float32")[0]
182
  return emb.tobytes()
183
 
184
  def get_query_emb(query_norm: str) -> np.ndarray:
185
  return np.frombuffer(cached_query_emb(query_norm), dtype=np.float32)
186
 
187
+ def compute_segment_sims(query_emb: np.ndarray, segments: List[str]) -> np.ndarray:
188
+ if not segments:
189
+ return np.array([], dtype=np.float32)
190
+ seg_emb = model.encode(
191
+ ["passage: " + s for s in segments],
192
+ normalize_embeddings=True
193
+ ).astype("float32")
194
+ return (seg_emb @ query_emb).astype(np.float32)
195
 
 
 
 
 
 
 
 
196
 
197
+ # =========================
198
+ # Core search: candidates -> rerank by best MATN segment
199
+ # =========================
200
+ def search_candidates_df(query_norm: str, rerank_k: int) -> pd.DataFrame:
201
+ q_emb = get_query_emb(query_norm).reshape(1, -1)
202
+ scores, idxs = index.search(q_emb, rerank_k)
203
 
204
+ res = meta.iloc[idxs[0]].copy()
205
+ res["faiss_score"] = scores[0]
206
+ res["faiss_rank"] = np.arange(len(res), dtype=np.int32)
207
 
208
  res["arabic"] = res["arabic"].fillna("").astype(str)
209
  res = res[res["arabic"].str.strip() != ""]
210
  return res
211
 
212
+ def rerank_rows(query_norm: str, cand: pd.DataFrame, seg_maxlen: int) -> pd.DataFrame:
213
+ if cand.empty:
214
+ out = cand.copy()
215
+ out["score"] = np.nan
216
+ out["best_seg"] = ""
217
+ out["lex_ratio"] = 0.0
218
+ out["lex_terms"] = ""
219
+ return out
220
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  q_emb = get_query_emb(query_norm) # (d,)
 
 
 
 
222
 
223
+ # Build per-row clean text
224
+ arabic_clean_list: List[str] = []
225
+ for _, row in cand.iterrows():
226
+ ar = str(row.get("arabic", "") or "")
227
+ ar_clean = row.get("arabic_clean", "")
228
+ if ar_clean is None or (isinstance(ar_clean, float) and np.isnan(ar_clean)):
229
+ ar_clean = ""
230
+ ar_clean = str(ar_clean).strip()
231
+ if not ar_clean:
232
+ ar_clean = normalize_ar(ar)
233
+ arabic_clean_list.append(ar_clean)
234
 
235
+ # Prepare segments for rerank (batch over all segments)
236
+ per_segments: List[List[str]] = []
237
+ per_deemph: List[np.ndarray] = []
238
+ all_segments: List[str] = []
239
+ seg_map: List[Tuple[int, int]] = [] # (row_i, seg_i_local)
240
+
241
+ for i, txt in enumerate(arabic_clean_list):
242
+ segs = split_ar_segments(txt, seg_maxlen)
243
+ if not segs:
244
+ segs = [txt] if txt else []
245
+ per_segments.append(segs)
246
+
247
+ deemph_mask = np.array([1.0 if not is_isnad_segment(s) else 0.0 for s in segs], dtype=np.float32)
248
+ per_deemph.append(deemph_mask)
249
+
250
+ for j, s in enumerate(segs):
251
+ all_segments.append(s)
252
+ seg_map.append((i, j))
253
+
254
+ if not all_segments:
255
+ out = cand.copy()
256
+ out["score"] = out["faiss_score"].astype(float)
257
+ out["best_seg"] = ""
258
+ out["lex_ratio"] = 0.0
259
+ out["lex_terms"] = ""
260
+ return out
261
+
262
+ # sims for all segments once
263
+ sims_all = compute_segment_sims(q_emb, all_segments)
264
+
265
+ # best segment per row (downweight isnad)
266
+ n_rows = len(per_segments)
267
+ best_sim = np.full((n_rows,), -1.0, dtype=np.float32)
268
+ best_local = np.full((n_rows,), -1, dtype=np.int32)
269
+
270
+ for k, (ri, sj) in enumerate(seg_map):
271
+ sim = float(sims_all[k])
272
+ deemph = float(per_deemph[ri][sj]) # 1 matn, 0 isnad-ish
273
+ sim_adj = sim * (0.70 + 0.30 * deemph) # isnad gets downweighted
274
+ if sim_adj > best_sim[ri]:
275
+ best_sim[ri] = sim_adj
276
+ best_local[ri] = sj
277
+
278
+ # lexical match
279
+ lex_ratios: List[float] = []
280
+ lex_terms: List[str] = []
281
+ for txt in arabic_clean_list:
282
+ r, matched = lexical_match_ratio(query_norm, txt)
283
+ lex_ratios.append(r)
284
+ lex_terms.append("، ".join(matched[:10]) if matched else "")
285
+
286
+ out = cand.copy()
287
+ out["score"] = best_sim.astype(float)
288
+ out["best_seg_idx"] = best_local.astype(int)
289
+
290
+ # compute best_seg text
291
+ best_segs = []
292
+ for i, segs in enumerate(per_segments):
293
+ j = int(best_local[i])
294
+ best_segs.append(segs[j] if (0 <= j < len(segs)) else (segs[0] if segs else ""))
295
+ out["best_seg"] = best_segs
296
+
297
+ out["lex_ratio"] = np.array(lex_ratios, dtype=np.float32)
298
+ out["lex_terms"] = lex_terms
299
+
300
+ # Sort by evidence score, then faiss score, then original rank
301
+ out = out.sort_values(["score", "faiss_score", "faiss_rank"], ascending=[False, False, True])
302
+ return out
303
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
304
 
305
+ # =========================
306
+ # UI helpers for html mode
307
+ # =========================
308
+ def confidence_badge(score1: float, score2: Optional[float]) -> Tuple[str, str]:
309
+ s1 = float(score1) if score1 is not None else 0.0
310
+ s2 = float(score2) if score2 is not None else None
311
+ margin = (s1 - s2) if s2 is not None else s1
312
+
313
+ if s1 >= 0.78 or margin >= 0.08:
314
+ return "High", "bHigh"
315
+ if s1 >= 0.68 or margin >= 0.04:
316
+ return "Medium", "bMed"
317
+ return "Low", "bLow"
318
+
319
+ def heatmap_html(sims: np.ndarray, bins: int = 16) -> str:
320
+ if sims.size == 0:
321
+ return ""
322
+ s_min = float(np.min(sims))
323
+ s_max = float(np.max(sims))
324
+ denom = (s_max - s_min) if (s_max - s_min) > 1e-6 else 1.0
325
+
326
+ n = sims.size
327
+ if n <= bins:
328
+ take_idx = list(range(n))
329
+ else:
330
+ take_idx = [int(round(i)) for i in np.linspace(0, n - 1, bins)]
331
+
332
+ parts = []
333
+ for i in take_idx:
334
+ w = (float(sims[i]) - s_min) / denom
335
+ alpha = 0.10 + 0.75 * w
336
+ alpha = max(0.08, min(alpha, 0.90))
337
+ parts.append(
338
+ f'<span title="seg {i+1}" style="display:inline-block;width:10px;height:10px;'
339
+ f'margin:0 2px;border-radius:3px;background:rgba(37,99,235,{alpha:.3f});"></span>'
340
+ )
341
+ return '<div style="margin:10px 0 8px;direction:ltr;text-align:left;">' + "".join(parts) + "</div>"
342
+
343
+ def highlight_segments_html(segs: List[str], sims: np.ndarray, strong_topn: int, deemph_mask: np.ndarray) -> str:
344
+ if not segs or sims.size == 0:
345
+ return ""
346
+
347
+ s_min = float(np.min(sims))
348
+ s_max = float(np.max(sims))
349
+ denom = (s_max - s_min) if (s_max - s_min) > 1e-6 else 1.0
350
+
351
+ order = np.argsort(-sims)
352
+ keep = set(order[:min(strong_topn, len(segs))])
353
+
354
+ parts: List[str] = []
355
+ for i, seg in enumerate(segs):
356
+ w = (float(sims[i]) - s_min) / denom
357
+
358
+ deemph = float(deemph_mask[i]) # 1 matn, 0 isnad-like
359
+ alpha = (0.18 + 0.62 * w) if i in keep else (0.06 + 0.20 * w)
360
+ alpha = alpha * (0.45 + 0.55 * deemph)
361
+ alpha = max(0.04, min(alpha, 0.82))
362
+ border_alpha = max(0.08, min(alpha * 0.75, 0.60))
363
+
364
+ style = (
365
+ f"background: rgba(255, 230, 120, {alpha:.3f});"
366
+ f"border: 1px solid rgba(234, 179, 8, {border_alpha:.3f});"
367
+ "border-radius: 12px;"
368
+ "padding: 3px 8px;"
369
+ "margin: 0 4px 6px 0;"
370
+ "display: inline;"
371
+ )
372
+ parts.append(f'<span style="{style}">{escape_html(seg)}</span> ')
373
+ return "".join(parts).strip()
374
+
375
+ def build_html_extras_for_row(query_norm: str, arabic_clean_text: str, hl_topn: int, seg_maxlen: int) -> Dict[str, str]:
376
+ segs = split_ar_segments(arabic_clean_text, seg_maxlen)
377
+ if not segs:
378
+ segs = [arabic_clean_text] if arabic_clean_text else []
379
+
380
+ deemph_mask = np.array([1.0 if not is_isnad_segment(s) else 0.0 for s in segs], dtype=np.float32)
381
+ q_emb = get_query_emb(query_norm)
382
+ sims = compute_segment_sims(q_emb, segs)
383
+
384
+ hm = heatmap_html(sims, bins=16) if hl_topn > 0 else ""
385
+ highlighted = highlight_segments_html(segs, sims, strong_topn=max(1, hl_topn), deemph_mask=deemph_mask) if hl_topn > 0 else escape_html(arabic_clean_text)
386
+
387
+ # Best seg
388
+ best_seg = ""
389
+ if sims.size > 0:
390
+ best_i = int(np.argmax(sims))
391
+ best_seg = segs[best_i]
392
+
393
+ best_seg_html = (
394
+ f'<span style="background:rgba(255,230,120,.55);border:1px solid rgba(234,179,8,.40);'
395
+ f'border-radius:12px;padding:4px 10px;display:inline;">{escape_html(best_seg)}</span>'
396
+ if best_seg else ""
397
+ )
398
+
399
+ return {
400
+ "heatmap_html": hm,
401
+ "arabic_clean_html": highlighted if highlighted else escape_html(arabic_clean_text),
402
+ "best_seg_html": best_seg_html or "",
403
+ }
404
 
405
 
406
  # =========================
 
408
  # =========================
409
  app = Flask(__name__)
410
 
411
+ # (UI is optional for HF; keep it for quick testing)
412
  UI_HTML = r"""
413
  <!doctype html>
414
  <html lang="ar" dir="rtl">
 
423
  :root{
424
  --bg:#f6f7fb; --card:#ffffff; --text:#0f172a; --muted:#475569;
425
  --line:#e5e7eb; --accent:#2563eb; --shadow: 0 10px 30px rgba(15, 23, 42, .08);
426
+ --good:#16a34a; --warn:#f59e0b; --bad:#ef4444;
427
  }
428
  body{
429
  margin:0; background: linear-gradient(180deg, #ffffff, var(--bg)); color: var(--text);
 
469
  background: var(--card); border: 1px solid var(--line); border-radius:18px;
470
  padding: 16px; box-shadow: var(--shadow);
471
  }
472
+ .row{ display:grid; grid-template-columns: 240px 1fr; gap:14px; }
473
  @media (max-width: 900px){ .row{ grid-template-columns: 1fr; } }
474
 
475
  .left{ color: var(--muted); font-size:14px; direction:ltr; text-align:left; }
476
+ .score{ font-weight:900; color: var(--accent); font-size:18px; }
477
+ .badge{
478
+ display:inline-flex; align-items:center; gap:6px; border-radius:999px;
479
+ padding:5px 10px; font-weight:900; font-size:12px; margin-top:10px;
480
+ border:1px solid var(--line); background:#fff;
481
+ }
482
+ .bHigh{ color: var(--good); border-color: rgba(22,163,74,.35); background: rgba(22,163,74,.08); }
483
+ .bMed{ color: var(--warn); border-color: rgba(245,158,11,.35); background: rgba(245,158,11,.10); }
484
+ .bLow{ color: var(--bad); border-color: rgba(239,68,68,.35); background: rgba(239,68,68,.08); }
485
 
486
  .arabic{
487
  direction: rtl; text-align:right; font-family: Amiri, serif; font-size:22px;
488
  line-height: 2.05; background:#fbfcff; border:1px solid var(--line);
489
  border-radius:16px; padding:14px; white-space: pre-wrap;
490
  }
491
+ .evidence{
492
+ margin-top: 10px; border: 1px dashed rgba(37,99,235,.25);
493
+ background: rgba(37,99,235,.05); border-radius: 14px;
494
+ padding: 10px 12px; direction: rtl; text-align: right;
495
+ font-family: Amiri, serif; font-size: 18px; line-height: 1.95;
496
+ }
497
+ .evidence small{
498
+ display:block; margin-bottom:6px; font-family: Tajawal, sans-serif;
499
+ color: var(--muted); direction:ltr; text-align:left; font-size:12px;
500
+ }
501
  .english{
502
  direction:ltr; text-align:left; font-size:16px; line-height:1.8; color:#111827;
503
  background:#fbfcff; border:1px solid var(--line); border-radius:16px; padding:14px; white-space: pre-wrap;
504
  }
505
  details summary{
506
  cursor:pointer; color: var(--accent); margin-top:12px; user-select:none;
507
+ direction:ltr; text-align:left; font-weight:800;
508
  }
509
  .empty{ margin-top: 14px; color: var(--muted); font-size: 15px; direction:ltr; text-align:left; }
510
+ .tiny{ margin-top:8px; font-size:12px; color: var(--muted); direction:ltr; text-align:left; }
511
  </style>
512
  </head>
513
  <body>
 
524
  </form>
525
 
526
  <div class="controls">
527
+ <label>Highlight Top Segments:
528
+ <input id="hl" type="range" min="0" max="25" value="6"><b id="hlv">6</b>
529
+ </label>
530
+ <label>Segment Size:
531
+ <input id="seg" type="range" min="120" max="420" step="20" value="220"><b id="segv">220</b>
532
  </label>
533
+ <label>Re-rank pool:
534
+ <input id="rk" type="range" min="20" max="120" step="5" value="35"><b id="rkv">35</b>
 
 
535
  </label>
536
  </div>
537
 
 
556
  l.textContent = r.value;
557
  r.addEventListener("input", ()=> l.textContent = r.value);
558
  }
559
+ sync("hl","hlv"); sync("seg","segv"); sync("rk","rkv");
560
 
561
  $("f").addEventListener("submit", async (e)=>{
562
  e.preventDefault();
 
564
  const k = parseInt($("k").value||"10",10);
565
  const hl = parseInt($("hl").value||"6",10);
566
  const seg = parseInt($("seg").value||"220",10);
567
+ const rk = parseInt($("rk").value||"35",10);
568
 
569
  $("msg").style.display="none";
570
  $("grid").innerHTML = "";
571
  $("meta").style.display="none";
 
572
 
573
  if(!q){
574
  $("msg").textContent="اكتب نص البحث أولًا.";
 
579
  $("msg").textContent="... جاري البحث";
580
  $("msg").style.display="block";
581
 
582
+ const url = `/search?q=${encodeURIComponent(q)}&k=${encodeURIComponent(k)}&rerank_k=${encodeURIComponent(rk)}&hl_topn=${encodeURIComponent(hl)}&seg_maxlen=${encodeURIComponent(seg)}&format=html`;
 
583
  const res = await fetch(url);
584
  const js = await res.json();
 
585
 
586
  $("meta").style.display="flex";
587
  $("meta").innerHTML =
588
+ pill("Rows", js.rows) + pill("Results", js.n) + pill("Time(ms)", js.took_ms) +
589
+ pill("TopK", js.k) + pill("ReRank", js.rerank_k) + pill("Query", js.query);
590
 
591
  if(!js.ok || !js.results || js.results.length===0){
592
  $("msg").textContent="لا توجد نتائج. جرّب كلمات مختلفة.";
 
596
  $("msg").style.display="none";
597
 
598
  const cards = js.results.map(r=>{
599
+ const hm = r.heatmap_html || "";
600
+ const best = r.best_seg_html || "";
601
+ const ar = r.arabic_clean_html || esc(r.arabic_clean||"");
602
  const ar_tashkeel = esc(r.arabic||"");
603
  const en = esc(r.english||"");
604
+
605
  return `
606
  <div class="card">
607
  <div class="row">
608
  <div class="left">
609
+ <div><span class="score">${Number(r.score||0).toFixed(4)}</span> evidence</div>
610
+ <div class="tiny">FAISS: <b>${Number(r.faiss_score||0).toFixed(4)}</b></div>
611
+
612
+ <div class="badge ${esc(r.conf_class||"")}">Confidence: <b>${esc(r.conf_label||"")}</b></div>
613
+
614
+ <div class="tiny" style="margin-top:10px;">
615
+ Lexical match: <b>${Math.round((r.lex_ratio||0)*100)}%</b>
616
+ ${r.lex_terms ? `<div style="margin-top:6px;">Matched: <b>${esc(r.lex_terms)}</b></div>` : ``}
617
+ </div>
618
+
619
  <div style="margin-top:12px;">HadithID: <b>${esc(r.hadithID)}</b></div>
620
  <div>Collection: <b>${esc(r.collection)}</b></div>
621
  <div>No: <b>${esc(r.hadith_number)}</b></div>
622
  </div>
623
+
624
  <div>
625
+ ${hm}
626
+ <div class="evidence"><small>Top evidence snippet</small>${best}</div>
627
+ <div class="arabic" style="margin-top:10px;">${ar}</div>
628
+
629
  <details>
630
  <summary>Show Arabic with tashkeel</summary>
631
  <div style="height:10px;"></div>
632
  <div class="arabic">${ar_tashkeel}</div>
633
  </details>
634
+
635
  <details>
636
  <summary>Show English</summary>
637
  <div style="height:10px;"></div>
 
683
  k = DEFAULT_TOP_K
684
  k = max(1, min(k, MAX_TOP_K))
685
 
686
+ # rerank pool
687
+ rk_raw = request.args.get("rerank_k", str(DEFAULT_RERANK_K)).strip()
688
+ try:
689
+ rerank_k = int(rk_raw) if rk_raw else DEFAULT_RERANK_K
690
+ except Exception:
691
+ rerank_k = DEFAULT_RERANK_K
692
+ rerank_k = max(MIN_RERANK_K, min(rerank_k, MAX_RERANK_K))
693
+ rerank_k = max(rerank_k, k) # must be >= k
694
+
695
  # Highlight controls
696
  hl_raw = request.args.get("hl_topn", str(DEFAULT_HL_TOPN)).strip()
697
  seg_raw = request.args.get("seg_maxlen", str(DEFAULT_SEG_MAXLEN)).strip()
 
716
  "query": "",
717
  "query_norm": "",
718
  "k": k,
719
+ "rerank_k": rerank_k,
720
  "n": 0,
721
  "rows": int(len(meta)),
722
  "took_ms": 0,
 
725
  })
726
 
727
  t0 = time.time()
728
+ q_norm = normalize_ar(q)
729
+
730
+ # 1) candidates from FAISS
731
+ cand = search_candidates_df(q_norm, rerank_k=rerank_k)
732
+
733
+ # 2) rerank by MATN evidence
734
+ reranked = rerank_rows(q_norm, cand, seg_maxlen=seg_maxlen)
735
+
736
+ # 3) take top k
737
+ reranked = reranked.head(k).copy()
738
  took_ms = int((time.time() - t0) * 1000)
739
 
740
+ # confidence uses margin between first and second
741
+ scores_final = reranked["score"].astype(float).tolist()
742
+ top2 = scores_final[1] if len(scores_final) > 1 else None
743
+
744
+ results: List[Dict[str, Any]] = []
745
+ for pos, (_, row) in enumerate(reranked.iterrows()):
746
+ arabic = str(row.get("arabic", "") or "")
747
+ english = str(row.get("english", "") or "")
748
 
 
 
 
 
749
  ar_clean = row.get("arabic_clean", "")
750
  if ar_clean is None or (isinstance(ar_clean, float) and np.isnan(ar_clean)):
751
  ar_clean = ""
752
  ar_clean = str(ar_clean).strip()
753
  if not ar_clean:
754
+ ar_clean = normalize_ar(arabic)
 
 
 
 
 
 
 
 
 
 
 
 
755
 
756
+ score = float(row.get("score")) if pd.notna(row.get("score")) else 0.0
757
+
758
+ # confidence
759
+ if pos == 0:
760
+ conf_label, conf_class = confidence_badge(score, top2)
761
+ else:
762
+ # compare against top1 as rough indicator
763
+ conf_label, conf_class = confidence_badge(score, scores_final[0] if scores_final else None)
764
+
765
+ r: Dict[str, Any] = {
766
  "hadithID": int(row.get("hadithID")) if pd.notna(row.get("hadithID")) else None,
767
  "collection": str(row.get("collection", "") or ""),
768
  "hadith_number": int(row.get("hadith_number")) if pd.notna(row.get("hadith_number")) else None,
769
+
770
+ "score": score, # evidence score (reranked)
771
+ "faiss_score": float(row.get("faiss_score") or 0.0),
772
+ "faiss_rank": int(row.get("faiss_rank") or 0),
773
+
774
+ "lex_ratio": float(row.get("lex_ratio") or 0.0),
775
+ "lex_terms": str(row.get("lex_terms", "") or ""),
776
+
777
+ "conf_label": conf_label,
778
+ "conf_class": conf_class,
779
+
780
  "arabic": arabic,
781
+ "arabic_clean": ar_clean,
782
  "english": english,
783
+
784
+ "best_seg": str(row.get("best_seg", "") or ""),
785
  }
786
+
787
  if want_html:
788
+ extras = build_html_extras_for_row(
789
+ query_norm=q_norm,
790
+ arabic_clean_text=ar_clean,
791
+ hl_topn=hl_topn,
792
+ seg_maxlen=seg_maxlen,
793
+ )
794
+ r.update(extras)
795
  results.append(r)
796
 
797
  return jsonify({
 
799
  "query": q,
800
  "query_norm": q_norm,
801
  "k": k,
802
+ "rerank_k": rerank_k,
803
  "n": len(results),
804
  "rows": int(len(meta)),
805
  "took_ms": took_ms,
806
  "format": "html" if want_html else "json",
807
  "hl_topn": hl_topn,
808
  "seg_maxlen": seg_maxlen,
 
809
  "results": results,
810
  })
811
 
812
 
813
  if __name__ == "__main__":
 
814
  app.run(host="127.0.0.1", port=5000, debug=True)