Alshargi commited on
Commit
90c65a7
·
verified ·
1 Parent(s): d1c4815

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +415 -518
app.py CHANGED
@@ -3,6 +3,7 @@ from __future__ import annotations
3
  import os
4
  import re
5
  import time
 
6
  from functools import lru_cache
7
  from typing import List, Dict, Any, Tuple, Optional
8
 
@@ -24,18 +25,26 @@ MODEL_NAME = os.getenv("HADITH_MODEL_NAME", "intfloat/multilingual-e5-base")
24
  DEFAULT_TOP_K = 10
25
  MAX_TOP_K = 50
26
 
27
- # pull more from FAISS then rerank by evidence
28
  DEFAULT_RERANK_K = 35
29
  MAX_RERANK_K = 120
30
- MIN_RERANK_K = 20
31
 
32
- DEFAULT_HL_TOPN = 6 # 0 = disable highlighting (FAST)
33
  MAX_HL_TOPN = 25
34
 
35
- DEFAULT_SEG_MAXLEN = 220 # segment size
36
  MAX_SEG_MAXLEN = 420
37
  MIN_SEG_MAXLEN = 120
38
 
 
 
 
 
 
 
 
 
 
39
 
40
  # =========================
41
  # Arabic normalization
@@ -47,6 +56,8 @@ _AR_DIACRITICS = re.compile(r"""
47
  | [\u06D6-\u06ED]
48
  """, re.VERBOSE)
49
 
 
 
50
  def normalize_ar(text: str) -> str:
51
  if text is None:
52
  return ""
@@ -60,6 +71,14 @@ def normalize_ar(text: str) -> str:
60
  text = re.sub(r"\s+", " ", text).strip()
61
  return text
62
 
 
 
 
 
 
 
 
 
63
  def escape_html(s: str) -> str:
64
  if s is None:
65
  return ""
@@ -74,42 +93,7 @@ def escape_html(s: str) -> str:
74
 
75
 
76
  # =========================
77
- # Lexical match helpers
78
- # =========================
79
- AR_STOPWORDS = {
80
- "من","الى","إلى","عن","على","في","و","ثم","أو","او","كما","كان","كانت","يكون","تكون",
81
- "هذا","هذه","ذلك","تلك","هناك","هنا","هو","هي","هم","هن","أنا","انت","أنت","نحن",
82
- "ما","ماذا","هل","لماذا","لم","لن","لا","إن","أن","إنه","أنه","إلا","الا","حتى","قد",
83
- "كل","أي","أيّ","اي","ايًّا","أيضا","أيضًا","مع","عند","بين","بعد","قبل","إذا","اذ","إذ",
84
- "قال","وقالت","يقول","يقولون","رسول","الله","صلى","عليه","وسلم"
85
- }
86
-
87
- def ar_tokens(text_norm: str) -> List[str]:
88
- if not text_norm:
89
- return []
90
- t = re.sub(r"[^\u0600-\u06FF0-9\s]", " ", text_norm)
91
- t = re.sub(r"\s+", " ", t).strip()
92
- toks = [x for x in t.split(" ") if x and x not in AR_STOPWORDS and len(x) >= 2]
93
- seen = set()
94
- out = []
95
- for w in toks:
96
- if w not in seen:
97
- seen.add(w)
98
- out.append(w)
99
- return out
100
-
101
- def lexical_match_ratio(query_norm: str, doc_norm: str) -> Tuple[float, List[str]]:
102
- q_toks = ar_tokens(query_norm)
103
- if not q_toks:
104
- return 0.0, []
105
- doc = " " + (doc_norm or "") + " "
106
- matched = [w for w in q_toks if f" {w} " in doc]
107
- ratio = len(matched) / max(1, len(q_toks))
108
- return float(ratio), matched
109
-
110
-
111
- # =========================
112
- # Segmenting + isnad detection
113
  # =========================
114
  def split_ar_segments(text: str, max_len: int) -> List[str]:
115
  if not text:
@@ -133,44 +117,26 @@ def split_ar_segments(text: str, max_len: int) -> List[str]:
133
  if buf:
134
  segs.append(buf)
135
 
 
136
  if len(segs) <= 1 and len(t) > max_len:
137
  segs = [t[i:i+max_len].strip() for i in range(0, len(t), max_len) if t[i:i+max_len].strip()]
138
  return segs
139
 
140
- ISNAD_START = re.compile(r"^\s*(حدثنا|أخبرنا|أنبأنا|سمعت|حدثني|أخبرني|قال|عن)\b")
141
- NAME_HEURISTIC = re.compile(r"(بن|ابن|أبو|أبي|بنت)\s+\S+")
142
-
143
- def is_isnad_segment(seg: str) -> bool:
144
- if not seg:
145
- return False
146
- s = seg.strip()
147
- if ISNAD_START.search(s):
148
- hits = len(NAME_HEURISTIC.findall(s))
149
- chain_markers = sum(s.count(x) for x in [" عن ", " قال ", " حدثنا ", " أخبرنا ", " سمعت "])
150
- if hits >= 1 or chain_markers >= 2 or len(s) < 120:
151
- return True
152
- return False
153
-
154
-
155
- # =========================
156
- # Load model + index + meta (once)
157
- # =========================
158
- if not os.path.exists(INDEX_PATH):
159
- raise FileNotFoundError(f"FAISS index not found: {INDEX_PATH}")
160
- if not os.path.exists(META_PATH):
161
- raise FileNotFoundError(f"Meta parquet not found: {META_PATH}")
162
-
163
- model = SentenceTransformer(MODEL_NAME)
164
- index = faiss.read_index(INDEX_PATH)
165
- meta = pd.read_parquet(META_PATH)
166
-
167
- required_cols = {"hadithID", "collection", "hadith_number", "arabic", "english"}
168
- missing = required_cols - set(meta.columns)
169
- if missing:
170
- raise ValueError(f"Meta is missing required columns: {missing}")
171
-
172
- if "arabic_clean" not in meta.columns:
173
- meta["arabic_clean"] = ""
174
 
175
 
176
  # =========================
@@ -184,45 +150,103 @@ def cached_query_emb(query_norm: str) -> bytes:
184
  def get_query_emb(query_norm: str) -> np.ndarray:
185
  return np.frombuffer(cached_query_emb(query_norm), dtype=np.float32)
186
 
187
- def compute_segment_sims(query_emb: np.ndarray, segments: List[str]) -> np.ndarray:
188
- if not segments:
189
- return np.array([], dtype=np.float32)
190
- seg_emb = model.encode(
191
- ["passage: " + s for s in segments],
192
- normalize_embeddings=True
193
- ).astype("float32")
194
- return (seg_emb @ query_emb).astype(np.float32)
195
-
196
 
197
  # =========================
198
- # Core search: candidates -> rerank by best MATN segment
199
  # =========================
200
- def search_candidates_df(query_norm: str, rerank_k: int) -> pd.DataFrame:
201
- q_emb = get_query_emb(query_norm).reshape(1, -1)
202
- scores, idxs = index.search(q_emb, rerank_k)
 
203
 
204
- res = meta.iloc[idxs[0]].copy()
205
- res["faiss_score"] = scores[0]
206
- res["faiss_rank"] = np.arange(len(res), dtype=np.int32)
207
 
208
- res["arabic"] = res["arabic"].fillna("").astype(str)
209
- res = res[res["arabic"].str.strip() != ""]
210
- return res
211
 
212
- def rerank_rows(query_norm: str, cand: pd.DataFrame, seg_maxlen: int) -> pd.DataFrame:
213
- if cand.empty:
214
- out = cand.copy()
215
- out["score"] = np.nan
216
- out["best_seg"] = ""
217
- out["lex_ratio"] = 0.0
218
- out["lex_terms"] = ""
219
- return out
 
 
 
 
 
 
 
220
 
221
- q_emb = get_query_emb(query_norm) # (d,)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
 
223
- # Build per-row clean text
224
- arabic_clean_list: List[str] = []
225
- for _, row in cand.iterrows():
226
  ar = str(row.get("arabic", "") or "")
227
  ar_clean = row.get("arabic_clean", "")
228
  if ar_clean is None or (isinstance(ar_clean, float) and np.isnan(ar_clean)):
@@ -230,136 +254,114 @@ def rerank_rows(query_norm: str, cand: pd.DataFrame, seg_maxlen: int) -> pd.Data
230
  ar_clean = str(ar_clean).strip()
231
  if not ar_clean:
232
  ar_clean = normalize_ar(ar)
233
- arabic_clean_list.append(ar_clean)
234
-
235
- # Prepare segments for rerank (batch over all segments)
236
- per_segments: List[List[str]] = []
237
- per_deemph: List[np.ndarray] = []
238
- all_segments: List[str] = []
239
- seg_map: List[Tuple[int, int]] = [] # (row_i, seg_i_local)
240
 
241
- for i, txt in enumerate(arabic_clean_list):
242
- segs = split_ar_segments(txt, seg_maxlen)
243
  if not segs:
244
- segs = [txt] if txt else []
245
- per_segments.append(segs)
246
-
247
- deemph_mask = np.array([1.0 if not is_isnad_segment(s) else 0.0 for s in segs], dtype=np.float32)
248
- per_deemph.append(deemph_mask)
249
-
250
- for j, s in enumerate(segs):
251
- all_segments.append(s)
252
- seg_map.append((i, j))
253
-
254
- if not all_segments:
255
- out = cand.copy()
256
- out["score"] = out["faiss_score"].astype(float)
257
- out["best_seg"] = ""
258
- out["lex_ratio"] = 0.0
259
- out["lex_terms"] = ""
260
- return out
261
-
262
- # sims for all segments once
263
- sims_all = compute_segment_sims(q_emb, all_segments)
264
-
265
- # best segment per row (downweight isnad)
266
- n_rows = len(per_segments)
267
- best_sim = np.full((n_rows,), -1.0, dtype=np.float32)
268
- best_local = np.full((n_rows,), -1, dtype=np.int32)
269
-
270
- for k, (ri, sj) in enumerate(seg_map):
271
- sim = float(sims_all[k])
272
- deemph = float(per_deemph[ri][sj]) # 1 matn, 0 isnad-ish
273
- sim_adj = sim * (0.70 + 0.30 * deemph) # isnad gets downweighted
274
- if sim_adj > best_sim[ri]:
275
- best_sim[ri] = sim_adj
276
- best_local[ri] = sj
277
-
278
- # lexical match
279
- lex_ratios: List[float] = []
280
- lex_terms: List[str] = []
281
- for txt in arabic_clean_list:
282
- r, matched = lexical_match_ratio(query_norm, txt)
283
- lex_ratios.append(r)
284
- lex_terms.append( ".join(matched[:10]) if matched else "")
285
-
286
- out = cand.copy()
287
- out["score"] = best_sim.astype(float)
288
- out["best_seg_idx"] = best_local.astype(int)
289
-
290
- # compute best_seg text
291
- best_segs = []
292
- for i, segs in enumerate(per_segments):
293
- j = int(best_local[i])
294
- best_segs.append(segs[j] if (0 <= j < len(segs)) else (segs[0] if segs else ""))
295
- out["best_seg"] = best_segs
296
-
297
- out["lex_ratio"] = np.array(lex_ratios, dtype=np.float32)
298
- out["lex_terms"] = lex_terms
299
-
300
- # Sort by evidence score, then faiss score, then original rank
301
- out = out.sort_values(["score", "faiss_score", "faiss_rank"], ascending=[False, False, True])
302
- return out
 
 
 
 
 
303
 
304
 
305
  # =========================
306
- # UI helpers for html mode
307
  # =========================
308
- def confidence_badge(score1: float, score2: Optional[float]) -> Tuple[str, str]:
309
- s1 = float(score1) if score1 is not None else 0.0
310
- s2 = float(score2) if score2 is not None else None
311
- margin = (s1 - s2) if s2 is not None else s1
312
-
313
- if s1 >= 0.78 or margin >= 0.08:
314
- return "High", "bHigh"
315
- if s1 >= 0.68 or margin >= 0.04:
316
- return "Medium", "bMed"
317
- return "Low", "bLow"
318
-
319
- def heatmap_html(sims: np.ndarray, bins: int = 16) -> str:
320
- if sims.size == 0:
321
- return ""
322
- s_min = float(np.min(sims))
323
- s_max = float(np.max(sims))
324
- denom = (s_max - s_min) if (s_max - s_min) > 1e-6 else 1.0
325
-
326
- n = sims.size
327
- if n <= bins:
328
- take_idx = list(range(n))
329
- else:
330
- take_idx = [int(round(i)) for i in np.linspace(0, n - 1, bins)]
331
 
332
- parts = []
333
- for i in take_idx:
334
- w = (float(sims[i]) - s_min) / denom
335
- alpha = 0.10 + 0.75 * w
336
- alpha = max(0.08, min(alpha, 0.90))
337
- parts.append(
338
- f'<span title="seg {i+1}" style="display:inline-block;width:10px;height:10px;'
339
- f'margin:0 2px;border-radius:3px;background:rgba(37,99,235,{alpha:.3f});"></span>'
340
- )
341
- return '<div style="margin:10px 0 8px;direction:ltr;text-align:left;">' + "".join(parts) + "</div>"
342
 
343
- def highlight_segments_html(segs: List[str], sims: np.ndarray, strong_topn: int, deemph_mask: np.ndarray) -> str:
344
- if not segs or sims.size == 0:
345
- return ""
346
 
347
  s_min = float(np.min(sims))
348
  s_max = float(np.max(sims))
349
  denom = (s_max - s_min) if (s_max - s_min) > 1e-6 else 1.0
350
 
351
  order = np.argsort(-sims)
352
- keep = set(order[:min(strong_topn, len(segs))])
353
 
354
  parts: List[str] = []
355
  for i, seg in enumerate(segs):
356
  w = (float(sims[i]) - s_min) / denom
357
-
358
- deemph = float(deemph_mask[i]) # 1 matn, 0 isnad-like
359
  alpha = (0.18 + 0.62 * w) if i in keep else (0.06 + 0.20 * w)
360
- alpha = alpha * (0.45 + 0.55 * deemph)
361
- alpha = max(0.04, min(alpha, 0.82))
362
- border_alpha = max(0.08, min(alpha * 0.75, 0.60))
363
 
364
  style = (
365
  f"background: rgba(255, 230, 120, {alpha:.3f});"
@@ -370,285 +372,80 @@ def highlight_segments_html(segs: List[str], sims: np.ndarray, strong_topn: int,
370
  "display: inline;"
371
  )
372
  parts.append(f'<span style="{style}">{escape_html(seg)}</span> ')
373
- return "".join(parts).strip()
374
 
375
- def build_html_extras_for_row(query_norm: str, arabic_clean_text: str, hl_topn: int, seg_maxlen: int) -> Dict[str, str]:
376
- segs = split_ar_segments(arabic_clean_text, seg_maxlen)
377
- if not segs:
378
- segs = [arabic_clean_text] if arabic_clean_text else []
 
379
 
380
- deemph_mask = np.array([1.0 if not is_isnad_segment(s) else 0.0 for s in segs], dtype=np.float32)
381
- q_emb = get_query_emb(query_norm)
382
- sims = compute_segment_sims(q_emb, segs)
383
 
384
- hm = heatmap_html(sims, bins=16) if hl_topn > 0 else ""
385
- highlighted = highlight_segments_html(segs, sims, strong_topn=max(1, hl_topn), deemph_mask=deemph_mask) if hl_topn > 0 else escape_html(arabic_clean_text)
 
 
 
 
 
386
 
387
- # Best seg
388
- best_seg = ""
389
- if sims.size > 0:
390
- best_i = int(np.argmax(sims))
391
- best_seg = segs[best_i]
392
 
393
- best_seg_html = (
394
- f'<span style="background:rgba(255,230,120,.55);border:1px solid rgba(234,179,8,.40);'
395
- f'border-radius:12px;padding:4px 10px;display:inline;">{escape_html(best_seg)}</span>'
396
- if best_seg else ""
397
- )
398
 
399
- return {
400
- "heatmap_html": hm,
401
- "arabic_clean_html": highlighted if highlighted else escape_html(arabic_clean_text),
402
- "best_seg_html": best_seg_html or "",
403
- }
404
 
405
 
406
  # =========================
407
- # Flask app
408
  # =========================
409
- app = Flask(__name__)
 
 
 
410
 
411
- # (UI is optional for HF; keep it for quick testing)
412
- UI_HTML = r"""
413
- <!doctype html>
414
- <html lang="ar" dir="rtl">
415
- <head>
416
- <meta charset="utf-8">
417
- <meta name="viewport" content="width=device-width,initial-scale=1">
418
- <title>البحث الدلالي في الأحاديث</title>
419
- <link rel="preconnect" href="https://fonts.googleapis.com">
420
- <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
421
- <link href="https://fonts.googleapis.com/css2?family=Amiri:wght@400;700&family=Tajawal:wght@400;700&display=swap" rel="stylesheet">
422
- <style>
423
- :root{
424
- --bg:#f6f7fb; --card:#ffffff; --text:#0f172a; --muted:#475569;
425
- --line:#e5e7eb; --accent:#2563eb; --shadow: 0 10px 30px rgba(15, 23, 42, .08);
426
- --good:#16a34a; --warn:#f59e0b; --bad:#ef4444;
427
- }
428
- body{
429
- margin:0; background: linear-gradient(180deg, #ffffff, var(--bg)); color: var(--text);
430
- font-family: Tajawal, ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Arial;
431
- }
432
- .wrap{ max-width: 1100px; margin:0 auto; padding: 26px 16px 44px; }
433
- .header{
434
- background: var(--card); border: 1px solid var(--line); border-radius: 18px;
435
- padding: 18px; box-shadow: var(--shadow);
436
- }
437
- .title{ font-family: Amiri, serif; font-size: 36px; font-weight: 700; margin:0; line-height:1.2; }
438
- .sub{ margin: 6px 0 0; font-size: 18px; color: var(--muted); direction:ltr; text-align:left; }
439
- .credit{ margin: 6px 0 0; font-size: 14px; color: var(--muted); direction:ltr; text-align:left; }
440
-
441
- form{ display:flex; flex-wrap:wrap; gap:10px; align-items:center; margin-top: 14px; }
442
- input[type="text"]{
443
- flex: 1 1 620px; background:#fff; border:1px solid var(--line);
444
- border-radius:14px; padding:14px; font-size:18px; outline:none;
445
- }
446
- input[type="number"]{
447
- width: 92px; background:#fff; border:1px solid var(--line);
448
- border-radius:14px; padding:14px 10px; font-size:16px; direction:ltr; text-align:left; outline:none;
449
- }
450
- button{
451
- background: linear-gradient(180deg, #3b82f6, #2563eb);
452
- border: 1px solid #1d4ed8; color:#fff; border-radius:14px;
453
- padding:14px 16px; cursor:pointer; font-weight:700; font-size:16px;
454
- }
455
- button:hover{ filter: brightness(1.05); }
456
 
457
- .controls{
458
- margin-top: 12px; display:flex; gap:10px; flex-wrap:wrap; align-items:center;
459
- direction:ltr; text-align:left; color: var(--muted); font-size: 13px;
460
- }
461
- .controls label{ display:flex; gap:8px; align-items:center; }
462
- .controls input[type="range"]{ width: 200px; }
463
 
464
- .meta{ display:flex; gap:10px; flex-wrap:wrap; margin-top: 10px; color: var(--muted); font-size: 13px; direction:ltr; }
465
- .pill{ border:1px solid var(--line); background:#fff; padding:6px 10px; border-radius:999px; }
 
 
 
 
 
 
466
 
467
- .grid{ display:grid; grid-template-columns: 1fr; gap:14px; margin-top:14px; }
468
- .card{
469
- background: var(--card); border: 1px solid var(--line); border-radius:18px;
470
- padding: 16px; box-shadow: var(--shadow);
471
- }
472
- .row{ display:grid; grid-template-columns: 240px 1fr; gap:14px; }
473
- @media (max-width: 900px){ .row{ grid-template-columns: 1fr; } }
474
-
475
- .left{ color: var(--muted); font-size:14px; direction:ltr; text-align:left; }
476
- .score{ font-weight:900; color: var(--accent); font-size:18px; }
477
- .badge{
478
- display:inline-flex; align-items:center; gap:6px; border-radius:999px;
479
- padding:5px 10px; font-weight:900; font-size:12px; margin-top:10px;
480
- border:1px solid var(--line); background:#fff;
481
- }
482
- .bHigh{ color: var(--good); border-color: rgba(22,163,74,.35); background: rgba(22,163,74,.08); }
483
- .bMed{ color: var(--warn); border-color: rgba(245,158,11,.35); background: rgba(245,158,11,.10); }
484
- .bLow{ color: var(--bad); border-color: rgba(239,68,68,.35); background: rgba(239,68,68,.08); }
485
-
486
- .arabic{
487
- direction: rtl; text-align:right; font-family: Amiri, serif; font-size:22px;
488
- line-height: 2.05; background:#fbfcff; border:1px solid var(--line);
489
- border-radius:16px; padding:14px; white-space: pre-wrap;
490
- }
491
- .evidence{
492
- margin-top: 10px; border: 1px dashed rgba(37,99,235,.25);
493
- background: rgba(37,99,235,.05); border-radius: 14px;
494
- padding: 10px 12px; direction: rtl; text-align: right;
495
- font-family: Amiri, serif; font-size: 18px; line-height: 1.95;
496
- }
497
- .evidence small{
498
- display:block; margin-bottom:6px; font-family: Tajawal, sans-serif;
499
- color: var(--muted); direction:ltr; text-align:left; font-size:12px;
500
- }
501
- .english{
502
- direction:ltr; text-align:left; font-size:16px; line-height:1.8; color:#111827;
503
- background:#fbfcff; border:1px solid var(--line); border-radius:16px; padding:14px; white-space: pre-wrap;
504
- }
505
- details summary{
506
- cursor:pointer; color: var(--accent); margin-top:12px; user-select:none;
507
- direction:ltr; text-align:left; font-weight:800;
508
- }
509
- .empty{ margin-top: 14px; color: var(--muted); font-size: 15px; direction:ltr; text-align:left; }
510
- .tiny{ margin-top:8px; font-size:12px; color: var(--muted); direction:ltr; text-align:left; }
511
- </style>
512
- </head>
513
- <body>
514
- <div class="wrap">
515
- <div class="header">
516
- <h1 class="title">البحث الدلالي في الأحاديث</h1>
517
- <div class="sub">search in hadeeth (API + UI)</div>
518
- <div class="credit">Done by Dr Faisal Alshargi</div>
519
-
520
- <form id="f">
521
- <input id="q" type="text" placeholder="اكتب سؤالك هنا… مثال: الاستغفار بعد الذنب وركعتين">
522
- <input id="k" type="number" min="1" max="50" value="10">
523
- <button type="submit">Search</button>
524
- </form>
525
-
526
- <div class="controls">
527
- <label>Highlight Top Segments:
528
- <input id="hl" type="range" min="0" max="25" value="6"><b id="hlv">6</b>
529
- </label>
530
- <label>Segment Size:
531
- <input id="seg" type="range" min="120" max="420" step="20" value="220"><b id="segv">220</b>
532
- </label>
533
- <label>Re-rank pool:
534
- <input id="rk" type="range" min="20" max="120" step="5" value="35"><b id="rkv">35</b>
535
- </label>
536
- </div>
537
-
538
- <div id="meta" class="meta" style="display:none;"></div>
539
- <div id="msg" class="empty" style="display:none;"></div>
540
- </div>
541
-
542
- <div id="grid" class="grid"></div>
543
- </div>
544
-
545
- <script>
546
- const $ = (id)=>document.getElementById(id);
547
- function esc(s){
548
- return String(s??"")
549
- .replaceAll("&","&amp;").replaceAll("<","&lt;").replaceAll(">","&gt;")
550
- .replaceAll('"',"&quot;").replaceAll("'","&#39;");
551
- }
552
- function pill(k,v){ return `<div class="pill">${esc(k)}: <b>${esc(v)}</b></div>`; }
553
-
554
- function sync(rangeId, labelId){
555
- const r=$(rangeId), l=$(labelId);
556
- l.textContent = r.value;
557
- r.addEventListener("input", ()=> l.textContent = r.value);
558
- }
559
- sync("hl","hlv"); sync("seg","segv"); sync("rk","rkv");
560
-
561
- $("f").addEventListener("submit", async (e)=>{
562
- e.preventDefault();
563
- const q = $("q").value.trim();
564
- const k = parseInt($("k").value||"10",10);
565
- const hl = parseInt($("hl").value||"6",10);
566
- const seg = parseInt($("seg").value||"220",10);
567
- const rk = parseInt($("rk").value||"35",10);
568
-
569
- $("msg").style.display="none";
570
- $("grid").innerHTML = "";
571
- $("meta").style.display="none";
572
-
573
- if(!q){
574
- $("msg").textContent="اكتب نص البحث أولًا.";
575
- $("msg").style.display="block";
576
- return;
577
- }
578
 
579
- $("msg").textContent="... جاري البحث";
580
- $("msg").style.display="block";
 
 
581
 
582
- const url = `/search?q=${encodeURIComponent(q)}&k=${encodeURIComponent(k)}&rerank_k=${encodeURIComponent(rk)}&hl_topn=${encodeURIComponent(hl)}&seg_maxlen=${encodeURIComponent(seg)}&format=html`;
583
- const res = await fetch(url);
584
- const js = await res.json();
 
 
 
585
 
586
- $("meta").style.display="flex";
587
- $("meta").innerHTML =
588
- pill("Rows", js.rows) + pill("Results", js.n) + pill("Time(ms)", js.took_ms) +
589
- pill("TopK", js.k) + pill("ReRank", js.rerank_k) + pill("Query", js.query);
590
 
591
- if(!js.ok || !js.results || js.results.length===0){
592
- $("msg").textContent="لا توجد نتائج. جرّب كلمات مختلفة.";
593
- $("msg").style.display="block";
594
- return;
595
- }
596
- $("msg").style.display="none";
597
-
598
- const cards = js.results.map(r=>{
599
- const hm = r.heatmap_html || "";
600
- const best = r.best_seg_html || "";
601
- const ar = r.arabic_clean_html || esc(r.arabic_clean||"");
602
- const ar_tashkeel = esc(r.arabic||"");
603
- const en = esc(r.english||"");
604
-
605
- return `
606
- <div class="card">
607
- <div class="row">
608
- <div class="left">
609
- <div><span class="score">${Number(r.score||0).toFixed(4)}</span> evidence</div>
610
- <div class="tiny">FAISS: <b>${Number(r.faiss_score||0).toFixed(4)}</b></div>
611
-
612
- <div class="badge ${esc(r.conf_class||"")}">Confidence: <b>${esc(r.conf_label||"")}</b></div>
613
-
614
- <div class="tiny" style="margin-top:10px;">
615
- Lexical match: <b>${Math.round((r.lex_ratio||0)*100)}%</b>
616
- ${r.lex_terms ? `<div style="margin-top:6px;">Matched: <b>${esc(r.lex_terms)}</b></div>` : ``}
617
- </div>
618
-
619
- <div style="margin-top:12px;">HadithID: <b>${esc(r.hadithID)}</b></div>
620
- <div>Collection: <b>${esc(r.collection)}</b></div>
621
- <div>No: <b>${esc(r.hadith_number)}</b></div>
622
- </div>
623
-
624
- <div>
625
- ${hm}
626
- <div class="evidence"><small>Top evidence snippet</small>${best}</div>
627
- <div class="arabic" style="margin-top:10px;">${ar}</div>
628
-
629
- <details>
630
- <summary>Show Arabic with tashkeel</summary>
631
- <div style="height:10px;"></div>
632
- <div class="arabic">${ar_tashkeel}</div>
633
- </details>
634
-
635
- <details>
636
- <summary>Show English</summary>
637
- <div style="height:10px;"></div>
638
- <div class="english">${en}</div>
639
- </details>
640
- </div>
641
- </div>
642
- </div>
643
- `;
644
- }).join("");
645
-
646
- $("grid").innerHTML = cards;
647
- });
648
- </script>
649
- </body>
650
- </html>
651
- """
652
 
653
 
654
  @app.get("/")
@@ -658,24 +455,25 @@ def health():
658
  "model": MODEL_NAME,
659
  "index_ntotal": int(getattr(index, "ntotal", -1)),
660
  "rows": int(len(meta)),
 
 
 
 
 
 
661
  "endpoints": {
662
- "ui": "/ui",
663
- "search_json": "/search?q=...&k=10",
664
- "search_html": "/search?q=...&k=10&format=html",
665
  }
666
  })
667
 
668
 
669
- @app.get("/ui")
670
- def ui():
671
- return Response(UI_HTML, mimetype="text/html; charset=utf-8")
672
-
673
-
674
  @app.get("/search")
675
  def search():
676
  q = request.args.get("q", "").strip()
677
 
678
- # TopK
679
  k_raw = request.args.get("k", str(DEFAULT_TOP_K)).strip()
680
  try:
681
  k = int(k_raw) if k_raw else DEFAULT_TOP_K
@@ -690,9 +488,9 @@ def search():
690
  except Exception:
691
  rerank_k = DEFAULT_RERANK_K
692
  rerank_k = max(MIN_RERANK_K, min(rerank_k, MAX_RERANK_K))
693
- rerank_k = max(rerank_k, k) # must be >= k
694
 
695
- # Highlight controls
696
  hl_raw = request.args.get("hl_topn", str(DEFAULT_HL_TOPN)).strip()
697
  seg_raw = request.args.get("seg_maxlen", str(DEFAULT_SEG_MAXLEN)).strip()
698
  try:
@@ -721,28 +519,26 @@ def search():
721
  "rows": int(len(meta)),
722
  "took_ms": 0,
723
  "format": "html" if want_html else "json",
 
 
724
  "results": [],
725
  })
726
 
727
  t0 = time.time()
728
- q_norm = normalize_ar(q)
729
 
730
- # 1) candidates from FAISS
731
- cand = search_candidates_df(q_norm, rerank_k=rerank_k)
 
732
 
733
- # 2) rerank by MATN evidence
734
- reranked = rerank_rows(q_norm, cand, seg_maxlen=seg_maxlen)
735
 
736
- # 3) take top k
737
- reranked = reranked.head(k).copy()
738
  took_ms = int((time.time() - t0) * 1000)
739
 
740
- # confidence uses margin between first and second
741
- scores_final = reranked["score"].astype(float).tolist()
742
- top2 = scores_final[1] if len(scores_final) > 1 else None
743
-
744
  results: List[Dict[str, Any]] = []
745
- for pos, (_, row) in enumerate(reranked.iterrows()):
 
746
  arabic = str(row.get("arabic", "") or "")
747
  english = str(row.get("english", "") or "")
748
 
@@ -753,45 +549,61 @@ def search():
753
  if not ar_clean:
754
  ar_clean = normalize_ar(arabic)
755
 
756
- score = float(row.get("score")) if pd.notna(row.get("score")) else 0.0
 
757
 
758
- # confidence
759
- if pos == 0:
760
- conf_label, conf_class = confidence_badge(score, top2)
761
- else:
762
- # compare against top1 as rough indicator
763
- conf_label, conf_class = confidence_badge(score, scores_final[0] if scores_final else None)
764
 
765
- r: Dict[str, Any] = {
766
- "hadithID": int(row.get("hadithID")) if pd.notna(row.get("hadithID")) else None,
 
 
 
 
767
  "collection": str(row.get("collection", "") or ""),
768
  "hadith_number": int(row.get("hadith_number")) if pd.notna(row.get("hadith_number")) else None,
769
 
770
- "score": score, # evidence score (reranked)
771
- "faiss_score": float(row.get("faiss_score") or 0.0),
772
- "faiss_rank": int(row.get("faiss_rank") or 0),
773
 
774
- "lex_ratio": float(row.get("lex_ratio") or 0.0),
775
- "lex_terms": str(row.get("lex_terms", "") or ""),
 
776
 
777
  "conf_label": conf_label,
778
  "conf_class": conf_class,
779
 
 
 
 
780
  "arabic": arabic,
781
  "arabic_clean": ar_clean,
782
  "english": english,
783
 
784
- "best_seg": str(row.get("best_seg", "") or ""),
 
 
785
  }
786
 
787
- if want_html:
788
- extras = build_html_extras_for_row(
 
 
789
  query_norm=q_norm,
790
  arabic_clean_text=ar_clean,
791
  hl_topn=hl_topn,
792
  seg_maxlen=seg_maxlen,
793
  )
794
- r.update(extras)
 
 
 
 
795
  results.append(r)
796
 
797
  return jsonify({
@@ -810,5 +622,90 @@ def search():
810
  })
811
 
812
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
813
  if __name__ == "__main__":
 
814
  app.run(host="127.0.0.1", port=5000, debug=True)
 
3
  import os
4
  import re
5
  import time
6
+ import math
7
  from functools import lru_cache
8
  from typing import List, Dict, Any, Tuple, Optional
9
 
 
25
  DEFAULT_TOP_K = 10
26
  MAX_TOP_K = 50
27
 
 
28
  DEFAULT_RERANK_K = 35
29
  MAX_RERANK_K = 120
30
+ MIN_RERANK_K = 10
31
 
32
+ DEFAULT_HL_TOPN = 6 # for /highlight and html responses
33
  MAX_HL_TOPN = 25
34
 
35
+ DEFAULT_SEG_MAXLEN = 220
36
  MAX_SEG_MAXLEN = 420
37
  MIN_SEG_MAXLEN = 120
38
 
39
+ # Rerank speed/quality knobs (safe defaults)
40
+ RERANK_MAX_SEGS_PER_DOC = int(os.getenv("RERANK_MAX_SEGS_PER_DOC", "10")) # keep it small for speed
41
+ RERANK_SEG_MAXLEN = int(os.getenv("RERANK_SEG_MAXLEN", "240")) # segment length during rerank
42
+ RERANK_WEIGHT = float(os.getenv("RERANK_WEIGHT", "0.65")) # 0..1 combine rerank with faiss
43
+ RERANK_ENABLE = os.getenv("RERANK_ENABLE", "1").strip() != "0"
44
+
45
+ # CORS
46
+ CORS_ALLOW_ORIGIN = os.getenv("CORS_ALLOW_ORIGIN", "*") # set to your domain if you want strict
47
+
48
 
49
  # =========================
50
  # Arabic normalization
 
56
  | [\u06D6-\u06ED]
57
  """, re.VERBOSE)
58
 
59
+ _AR_PUNCT = re.compile(r"[^\w\u0600-\u06FF]+", re.UNICODE)
60
+
61
  def normalize_ar(text: str) -> str:
62
  if text is None:
63
  return ""
 
71
  text = re.sub(r"\s+", " ", text).strip()
72
  return text
73
 
74
+ def ar_tokens(text: str) -> List[str]:
75
+ t = normalize_ar(text)
76
+ t = _AR_PUNCT.sub(" ", t)
77
+ toks = [x.strip() for x in t.split() if x.strip()]
78
+ # remove super short tokens
79
+ toks = [x for x in toks if len(x) >= 2]
80
+ return toks
81
+
82
  def escape_html(s: str) -> str:
83
  if s is None:
84
  return ""
 
93
 
94
 
95
  # =========================
96
+ # Segmenting
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  # =========================
98
  def split_ar_segments(text: str, max_len: int) -> List[str]:
99
  if not text:
 
117
  if buf:
118
  segs.append(buf)
119
 
120
+ # fallback chunking
121
  if len(segs) <= 1 and len(t) > max_len:
122
  segs = [t[i:i+max_len].strip() for i in range(0, len(t), max_len) if t[i:i+max_len].strip()]
123
  return segs
124
 
125
+ def pick_segs_for_rerank(segs: List[str], max_keep: int) -> List[str]:
126
+ """Pick up to max_keep segments spread out (for speed)."""
127
+ if len(segs) <= max_keep:
128
+ return segs
129
+ # spread indices evenly
130
+ idxs = np.linspace(0, len(segs) - 1, num=max_keep)
131
+ idxs = [int(round(x)) for x in idxs]
132
+ # unique preserve order
133
+ seen = set()
134
+ out = []
135
+ for i in idxs:
136
+ if i not in seen:
137
+ seen.add(i)
138
+ out.append(segs[i])
139
+ return out[:max_keep]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
 
142
  # =========================
 
150
  def get_query_emb(query_norm: str) -> np.ndarray:
151
  return np.frombuffer(cached_query_emb(query_norm), dtype=np.float32)
152
 
 
 
 
 
 
 
 
 
 
153
 
154
  # =========================
155
+ # Rerank + evidence HTML (no extra encode)
156
  # =========================
157
+ def build_heatmap_html(segs: List[str], sims: np.ndarray, top_n: int = 6) -> str:
158
+ """Small bar-like heatmap using segment similarity (already computed)."""
159
+ if not segs or sims.size == 0:
160
+ return ""
161
 
162
+ n = len(segs)
163
+ top_n = max(1, min(top_n, n))
 
164
 
165
+ s_min = float(np.min(sims))
166
+ s_max = float(np.max(sims))
167
+ denom = (s_max - s_min) if (s_max - s_min) > 1e-6 else 1.0
168
 
169
+ # choose top indices
170
+ order = np.argsort(-sims)
171
+ keep = set(order[:top_n])
172
+
173
+ blocks = []
174
+ for i in range(n):
175
+ w = (float(sims[i]) - s_min) / denom # 0..1
176
+ # stronger for top segments
177
+ alpha = (0.20 + 0.60 * w) if i in keep else (0.08 + 0.18 * w)
178
+ alpha = max(0.06, min(alpha, 0.85))
179
+ blocks.append(
180
+ f'<span title="{escape_html(segs[i])}" '
181
+ f'style="display:inline-block;width:10px;height:10px;margin:0 3px 0 0;'
182
+ f'border-radius:4px;background:rgba(37,99,235,{alpha:.3f});border:1px solid rgba(37,99,235,0.20);"></span>'
183
+ )
184
 
185
+ return (
186
+ '<div style="margin:10px 0 0;direction:ltr;text-align:left;">'
187
+ '<div style="font-size:12px;color:#475569;margin-bottom:6px;">Evidence heatmap</div>'
188
+ + "".join(blocks) +
189
+ '</div>'
190
+ )
191
+
192
+ def best_seg_html(segs: List[str], sims: np.ndarray) -> str:
193
+ if not segs or sims.size == 0:
194
+ return ""
195
+ i = int(np.argmax(sims))
196
+ return f'<span style="background:rgba(255,230,120,0.55);border:1px solid rgba(234,179,8,0.35);border-radius:12px;padding:3px 8px;display:inline;">{escape_html(segs[i])}</span>'
197
+
198
+ def lexical_ratio(query_norm: str, doc_norm: str, max_terms: int = 10) -> Tuple[float, str]:
199
+ q_toks = ar_tokens(query_norm)
200
+ d_toks = set(ar_tokens(doc_norm))
201
+ if not q_toks:
202
+ return 0.0, ""
203
+ hit = [t for t in q_toks if t in d_toks]
204
+ ratio = len(hit) / max(1, len(set(q_toks)))
205
+ terms = " ".join(hit[:max_terms])
206
+ return float(ratio), terms
207
+
208
+ def confidence_label(score: float) -> Tuple[str, str]:
209
+ """
210
+ Simple score->label mapping.
211
+ Assumes cosine-like range ~[0..1] after normalization & blending.
212
+ """
213
+ if score >= 0.78:
214
+ return "HIGH", "bHigh"
215
+ if score >= 0.62:
216
+ return "MED", "bMed"
217
+ return "LOW", "bLow"
218
+
219
+ def rerank_rows(
220
+ query_norm: str,
221
+ df: pd.DataFrame,
222
+ k_final: int,
223
+ ) -> Tuple[pd.DataFrame, Dict[int, Dict[str, Any]]]:
224
+ """
225
+ Rerank using segment max similarity:
226
+ - Split each doc to segments (short)
227
+ - Pick a limited set of segments (speed)
228
+ - One encode call for all segments
229
+ Returns reranked df and per-hadith evidence dict (sims/segs + prebuilt html).
230
+ """
231
+ evidence: Dict[int, Dict[str, Any]] = {}
232
+
233
+ if (not RERANK_ENABLE) or df.empty:
234
+ # still fill basic fields
235
+ for _, row in df.iterrows():
236
+ hid = int(row["hadithID"]) if pd.notna(row.get("hadithID")) else -1
237
+ evidence[hid] = {"mode": "disabled"}
238
+ return df.head(k_final), evidence
239
+
240
+ # Collect segments for each candidate
241
+ cand_rows = df.copy()
242
+
243
+ per_doc_segs: List[List[str]] = []
244
+ doc_hids: List[int] = []
245
+
246
+ for _, row in cand_rows.iterrows():
247
+ hid = int(row["hadithID"]) if pd.notna(row.get("hadithID")) else -1
248
+ doc_hids.append(hid)
249
 
 
 
 
250
  ar = str(row.get("arabic", "") or "")
251
  ar_clean = row.get("arabic_clean", "")
252
  if ar_clean is None or (isinstance(ar_clean, float) and np.isnan(ar_clean)):
 
254
  ar_clean = str(ar_clean).strip()
255
  if not ar_clean:
256
  ar_clean = normalize_ar(ar)
 
 
 
 
 
 
 
257
 
258
+ segs = split_ar_segments(ar_clean, max_len=RERANK_SEG_MAXLEN)
259
+ segs = pick_segs_for_rerank(segs, max_keep=RERANK_MAX_SEGS_PER_DOC)
260
  if not segs:
261
+ segs = [ar_clean[:RERANK_SEG_MAXLEN]] if ar_clean else []
262
+ per_doc_segs.append(segs)
263
+
264
+ # Flatten
265
+ all_segs: List[str] = []
266
+ offsets: List[Tuple[int, int]] = []
267
+ cur = 0
268
+ for segs in per_doc_segs:
269
+ start = cur
270
+ all_segs.extend(segs)
271
+ cur += len(segs)
272
+ offsets.append((start, cur))
273
+
274
+ if not all_segs:
275
+ # fallback: no rerank
276
+ for hid in doc_hids:
277
+ evidence[hid] = {"mode": "empty"}
278
+ return cand_rows.head(k_final), evidence
279
+
280
+ # Encode query once + all segments once
281
+ q_emb = get_query_emb(query_norm) # (d,)
282
+ seg_emb = model.encode(
283
+ ["passage: " + s for s in all_segs],
284
+ normalize_embeddings=True
285
+ ).astype("float32") # (N, d)
286
+
287
+ sims_all = (seg_emb @ q_emb).astype(np.float32) # (N,)
288
+
289
+ # Compute per-doc rerank score = max(sim)
290
+ rr_scores: List[float] = []
291
+ for hid, (start, end), segs in zip(doc_hids, offsets, per_doc_segs):
292
+ if start == end:
293
+ rr = -1.0
294
+ sims = np.array([], dtype=np.float32)
295
+ else:
296
+ sims = sims_all[start:end]
297
+ rr = float(np.max(sims))
298
+ rr_scores.append(rr)
299
+
300
+ # Build evidence HTML now (no extra encode)
301
+ hm = build_heatmap_html(segs, sims, top_n=min(6, len(segs))) if sims.size else ""
302
+ best = best_seg_html(segs, sims) if sims.size else ""
303
+ evidence[hid] = {
304
+ "mode": "rerank",
305
+ "rerank_score": rr,
306
+ "heatmap_html": hm,
307
+ "best_seg_html": best,
308
+ "rerank_segs": segs, # keep for debugging (can omit if you want)
309
+ "rerank_sims": None, # don't ship full sims to client
310
+ }
311
+
312
+ cand_rows["rerank_score"] = rr_scores
313
+
314
+ # Blend: score_final = (1-w)*faiss + w*rerank
315
+ # Both are cosine-ish in [0,1] in your setup (normalize embeddings + IP index)
316
+ faiss_scores = cand_rows["score"].astype(float).to_numpy()
317
+ rr = cand_rows["rerank_score"].astype(float).to_numpy()
318
+
319
+ w = float(max(0.0, min(1.0, RERANK_WEIGHT)))
320
+ blended = (1.0 - w) * faiss_scores + w * rr
321
+ cand_rows["final_score"] = blended
322
+
323
+ cand_rows = cand_rows.sort_values("final_score", ascending=False).head(k_final)
324
+ return cand_rows, evidence
325
 
326
 
327
  # =========================
328
+ # Full highlight for ONE hadith (on click)
329
  # =========================
330
+ def full_highlight_html(
331
+ query_norm: str,
332
+ arabic_clean_text: str,
333
+ hl_topn: int,
334
+ seg_maxlen: int,
335
+ ) -> Dict[str, str]:
336
+ segs = split_ar_segments(arabic_clean_text, max_len=seg_maxlen)
337
+ if not segs:
338
+ return {
339
+ "arabic_clean_html": escape_html(arabic_clean_text),
340
+ "heatmap_html": "",
341
+ "best_seg_html": "",
342
+ }
 
 
 
 
 
 
 
 
 
 
343
 
344
+ q_emb = get_query_emb(query_norm)
345
+ seg_emb = model.encode(
346
+ ["passage: " + s for s in segs],
347
+ normalize_embeddings=True
348
+ ).astype("float32")
 
 
 
 
 
349
 
350
+ sims = (seg_emb @ q_emb).astype(np.float32)
 
 
351
 
352
  s_min = float(np.min(sims))
353
  s_max = float(np.max(sims))
354
  denom = (s_max - s_min) if (s_max - s_min) > 1e-6 else 1.0
355
 
356
  order = np.argsort(-sims)
357
+ keep = set(order[:max(0, min(hl_topn, len(segs)))])
358
 
359
  parts: List[str] = []
360
  for i, seg in enumerate(segs):
361
  w = (float(sims[i]) - s_min) / denom
 
 
362
  alpha = (0.18 + 0.62 * w) if i in keep else (0.06 + 0.20 * w)
363
+ alpha = max(0.05, min(alpha, 0.82))
364
+ border_alpha = max(0.10, min(alpha * 0.8, 0.65))
 
365
 
366
  style = (
367
  f"background: rgba(255, 230, 120, {alpha:.3f});"
 
372
  "display: inline;"
373
  )
374
  parts.append(f'<span style="{style}">{escape_html(seg)}</span> ')
 
375
 
376
+ return {
377
+ "arabic_clean_html": "".join(parts).strip() or escape_html(arabic_clean_text),
378
+ "heatmap_html": build_heatmap_html(segs, sims, top_n=min(6, len(segs))),
379
+ "best_seg_html": best_seg_html(segs, sims),
380
+ }
381
 
 
 
 
382
 
383
+ # =========================
384
+ # Load model + index + meta (once)
385
+ # =========================
386
+ if not os.path.exists(INDEX_PATH):
387
+ raise FileNotFoundError(f"FAISS index not found: {INDEX_PATH}")
388
+ if not os.path.exists(META_PATH):
389
+ raise FileNotFoundError(f"Meta parquet not found: {META_PATH}")
390
 
391
+ model = SentenceTransformer(MODEL_NAME)
392
+ index = faiss.read_index(INDEX_PATH)
393
+ meta = pd.read_parquet(META_PATH)
 
 
394
 
395
+ required_cols = {"hadithID", "collection", "hadith_number", "arabic", "english"}
396
+ missing = required_cols - set(meta.columns)
397
+ if missing:
398
+ raise ValueError(f"Meta is missing required columns: {missing}")
 
399
 
400
+ if "arabic_clean" not in meta.columns:
401
+ meta["arabic_clean"] = ""
 
 
 
402
 
403
 
404
  # =========================
405
+ # FAISS Search
406
  # =========================
407
+ def semantic_search_df(query: str, top_k: int) -> pd.DataFrame:
408
+ q = str(query or "").strip()
409
+ if not q:
410
+ return meta.iloc[0:0].copy()
411
 
412
+ top_k = max(1, min(int(top_k), MAX_TOP_K))
413
+ q_norm = normalize_ar(q)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
414
 
415
+ q_emb = get_query_emb(q_norm).reshape(1, -1)
416
+ scores, idx = index.search(q_emb, top_k)
 
 
 
 
417
 
418
+ res = meta.iloc[idx[0]].copy()
419
+ res["score"] = scores[0]
420
+ res = res.sort_values("score", ascending=False)
421
+
422
+ # ensure arabic
423
+ res["arabic"] = res["arabic"].fillna("").astype(str)
424
+ res = res[res["arabic"].str.strip() != ""]
425
+ return res
426
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
427
 
428
+ # =========================
429
+ # Flask app
430
+ # =========================
431
+ app = Flask(__name__)
432
 
433
+ def add_cors(resp):
434
+ resp.headers["Access-Control-Allow-Origin"] = CORS_ALLOW_ORIGIN
435
+ resp.headers["Access-Control-Allow-Methods"] = "GET, OPTIONS"
436
+ resp.headers["Access-Control-Allow-Headers"] = "Content-Type, Authorization"
437
+ resp.headers["Access-Control-Max-Age"] = "86400"
438
+ return resp
439
 
440
+ @app.after_request
441
+ def _after(resp):
442
+ return add_cors(resp)
 
443
 
444
+ @app.route("/search", methods=["OPTIONS"])
445
+ @app.route("/highlight", methods=["OPTIONS"])
446
+ @app.route("/", methods=["OPTIONS"])
447
+ def options():
448
+ return add_cors(Response("", status=204))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
449
 
450
 
451
  @app.get("/")
 
455
  "model": MODEL_NAME,
456
  "index_ntotal": int(getattr(index, "ntotal", -1)),
457
  "rows": int(len(meta)),
458
+ "rerank": {
459
+ "enabled": bool(RERANK_ENABLE),
460
+ "weight": RERANK_WEIGHT,
461
+ "max_segs_per_doc": RERANK_MAX_SEGS_PER_DOC,
462
+ "seg_maxlen": RERANK_SEG_MAXLEN,
463
+ },
464
  "endpoints": {
465
+ "search": "/search?q=...&k=10&rerank_k=35&format=json",
466
+ "search_html": "/search?q=...&k=10&rerank_k=35&format=html",
467
+ "highlight": "/highlight?q=...&hadithID=123&format=html&hl_topn=6&seg_maxlen=220",
468
  }
469
  })
470
 
471
 
 
 
 
 
 
472
  @app.get("/search")
473
  def search():
474
  q = request.args.get("q", "").strip()
475
 
476
+ # TopK final
477
  k_raw = request.args.get("k", str(DEFAULT_TOP_K)).strip()
478
  try:
479
  k = int(k_raw) if k_raw else DEFAULT_TOP_K
 
488
  except Exception:
489
  rerank_k = DEFAULT_RERANK_K
490
  rerank_k = max(MIN_RERANK_K, min(rerank_k, MAX_RERANK_K))
491
+ rerank_k = max(rerank_k, k)
492
 
493
+ # Highlight controls (only used for format=html; for fast mode you can still send hl_topn=0)
494
  hl_raw = request.args.get("hl_topn", str(DEFAULT_HL_TOPN)).strip()
495
  seg_raw = request.args.get("seg_maxlen", str(DEFAULT_SEG_MAXLEN)).strip()
496
  try:
 
519
  "rows": int(len(meta)),
520
  "took_ms": 0,
521
  "format": "html" if want_html else "json",
522
+ "hl_topn": hl_topn,
523
+ "seg_maxlen": seg_maxlen,
524
  "results": [],
525
  })
526
 
527
  t0 = time.time()
 
528
 
529
+ # 1) FAISS retrieve pool (rerank_k)
530
+ df_pool = semantic_search_df(q, top_k=rerank_k)
531
+ q_norm = normalize_ar(q)
532
 
533
+ # 2) rerank to final k + evidence (no extra encode)
534
+ df_final, ev = rerank_rows(query_norm=q_norm, df=df_pool, k_final=k)
535
 
 
 
536
  took_ms = int((time.time() - t0) * 1000)
537
 
538
+ # Build results
 
 
 
539
  results: List[Dict[str, Any]] = []
540
+ for _, row in df_final.iterrows():
541
+ hid = int(row.get("hadithID")) if pd.notna(row.get("hadithID")) else None
542
  arabic = str(row.get("arabic", "") or "")
543
  english = str(row.get("english", "") or "")
544
 
 
549
  if not ar_clean:
550
  ar_clean = normalize_ar(arabic)
551
 
552
+ # lexical
553
+ lex_r, lex_terms = lexical_ratio(q_norm, ar_clean)
554
 
555
+ # scores
556
+ faiss_score = float(row.get("score")) if pd.notna(row.get("score")) else 0.0
557
+ rerank_score = float(row.get("rerank_score")) if pd.notna(row.get("rerank_score")) else faiss_score
558
+ final_score = float(row.get("final_score")) if pd.notna(row.get("final_score")) else faiss_score
559
+
560
+ conf_label, conf_class = confidence_label(final_score)
561
 
562
+ e = ev.get(hid or -1, {})
563
+ heatmap_html = e.get("heatmap_html", "") if isinstance(e, dict) else ""
564
+ best_html = e.get("best_seg_html", "") if isinstance(e, dict) else ""
565
+
566
+ r = {
567
+ "hadithID": hid,
568
  "collection": str(row.get("collection", "") or ""),
569
  "hadith_number": int(row.get("hadith_number")) if pd.notna(row.get("hadith_number")) else None,
570
 
571
+ # unified score the UI should use
572
+ "score": final_score,
 
573
 
574
+ # diagnostics
575
+ "faiss_score": faiss_score,
576
+ "rerank_score": rerank_score,
577
 
578
  "conf_label": conf_label,
579
  "conf_class": conf_class,
580
 
581
+ "lex_ratio": float(lex_r),
582
+ "lex_terms": lex_terms,
583
+
584
  "arabic": arabic,
585
  "arabic_clean": ar_clean,
586
  "english": english,
587
 
588
+ # Provide evidence html even in json (cheap: already computed in rerank)
589
+ "heatmap_html": heatmap_html,
590
+ "best_seg_html": best_html,
591
  }
592
 
593
+ # If the caller asked for html AND did not disable highlight, also compute full highlight for each result.
594
+ # This is heavier. Recommended: keep hl_topn=0 for fast mode and use /highlight on click.
595
+ if want_html and hl_topn > 0:
596
+ extras = full_highlight_html(
597
  query_norm=q_norm,
598
  arabic_clean_text=ar_clean,
599
  hl_topn=hl_topn,
600
  seg_maxlen=seg_maxlen,
601
  )
602
+ r["arabic_clean_html"] = extras["arabic_clean_html"]
603
+ # You can overwrite with full-doc ones (optional):
604
+ r["heatmap_html"] = extras["heatmap_html"] or r["heatmap_html"]
605
+ r["best_seg_html"] = extras["best_seg_html"] or r["best_seg_html"]
606
+
607
  results.append(r)
608
 
609
  return jsonify({
 
622
  })
623
 
624
 
625
+ @app.get("/highlight")
626
+ def highlight():
627
+ """
628
+ Highlight a single hadith on-demand (for fast UI).
629
+ GET /highlight?q=...&hadithID=123&format=html&hl_topn=6&seg_maxlen=220
630
+ """
631
+ q = request.args.get("q", "").strip()
632
+ hid_raw = request.args.get("hadithID", "").strip()
633
+
634
+ hl_raw = request.args.get("hl_topn", str(DEFAULT_HL_TOPN)).strip()
635
+ seg_raw = request.args.get("seg_maxlen", str(DEFAULT_SEG_MAXLEN)).strip()
636
+ try:
637
+ hl_topn = int(hl_raw) if hl_raw else DEFAULT_HL_TOPN
638
+ except Exception:
639
+ hl_topn = DEFAULT_HL_TOPN
640
+ try:
641
+ seg_maxlen = int(seg_raw) if seg_raw else DEFAULT_SEG_MAXLEN
642
+ except Exception:
643
+ seg_maxlen = DEFAULT_SEG_MAXLEN
644
+
645
+ hl_topn = max(0, min(hl_topn, MAX_HL_TOPN))
646
+ seg_maxlen = max(MIN_SEG_MAXLEN, min(seg_maxlen, MAX_SEG_MAXLEN))
647
+
648
+ fmt = (request.args.get("format", "html") or "html").lower()
649
+ want_html = (fmt == "html")
650
+
651
+ if not q or not hid_raw:
652
+ return jsonify({"ok": False, "error": "q and hadithID are required"}), 400
653
+
654
+ try:
655
+ hid = int(hid_raw)
656
+ except Exception:
657
+ return jsonify({"ok": False, "error": "hadithID must be int"}), 400
658
+
659
+ row_df = meta[meta["hadithID"] == hid]
660
+ if row_df.empty:
661
+ return jsonify({"ok": False, "error": "hadithID not found"}), 404
662
+ row = row_df.iloc[0]
663
+
664
+ q_norm = normalize_ar(q)
665
+
666
+ arabic = str(row.get("arabic", "") or "")
667
+ english = str(row.get("english", "") or "")
668
+
669
+ ar_clean = row.get("arabic_clean", "")
670
+ if ar_clean is None or (isinstance(ar_clean, float) and np.isnan(ar_clean)):
671
+ ar_clean = ""
672
+ ar_clean = str(ar_clean).strip()
673
+ if not ar_clean:
674
+ ar_clean = normalize_ar(arabic)
675
+
676
+ # Always produce evidence + highlight here (one doc only)
677
+ extras = full_highlight_html(
678
+ query_norm=q_norm,
679
+ arabic_clean_text=ar_clean,
680
+ hl_topn=hl_topn if want_html else 0,
681
+ seg_maxlen=seg_maxlen,
682
+ )
683
+
684
+ # lexical
685
+ lex_r, lex_terms = lexical_ratio(q_norm, ar_clean)
686
+
687
+ return jsonify({
688
+ "ok": True,
689
+ "query": q,
690
+ "query_norm": q_norm,
691
+ "hadithID": hid,
692
+ "format": "html" if want_html else "json",
693
+ "hl_topn": hl_topn,
694
+ "seg_maxlen": seg_maxlen,
695
+
696
+ "lex_ratio": float(lex_r),
697
+ "lex_terms": lex_terms,
698
+
699
+ "arabic": arabic,
700
+ "arabic_clean": ar_clean,
701
+ "english": english,
702
+
703
+ "arabic_clean_html": extras.get("arabic_clean_html", "") if want_html else "",
704
+ "heatmap_html": extras.get("heatmap_html", ""),
705
+ "best_seg_html": extras.get("best_seg_html", ""),
706
+ })
707
+
708
+
709
  if __name__ == "__main__":
710
+ # local run only
711
  app.run(host="127.0.0.1", port=5000, debug=True)