Alshargi commited on
Commit
5e4085a
·
verified ·
1 Parent(s): 42a6a19

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +387 -117
app.py CHANGED
@@ -3,13 +3,14 @@ from __future__ import annotations
3
  import os
4
  import re
5
  import time
 
6
  from typing import List, Dict, Any, Tuple
7
 
8
  import numpy as np
9
  import pandas as pd
10
  import faiss
11
- from flask import Flask, request, jsonify
12
- from flask_cors import CORS
13
  from sentence_transformers import SentenceTransformer
14
 
15
 
@@ -23,11 +24,12 @@ MODEL_NAME = os.getenv("HADITH_MODEL_NAME", "intfloat/multilingual-e5-base")
23
  DEFAULT_TOP_K = 10
24
  MAX_TOP_K = 50
25
 
26
- DEFAULT_HL_TOPN = 6
27
  MAX_HL_TOPN = 25
28
 
29
- DEFAULT_SEG_MAXLEN = 220
30
  MAX_SEG_MAXLEN = 420
 
31
 
32
 
33
  # =========================
@@ -67,7 +69,7 @@ def escape_html(s: str) -> str:
67
 
68
 
69
  # =========================
70
- # Segmenting + semantic highlight
71
  # =========================
72
  def split_ar_segments(text: str, max_len: int) -> List[str]:
73
  if not text:
@@ -91,74 +93,17 @@ def split_ar_segments(text: str, max_len: int) -> List[str]:
91
  if buf:
92
  segs.append(buf)
93
 
 
94
  if len(segs) <= 1 and len(t) > max_len:
95
  segs = [t[i:i+max_len].strip() for i in range(0, len(t), max_len) if t[i:i+max_len].strip()]
96
-
97
  return segs
98
 
99
- def semantic_highlight_segments_html(
100
- model: SentenceTransformer,
101
- query_norm: str,
102
- arabic_clean: str,
103
- top_n: int,
104
- seg_max_len: int
105
- ) -> Tuple[str, List[Dict[str, Any]]]:
106
- """
107
- Returns:
108
- - HTML string with highlighted segments
109
- - segments_debug: list of {seg, sim, strong}
110
- """
111
- segs = split_ar_segments(arabic_clean, max_len=seg_max_len)
112
- if not segs:
113
- return escape_html(arabic_clean), []
114
-
115
- q_emb = model.encode(["query: " + query_norm], normalize_embeddings=True).astype("float32")
116
- seg_emb = model.encode(["passage: " + s for s in segs], normalize_embeddings=True).astype("float32")
117
-
118
- sims = (seg_emb @ q_emb[0]).astype(np.float32)
119
- s_min = float(np.min(sims))
120
- s_max = float(np.max(sims))
121
- denom = (s_max - s_min) if (s_max - s_min) > 1e-6 else 1.0
122
-
123
- order = np.argsort(-sims)
124
- keep = set(order[:min(top_n, len(segs))])
125
-
126
- html_parts: List[str] = []
127
- dbg: List[Dict[str, Any]] = []
128
-
129
- for i, seg in enumerate(segs):
130
- w = (float(sims[i]) - s_min) / denom # 0..1
131
- strong = i in keep
132
-
133
- # Strong highlight for top segments, softer for others
134
- alpha = (0.18 + 0.62 * w) if strong else (0.06 + 0.20 * w)
135
- alpha = max(0.05, min(alpha, 0.82))
136
- border_alpha = max(0.10, min(alpha * 0.8, 0.65))
137
-
138
- style = (
139
- f"background: rgba(255, 230, 120, {alpha:.3f});"
140
- f"border: 1px solid rgba(234, 179, 8, {border_alpha:.3f});"
141
- "border-radius: 12px;"
142
- "padding: 3px 8px;"
143
- "margin: 0 4px 6px 0;"
144
- "display: inline;"
145
- )
146
- html_parts.append(f'<span style="{style}">{escape_html(seg)}</span> ')
147
- dbg.append({"seg": seg, "sim": float(sims[i]), "strong": bool(strong)})
148
-
149
- html = "".join(html_parts).strip()
150
- if not html:
151
- html = escape_html(arabic_clean)
152
-
153
- return html, dbg
154
-
155
 
156
  # =========================
157
  # Load model + index + meta (once)
158
  # =========================
159
  if not os.path.exists(INDEX_PATH):
160
  raise FileNotFoundError(f"FAISS index not found: {INDEX_PATH}")
161
-
162
  if not os.path.exists(META_PATH):
163
  raise FileNotFoundError(f"Meta parquet not found: {META_PATH}")
164
 
@@ -175,16 +120,28 @@ if "arabic_clean" not in meta.columns:
175
  meta["arabic_clean"] = ""
176
 
177
 
178
- def semantic_search(query: str, top_k: int) -> pd.DataFrame:
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  q = str(query or "").strip()
180
  if not q:
181
  return meta.iloc[0:0].copy()
182
 
183
  top_k = max(1, min(int(top_k), MAX_TOP_K))
184
-
185
  q_norm = normalize_ar(q)
186
- q_emb = model.encode(["query: " + q_norm], normalize_embeddings=True).astype("float32")
187
 
 
188
  scores, idx = index.search(q_emb, top_k)
189
 
190
  res = meta.iloc[idx[0]].copy()
@@ -197,43 +154,345 @@ def semantic_search(query: str, top_k: int) -> pd.DataFrame:
197
 
198
 
199
  # =========================
200
- # Flask app (JSON API)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  # =========================
202
  app = Flask(__name__)
203
- CORS(app) # مهم عشان تقدر تناديه من أي هوست (HTML خارجي)
204
 
205
- @app.get("/health")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
  def health():
207
  return jsonify({
208
  "ok": True,
209
  "model": MODEL_NAME,
 
210
  "rows": int(len(meta)),
211
- "index_ntotal": int(index.ntotal),
 
 
 
 
212
  })
213
 
 
 
 
 
 
 
214
  @app.get("/search")
215
  def search():
216
  q = request.args.get("q", "").strip()
217
 
218
- # k
 
219
  try:
220
- k = int(request.args.get("k", str(DEFAULT_TOP_K)))
221
  except Exception:
222
  k = DEFAULT_TOP_K
223
- k = min(max(1, k), MAX_TOP_K)
224
 
225
- # highlight controls
 
 
226
  try:
227
- hl_topn = int(request.args.get("hl_topn", str(DEFAULT_HL_TOPN)))
228
  except Exception:
229
  hl_topn = DEFAULT_HL_TOPN
230
- hl_topn = min(max(1, hl_topn), MAX_HL_TOPN)
231
-
232
  try:
233
- seg_maxlen = int(request.args.get("seg_maxlen", str(DEFAULT_SEG_MAXLEN)))
234
  except Exception:
235
  seg_maxlen = DEFAULT_SEG_MAXLEN
236
- seg_maxlen = min(max(120, seg_maxlen), MAX_SEG_MAXLEN)
 
 
 
 
 
237
 
238
  if not q:
239
  return jsonify({
@@ -241,64 +500,75 @@ def search():
241
  "query": "",
242
  "query_norm": "",
243
  "k": k,
244
- "hl_topn": hl_topn,
245
- "seg_maxlen": seg_maxlen,
246
  "took_ms": 0,
247
- "results": []
 
248
  })
249
 
250
  t0 = time.time()
251
- res_df = semantic_search(q, top_k=k)
252
  took_ms = int((time.time() - t0) * 1000)
253
 
254
  q_norm = normalize_ar(q)
255
 
256
- out: List[Dict[str, Any]] = []
257
- for _, row in res_df.iterrows():
258
- r = row.to_dict()
259
-
260
- arabic_text = str(r.get("arabic", "") or "")
261
- english_text = str(r.get("english", "") or "")
262
-
263
- arabic_clean_val = r.get("arabic_clean", "")
264
- if arabic_clean_val is None or (isinstance(arabic_clean_val, float) and np.isnan(arabic_clean_val)):
265
- arabic_clean_val = ""
266
- arabic_clean = str(arabic_clean_val).strip()
267
- if not arabic_clean:
268
- arabic_clean = normalize_ar(arabic_text)
269
-
270
- # هنا الهايلايت الدلالي مثل كودك
271
- arabic_highlight_html, _dbg = semantic_highlight_segments_html(
272
- model=model,
273
  query_norm=q_norm,
274
- arabic_clean=arabic_clean,
275
- top_n=hl_topn,
276
- seg_max_len=seg_maxlen,
277
  )
278
 
279
- out.append({
280
- "score": float(r.get("score", 0.0)),
281
- "hadithID": int(r.get("hadithID")),
282
- "collection": str(r.get("collection", "")),
283
- "hadith_number": int(r.get("hadith_number")),
284
- "arabic": arabic_text,
285
- "arabic_clean": arabic_clean,
286
- "arabic_highlight_html": arabic_highlight_html, # أهم شيء
287
- "english": english_text,
288
- })
 
 
 
 
 
 
289
 
290
  return jsonify({
291
  "ok": True,
292
  "query": q,
293
  "query_norm": q_norm,
294
  "k": k,
 
 
 
 
295
  "hl_topn": hl_topn,
296
  "seg_maxlen": seg_maxlen,
297
- "took_ms": took_ms,
298
- "results": out
299
  })
300
 
301
 
302
- # HF Spaces runs with gunicorn; locally:
303
  if __name__ == "__main__":
304
- app.run(host="0.0.0.0", port=7860, debug=True)
 
 
3
  import os
4
  import re
5
  import time
6
+ from functools import lru_cache
7
  from typing import List, Dict, Any, Tuple
8
 
9
  import numpy as np
10
  import pandas as pd
11
  import faiss
12
+
13
+ from flask import Flask, request, jsonify, Response
14
  from sentence_transformers import SentenceTransformer
15
 
16
 
 
24
  DEFAULT_TOP_K = 10
25
  MAX_TOP_K = 50
26
 
27
+ DEFAULT_HL_TOPN = 6 # 0 = disable highlighting (FAST)
28
  MAX_HL_TOPN = 25
29
 
30
+ DEFAULT_SEG_MAXLEN = 220 # segment size
31
  MAX_SEG_MAXLEN = 420
32
+ MIN_SEG_MAXLEN = 120
33
 
34
 
35
  # =========================
 
69
 
70
 
71
  # =========================
72
+ # Segmenting
73
  # =========================
74
  def split_ar_segments(text: str, max_len: int) -> List[str]:
75
  if not text:
 
93
  if buf:
94
  segs.append(buf)
95
 
96
+ # fallback chunking
97
  if len(segs) <= 1 and len(t) > max_len:
98
  segs = [t[i:i+max_len].strip() for i in range(0, len(t), max_len) if t[i:i+max_len].strip()]
 
99
  return segs
100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
  # =========================
103
  # Load model + index + meta (once)
104
  # =========================
105
  if not os.path.exists(INDEX_PATH):
106
  raise FileNotFoundError(f"FAISS index not found: {INDEX_PATH}")
 
107
  if not os.path.exists(META_PATH):
108
  raise FileNotFoundError(f"Meta parquet not found: {META_PATH}")
109
 
 
120
  meta["arabic_clean"] = ""
121
 
122
 
123
+ # =========================
124
+ # Embedding helpers (cached)
125
+ # =========================
126
+ @lru_cache(maxsize=1024)
127
+ def cached_query_emb(query_norm: str) -> bytes:
128
+ """Cache query embedding (normalized, float32). Return as bytes for caching."""
129
+ emb = model.encode(["query: " + query_norm], normalize_embeddings=True).astype("float32")[0]
130
+ return emb.tobytes()
131
+
132
+ def get_query_emb(query_norm: str) -> np.ndarray:
133
+ return np.frombuffer(cached_query_emb(query_norm), dtype=np.float32)
134
+
135
+
136
+ def semantic_search_df(query: str, top_k: int) -> pd.DataFrame:
137
  q = str(query or "").strip()
138
  if not q:
139
  return meta.iloc[0:0].copy()
140
 
141
  top_k = max(1, min(int(top_k), MAX_TOP_K))
 
142
  q_norm = normalize_ar(q)
 
143
 
144
+ q_emb = get_query_emb(q_norm).reshape(1, -1)
145
  scores, idx = index.search(q_emb, top_k)
146
 
147
  res = meta.iloc[idx[0]].copy()
 
154
 
155
 
156
  # =========================
157
+ # Batch semantic highlight (FAST)
158
+ # =========================
159
+ def build_highlight_html_batch(
160
+ query_norm: str,
161
+ arabic_clean_list: List[str],
162
+ hl_topn: int,
163
+ seg_maxlen: int,
164
+ ) -> Tuple[List[str], Dict[str, Any]]:
165
+ """
166
+ Return list of HTML strings (one per hadith), highlighted by segment similarity.
167
+ Uses ONE encode() call for all segments across all hadith results (fast).
168
+ """
169
+ # If disabled:
170
+ if hl_topn <= 0:
171
+ return [escape_html(t) for t in arabic_clean_list], {"mode": "disabled"}
172
+
173
+ # Split into segments per hadith
174
+ per_segments: List[List[str]] = [split_ar_segments(t, seg_maxlen) for t in arabic_clean_list]
175
+
176
+ # Flatten segments
177
+ all_segments: List[str] = []
178
+ offsets: List[Tuple[int,int]] = [] # (start, end) in flattened array
179
+ cur = 0
180
+ for segs in per_segments:
181
+ start = cur
182
+ all_segments.extend(segs)
183
+ cur += len(segs)
184
+ offsets.append((start, cur))
185
+
186
+ # Edge cases
187
+ if len(all_segments) == 0:
188
+ return [escape_html(t) for t in arabic_clean_list], {"mode": "empty"}
189
+
190
+ # Encode query once + encode all segments once
191
+ q_emb = get_query_emb(query_norm) # (d,)
192
+ seg_emb = model.encode(
193
+ ["passage: " + s for s in all_segments],
194
+ normalize_embeddings=True
195
+ ).astype("float32") # (N, d)
196
+
197
+ sims_all = (seg_emb @ q_emb).astype(np.float32) # (N,)
198
+
199
+ # Build HTML per hadith
200
+ html_out: List[str] = []
201
+ for (start, end), segs in zip(offsets, per_segments):
202
+ if start == end or len(segs) == 0:
203
+ html_out.append("")
204
+ continue
205
+
206
+ sims = sims_all[start:end]
207
+ s_min = float(np.min(sims))
208
+ s_max = float(np.max(sims))
209
+ denom = (s_max - s_min) if (s_max - s_min) > 1e-6 else 1.0
210
+
211
+ order = np.argsort(-sims)
212
+ keep = set(order[:min(hl_topn, len(segs))])
213
+
214
+ parts: List[str] = []
215
+ for i, seg in enumerate(segs):
216
+ w = (float(sims[i]) - s_min) / denom
217
+ alpha = (0.18 + 0.62 * w) if i in keep else (0.06 + 0.20 * w)
218
+ alpha = max(0.05, min(alpha, 0.82))
219
+ border_alpha = max(0.10, min(alpha * 0.8, 0.65))
220
+
221
+ style = (
222
+ f"background: rgba(255, 230, 120, {alpha:.3f});"
223
+ f"border: 1px solid rgba(234, 179, 8, {border_alpha:.3f});"
224
+ "border-radius: 12px;"
225
+ "padding: 3px 8px;"
226
+ "margin: 0 4px 6px 0;"
227
+ "display: inline;"
228
+ )
229
+ parts.append(f'<span style="{style}">{escape_html(seg)}</span> ')
230
+ html_out.append("".join(parts).strip())
231
+
232
+ return html_out, {"mode": "batch", "segments_total": len(all_segments)}
233
+
234
+
235
+ # =========================
236
+ # Flask app
237
  # =========================
238
  app = Flask(__name__)
 
239
 
240
+ UI_HTML = r"""
241
+ <!doctype html>
242
+ <html lang="ar" dir="rtl">
243
+ <head>
244
+ <meta charset="utf-8">
245
+ <meta name="viewport" content="width=device-width,initial-scale=1">
246
+ <title>البحث الدلالي في الأحاديث</title>
247
+ <link rel="preconnect" href="https://fonts.googleapis.com">
248
+ <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
249
+ <link href="https://fonts.googleapis.com/css2?family=Amiri:wght@400;700&family=Tajawal:wght@400;700&display=swap" rel="stylesheet">
250
+ <style>
251
+ :root{
252
+ --bg:#f6f7fb; --card:#ffffff; --text:#0f172a; --muted:#475569;
253
+ --line:#e5e7eb; --accent:#2563eb; --shadow: 0 10px 30px rgba(15, 23, 42, .08);
254
+ }
255
+ body{
256
+ margin:0; background: linear-gradient(180deg, #ffffff, var(--bg)); color: var(--text);
257
+ font-family: Tajawal, ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Arial;
258
+ }
259
+ .wrap{ max-width: 1100px; margin:0 auto; padding: 26px 16px 44px; }
260
+ .header{
261
+ background: var(--card); border: 1px solid var(--line); border-radius: 18px;
262
+ padding: 18px; box-shadow: var(--shadow);
263
+ }
264
+ .title{ font-family: Amiri, serif; font-size: 36px; font-weight: 700; margin:0; line-height:1.2; }
265
+ .sub{ margin: 6px 0 0; font-size: 18px; color: var(--muted); direction:ltr; text-align:left; }
266
+ .credit{ margin: 6px 0 0; font-size: 14px; color: var(--muted); direction:ltr; text-align:left; }
267
+
268
+ form{ display:flex; flex-wrap:wrap; gap:10px; align-items:center; margin-top: 14px; }
269
+ input[type="text"]{
270
+ flex: 1 1 620px; background:#fff; border:1px solid var(--line);
271
+ border-radius:14px; padding:14px; font-size:18px; outline:none;
272
+ }
273
+ input[type="number"]{
274
+ width: 92px; background:#fff; border:1px solid var(--line);
275
+ border-radius:14px; padding:14px 10px; font-size:16px; direction:ltr; text-align:left; outline:none;
276
+ }
277
+ button{
278
+ background: linear-gradient(180deg, #3b82f6, #2563eb);
279
+ border: 1px solid #1d4ed8; color:#fff; border-radius:14px;
280
+ padding:14px 16px; cursor:pointer; font-weight:700; font-size:16px;
281
+ }
282
+ button:hover{ filter: brightness(1.05); }
283
+
284
+ .controls{
285
+ margin-top: 12px; display:flex; gap:10px; flex-wrap:wrap; align-items:center;
286
+ direction:ltr; text-align:left; color: var(--muted); font-size: 13px;
287
+ }
288
+ .controls label{ display:flex; gap:8px; align-items:center; }
289
+ .controls input[type="range"]{ width: 200px; }
290
+
291
+ .meta{ display:flex; gap:10px; flex-wrap:wrap; margin-top: 10px; color: var(--muted); font-size: 13px; direction:ltr; }
292
+ .pill{ border:1px solid var(--line); background:#fff; padding:6px 10px; border-radius:999px; }
293
+
294
+ .grid{ display:grid; grid-template-columns: 1fr; gap:14px; margin-top:14px; }
295
+ .card{
296
+ background: var(--card); border: 1px solid var(--line); border-radius:18px;
297
+ padding: 16px; box-shadow: var(--shadow);
298
+ }
299
+ .row{ display:grid; grid-template-columns: 210px 1fr; gap:14px; }
300
+ @media (max-width: 900px){ .row{ grid-template-columns: 1fr; } }
301
+
302
+ .left{ color: var(--muted); font-size:14px; direction:ltr; text-align:left; }
303
+ .score{ font-weight:800; color: var(--accent); font-size:16px; }
304
+
305
+ .arabic{
306
+ direction: rtl; text-align:right; font-family: Amiri, serif; font-size:22px;
307
+ line-height: 2.05; background:#fbfcff; border:1px solid var(--line);
308
+ border-radius:16px; padding:14px; white-space: pre-wrap;
309
+ }
310
+ .english{
311
+ direction:ltr; text-align:left; font-size:16px; line-height:1.8; color:#111827;
312
+ background:#fbfcff; border:1px solid var(--line); border-radius:16px; padding:14px; white-space: pre-wrap;
313
+ }
314
+ details summary{
315
+ cursor:pointer; color: var(--accent); margin-top:12px; user-select:none;
316
+ direction:ltr; text-align:left; font-weight:700;
317
+ }
318
+ .empty{ margin-top: 14px; color: var(--muted); font-size: 15px; direction:ltr; text-align:left; }
319
+ </style>
320
+ </head>
321
+ <body>
322
+ <div class="wrap">
323
+ <div class="header">
324
+ <h1 class="title">البحث الدلالي في الأحاديث</h1>
325
+ <div class="sub">search in hadeeth (API + UI)</div>
326
+ <div class="credit">Done by Dr Faisal Alshargi</div>
327
+
328
+ <form id="f">
329
+ <input id="q" type="text" placeholder="اكتب سؤالك هنا… مثال: الاستغفار بعد الذنب وركعتين">
330
+ <input id="k" type="number" min="1" max="50" value="10">
331
+ <button type="submit">Search</button>
332
+ </form>
333
+
334
+ <div class="controls">
335
+ <label>
336
+ Highlight Top Segments:
337
+ <input id="hl" type="range" min="0" max="25" value="6">
338
+ <b id="hlv">6</b>
339
+ </label>
340
+ <label>
341
+ Segment Size:
342
+ <input id="seg" type="range" min="120" max="420" step="20" value="220">
343
+ <b id="segv">220</b>
344
+ </label>
345
+ </div>
346
+
347
+ <div id="meta" class="meta" style="display:none;"></div>
348
+ <div id="msg" class="empty" style="display:none;"></div>
349
+ </div>
350
+
351
+ <div id="grid" class="grid"></div>
352
+ </div>
353
+
354
+ <script>
355
+ const $ = (id)=>document.getElementById(id);
356
+ function esc(s){
357
+ return String(s??"")
358
+ .replaceAll("&","&amp;").replaceAll("<","&lt;").replaceAll(">","&gt;")
359
+ .replaceAll('"',"&quot;").replaceAll("'","&#39;");
360
+ }
361
+ function pill(k,v){ return `<div class="pill">${esc(k)}: <b>${esc(v)}</b></div>`; }
362
+
363
+ function sync(rangeId, labelId){
364
+ const r=$(rangeId), l=$(labelId);
365
+ l.textContent = r.value;
366
+ r.addEventListener("input", ()=> l.textContent = r.value);
367
+ }
368
+ sync("hl","hlv"); sync("seg","segv");
369
+
370
+ $("f").addEventListener("submit", async (e)=>{
371
+ e.preventDefault();
372
+ const q = $("q").value.trim();
373
+ const k = parseInt($("k").value||"10",10);
374
+ const hl = parseInt($("hl").value||"6",10);
375
+ const seg = parseInt($("seg").value||"220",10);
376
+
377
+ $("msg").style.display="none";
378
+ $("grid").innerHTML = "";
379
+ $("meta").style.display="none";
380
+ $("meta").innerHTML = pill("Query", q) + pill("TopK", k) + pill("Highlight", hl) + pill("SegLen", seg);
381
+
382
+ if(!q){
383
+ $("msg").textContent="اكتب نص البحث أولًا.";
384
+ $("msg").style.display="block";
385
+ return;
386
+ }
387
+
388
+ $("msg").textContent="... جاري البحث";
389
+ $("msg").style.display="block";
390
+
391
+ const url = `/search?q=${encodeURIComponent(q)}&k=${encodeURIComponent(k)}&hl_topn=${encodeURIComponent(hl)}&seg_maxlen=${encodeURIComponent(seg)}&format=html`;
392
+ const t0 = performance.now();
393
+ const res = await fetch(url);
394
+ const js = await res.json();
395
+ const ms = Math.round(performance.now()-t0);
396
+
397
+ $("meta").style.display="flex";
398
+ $("meta").innerHTML =
399
+ pill("Rows", js.rows) + pill("Results", js.n) + pill("Time", js.took_ms ?? ms) + pill("TopK", js.k) + pill("Query", js.query);
400
+
401
+ if(!js.ok || !js.results || js.results.length===0){
402
+ $("msg").textContent="لا توجد نتائج. جرّب كلمات مختلفة.";
403
+ $("msg").style.display="block";
404
+ return;
405
+ }
406
+ $("msg").style.display="none";
407
+
408
+ const cards = js.results.map(r=>{
409
+ const ar = js.format==="html" ? (r.arabic_clean_html||esc(r.arabic_clean||"")) : esc(r.arabic_clean||"");
410
+ const ar_tashkeel = esc(r.arabic||"");
411
+ const en = esc(r.english||"");
412
+ return `
413
+ <div class="card">
414
+ <div class="row">
415
+ <div class="left">
416
+ <div><span class="score">${Number(r.score||0).toFixed(4)}</span> score</div>
417
+ <div style="margin-top:12px;">HadithID: <b>${esc(r.hadithID)}</b></div>
418
+ <div>Collection: <b>${esc(r.collection)}</b></div>
419
+ <div>No: <b>${esc(r.hadith_number)}</b></div>
420
+ </div>
421
+ <div>
422
+ <div class="arabic">${ar}</div>
423
+ <details>
424
+ <summary>Show Arabic with tashkeel</summary>
425
+ <div style="height:10px;"></div>
426
+ <div class="arabic">${ar_tashkeel}</div>
427
+ </details>
428
+ <details>
429
+ <summary>Show English</summary>
430
+ <div style="height:10px;"></div>
431
+ <div class="english">${en}</div>
432
+ </details>
433
+ </div>
434
+ </div>
435
+ </div>
436
+ `;
437
+ }).join("");
438
+
439
+ $("grid").innerHTML = cards;
440
+ });
441
+ </script>
442
+ </body>
443
+ </html>
444
+ """
445
+
446
+
447
+ @app.get("/")
448
  def health():
449
  return jsonify({
450
  "ok": True,
451
  "model": MODEL_NAME,
452
+ "index_ntotal": int(getattr(index, "ntotal", -1)),
453
  "rows": int(len(meta)),
454
+ "endpoints": {
455
+ "ui": "/ui",
456
+ "search_json": "/search?q=...&k=10",
457
+ "search_html": "/search?q=...&k=10&format=html",
458
+ }
459
  })
460
 
461
+
462
+ @app.get("/ui")
463
+ def ui():
464
+ return Response(UI_HTML, mimetype="text/html; charset=utf-8")
465
+
466
+
467
  @app.get("/search")
468
  def search():
469
  q = request.args.get("q", "").strip()
470
 
471
+ # TopK
472
+ k_raw = request.args.get("k", str(DEFAULT_TOP_K)).strip()
473
  try:
474
+ k = int(k_raw) if k_raw else DEFAULT_TOP_K
475
  except Exception:
476
  k = DEFAULT_TOP_K
477
+ k = max(1, min(k, MAX_TOP_K))
478
 
479
+ # Highlight controls
480
+ hl_raw = request.args.get("hl_topn", str(DEFAULT_HL_TOPN)).strip()
481
+ seg_raw = request.args.get("seg_maxlen", str(DEFAULT_SEG_MAXLEN)).strip()
482
  try:
483
+ hl_topn = int(hl_raw) if hl_raw else DEFAULT_HL_TOPN
484
  except Exception:
485
  hl_topn = DEFAULT_HL_TOPN
 
 
486
  try:
487
+ seg_maxlen = int(seg_raw) if seg_raw else DEFAULT_SEG_MAXLEN
488
  except Exception:
489
  seg_maxlen = DEFAULT_SEG_MAXLEN
490
+
491
+ hl_topn = max(0, min(hl_topn, MAX_HL_TOPN))
492
+ seg_maxlen = max(MIN_SEG_MAXLEN, min(seg_maxlen, MAX_SEG_MAXLEN))
493
+
494
+ fmt = (request.args.get("format", "json") or "json").lower()
495
+ want_html = (fmt == "html")
496
 
497
  if not q:
498
  return jsonify({
 
500
  "query": "",
501
  "query_norm": "",
502
  "k": k,
503
+ "n": 0,
504
+ "rows": int(len(meta)),
505
  "took_ms": 0,
506
+ "format": "html" if want_html else "json",
507
+ "results": [],
508
  })
509
 
510
  t0 = time.time()
511
+ df = semantic_search_df(q, top_k=k)
512
  took_ms = int((time.time() - t0) * 1000)
513
 
514
  q_norm = normalize_ar(q)
515
 
516
+ # Build clean arabic list (fallback derive if missing)
517
+ arabic_list: List[str] = []
518
+ for _, row in df.iterrows():
519
+ ar = str(row.get("arabic", "") or "")
520
+ ar_clean = row.get("arabic_clean", "")
521
+ if ar_clean is None or (isinstance(ar_clean, float) and np.isnan(ar_clean)):
522
+ ar_clean = ""
523
+ ar_clean = str(ar_clean).strip()
524
+ if not ar_clean:
525
+ ar_clean = normalize_ar(ar)
526
+ arabic_list.append(ar_clean)
527
+
528
+ # Highlight (batch)
529
+ ar_html_list: List[str] = ["" for _ in arabic_list]
530
+ dbg: Dict[str, Any] = {}
531
+ if want_html:
532
+ ar_html_list, dbg = build_highlight_html_batch(
533
  query_norm=q_norm,
534
+ arabic_clean_list=arabic_list,
535
+ hl_topn=hl_topn,
536
+ seg_maxlen=seg_maxlen,
537
  )
538
 
539
+ results: List[Dict[str, Any]] = []
540
+ for i, (_, row) in enumerate(df.iterrows()):
541
+ arabic = str(row.get("arabic", "") or "")
542
+ english = str(row.get("english", "") or "")
543
+ r = {
544
+ "hadithID": int(row.get("hadithID")) if pd.notna(row.get("hadithID")) else None,
545
+ "collection": str(row.get("collection", "") or ""),
546
+ "hadith_number": int(row.get("hadith_number")) if pd.notna(row.get("hadith_number")) else None,
547
+ "score": float(row.get("score")) if pd.notna(row.get("score")) else 0.0,
548
+ "arabic": arabic,
549
+ "arabic_clean": arabic_list[i],
550
+ "english": english,
551
+ }
552
+ if want_html:
553
+ r["arabic_clean_html"] = ar_html_list[i] if ar_html_list[i] else escape_html(arabic_list[i])
554
+ results.append(r)
555
 
556
  return jsonify({
557
  "ok": True,
558
  "query": q,
559
  "query_norm": q_norm,
560
  "k": k,
561
+ "n": len(results),
562
+ "rows": int(len(meta)),
563
+ "took_ms": took_ms,
564
+ "format": "html" if want_html else "json",
565
  "hl_topn": hl_topn,
566
  "seg_maxlen": seg_maxlen,
567
+ "debug": dbg if want_html else {},
568
+ "results": results,
569
  })
570
 
571
 
 
572
  if __name__ == "__main__":
573
+ # local run only
574
+ app.run(host="127.0.0.1", port=5000, debug=True)