kdallash commited on
Commit
1e05409
·
verified ·
1 Parent(s): 3428666

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +95 -180
app.py CHANGED
@@ -3,96 +3,63 @@ import pandas as pd
3
  import numpy as np
4
  import faiss
5
  import pickle
6
- import html
7
  from sentence_transformers import SentenceTransformer
8
 
9
- # ===============================
10
- # Load data & models (ONCE)
11
- # ===============================
12
 
 
13
  df = pd.read_csv("data/hadith.csv")
 
 
14
  hadith_embeddings = np.load("data/hadith_embeddings.npy")
15
  print(f"Loaded hadith embeddings: {hadith_embeddings.shape}")
16
 
 
17
  with open("data/bm25.pkl", "rb") as f:
18
  bm25 = pickle.load(f)
19
 
 
20
  anchor_index = faiss.read_index("data/faiss_anchor.index")
21
  print(f"Anchor index dimension: {anchor_index.d}")
22
 
 
23
  with open("data/anchor_dict.pkl", "rb") as f:
24
  anchor_dict = pickle.load(f)
25
 
26
  with open("data/unique_anchor_texts.pkl", "rb") as f:
27
  unique_anchor_texts = pickle.load(f)
28
 
29
- model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
 
 
 
30
  model.max_seq_length = 512
31
 
32
- # Quick dimension check
33
  test_emb = model.encode("test", normalize_embeddings=True)
34
  print(f"Model embedding dimension: {test_emb.shape}")
 
 
35
  if test_emb.shape[0] != anchor_index.d:
36
  raise ValueError(
37
  f"Dimension mismatch! Model outputs {test_emb.shape[0]}D but "
38
- f"anchor_index expects {anchor_index.d}D. Rebuild your anchor_index with the same model."
 
39
  )
40
 
41
- # Import retrieval
42
  from retrieval import hybrid_search_fixed
43
  from utils import preprocess_query
44
-
45
  # ===============================
46
- # Helper: safe getter & short preview (robust version)
47
  # ===============================
48
- def safe_get(row, col):
49
- """
50
- Robust getter that works if `row` is a pandas Series or a dict-like object.
51
- Returns empty string if column missing or value is NaN/empty.
52
- """
53
- try:
54
- # try Series.get first (works for pandas Series)
55
- v = None
56
- if hasattr(row, "get"):
57
- v = row.get(col, "")
58
- else:
59
- # fallback for other mapping-like rows
60
- v = row[col] if col in row else ""
61
- if v is None:
62
- return ""
63
- if isinstance(v, float) and np.isnan(v):
64
- return ""
65
- v_str = str(v).strip()
66
- return v_str
67
- except Exception:
68
- return ""
69
-
70
- def first_nonempty(row, cols):
71
- """
72
- Return the first non-empty value found in `cols` for `row`.
73
- Example: first_nonempty(row, ["isnad_text","hadith_title","main_subj"])
74
- """
75
- for c in cols:
76
- val = safe_get(row, c)
77
- if val:
78
- return val
79
- return ""
80
 
81
- def short_preview(text, length=300):
82
- text = (text or "").strip()
83
- if len(text) <= length:
84
- return text
85
- # safe break
86
- return text[:length].rsplit(' ', 1)[0] + "..."
87
-
88
- # ===============================
89
- # Search Function (updated)
90
- # ===============================
91
  def search_hadith(query, top_k):
92
- if not query or not str(query).strip():
93
  return "<p class='empty'>Please enter a search query.</p>"
94
 
95
- results_df, debug = hybrid_search_fixed(
96
  query=query,
97
  df=df,
98
  bm25=bm25,
@@ -105,174 +72,121 @@ def search_hadith(query, top_k):
105
  top_k=int(top_k)
106
  )
107
 
108
- final_scores = debug.get("final_scores") if isinstance(debug, dict) else None
109
-
110
- html_parts = []
111
- html_parts.append("<div class='results'>")
112
-
113
- for rank, (_, row) in enumerate(results_df.iterrows(), start=1):
114
- hadith_idx = row.name if hasattr(row, "name") else ""
115
-
116
- # Try multiple candidate columns for the title; isnad_text is first priority
117
- title_candidates = ["isnad_text", "hadith_title", "title", "main_subj"]
118
- title_val = first_nonempty(row, title_candidates)
119
- if not title_val:
120
- # fallback to a short generated title from matn_text
121
- title_val = short_preview(safe_get(row, "matn_text"), 60) or f"Hadith #{hadith_idx}"
122
-
123
- # matn_text (main hadith body) - preserve Arabic text, escape HTML
124
- matn_text = safe_get(row, "matn_text")
125
- matn_text_escaped = html.escape(matn_text).replace("\n", "<br>")
126
 
127
- topic = html.escape(safe_get(row, "main_subj"))
128
- source_url = safe_get(row, "url")
129
-
130
- # score (if available)
131
- score_display = ""
132
- if isinstance(final_scores, (list, np.ndarray)):
133
- try:
134
- s = float(final_scores[int(hadith_idx)])
135
- score_display = f"{s:.3f}"
136
- except Exception:
137
- score_display = ""
138
-
139
- preview = short_preview(matn_text, length=360)
140
- preview_escaped = html.escape(preview).replace("\n", "<br>")
141
-
142
- card_html = f"""
143
  <div class="card">
144
  <div class="card-header">
145
- <div class="left">
146
- <div class="index">#{rank}</div>
147
- <div class="title">{html.escape(title_val)}</div>
148
- <div class="topic">Topic: {topic}</div>
149
- </div>
150
- <div class="right-meta">
151
- <div class="meta">ID: <span class="meta-val">{hadith_idx}</span></div>
152
- {f'<div class="meta">Score: <span class="meta-val">{score_display}</span></div>' if score_display else ''}
153
- </div>
154
  </div>
155
- <div class="text-rtl">
156
- <details>
157
- <summary class="summary">{preview_escaped}</summary>
158
- <div class="full-text">{matn_text_escaped}</div>
159
- </details>
160
  </div>
161
  <div class="footer">
162
- {"<a href='" + html.escape(source_url) + "' target='_blank' rel='noopener noreferrer'>📖 View source</a>" if source_url else ""}
 
 
163
  </div>
164
  </div>
165
  """
166
- html_parts.append(card_html)
167
 
168
- html_parts.append("</div>")
169
- return "\n".join(html_parts)
170
 
171
  # ===============================
172
- # Custom CSS (polished, professional)
173
  # ===============================
174
- custom_css = """
175
- /* Page */
176
- body { background-color: #f5f7fb; font-family: Inter, "Segoe UI", Tahoma, Arial, sans-serif; }
177
-
178
- /* Container (results) keeps RTL for hadith text blocks but LTR for metadata */
179
- .results { margin-top: 10px; }
180
 
181
- /* Card */
 
 
 
 
 
 
 
182
  .card {
183
- background: #ffffff;
184
- border-radius: 12px;
185
- padding: 18px;
186
- margin-bottom: 14px;
187
- box-shadow: 0 6px 20px rgba(18, 38, 63, 0.06);
188
- border: 1px solid rgba(20, 43, 74, 0.03);
189
  }
190
-
191
- /* Header layout */
192
  .card-header {
193
- display: flex;
194
- justify-content: space-between;
195
- align-items: flex-start;
196
- gap: 12px;
197
- margin-bottom: 10px;
198
  }
199
-
200
- .card-header .left { display:flex; flex-direction: column; gap:6px; }
201
  .index {
202
- display:inline-block;
203
- background: linear-gradient(90deg,#2563eb,#06b6d4);
204
- color: white;
205
- padding: 6px 10px;
206
- border-radius: 20px;
207
- font-weight: 600;
208
- font-size: 13px;
209
- width: fit-content;
210
  }
211
-
212
- .title {
213
- font-size: 18px;
214
- font-weight: 700;
215
- color: #0f172a;
216
- }
217
-
218
  .topic {
219
- font-size: 13px;
220
- color: #475569;
221
- margin-top: 4px;
 
 
 
 
 
 
222
  }
223
-
224
- /* Right meta */
225
- .right-meta { text-align: right; min-width: 120px; }
226
- .meta { font-size: 12px; color: #6b7280; }
227
- .meta-val { color: #0f172a; font-weight: 600; margin-left: 6px; }
228
-
229
- /* Hadith text: keep RTL and readable */
230
- .text-rtl { direction: rtl; margin-top: 6px; margin-bottom: 10px; }
231
- .text-rtl .summary { cursor: pointer; font-size: 16px; color: #111827; line-height: 1.9; }
232
- .text-rtl .full-text { margin-top: 8px; font-size: 16px; color: #111827; line-height: 1.9; }
233
-
234
- /* Footer link */
235
- .footer { margin-top: 12px; }
236
  .footer a {
237
- text-decoration: none;
238
- color: #2563eb; font-weight: 600;
 
239
  }
240
- .footer a:hover { text-decoration: underline; }
241
-
242
- /* Empty state */
243
- .empty { font-size: 15px; color: #6b7280; text-align:center; padding: 12px; }
244
-
245
- /* Responsive */
246
- @media (max-width: 700px) {
247
- .card-header { flex-direction: column; align-items: flex-start; }
248
- .right-meta { text-align: left; }
249
  }
250
  """
251
 
252
  # ===============================
253
  # Gradio Interface
254
  # ===============================
 
255
  interface = gr.Interface(
256
  fn=search_hadith,
257
  inputs=[
258
  gr.Textbox(
259
- label="🔍 Search Query",
260
- placeholder="e.g. Importance of intention in Islam / أهمية النية",
 
261
  lines=2
262
  ),
263
- gr.Slider(minimum=1, maximum=20, value=5, step=1, label="📌 Number of Results")
 
 
 
 
 
 
264
  ],
265
  outputs=gr.HTML(),
266
  title="📚 Intelligent Hadith Search Engine",
267
  description="""
268
  An AI-powered semantic search engine for Hadith retrieval.
269
- Combines lexical (BM25), semantic embeddings, and topic-aware anchors.
 
270
  """,
271
- examples=[
272
  ["أهمية النية وأثرها في قبول الأعمال", 5],
273
- ["فضل الصدقة", 5],
274
- ["حقوق الوالدين", 5]
275
- ],
276
  css=custom_css,
277
  allow_flagging="never"
278
  )
@@ -280,5 +194,6 @@ interface = gr.Interface(
280
  # ===============================
281
  # Launch
282
  # ===============================
 
283
  if __name__ == "__main__":
284
- interface.launch()
 
3
  import numpy as np
4
  import faiss
5
  import pickle
6
+
7
  from sentence_transformers import SentenceTransformer
8
 
9
+ # Load data & models ONCE
 
 
10
 
11
+ # Load dataset
12
  df = pd.read_csv("data/hadith.csv")
13
+
14
+ # Load embeddings
15
  hadith_embeddings = np.load("data/hadith_embeddings.npy")
16
  print(f"Loaded hadith embeddings: {hadith_embeddings.shape}")
17
 
18
+ # Load BM25
19
  with open("data/bm25.pkl", "rb") as f:
20
  bm25 = pickle.load(f)
21
 
22
+ # Load anchor FAISS index
23
  anchor_index = faiss.read_index("data/faiss_anchor.index")
24
  print(f"Anchor index dimension: {anchor_index.d}")
25
 
26
+ # Load anchor mapping
27
  with open("data/anchor_dict.pkl", "rb") as f:
28
  anchor_dict = pickle.load(f)
29
 
30
  with open("data/unique_anchor_texts.pkl", "rb") as f:
31
  unique_anchor_texts = pickle.load(f)
32
 
33
+ # Load embedding model
34
+ model = SentenceTransformer(
35
+ "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
36
+ )
37
  model.max_seq_length = 512
38
 
39
+ # Test embedding dimension
40
  test_emb = model.encode("test", normalize_embeddings=True)
41
  print(f"Model embedding dimension: {test_emb.shape}")
42
+
43
+ # Verify dimensions match
44
  if test_emb.shape[0] != anchor_index.d:
45
  raise ValueError(
46
  f"Dimension mismatch! Model outputs {test_emb.shape[0]}D but "
47
+ f"anchor_index expects {anchor_index.d}D. "
48
+ f"Rebuild your anchor_index with the same model."
49
  )
50
 
51
+ # Import retrieval logic
52
  from retrieval import hybrid_search_fixed
53
  from utils import preprocess_query
 
54
  # ===============================
55
+ # Search Function
56
  # ===============================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
 
 
 
 
 
 
 
 
 
 
58
  def search_hadith(query, top_k):
59
+ if not query.strip():
60
  return "<p class='empty'>Please enter a search query.</p>"
61
 
62
+ results_df, _ = hybrid_search_fixed(
63
  query=query,
64
  df=df,
65
  bm25=bm25,
 
72
  top_k=int(top_k)
73
  )
74
 
75
+ html = "<div class='results'>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
+ for i, row in results_df.iterrows():
78
+ html += f"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  <div class="card">
80
  <div class="card-header">
81
+ <span class="index">#{i+1}</span>
82
+ <span class="topic">{row['main_subj']}</span>
 
 
 
 
 
 
 
83
  </div>
84
+ <div class="text">
85
+ {row['isnad_text']}
86
+ {row['matn_text']}
 
 
87
  </div>
88
  <div class="footer">
89
+ <a href="{row['url']}" target="_blank">
90
+ 📖 View Hadith Source
91
+ </a>
92
  </div>
93
  </div>
94
  """
 
95
 
96
+ html += "</div>"
97
+ return html
98
 
99
  # ===============================
100
+ # Custom CSS
101
  # ===============================
 
 
 
 
 
 
102
 
103
+ custom_css = """
104
+ body {
105
+ background-color: #f7f9fc;
106
+ }
107
+ .results {
108
+ direction: rtl;
109
+ font-family: "Tahoma", "Arial", sans-serif;
110
+ }
111
  .card {
112
+ background: white;
113
+ border-radius: 12px;
114
+ padding: 18px 20px;
115
+ margin-bottom: 16px;
116
+ box-shadow: 0 4px 14px rgba(0,0,0,0.06);
 
117
  }
 
 
118
  .card-header {
119
+ display: flex;
120
+ align-items: center;
121
+ gap: 10px;
122
+ margin-bottom: 12px;
 
123
  }
 
 
124
  .index {
125
+ background: #2563eb;
126
+ color: white;
127
+ padding: 4px 10px;
128
+ border-radius: 20px;
129
+ font-size: 14px;
 
 
 
130
  }
 
 
 
 
 
 
 
131
  .topic {
132
+ font-weight: bold;
133
+ color: #1f2937;
134
+ font-size: 16px;
135
+ }
136
+ .text {
137
+ line-height: 1.9;
138
+ font-size: 16px;
139
+ color: #111827;
140
+ margin-bottom: 14px;
141
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  .footer a {
143
+ text-decoration: none;
144
+ color: #2563eb;
145
+ font-weight: bold;
146
  }
147
+ .footer a:hover {
148
+ text-decoration: underline;
149
+ }
150
+ .empty {
151
+ direction: ltr;
152
+ font-size: 16px;
153
+ color: #6b7280;
154
+ text-align: center;
 
155
  }
156
  """
157
 
158
  # ===============================
159
  # Gradio Interface
160
  # ===============================
161
+
162
  interface = gr.Interface(
163
  fn=search_hadith,
164
  inputs=[
165
  gr.Textbox(
166
+ label="أدخل موضوع البحث أو السؤال",
167
+ placeholder="مثال: أهمية النية وأثرها في قبول الأعمال",
168
+
169
  lines=2
170
  ),
171
+ gr.Slider(
172
+ minimum=1,
173
+ maximum=20,
174
+ value=5,
175
+ step=1,
176
+ label="📌 Number of Results"
177
+ )
178
  ],
179
  outputs=gr.HTML(),
180
  title="📚 Intelligent Hadith Search Engine",
181
  description="""
182
  An AI-powered semantic search engine for Hadith retrieval.
183
+ The system combines lexical, semantic, and topic-aware retrieval
184
+ to return results based on meaning — not just keyword matching.
185
  """,
186
+ examples=[
187
  ["أهمية النية وأثرها في قبول الأعمال", 5],
188
+ ["فضل الصلاة", 5],
189
+ ["حقوق الجار", 5]],
 
190
  css=custom_css,
191
  allow_flagging="never"
192
  )
 
194
  # ===============================
195
  # Launch
196
  # ===============================
197
+
198
  if __name__ == "__main__":
199
+ interface.launch()