kdallash commited on
Commit
61df2a9
ยท
verified ยท
1 Parent(s): affbb12

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +180 -95
app.py CHANGED
@@ -3,63 +3,96 @@ import pandas as pd
3
  import numpy as np
4
  import faiss
5
  import pickle
6
-
7
  from sentence_transformers import SentenceTransformer
8
 
9
- # Load data & models ONCE
 
 
10
 
11
- # Load dataset
12
  df = pd.read_csv("data/hadith.csv")
13
-
14
- # Load embeddings
15
  hadith_embeddings = np.load("data/hadith_embeddings.npy")
16
  print(f"Loaded hadith embeddings: {hadith_embeddings.shape}")
17
 
18
- # Load BM25
19
  with open("data/bm25.pkl", "rb") as f:
20
  bm25 = pickle.load(f)
21
 
22
- # Load anchor FAISS index
23
  anchor_index = faiss.read_index("data/faiss_anchor.index")
24
  print(f"Anchor index dimension: {anchor_index.d}")
25
 
26
- # Load anchor mapping
27
  with open("data/anchor_dict.pkl", "rb") as f:
28
  anchor_dict = pickle.load(f)
29
 
30
  with open("data/unique_anchor_texts.pkl", "rb") as f:
31
  unique_anchor_texts = pickle.load(f)
32
 
33
- # Load embedding model
34
- model = SentenceTransformer(
35
- "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
36
- )
37
  model.max_seq_length = 512
38
 
39
- # Test embedding dimension
40
  test_emb = model.encode("test", normalize_embeddings=True)
41
  print(f"Model embedding dimension: {test_emb.shape}")
42
-
43
- # Verify dimensions match
44
  if test_emb.shape[0] != anchor_index.d:
45
  raise ValueError(
46
  f"Dimension mismatch! Model outputs {test_emb.shape[0]}D but "
47
- f"anchor_index expects {anchor_index.d}D. "
48
- f"Rebuild your anchor_index with the same model."
49
  )
50
 
51
- # Import retrieval logic
52
  from retrieval import hybrid_search_fixed
53
  from utils import preprocess_query
 
54
  # ===============================
55
- # Search Function
56
  # ===============================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
 
 
 
 
 
 
 
 
 
 
58
  def search_hadith(query, top_k):
59
- if not query.strip():
60
  return "<p class='empty'>Please enter a search query.</p>"
61
 
62
- results_df, _ = hybrid_search_fixed(
63
  query=query,
64
  df=df,
65
  bm25=bm25,
@@ -72,121 +105,174 @@ def search_hadith(query, top_k):
72
  top_k=int(top_k)
73
  )
74
 
75
- html = "<div class='results'>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
- for i, row in results_df.iterrows():
78
- html += f"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  <div class="card">
80
  <div class="card-header">
81
- <span class="index">#{i+1}</span>
82
- <span class="topic">{row['main_subj']}</span>
 
 
 
 
 
 
 
83
  </div>
84
- <div class="text">
85
- {row['isnad_text']}
86
- {row['matn_text']}
 
 
87
  </div>
88
  <div class="footer">
89
- <a href="{row['url']}" target="_blank">
90
- ๐Ÿ“– View Hadith Source
91
- </a>
92
  </div>
93
  </div>
94
  """
 
95
 
96
- html += "</div>"
97
- return html
98
 
99
  # ===============================
100
- # Custom CSS
101
  # ===============================
102
-
103
  custom_css = """
104
- body {
105
- background-color: #f7f9fc;
106
- }
107
- .results {
108
- direction: rtl;
109
- font-family: "Tahoma", "Arial", sans-serif;
110
- }
111
  .card {
112
- background: white;
113
- border-radius: 12px;
114
- padding: 18px 20px;
115
- margin-bottom: 16px;
116
- box-shadow: 0 4px 14px rgba(0,0,0,0.06);
 
117
  }
 
 
118
  .card-header {
119
- display: flex;
120
- align-items: center;
121
- gap: 10px;
122
- margin-bottom: 12px;
 
123
  }
 
 
124
  .index {
125
- background: #2563eb;
126
- color: white;
127
- padding: 4px 10px;
128
- border-radius: 20px;
129
- font-size: 14px;
 
 
 
130
  }
131
- .topic {
132
- font-weight: bold;
133
- color: #1f2937;
134
- font-size: 16px;
 
135
  }
136
- .text {
137
- line-height: 1.9;
138
- font-size: 16px;
139
- color: #111827;
140
- margin-bottom: 14px;
141
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  .footer a {
143
- text-decoration: none;
144
- color: #2563eb;
145
- font-weight: bold;
146
- }
147
- .footer a:hover {
148
- text-decoration: underline;
149
  }
150
- .empty {
151
- direction: ltr;
152
- font-size: 16px;
153
- color: #6b7280;
154
- text-align: center;
 
 
 
 
155
  }
156
  """
157
 
158
  # ===============================
159
  # Gradio Interface
160
  # ===============================
161
-
162
  interface = gr.Interface(
163
  fn=search_hadith,
164
  inputs=[
165
  gr.Textbox(
166
- label="ุฃุฏุฎู„ ู…ูˆุถูˆุน ุงู„ุจุญุซ ุฃูˆ ุงู„ุณุคุงู„",
167
- placeholder="ู…ุซุงู„: ุฃู‡ู…ูŠุฉ ุงู„ู†ูŠุฉ ูˆุฃุซุฑู‡ุง ููŠ ู‚ุจูˆู„ ุงู„ุฃุนู…ุงู„",
168
-
169
  lines=2
170
  ),
171
- gr.Slider(
172
- minimum=1,
173
- maximum=20,
174
- value=5,
175
- step=1,
176
- label="๐Ÿ“Œ Number of Results"
177
- )
178
  ],
179
  outputs=gr.HTML(),
180
  title="๐Ÿ“š Intelligent Hadith Search Engine",
181
  description="""
182
  An AI-powered semantic search engine for Hadith retrieval.
183
- The system combines lexical, semantic, and topic-aware retrieval
184
- to return results based on meaning โ€” not just keyword matching.
185
  """,
186
- examples=[
187
  ["ุฃู‡ู…ูŠุฉ ุงู„ู†ูŠุฉ ูˆุฃุซุฑู‡ุง ููŠ ู‚ุจูˆู„ ุงู„ุฃุนู…ุงู„", 5],
188
- ["ูุถู„ ุงู„ุตู„ุงุฉ", 5],
189
- ["ุญู‚ูˆู‚ ุงู„ุฌุงุฑ", 5]],
 
190
  css=custom_css,
191
  allow_flagging="never"
192
  )
@@ -194,6 +280,5 @@ interface = gr.Interface(
194
  # ===============================
195
  # Launch
196
  # ===============================
197
-
198
  if __name__ == "__main__":
199
- interface.launch()
 
3
  import numpy as np
4
  import faiss
5
  import pickle
6
+ import html
7
  from sentence_transformers import SentenceTransformer
8
 
9
+ # ===============================
10
+ # Load data & models (ONCE)
11
+ # ===============================
12
 
 
13
  df = pd.read_csv("data/hadith.csv")
 
 
14
  hadith_embeddings = np.load("data/hadith_embeddings.npy")
15
  print(f"Loaded hadith embeddings: {hadith_embeddings.shape}")
16
 
 
17
  with open("data/bm25.pkl", "rb") as f:
18
  bm25 = pickle.load(f)
19
 
 
20
  anchor_index = faiss.read_index("data/faiss_anchor.index")
21
  print(f"Anchor index dimension: {anchor_index.d}")
22
 
 
23
  with open("data/anchor_dict.pkl", "rb") as f:
24
  anchor_dict = pickle.load(f)
25
 
26
  with open("data/unique_anchor_texts.pkl", "rb") as f:
27
  unique_anchor_texts = pickle.load(f)
28
 
29
+ model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
 
 
 
30
  model.max_seq_length = 512
31
 
32
+ # Quick dimension check
33
  test_emb = model.encode("test", normalize_embeddings=True)
34
  print(f"Model embedding dimension: {test_emb.shape}")
 
 
35
  if test_emb.shape[0] != anchor_index.d:
36
  raise ValueError(
37
  f"Dimension mismatch! Model outputs {test_emb.shape[0]}D but "
38
+ f"anchor_index expects {anchor_index.d}D. Rebuild your anchor_index with the same model."
 
39
  )
40
 
41
+ # Import retrieval
42
  from retrieval import hybrid_search_fixed
43
  from utils import preprocess_query
44
+
45
  # ===============================
46
+ # Helper: safe getter & short preview (robust version)
47
  # ===============================
48
+ def safe_get(row, col):
49
+ """
50
+ Robust getter that works if `row` is a pandas Series or a dict-like object.
51
+ Returns empty string if column missing or value is NaN/empty.
52
+ """
53
+ try:
54
+ # try Series.get first (works for pandas Series)
55
+ v = None
56
+ if hasattr(row, "get"):
57
+ v = row.get(col, "")
58
+ else:
59
+ # fallback for other mapping-like rows
60
+ v = row[col] if col in row else ""
61
+ if v is None:
62
+ return ""
63
+ if isinstance(v, float) and np.isnan(v):
64
+ return ""
65
+ v_str = str(v).strip()
66
+ return v_str
67
+ except Exception:
68
+ return ""
69
+
70
+ def first_nonempty(row, cols):
71
+ """
72
+ Return the first non-empty value found in `cols` for `row`.
73
+ Example: first_nonempty(row, ["isnad_text","hadith_title","main_subj"])
74
+ """
75
+ for c in cols:
76
+ val = safe_get(row, c)
77
+ if val:
78
+ return val
79
+ return ""
80
 
81
+ def short_preview(text, length=300):
82
+ text = (text or "").strip()
83
+ if len(text) <= length:
84
+ return text
85
+ # safe break
86
+ return text[:length].rsplit(' ', 1)[0] + "..."
87
+
88
+ # ===============================
89
+ # Search Function (updated)
90
+ # ===============================
91
  def search_hadith(query, top_k):
92
+ if not query or not str(query).strip():
93
  return "<p class='empty'>Please enter a search query.</p>"
94
 
95
+ results_df, debug = hybrid_search_fixed(
96
  query=query,
97
  df=df,
98
  bm25=bm25,
 
105
  top_k=int(top_k)
106
  )
107
 
108
+ final_scores = debug.get("final_scores") if isinstance(debug, dict) else None
109
+
110
+ html_parts = []
111
+ html_parts.append("<div class='results'>")
112
+
113
+ for rank, (_, row) in enumerate(results_df.iterrows(), start=1):
114
+ hadith_idx = row.name if hasattr(row, "name") else ""
115
+
116
+ # Try multiple candidate columns for the title; isnad_text is first priority
117
+ title_candidates = ["isnad_text", "hadith_title", "title", "main_subj"]
118
+ title_val = first_nonempty(row, title_candidates)
119
+ if not title_val:
120
+ # fallback to a short generated title from matn_text
121
+ title_val = short_preview(safe_get(row, "matn_text"), 60) or f"Hadith #{hadith_idx}"
122
+
123
+ # matn_text (main hadith body) - preserve Arabic text, escape HTML
124
+ matn_text = safe_get(row, "matn_text")
125
+ matn_text_escaped = html.escape(matn_text).replace("\n", "<br>")
126
 
127
+ topic = html.escape(safe_get(row, "main_subj"))
128
+ source_url = safe_get(row, "url")
129
+
130
+ # score (if available)
131
+ score_display = ""
132
+ if isinstance(final_scores, (list, np.ndarray)):
133
+ try:
134
+ s = float(final_scores[int(hadith_idx)])
135
+ score_display = f"{s:.3f}"
136
+ except Exception:
137
+ score_display = ""
138
+
139
+ preview = short_preview(matn_text, length=360)
140
+ preview_escaped = html.escape(preview).replace("\n", "<br>")
141
+
142
+ card_html = f"""
143
  <div class="card">
144
  <div class="card-header">
145
+ <div class="left">
146
+ <div class="index">#{rank}</div>
147
+ <div class="title">{html.escape(title_val)}</div>
148
+ <div class="topic">Topic: {topic}</div>
149
+ </div>
150
+ <div class="right-meta">
151
+ <div class="meta">ID: <span class="meta-val">{hadith_idx}</span></div>
152
+ {f'<div class="meta">Score: <span class="meta-val">{score_display}</span></div>' if score_display else ''}
153
+ </div>
154
  </div>
155
+ <div class="text-rtl">
156
+ <details>
157
+ <summary class="summary">{preview_escaped}</summary>
158
+ <div class="full-text">{matn_text_escaped}</div>
159
+ </details>
160
  </div>
161
  <div class="footer">
162
+ {"<a href='" + html.escape(source_url) + "' target='_blank' rel='noopener noreferrer'>๐Ÿ“– View source</a>" if source_url else ""}
 
 
163
  </div>
164
  </div>
165
  """
166
+ html_parts.append(card_html)
167
 
168
+ html_parts.append("</div>")
169
+ return "\n".join(html_parts)
170
 
171
  # ===============================
172
+ # Custom CSS (polished, professional)
173
  # ===============================
 
174
  custom_css = """
175
+ /* Page */
176
+ body { background-color: #f5f7fb; font-family: Inter, "Segoe UI", Tahoma, Arial, sans-serif; }
177
+
178
+ /* Container (results) keeps RTL for hadith text blocks but LTR for metadata */
179
+ .results { margin-top: 10px; }
180
+
181
+ /* Card */
182
  .card {
183
+ background: #ffffff;
184
+ border-radius: 12px;
185
+ padding: 18px;
186
+ margin-bottom: 14px;
187
+ box-shadow: 0 6px 20px rgba(18, 38, 63, 0.06);
188
+ border: 1px solid rgba(20, 43, 74, 0.03);
189
  }
190
+
191
+ /* Header layout */
192
  .card-header {
193
+ display: flex;
194
+ justify-content: space-between;
195
+ align-items: flex-start;
196
+ gap: 12px;
197
+ margin-bottom: 10px;
198
  }
199
+
200
+ .card-header .left { display:flex; flex-direction: column; gap:6px; }
201
  .index {
202
+ display:inline-block;
203
+ background: linear-gradient(90deg,#2563eb,#06b6d4);
204
+ color: white;
205
+ padding: 6px 10px;
206
+ border-radius: 20px;
207
+ font-weight: 600;
208
+ font-size: 13px;
209
+ width: fit-content;
210
  }
211
+
212
+ .title {
213
+ font-size: 18px;
214
+ font-weight: 700;
215
+ color: #0f172a;
216
  }
217
+
218
+ .topic {
219
+ font-size: 13px;
220
+ color: #475569;
221
+ margin-top: 4px;
222
  }
223
+
224
+ /* Right meta */
225
+ .right-meta { text-align: right; min-width: 120px; }
226
+ .meta { font-size: 12px; color: #6b7280; }
227
+ .meta-val { color: #0f172a; font-weight: 600; margin-left: 6px; }
228
+
229
+ /* Hadith text: keep RTL and readable */
230
+ .text-rtl { direction: rtl; margin-top: 6px; margin-bottom: 10px; }
231
+ .text-rtl .summary { cursor: pointer; font-size: 16px; color: #111827; line-height: 1.9; }
232
+ .text-rtl .full-text { margin-top: 8px; font-size: 16px; color: #111827; line-height: 1.9; }
233
+
234
+ /* Footer link */
235
+ .footer { margin-top: 12px; }
236
  .footer a {
237
+ text-decoration: none;
238
+ color: #2563eb; font-weight: 600;
 
 
 
 
239
  }
240
+ .footer a:hover { text-decoration: underline; }
241
+
242
+ /* Empty state */
243
+ .empty { font-size: 15px; color: #6b7280; text-align:center; padding: 12px; }
244
+
245
+ /* Responsive */
246
+ @media (max-width: 700px) {
247
+ .card-header { flex-direction: column; align-items: flex-start; }
248
+ .right-meta { text-align: left; }
249
  }
250
  """
251
 
252
  # ===============================
253
  # Gradio Interface
254
  # ===============================
 
255
  interface = gr.Interface(
256
  fn=search_hadith,
257
  inputs=[
258
  gr.Textbox(
259
+ label="๐Ÿ” Search Query",
260
+ placeholder="e.g. Importance of intention in Islam / ุฃู‡ู…ูŠุฉ ุงู„ู†ูŠุฉ",
 
261
  lines=2
262
  ),
263
+ gr.Slider(minimum=1, maximum=20, value=5, step=1, label="๐Ÿ“Œ Number of Results")
 
 
 
 
 
 
264
  ],
265
  outputs=gr.HTML(),
266
  title="๐Ÿ“š Intelligent Hadith Search Engine",
267
  description="""
268
  An AI-powered semantic search engine for Hadith retrieval.
269
+ Combines lexical (BM25), semantic embeddings, and topic-aware anchors.
 
270
  """,
271
+ examples=[
272
  ["ุฃู‡ู…ูŠุฉ ุงู„ู†ูŠุฉ ูˆุฃุซุฑู‡ุง ููŠ ู‚ุจูˆู„ ุงู„ุฃุนู…ุงู„", 5],
273
+ ["ูุถู„ ุงู„ุตุฏู‚ุฉ", 5],
274
+ ["ุญู‚ูˆู‚ ุงู„ูˆุงู„ุฏูŠู†", 5]
275
+ ],
276
  css=custom_css,
277
  allow_flagging="never"
278
  )
 
280
  # ===============================
281
  # Launch
282
  # ===============================
 
283
  if __name__ == "__main__":
284
+ interface.launch()