Spaces:

kdallash
/

Hadith_Semantic_Search

Running

App Files Files Community

kdallash commited on Jan 27

Commit

baeedb1

verified ·

1 Parent(s): 8011197

Upload 3 files

Browse files

Files changed (3) hide show

app.py +98 -0
retrieval.py +105 -0
utils.py +10 -0

app.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import gradio as gr
+import pandas as pd
+import numpy as np
+import faiss
+import pickle
+from sentence_transformers import SentenceTransformer
+# Load data & models ONCE
+# Load dataset
+df = pd.read_csv("data/hadith.csv")
+# Load embeddings
+hadith_embeddings = np.load("data/hadith_embeddings.npy")
+# Load BM25
+with open("data/bm25.pkl", "rb") as f:
+    bm25 = pickle.load(f)
+# Load anchor FAISS index
+anchor_index = faiss.read_index("data/faiss_anchor.index")
+# Load anchor mapping
+with open("data/anchor_dict.pkl", "rb") as f:
+    anchor_dict = pickle.load(f)
+with open("data/unique_anchor_texts.pkl", "rb") as f:
+    unique_anchor_texts = pickle.load(f)
+# Load embedding model
+model = SentenceTransformer(
+    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
+)
+# Import retrieval logic
+from retrieval import hybrid_search_fixed
+# -----------------------------
+# Search function (UI entry)
+# -----------------------------
+def search_hadith(query):
+    if query.strip() == "":
+        return pd.DataFrame(columns=["الموضوع", "نص الحديث"])
+    results_df, _ = hybrid_search_fixed(
+        query=query,
+        df=df,
+        bm25=bm25,
+        model=model,
+        hadith_embeddings=hadith_embeddings,
+        anchor_index=anchor_index,
+        anchor_dict=anchor_dict,
+        unique_anchor_texts=unique_anchor_texts,
+        top_k=int(top_k)
+    )
+    return results_df[["main_subj", "clean_text","url"]] \
+        .rename(columns={
+            "main_subj": "الموضوع",
+            "clean_text": "نص الحديث",
+            "url":"hadith page on Islamweb.net"
+        })
+# Gradio Interface
+interface = gr.Interface(
+    fn=search_hadith,
+    inputs=[
+        gr.Textbox(
+            label="أدخل موضوع البحث أو السؤال",
+            placeholder="مثال: أهمية النية وأثرها في قبول الأعمال"
+        ),
+        gr.Slider(
+            minimum=1,
+            maximum=20,
+            value=5,
+            step=1,
+            label="عدد الأحاديث المعروضة"
+        )
+    ],
+    outputs=gr.Dataframe(
+        label="نتائج البحث",
+        wrap=True
+    ),
+    title="محرك بحث ذكي في الأحاديث النبوية",
+    description=(
+        "يعتمد هذا النظام على البحث الدلالي والموضوعي "
+        "لاسترجاع الأحاديث ذات الصلة بالمعنى وليس بالكلمات فقط."
+    ),
+    allow_flagging="never"
+)
+# Launch app
+if __name__ == "__main__":
+    interface.launch()

retrieval.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import numpy as np
+def query_anchor_scores(query, model, anchor_index, top_k=10):
+    q_emb = model.encode(query, normalize_embeddings=True)
+    scores, indices = anchor_index.search(q_emb.reshape(1, -1), top_k)
+    return np.array(indices[0], dtype=int), np.array(scores[0], dtype=float)
+def bm25_retrieve(query, bm25, preprocess_query, top_k=50):
+    tokenized_query = preprocess_query(query)
+    scores = bm25.get_scores(tokenized_query)
+    top_idx = np.argsort(scores)[::-1][:top_k]
+    return top_idx, scores[top_idx]
+def compute_anchor_scores_for_hadiths(
+    n_hadiths,
+    anchor_indices,
+    anchor_scores,
+    anchor_dict,
+    unique_anchor_texts
+):
+    anchor_score_vec = np.zeros(n_hadiths, dtype=float)
+    for a_idx, a_score in zip(anchor_indices, anchor_scores):
+        if 0 <= a_idx < len(unique_anchor_texts):
+            anchor_text = unique_anchor_texts[a_idx]
+            for h_idx in anchor_dict.get(anchor_text, []):
+                anchor_score_vec[h_idx] = a_score
+    return anchor_score_vec
+def hybrid_search_fixed(
+    query,
+    df,
+    bm25,
+    preprocess_query,
+    model,
+    hadith_embeddings,
+    anchor_index,
+    anchor_dict,
+    unique_anchor_texts,
+    top_k=5,
+    top_bm25=50,
+    top_anchors=10,
+    alpha_anchor=0.40,
+    alpha_semantic=0.35,
+    alpha_bm25=0.25,
+):
+    n = len(df)
+    eps = 1e-8
+    # --- BM25 ---
+    bm25_idx, bm25_scores = bm25_retrieve(
+        query, bm25, preprocess_query, top_k=top_bm25
+    )
+    bm25_vec = np.zeros(n)
+    if bm25_scores.size > 0:
+        bm25_scores = bm25_scores / (bm25_scores.max() + eps)
+        bm25_vec[bm25_idx] = bm25_scores
+    # --- Anchor ---
+    anchor_idx, anchor_scores = query_anchor_scores(
+        query, model, anchor_index, top_k=top_anchors
+    )
+    anchor_vec = compute_anchor_scores_for_hadiths(
+        n,
+        anchor_idx,
+        anchor_scores,
+        anchor_dict,
+        unique_anchor_texts
+    )
+    if anchor_scores.size > 0:
+        anchor_vec /= (anchor_scores.max() + eps)
+    # --- Semantic ---
+    union_idx = np.unique(
+        np.concatenate([
+            bm25_idx,
+            np.where(anchor_vec > 0)[0]
+        ])
+    )
+    semantic_vec = np.zeros(n)
+    if len(union_idx) > 0:
+        q_emb = model.encode(query, normalize_embeddings=True)
+        semantic_vals = hadith_embeddings[union_idx] @ q_emb
+        semantic_vals /= (semantic_vals.max() + eps)
+        semantic_vec[union_idx] = semantic_vals
+    # --- Final fusion ---
+    final_scores = (
+        alpha_anchor * anchor_vec +
+        alpha_semantic * semantic_vec +
+        alpha_bm25 * bm25_vec
+    )
+    top_indices = np.argsort(final_scores)[::-1][:top_k]
+    return df.iloc[top_indices].copy(), final_scores

utils.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import re
+def preprocess_arabic(text):
+    text = re.sub(r"[ًٌٍَُِّْـ]", "", text)
+    text = re.sub(r"[^\w\s]", " ", text)
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+def bm25_tokenize(text):
+    return preprocess_arabic(text).split()