kdallash commited on
Commit
baeedb1
ยท
verified ยท
1 Parent(s): 8011197

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +98 -0
  2. retrieval.py +105 -0
  3. utils.py +10 -0
app.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import numpy as np
4
+ import faiss
5
+ import pickle
6
+
7
+ from sentence_transformers import SentenceTransformer
8
+
9
+ # Load data & models ONCE
10
+
11
+ # Load dataset
12
+ df = pd.read_csv("data/hadith.csv")
13
+
14
+ # Load embeddings
15
+ hadith_embeddings = np.load("data/hadith_embeddings.npy")
16
+
17
+ # Load BM25
18
+ with open("data/bm25.pkl", "rb") as f:
19
+ bm25 = pickle.load(f)
20
+
21
+ # Load anchor FAISS index
22
+ anchor_index = faiss.read_index("data/faiss_anchor.index")
23
+
24
+ # Load anchor mapping
25
+ with open("data/anchor_dict.pkl", "rb") as f:
26
+ anchor_dict = pickle.load(f)
27
+
28
+ with open("data/unique_anchor_texts.pkl", "rb") as f:
29
+ unique_anchor_texts = pickle.load(f)
30
+
31
+ # Load embedding model
32
+ model = SentenceTransformer(
33
+ "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
34
+ )
35
+
36
+ # Import retrieval logic
37
+ from retrieval import hybrid_search_fixed
38
+
39
+
40
+ # -----------------------------
41
+ # Search function (UI entry)
42
+ # -----------------------------
43
+
44
+ def search_hadith(query):
45
+ if query.strip() == "":
46
+ return pd.DataFrame(columns=["ุงู„ู…ูˆุถูˆุน", "ู†ุต ุงู„ุญุฏูŠุซ"])
47
+
48
+ results_df, _ = hybrid_search_fixed(
49
+ query=query,
50
+ df=df,
51
+ bm25=bm25,
52
+ model=model,
53
+ hadith_embeddings=hadith_embeddings,
54
+ anchor_index=anchor_index,
55
+ anchor_dict=anchor_dict,
56
+ unique_anchor_texts=unique_anchor_texts,
57
+ top_k=int(top_k)
58
+ )
59
+
60
+ return results_df[["main_subj", "clean_text","url"]] \
61
+ .rename(columns={
62
+ "main_subj": "ุงู„ู…ูˆุถูˆุน",
63
+ "clean_text": "ู†ุต ุงู„ุญุฏูŠุซ",
64
+ "url":"hadith page on Islamweb.net"
65
+ })
66
+
67
+
68
+ # Gradio Interface
69
+ interface = gr.Interface(
70
+ fn=search_hadith,
71
+ inputs=[
72
+ gr.Textbox(
73
+ label="ุฃุฏุฎู„ ู…ูˆุถูˆุน ุงู„ุจุญุซ ุฃูˆ ุงู„ุณุคุงู„",
74
+ placeholder="ู…ุซุงู„: ุฃู‡ู…ูŠุฉ ุงู„ู†ูŠุฉ ูˆุฃุซุฑู‡ุง ููŠ ู‚ุจูˆู„ ุงู„ุฃุนู…ุงู„"
75
+ ),
76
+ gr.Slider(
77
+ minimum=1,
78
+ maximum=20,
79
+ value=5,
80
+ step=1,
81
+ label="ุนุฏุฏ ุงู„ุฃุญุงุฏูŠุซ ุงู„ู…ุนุฑูˆุถุฉ"
82
+ )
83
+ ],
84
+ outputs=gr.Dataframe(
85
+ label="ู†ุชุงุฆุฌ ุงู„ุจุญุซ",
86
+ wrap=True
87
+ ),
88
+ title="ู…ุญุฑูƒ ุจุญุซ ุฐูƒูŠ ููŠ ุงู„ุฃุญุงุฏูŠุซ ุงู„ู†ุจูˆูŠุฉ",
89
+ description=(
90
+ "ูŠุนุชู…ุฏ ู‡ุฐุง ุงู„ู†ุธุงู… ุนู„ู‰ ุงู„ุจุญุซ ุงู„ุฏู„ุงู„ูŠ ูˆุงู„ู…ูˆุถูˆุนูŠ "
91
+ "ู„ุงุณุชุฑุฌุงุน ุงู„ุฃุญุงุฏูŠุซ ุฐุงุช ุงู„ุตู„ุฉ ุจุงู„ู…ุนู†ู‰ ูˆู„ูŠุณ ุจุงู„ูƒู„ู…ุงุช ูู‚ุท."
92
+ ),
93
+ allow_flagging="never"
94
+ )
95
+
96
+ # Launch app
97
+ if __name__ == "__main__":
98
+ interface.launch()
retrieval.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+
4
+ def query_anchor_scores(query, model, anchor_index, top_k=10):
5
+ q_emb = model.encode(query, normalize_embeddings=True)
6
+ scores, indices = anchor_index.search(q_emb.reshape(1, -1), top_k)
7
+ return np.array(indices[0], dtype=int), np.array(scores[0], dtype=float)
8
+
9
+
10
+ def bm25_retrieve(query, bm25, preprocess_query, top_k=50):
11
+ tokenized_query = preprocess_query(query)
12
+ scores = bm25.get_scores(tokenized_query)
13
+ top_idx = np.argsort(scores)[::-1][:top_k]
14
+ return top_idx, scores[top_idx]
15
+
16
+
17
+ def compute_anchor_scores_for_hadiths(
18
+ n_hadiths,
19
+ anchor_indices,
20
+ anchor_scores,
21
+ anchor_dict,
22
+ unique_anchor_texts
23
+ ):
24
+ anchor_score_vec = np.zeros(n_hadiths, dtype=float)
25
+
26
+ for a_idx, a_score in zip(anchor_indices, anchor_scores):
27
+ if 0 <= a_idx < len(unique_anchor_texts):
28
+ anchor_text = unique_anchor_texts[a_idx]
29
+ for h_idx in anchor_dict.get(anchor_text, []):
30
+ anchor_score_vec[h_idx] = a_score
31
+
32
+ return anchor_score_vec
33
+
34
+
35
+ def hybrid_search_fixed(
36
+ query,
37
+ df,
38
+ bm25,
39
+ preprocess_query,
40
+ model,
41
+ hadith_embeddings,
42
+ anchor_index,
43
+ anchor_dict,
44
+ unique_anchor_texts,
45
+ top_k=5,
46
+ top_bm25=50,
47
+ top_anchors=10,
48
+ alpha_anchor=0.40,
49
+ alpha_semantic=0.35,
50
+ alpha_bm25=0.25,
51
+ ):
52
+ n = len(df)
53
+ eps = 1e-8
54
+
55
+ # --- BM25 ---
56
+ bm25_idx, bm25_scores = bm25_retrieve(
57
+ query, bm25, preprocess_query, top_k=top_bm25
58
+ )
59
+
60
+ bm25_vec = np.zeros(n)
61
+ if bm25_scores.size > 0:
62
+ bm25_scores = bm25_scores / (bm25_scores.max() + eps)
63
+ bm25_vec[bm25_idx] = bm25_scores
64
+
65
+ # --- Anchor ---
66
+ anchor_idx, anchor_scores = query_anchor_scores(
67
+ query, model, anchor_index, top_k=top_anchors
68
+ )
69
+
70
+ anchor_vec = compute_anchor_scores_for_hadiths(
71
+ n,
72
+ anchor_idx,
73
+ anchor_scores,
74
+ anchor_dict,
75
+ unique_anchor_texts
76
+ )
77
+
78
+ if anchor_scores.size > 0:
79
+ anchor_vec /= (anchor_scores.max() + eps)
80
+
81
+ # --- Semantic ---
82
+ union_idx = np.unique(
83
+ np.concatenate([
84
+ bm25_idx,
85
+ np.where(anchor_vec > 0)[0]
86
+ ])
87
+ )
88
+
89
+ semantic_vec = np.zeros(n)
90
+ if len(union_idx) > 0:
91
+ q_emb = model.encode(query, normalize_embeddings=True)
92
+ semantic_vals = hadith_embeddings[union_idx] @ q_emb
93
+ semantic_vals /= (semantic_vals.max() + eps)
94
+ semantic_vec[union_idx] = semantic_vals
95
+
96
+ # --- Final fusion ---
97
+ final_scores = (
98
+ alpha_anchor * anchor_vec +
99
+ alpha_semantic * semantic_vec +
100
+ alpha_bm25 * bm25_vec
101
+ )
102
+
103
+ top_indices = np.argsort(final_scores)[::-1][:top_k]
104
+
105
+ return df.iloc[top_indices].copy(), final_scores
utils.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ def preprocess_arabic(text):
4
+ text = re.sub(r"[ู‘ูŽู‹ููŒููู’ู€]", "", text)
5
+ text = re.sub(r"[^\w\s]", " ", text)
6
+ text = re.sub(r"\s+", " ", text).strip()
7
+ return text
8
+
9
+ def bm25_tokenize(text):
10
+ return preprocess_arabic(text).split()