kdallash commited on
Commit
16c341e
·
verified ·
1 Parent(s): cee4429

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -18
app.py CHANGED
@@ -13,6 +13,7 @@ df = pd.read_csv("data/hadith.csv")
13
 
14
  # Load embeddings
15
  hadith_embeddings = np.load("data/hadith_embeddings.npy")
 
16
 
17
  # Load BM25
18
  with open("data/bm25.pkl", "rb") as f:
@@ -20,6 +21,7 @@ with open("data/bm25.pkl", "rb") as f:
20
 
21
  # Load anchor FAISS index
22
  anchor_index = faiss.read_index("data/faiss_anchor.index")
 
23
 
24
  # Load anchor mapping
25
  with open("data/anchor_dict.pkl", "rb") as f:
@@ -32,7 +34,20 @@ with open("data/unique_anchor_texts.pkl", "rb") as f:
32
  model = SentenceTransformer(
33
  "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
34
  )
35
- model.max_seq_length = 512
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  # Import retrieval logic
37
  from retrieval import hybrid_search_fixed
38
  from utils import preprocess_query
@@ -44,24 +59,33 @@ from utils import preprocess_query
44
  def search_hadith(query, top_k):
45
  if query.strip() == "":
46
  return pd.DataFrame(columns=["الموضوع", "نص الحديث", "hadith page on Islamweb.net"])
47
- results_df, _ = hybrid_search_fixed(
48
- query=query,
49
- df=df,
50
- bm25=bm25,
51
- model=model,
52
- preprocess_query=preprocess_query,
53
- hadith_embeddings=hadith_embeddings,
54
- anchor_index=anchor_index,
55
- anchor_dict=anchor_dict,
56
- unique_anchor_texts=unique_anchor_texts,
57
- top_k=int(top_k)
58
- )
 
 
59
 
60
- return results_df[["main_subj", "clean_text", "url"]] \
61
- .rename(columns={
62
- "main_subj": "الموضوع",
63
- "clean_text": "نص الحديث",
64
- "url": "hadith page on Islamweb.net"
 
 
 
 
 
 
 
65
  })
66
 
67
 
@@ -87,6 +111,11 @@ interface = gr.Interface(
87
  ),
88
  title="Using NLP to search hadith in sahih bukhari",
89
  description=("AI-powered search engine that understands the **meaning** of queries, not just keyword matches."),
 
 
 
 
 
90
  )
91
 
92
  # Launch app
 
13
 
14
  # Load embeddings
15
  hadith_embeddings = np.load("data/hadith_embeddings.npy")
16
+ print(f"Loaded hadith embeddings: {hadith_embeddings.shape}")
17
 
18
  # Load BM25
19
  with open("data/bm25.pkl", "rb") as f:
 
21
 
22
  # Load anchor FAISS index
23
  anchor_index = faiss.read_index("data/faiss_anchor.index")
24
+ print(f"Anchor index dimension: {anchor_index.d}")
25
 
26
  # Load anchor mapping
27
  with open("data/anchor_dict.pkl", "rb") as f:
 
34
  model = SentenceTransformer(
35
  "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
36
  )
37
+ model.max_seq_length = 512
38
+
39
+ # Test embedding dimension
40
+ test_emb = model.encode("test", normalize_embeddings=True)
41
+ print(f"Model embedding dimension: {test_emb.shape}")
42
+
43
+ # Verify dimensions match
44
+ if test_emb.shape[0] != anchor_index.d:
45
+ raise ValueError(
46
+ f"Dimension mismatch! Model outputs {test_emb.shape[0]}D but "
47
+ f"anchor_index expects {anchor_index.d}D. "
48
+ f"Rebuild your anchor_index with the same model."
49
+ )
50
+
51
  # Import retrieval logic
52
  from retrieval import hybrid_search_fixed
53
  from utils import preprocess_query
 
59
  def search_hadith(query, top_k):
60
  if query.strip() == "":
61
  return pd.DataFrame(columns=["الموضوع", "نص الحديث", "hadith page on Islamweb.net"])
62
+
63
+ try:
64
+ results_df, _ = hybrid_search_fixed(
65
+ query=query,
66
+ df=df,
67
+ bm25=bm25,
68
+ model=model,
69
+ preprocess_query=preprocess_query,
70
+ hadith_embeddings=hadith_embeddings,
71
+ anchor_index=anchor_index,
72
+ anchor_dict=anchor_dict,
73
+ unique_anchor_texts=unique_anchor_texts,
74
+ top_k=int(top_k)
75
+ )
76
 
77
+ return results_df[["main_subj", "clean_text", "url"]] \
78
+ .rename(columns={
79
+ "main_subj": "الموضوع",
80
+ "clean_text": "نص الحديث",
81
+ "url": "hadith page on Islamweb.net"
82
+ })
83
+ except Exception as e:
84
+ print(f"Error in search: {e}")
85
+ return pd.DataFrame({
86
+ "الموضوع": ["Error"],
87
+ "نص الحديث": [str(e)],
88
+ "hadith page on Islamweb.net": [""]
89
  })
90
 
91
 
 
111
  ),
112
  title="Using NLP to search hadith in sahih bukhari",
113
  description=("AI-powered search engine that understands the **meaning** of queries, not just keyword matches."),
114
+ examples=[
115
+ ["أهمية النية وأثرها في قبول الأعمال", 5],
116
+ ["فضل الصلاة", 5],
117
+ ["حقوق الجار", 5],
118
+ ]
119
  )
120
 
121
  # Launch app