import numpy as np def query_anchor_scores(query, model, anchor_index, top_k=10): q_emb = model.encode(query, normalize_embeddings=True) scores, indices = anchor_index.search(q_emb.reshape(1, -1), top_k) return np.array(indices[0], dtype=int), np.array(scores[0], dtype=float) def bm25_retrieve(query, bm25, preprocess_query, top_k=50): tokenized_query = preprocess_query(query) scores = bm25.get_scores(tokenized_query) top_idx = np.argsort(scores)[::-1][:top_k] return top_idx, scores[top_idx] def compute_anchor_scores_for_hadiths( n_hadiths, anchor_indices, anchor_scores, anchor_dict, unique_anchor_texts ): anchor_score_vec = np.zeros(n_hadiths, dtype=float) for a_idx, a_score in zip(anchor_indices, anchor_scores): if 0 <= a_idx < len(unique_anchor_texts): anchor_text = unique_anchor_texts[a_idx] for h_idx in anchor_dict.get(anchor_text, []): anchor_score_vec[h_idx] = a_score return anchor_score_vec def hybrid_search_fixed(query, df, bm25, preprocess_query, model, hadith_embeddings, anchor_index, anchor_dict, unique_anchor_texts, top_k=5, top_bm25=50, top_anchors=10, alpha_anchor=0.40, alpha_semantic=0.35, alpha_bm25=0.25, full_semantic=False): """ Hybrid search with correct signal alignment: - bm25 retrieves top_bm25 hadiths + scores - anchor_index returns top_anchors anchors + scores -> mapped to hadith-level anchor scores - semantic scores computed either for full corpus (if full_semantic=True) OR for just the union of bm25 candidates + anchor-linked hadiths - missing-signal entries are zero """ n = len(df) eps = 1e-8 # 1) BM25 candidates and scores bm25_indices, bm25_scores = bm25_retrieve(query, bm25, preprocess_query,top_k=top_bm25) # make dictionary mapping hadith_idx -> bm25_score bm25_map = {int(idx): float(score) for idx, score in zip(bm25_indices, bm25_scores)} # 2) Anchor retrieval -> anchor indices + scores anchor_idx, anchor_scores = query_anchor_scores(query, model, anchor_index, top_k=top_anchors) # 3) Build per-hadith anchor score for entire corpus (zeros by default) anchor_score_vec = compute_anchor_scores_for_hadiths( n_hadiths=n, anchor_indices=anchor_idx, anchor_scores=anchor_scores, anchor_dict=anchor_dict, unique_anchor_texts=unique_anchor_texts, ) # 4) Determine which hadith indices we will score semantically. # union of bm25 candidates and all anchor-linked hadiths returned anchor_linked_indices = [] for a_idx in anchor_idx: # safe check if 0 <= a_idx < len(unique_anchor_texts): anchor_text = unique_anchor_texts[int(a_idx)] anchor_linked_indices.extend(anchor_dict.get(anchor_text, [])) anchor_linked_indices = np.unique(np.array(anchor_linked_indices, dtype=int)) if len(anchor_linked_indices) else np.array([], dtype=int) if full_semantic: # compute semantic for whole corpus (slower) query_emb = model.encode(query, normalize_embeddings=True) # hadith_embeddings @ query_emb semantic_scores_all = hadith_embeddings @ query_emb semantic_score_vec = np.array(semantic_scores_all, dtype=float) else: # compute semantic only for union (faster) union_indices = np.unique(np.concatenate([bm25_indices, anchor_linked_indices])).astype(int) query_emb = model.encode(query, normalize_embeddings=True) # compute dot product for selected indices only if len(union_indices) > 0: sem_vals = hadith_embeddings[union_indices] @ query_emb # place into full-length vector semantic_score_vec = np.zeros(n, dtype=float) semantic_score_vec[union_indices] = sem_vals else: semantic_score_vec = np.zeros(n, dtype=float) # 5) BM25 vector for full corpus (zeros except candidates) bm25_score_vec = np.zeros(n, dtype=float) if len(bm25_map) > 0: # normalize BM25 across candidates for stability bm25_vals = np.array(list(bm25_map.values()), dtype=float) bm25_max = bm25_vals.max() if bm25_vals.size>0 else 0.0 for idx, val in bm25_map.items(): bm25_score_vec[idx] = float(val / (bm25_max + eps) if bm25_max > 0 else 0.0) # if no bm25 candidates, bm25_score_vec remains zeros # 6) Anchor normalization: map raw anchor scores (which may be cosine/inner-product distances) into [0,1] # we can normalize by the max returned anchor score to scale consistently if np.max(anchor_scores) > 0: anchor_max = float(np.max(anchor_scores)) if anchor_max > 0: anchor_score_vec = anchor_score_vec / (anchor_max + eps) # else remain zeros # 7) Semantic normalization: optional - normalize semantic_score_vec by its max over the scored entries sem_max = semantic_score_vec.max() if semantic_score_vec.size > 0 else 0.0 if sem_max > 0: semantic_score_vec = semantic_score_vec / (sem_max + eps) # 8) Final fusion final_scores = ( alpha_anchor * anchor_score_vec + alpha_semantic * semantic_score_vec + alpha_bm25 * bm25_score_vec ) # 9) Sort and return top_k results ranked_all = np.argsort(final_scores)[::-1] top_indices = ranked_all[:top_k] return df.iloc[top_indices].copy(), { "final_scores": final_scores, "anchor_scores": anchor_score_vec, "semantic_scores": semantic_score_vec, "bm25_scores": bm25_score_vec, "union_size": len(union_indices) if not full_semantic else n }