import gradio as gr import pandas as pd import numpy as np import faiss import pickle import html from sentence_transformers import SentenceTransformer import os os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" # =============================== # Load data & models (ONCE) # =============================== df = pd.read_csv("data/hadith.csv") hadith_embeddings = np.load("data/hadith_embeddings.npy") print(f"Loaded hadith embeddings: {hadith_embeddings.shape}") with open("data/bm25.pkl", "rb") as f: bm25 = pickle.load(f) anchor_index = faiss.read_index("data/faiss_anchor.index") print(f"Anchor index dimension: {anchor_index.d}") with open("data/anchor_dict.pkl", "rb") as f: anchor_dict = pickle.load(f) with open("data/unique_anchor_texts.pkl", "rb") as f: unique_anchor_texts = pickle.load(f) model = SentenceTransformer("omarelshehy/arabic-english-sts-matryoshka-v2.0") model.max_seq_length = 512 # Quick dimension check test_emb = model.encode("test", normalize_embeddings=True) print(f"Model embedding dimension: {test_emb.shape}") if test_emb.shape[0] != anchor_index.d: raise ValueError( f"Dimension mismatch! Model outputs {test_emb.shape[0]}D but " f"anchor_index expects {anchor_index.d}D. Rebuild your anchor_index." ) from retrieval import hybrid_search_fixed from utils import preprocess_query # =============================== # Helper functions # =============================== def safe_get(row, col): try: v = row.get(col, "") if hasattr(row, "get") else row[col] if col in row else "" if v is None or (isinstance(v, float) and np.isnan(v)): return "" return str(v).strip() except Exception: return "" def first_nonempty(row, cols): for c in cols: val = safe_get(row, c) if val: return val return "" def short_preview(text, length=300): text = (text or "").strip() if len(text) <= length: return text return text[:length].rsplit(' ', 1)[0] + "..." # =============================== # Search Function # =============================== def search_hadith(query, top_k): if not query or not str(query).strip(): return "

الرجاء إدخال استعلام بحث

" results_df, debug = hybrid_search_fixed( query=query, df=df, bm25=bm25, preprocess_query=preprocess_query, model=model, hadith_embeddings=hadith_embeddings, anchor_index=anchor_index, anchor_dict=anchor_dict, unique_anchor_texts=unique_anchor_texts, top_k=int(top_k) ) final_scores = debug.get("final_scores") if isinstance(debug, dict) else None html_parts = ["
"] for rank, (_, row) in enumerate(results_df.iterrows(), start=1): hadith_idx = row.name if hasattr(row, "name") else "" title_val = first_nonempty(row, ["isnad_text", "hadith_title", "title", "main_subj"]) or \ short_preview(safe_get(row, "matn_text"), 60) or f"حديث #{hadith_idx}" clean_text = safe_get(row, "matn_text") clean_text_escaped = html.escape(clean_text).replace("\n", "
") topic = html.escape(safe_get(row, "main_subj")) source_url = safe_get(row, "url") score_display = "" if isinstance(final_scores, (list, np.ndarray)): try: s = float(final_scores[int(hadith_idx)]) score_display = f"{s:.3f}" except: pass preview = short_preview(clean_text, 360) preview_escaped = html.escape(preview).replace("\n", "
") card_html = f"""
#{rank}
{html.escape(title_val)}
الموضوع: {topic}
الرقم: {hadith_idx}
{f'
الدرجة: {score_display}
' if score_display else ''}
{preview_escaped}
{clean_text_escaped}
""" html_parts.append(card_html) html_parts.append("
") return "\n".join(html_parts) # =============================== # PROFESSIONAL DARK MODE + PERFECT ARABIC TYPOGRAPHY + HIDDEN SCROLL ARROWS # =============================== custom_css = """ @import url('https://fonts.googleapis.com/css2?family=Cairo:wght@400;500;600;700&family=Noto+Sans+Arabic:wght@400;500;700&display=swap'); :root { --body-background-fill: #020617 !important; --background-fill-primary: #020617 !important; --background-fill-secondary: #0f172a !important; --border-color-primary: #334155 !important; --color-text-primary: #f1f5f9 !important; --color-text-secondary: #cbd5e1 !important; --button-primary-background-fill: #3b82f6 !important; --button-primary-background-fill-hover: #2563eb !important; } /* Global dark + Arabic font */ body, .gradio-container, .gr-panel, .gr-box, .gr-form, .wrap, .panel, .block, footer { background-color: #020617 !important; color: #f1f5f9 !important; font-family: 'Cairo', 'Noto Sans Arabic', system-ui, sans-serif !important; } .gradio-container { max-width: 960px !important; margin: 0 auto !important; } /* Arabic text perfection */ .text-rtl, .full-text, .summary, .title, .topic { font-family: 'Cairo', 'Noto Sans Arabic', sans-serif !important; line-height: 2.05 !important; } /* Inputs & Buttons */ label span, input, textarea, .gr-input, .gr-textarea { color: #f1f5f9 !important; background: #1e2937 !important; border-color: #475569 !important; } button.primary { background: linear-gradient(90deg, #3b82f6, #60a5fa) !important; font-weight: 600 !important; } /* Premium result cards */ .card { background: #1e2937 !important; border-radius: 18px !important; padding: 26px !important; margin-bottom: 24px !important; box-shadow: 0 10px 30px rgba(0,0,0,0.4) !important; border: 1px solid #334155 !important; transition: all 0.35s ease !important; } .card:hover { transform: translateY(-8px); box-shadow: 0 25px 50px rgba(59, 130, 246, 0.2) !important; } .index { background: linear-gradient(90deg, #3b82f6, #60a5fa) !important; color: #fff !important; padding: 7px 16px !important; border-radius: 9999px !important; font-weight: 700 !important; } .title { font-size: 19px !important; font-weight: 700 !important; } .topic { font-size: 14px !important; color: #94a3b8 !important; } .text-rtl { background: #0f172a !important; border-radius: 12px !important; padding: 18px !important; border-right: 6px solid #3b82f6 !important; font-size: 17px !important; color: #e2e8f0 !important; } /* Hide the unwanted white scroll arrow boxes (the ones in your 2nd screenshot) */ .gradio-container button[aria-label*="Scroll"], .gradio-container .gr-button.scroll, .gr-scrollbar-button, ::-webkit-scrollbar-button, [data-testid*="scroll"], button[class*="scroll"], .scroll-button, .gr-scroll-button { display: none !important; visibility: hidden !important; opacity: 0 !important; pointer-events: none !important; } /* Clean up any remaining floating arrows */ .gradio-container > div > button:last-child { display: none !important; } .results { margin-top: 24px; } .footer a { color: #60a5fa !important; font-weight: 600 !important; } .empty { color: #94a3b8 !important; text-align: center; padding: 50px 20px; font-size: 17px; } @media (max-width: 768px) { .card-header { flex-direction: column !important; } } """ # =============================== # Gradio Interface – ENGLISH HEADER ONLY # =============================== interface = gr.Interface( fn=search_hadith, inputs=[ gr.Textbox( label="🔍 استعلام البحث", placeholder="مثال: أهمية النية في الإسلام / Importance of intention in Islam", lines=2, max_lines=3 ), gr.Slider( minimum=1, maximum=20, value=5, step=1, label="📌 عدد النتائج" ) ], outputs=gr.HTML(), title="📖 Intelligent Hadith Search Engine", description=""" Advanced AI-powered semantic search engine for the Noble Prophetic Hadith.
Combines lexical search (BM25), semantic embeddings, and topic-aware Anchors. """, examples=[ ["أهمية النية وأثرها في قبول الأعمال", 5], ["حقوق الوالدين في الإسلام", 5], ["فضل الصلاة على النبي ﷺ", 5] ], flagging_mode="never", theme=gr.themes.Soft(), css=custom_css ) if __name__ == "__main__": interface.launch()