DevnilMaster1's picture
Replace 127K-item dropdown with server-side search-as-you-type
a964075 verified
"""
Bangla Book Recommender β€” Hugging Face Space
============================================
Cold-start recommendation interface for the RokomariBG dataset using two
benchmarked recommender models from the paper "Towards Personalized Bangla
Book Recommendation: A Large-Scale Multi-Entity Book Graph Dataset"
(https://arxiv.org/abs/2602.12129).
Performance note
----------------
The catalogue has 127K books. Loading all of them into a Dropdown component
freezes the browser. Instead, this app uses a search-as-you-type pattern:
the user types a query, the backend filters titles/authors and returns up
to ~50 matches, and only those matches are rendered in a Dropdown. This
keeps the UI responsive even on free CPU tiers.
"""
from __future__ import annotations
from pathlib import Path
from typing import Dict, List
import gradio as gr
import numpy as np
import pandas as pd
# ─────────────────────────────────────────────────────────────────────────────
# Configuration
# ─────────────────────────────────────────────────────────────────────────────
ROOT = Path(__file__).parent
TOP_K_MIN, TOP_K_DEFAULT, TOP_K_MAX = 5, 10, 30
SEARCH_RESULT_LIMIT = 50 # max books shown in the search dropdown at any time
MODEL_CONFIG: Dict[str, Dict] = {
"Neural Two-Tower (best)": {"key": "two_tower", "dim": 256, "color": "#7c3aed"},
"LightGCN": {"key": "lightgcn", "dim": 64, "color": "#0891b2"},
}
# ─────────────────────────────────────────────────────────────────────────────
# Loading
# ─────────────────────────────────────────────────────────────────────────────
def load_metadata() -> pd.DataFrame:
path = ROOT / "books_metadata.parquet"
if path.exists():
df = pd.read_parquet(path)
print(f" β†’ metadata loaded from {path.name}")
else:
print(f" ⚠️ {path.name} not found β€” using synthetic fallback")
rng = np.random.default_rng(42)
n = 500
df = pd.DataFrame({
"book_id": [f"demo_{i:05d}" for i in range(n)],
"title": [f"Demo Book #{i}" for i in range(n)],
"author": [f"Demo Author {i % 30}" for i in range(n)],
"category": rng.choice(["Fiction", "History", "Science"], size=n),
"rating": np.round(3 + rng.random(n) * 2, 1),
"summary": ["Synthetic placeholder summary."] * n,
"book_url": [""] * n,
})
for col, default in [("book_id", ""), ("title", ""), ("author", ""),
("category", ""), ("summary", ""), ("book_url", "")]:
if col not in df.columns:
df[col] = default
df[col] = df[col].fillna(default).astype(str)
if "rating" not in df.columns:
df["rating"] = np.nan
df["rating"] = pd.to_numeric(df["rating"], errors="coerce").clip(lower=1.0, upper=5.0)
# Precompute lowercased search index for fast filtering
df["_search"] = (df["title"].str.lower() + " " + df["author"].str.lower())
return df.reset_index(drop=True)
def load_embedding_pair(model_key: str, fallback_n: int, fallback_dim: int):
emb_path = ROOT / f"{model_key}_book_emb.npy"
ids_path = ROOT / f"{model_key}_book_ids.npy"
if emb_path.exists() and ids_path.exists():
emb = np.load(emb_path).astype(np.float32)
ids = np.load(ids_path, allow_pickle=True).astype(str)
source = "real"
else:
rng = np.random.default_rng(seed=abs(hash(model_key)) % (2**32))
emb = rng.standard_normal((fallback_n, fallback_dim)).astype(np.float32)
ids = METADATA["book_id"].values[:fallback_n].astype(str)
source = "synthetic"
norms = np.linalg.norm(emb, axis=1, keepdims=True)
norms[norms == 0] = 1.0
emb = emb / norms
id_to_row = {bid: i for i, bid in enumerate(ids)}
return emb, ids, id_to_row, source
print("─" * 60)
print(" Bangla Book Recommender β€” startup")
print("─" * 60)
print("Loading metadata…")
METADATA = load_metadata()
print(f" β†’ {len(METADATA):,} books in catalogue")
print("Loading model embeddings…")
EMBEDDINGS: Dict[str, Dict] = {}
for label, cfg in MODEL_CONFIG.items():
emb, ids, id_to_row, source = load_embedding_pair(
cfg["key"], len(METADATA), cfg["dim"]
)
EMBEDDINGS[label] = {
"emb": emb, "ids": ids, "id_to_row": id_to_row,
"dim": cfg["dim"], "color": cfg["color"], "source": source,
}
print(f" β†’ {label:30s} {str(emb.shape):16s} [{source}]")
# Pre-computed display labels for ALL books (used for showing selected books)
ALL_LABELS = [
f"{row.title} β€” {row.author}" if row.author else row.title
for row in METADATA.itertuples(index=False)
]
LABEL_TO_BOOKID = dict(zip(ALL_LABELS, METADATA["book_id"].astype(str).values))
BOOKID_TO_META = {row.book_id: row for row in METADATA.itertuples(index=False)}
# Surface a small set of "popular" labels by default (highest-rated)
# This gives the dropdown something to show before the user types.
_DEFAULT_SAMPLE = METADATA.nlargest(SEARCH_RESULT_LIMIT, "rating", keep="first")
DEFAULT_LABELS = [
f"{row.title} β€” {row.author}" if row.author else row.title
for row in _DEFAULT_SAMPLE.itertuples(index=False)
]
# ─────────────────────────────────────────────────────────────────────────────
# Search β€” the core UX fix
# ─────────────────────────────────────────────────────────────────────────────
def search_books(query: str, currently_selected: List[str]):
"""
Return up to SEARCH_RESULT_LIMIT book labels matching the query.
The dropdown's `choices` is updated, but `value` (currently selected
items) is preserved exactly. This means a user can search β†’ pick β†’ search
something else β†’ pick, and earlier picks remain selected even though
they're no longer in the search results.
"""
selected = currently_selected or []
if not query or not query.strip():
choices = DEFAULT_LABELS
else:
q = query.strip().lower()
mask = METADATA["_search"].str.contains(q, regex=False, na=False)
matched = METADATA.loc[mask].head(SEARCH_RESULT_LIMIT)
choices = [
f"{row.title} β€” {row.author}" if row.author else row.title
for row in matched.itertuples(index=False)
]
# Always include the currently-selected items so they remain visible
# even if they don't match the new query.
merged = list(dict.fromkeys(selected + choices))
return gr.update(choices=merged, value=selected)
# ─────────────────────────────────────────────────────────────────────────────
# Recommendation
# ─────────────────────────────────────────────────────────────────────────────
def recommend(seed_labels: List[str], model_label: str, top_k: int):
if not seed_labels:
return _empty_state_html(), gr.update(choices=[], value=[])
seed_book_ids = [
LABEL_TO_BOOKID[label] for label in seed_labels if label in LABEL_TO_BOOKID
]
if not seed_book_ids:
return (_empty_state_html("No valid seed books selected."),
gr.update(choices=[], value=[]))
model = EMBEDDINGS[model_label]
emb, id_to_row = model["emb"], model["id_to_row"]
seed_rows = [id_to_row[bid] for bid in seed_book_ids if bid in id_to_row]
if not seed_rows:
msg = (f"None of the selected books exist in the {model_label} embedding "
"space. This model was trained on a subset of books with sufficient "
"interaction history. Try different books, or switch to "
"<b>Neural Two-Tower</b> which has broader coverage.")
return _empty_state_html(msg), gr.update(choices=[], value=[])
user_vec = emb[seed_rows].mean(axis=0)
user_vec = user_vec / (np.linalg.norm(user_vec) + 1e-8)
scores = emb @ user_vec
scores[seed_rows] = -np.inf
top_k = min(int(top_k), len(scores) - 1)
top_idx = np.argpartition(-scores, top_k)[:top_k]
top_idx = top_idx[np.argsort(-scores[top_idx])]
recs = []
for row_i in top_idx:
book_id = str(model["ids"][row_i])
meta = BOOKID_TO_META.get(book_id)
if meta is None:
continue
recs.append({
"rank": len(recs) + 1,
"title": meta.title,
"author": meta.author,
"category": meta.category,
"rating": meta.rating,
"summary": (meta.summary or "")[:240],
"book_url": meta.book_url,
"score": float(scores[row_i]),
})
html = _render_recommendation_cards(
recs, model_label, len(seed_rows), len(seed_book_ids)
)
rec_choices = [
f"{r['title']} β€” {r['author']}" if r["author"] else r["title"]
for r in recs
]
return html, gr.update(choices=rec_choices, value=[])
def add_to_favourites(current_favs: List[str], to_add: List[str]):
if not to_add:
return gr.update(value=current_favs or [])
merged = list(dict.fromkeys((current_favs or []) + to_add))
return gr.update(choices=merged, value=merged)
# ─────────────────────────────────────────────────────────────────────────────
# Rendering
# ─────────────────────────────────────────────────────────────────────────────
def _empty_state_html(message: str = "Search for books you have enjoyed, "
"select a few, then click <b>Get Recommendations</b>.") -> str:
return f"""
<div style="text-align:center;padding:3rem 1rem;color:var(--body-text-color-subdued,#888);">
<div style="font-size:2.6rem;margin-bottom:.5rem;">πŸ“š</div>
<div style="font-size:.95rem;max-width:32rem;margin:0 auto;line-height:1.5;">{message}</div>
</div>
"""
def _render_recommendation_cards(recs, model_label, seeds_in_model, total_seeds):
color = EMBEDDINGS[model_label]["color"]
source = EMBEDDINGS[model_label]["source"]
if not recs:
return _empty_state_html("No recommendations could be produced.")
cards = []
for r in recs:
rating_html = (
f"<span style='color:#f59e0b;font-weight:600;'>β˜… {r['rating']:.1f}</span>"
if r["rating"] is not None and not pd.isna(r["rating"]) else ""
)
category_html = (
f"<span style='color:var(--body-text-color-subdued,#888);font-size:.78rem;'>{r['category']}</span>"
if r["category"] else ""
)
summary_html = (
f"<div style='margin-top:.6rem;font-size:.82rem;line-height:1.5;color:var(--body-text-color,#444);opacity:.85;'>{r['summary']}…</div>"
if r["summary"] else ""
)
url_html = (
f"<div style='margin-top:.7rem;'><a href='{r['book_url']}' target='_blank' rel='noopener' "
f"style='font-size:.78rem;color:{color};text-decoration:none;font-weight:500;'>View on Rokomari β†’</a></div>"
if r["book_url"] else ""
)
cards.append(f"""
<div style="border:1px solid var(--border-color-primary,#e5e5e5);border-radius:12px;padding:1.1rem;
background:var(--background-fill-secondary,#fff);position:relative;overflow:hidden;">
<div style="position:absolute;top:0;left:0;height:3px;width:100%;background:{color};"></div>
<div style="display:flex;justify-content:space-between;align-items:flex-start;gap:.5rem;margin-bottom:.4rem;">
<div style="font-weight:600;font-size:1rem;line-height:1.3;flex:1;">#{r['rank']}. {r['title']}</div>
<div style="font-size:.68rem;color:#999;white-space:nowrap;font-family:ui-monospace,monospace;">sim {r['score']:.3f}</div>
</div>
<div style="color:var(--body-text-color,#555);font-size:.88rem;margin-bottom:.5rem;">{r['author']}</div>
<div style="display:flex;justify-content:space-between;align-items:center;font-size:.85rem;">{rating_html}{category_html}</div>
{summary_html}{url_html}
</div>
""")
grid = ("<div style='display:grid;grid-template-columns:repeat(auto-fill,minmax(280px,1fr));gap:1rem;'>"
+ "".join(cards) + "</div>")
warnings = []
if source == "synthetic":
warnings.append(
"<div style='background:#fef3c7;border:1px solid #fbbf24;color:#78350f;"
"padding:.6rem .9rem;border-radius:8px;font-size:.82rem;margin-bottom:1rem;'>"
"⚠️ This model is using <b>synthetic random embeddings</b>.</div>"
)
if seeds_in_model < total_seeds:
skipped = total_seeds - seeds_in_model
warnings.append(
f"<div style='background:#dbeafe;border:1px solid #60a5fa;color:#1e40af;"
"padding:.6rem .9rem;border-radius:8px;font-size:.82rem;margin-bottom:1rem;'>"
f"ℹ️ {skipped} of your {total_seeds} seed book(s) are not in the "
f"<b>{model_label}</b> embedding space and were skipped.</div>"
)
header = f"""
<div style="display:flex;align-items:center;gap:.5rem;margin-bottom:1rem;
font-size:.9rem;color:var(--body-text-color-subdued,#666);">
<span style="display:inline-block;width:.6rem;height:.6rem;border-radius:50%;background:{color};"></span>
<span><strong>{len(recs)}</strong> recommendations from <strong>{model_label}</strong></span>
</div>
"""
return "".join(warnings) + header + grid
# ─────────────────────────────────────────────────────────────────────────────
# UI
# ─────────────────────────────────────────────────────────────────────────────
INTRO_MD = f"""
# πŸ“š Bangla Book Recommender
### Cold-start recommendations from {len(METADATA):,} Bangla titles, powered by two benchmarked models on the [RokomariBG dataset](https://huggingface.co/datasets/DevnilMaster1/Bangla-Book-Recommendation-Dataset).
Type a book name or author in the search box below, pick a few you have enjoyed,
choose a model, and get nearest-neighbour recommendations from the same embedding
space the model was trained in.
"""
ABOUT_MD = """
**How it works.** Each model was trained on the RokomariBG corpus (127K books,
63K users, 209K reviews scraped from Rokomari.com) to produce a vector
representation of every book. When you pick books you like, your *taste vector*
is computed as the mean of those books' embeddings, and the books with the
highest cosine similarity to your taste vector are surfaced.
**Models.**
- **Neural Two-Tower** β€” best benchmarked model. Item tower fuses ID, content
(title, summary, author, publisher), and metadata. Strongest at cold-start.
- **LightGCN** β€” pure graph collaborative filtering with 4 GCN layers.
**Citation.** [Paper](https://arxiv.org/abs/2602.12129).
"""
with gr.Blocks(title="Bangla Book Recommender") as demo:
gr.Markdown(INTRO_MD)
with gr.Accordion("ℹ️ About this Space", open=False):
gr.Markdown(ABOUT_MD)
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### 1. Search and pick books you have enjoyed")
search_box = gr.Textbox(
label="Search by title or author",
placeholder="Type at least 2 letters in Bangla or English…",
show_label=True,
)
seeds = gr.Dropdown(
choices=DEFAULT_LABELS,
value=[],
multiselect=True,
label="Your taste profile",
info="Pick 3–5 books from the search results.",
allow_custom_value=False,
max_choices=20,
filterable=False,
)
gr.Markdown("### 2. Choose a model")
model_choice = gr.Radio(
choices=list(MODEL_CONFIG.keys()),
value="Neural Two-Tower (best)",
label="Recommendation model",
)
top_k = gr.Slider(
minimum=TOP_K_MIN, maximum=TOP_K_MAX,
value=TOP_K_DEFAULT, step=1,
label="Number of recommendations",
)
run_btn = gr.Button("πŸ” Get Recommendations", variant="primary", size="lg")
gr.Markdown("### 3. Refine (optional)")
refine_picker = gr.Dropdown(
choices=[], value=[], multiselect=True,
label="Promote recommendations into your taste profile",
info="After recommendations appear, pick the ones you like, then add them.",
)
add_btn = gr.Button("βž• Add to my favourites", size="sm")
with gr.Column(scale=2):
gr.Markdown("### Recommendations")
output_html = gr.HTML(value=_empty_state_html())
gr.Markdown(
"""
---
<div style='text-align:center;font-size:.82rem;color:var(--body-text-color-subdued,#777);'>
Built on the
<a href='https://huggingface.co/datasets/DevnilMaster1/Bangla-Book-Recommendation-Dataset'>RokomariBG dataset</a>
Β·
<a href='https://github.com/DevnilMaster/Bangla-Book-Recommendation-Dataset'>Code</a>
Β·
<a href='https://arxiv.org/abs/2602.12129'>Paper</a>
Β·
CC BY-NC 4.0
</div>
"""
)
search_box.change(
fn=search_books,
inputs=[search_box, seeds],
outputs=[seeds],
show_progress="hidden",
)
run_btn.click(
fn=recommend,
inputs=[seeds, model_choice, top_k],
outputs=[output_html, refine_picker],
)
add_btn.click(
fn=add_to_favourites,
inputs=[seeds, refine_picker],
outputs=[seeds],
)
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=True,
ssr_mode=False,
)