File size: 19,575 Bytes
be8cb65 32da19c be8cb65 a964075 be8cb65 32da19c be8cb65 a964075 be8cb65 32da19c be8cb65 32da19c be8cb65 32da19c be8cb65 32da19c be8cb65 a964075 be8cb65 a964075 32da19c be8cb65 32da19c be8cb65 32da19c a964075 32da19c a964075 be8cb65 32da19c be8cb65 a964075 be8cb65 a964075 be8cb65 a964075 be8cb65 32da19c be8cb65 a964075 be8cb65 32da19c be8cb65 a964075 be8cb65 a964075 be8cb65 a964075 be8cb65 a964075 be8cb65 a964075 be8cb65 a964075 be8cb65 a964075 be8cb65 32da19c a964075 32da19c be8cb65 a964075 be8cb65 a964075 be8cb65 a964075 be8cb65 a964075 be8cb65 32da19c be8cb65 32da19c a964075 32da19c a964075 32da19c be8cb65 32da19c be8cb65 32da19c be8cb65 a964075 be8cb65 a964075 be8cb65 9e7aae8 be8cb65 a964075 be8cb65 a964075 be8cb65 a964075 be8cb65 a964075 be8cb65 a964075 be8cb65 a964075 be8cb65 a964075 be8cb65 a964075 be8cb65 a964075 be8cb65 fa4b1c8 cb3d29e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 | """
Bangla Book Recommender β Hugging Face Space
============================================
Cold-start recommendation interface for the RokomariBG dataset using two
benchmarked recommender models from the paper "Towards Personalized Bangla
Book Recommendation: A Large-Scale Multi-Entity Book Graph Dataset"
(https://arxiv.org/abs/2602.12129).
Performance note
----------------
The catalogue has 127K books. Loading all of them into a Dropdown component
freezes the browser. Instead, this app uses a search-as-you-type pattern:
the user types a query, the backend filters titles/authors and returns up
to ~50 matches, and only those matches are rendered in a Dropdown. This
keeps the UI responsive even on free CPU tiers.
"""
from __future__ import annotations
from pathlib import Path
from typing import Dict, List
import gradio as gr
import numpy as np
import pandas as pd
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Configuration
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
ROOT = Path(__file__).parent
TOP_K_MIN, TOP_K_DEFAULT, TOP_K_MAX = 5, 10, 30
SEARCH_RESULT_LIMIT = 50 # max books shown in the search dropdown at any time
MODEL_CONFIG: Dict[str, Dict] = {
"Neural Two-Tower (best)": {"key": "two_tower", "dim": 256, "color": "#7c3aed"},
"LightGCN": {"key": "lightgcn", "dim": 64, "color": "#0891b2"},
}
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Loading
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def load_metadata() -> pd.DataFrame:
path = ROOT / "books_metadata.parquet"
if path.exists():
df = pd.read_parquet(path)
print(f" β metadata loaded from {path.name}")
else:
print(f" β οΈ {path.name} not found β using synthetic fallback")
rng = np.random.default_rng(42)
n = 500
df = pd.DataFrame({
"book_id": [f"demo_{i:05d}" for i in range(n)],
"title": [f"Demo Book #{i}" for i in range(n)],
"author": [f"Demo Author {i % 30}" for i in range(n)],
"category": rng.choice(["Fiction", "History", "Science"], size=n),
"rating": np.round(3 + rng.random(n) * 2, 1),
"summary": ["Synthetic placeholder summary."] * n,
"book_url": [""] * n,
})
for col, default in [("book_id", ""), ("title", ""), ("author", ""),
("category", ""), ("summary", ""), ("book_url", "")]:
if col not in df.columns:
df[col] = default
df[col] = df[col].fillna(default).astype(str)
if "rating" not in df.columns:
df["rating"] = np.nan
df["rating"] = pd.to_numeric(df["rating"], errors="coerce").clip(lower=1.0, upper=5.0)
# Precompute lowercased search index for fast filtering
df["_search"] = (df["title"].str.lower() + " " + df["author"].str.lower())
return df.reset_index(drop=True)
def load_embedding_pair(model_key: str, fallback_n: int, fallback_dim: int):
emb_path = ROOT / f"{model_key}_book_emb.npy"
ids_path = ROOT / f"{model_key}_book_ids.npy"
if emb_path.exists() and ids_path.exists():
emb = np.load(emb_path).astype(np.float32)
ids = np.load(ids_path, allow_pickle=True).astype(str)
source = "real"
else:
rng = np.random.default_rng(seed=abs(hash(model_key)) % (2**32))
emb = rng.standard_normal((fallback_n, fallback_dim)).astype(np.float32)
ids = METADATA["book_id"].values[:fallback_n].astype(str)
source = "synthetic"
norms = np.linalg.norm(emb, axis=1, keepdims=True)
norms[norms == 0] = 1.0
emb = emb / norms
id_to_row = {bid: i for i, bid in enumerate(ids)}
return emb, ids, id_to_row, source
print("β" * 60)
print(" Bangla Book Recommender β startup")
print("β" * 60)
print("Loading metadataβ¦")
METADATA = load_metadata()
print(f" β {len(METADATA):,} books in catalogue")
print("Loading model embeddingsβ¦")
EMBEDDINGS: Dict[str, Dict] = {}
for label, cfg in MODEL_CONFIG.items():
emb, ids, id_to_row, source = load_embedding_pair(
cfg["key"], len(METADATA), cfg["dim"]
)
EMBEDDINGS[label] = {
"emb": emb, "ids": ids, "id_to_row": id_to_row,
"dim": cfg["dim"], "color": cfg["color"], "source": source,
}
print(f" β {label:30s} {str(emb.shape):16s} [{source}]")
# Pre-computed display labels for ALL books (used for showing selected books)
ALL_LABELS = [
f"{row.title} β {row.author}" if row.author else row.title
for row in METADATA.itertuples(index=False)
]
LABEL_TO_BOOKID = dict(zip(ALL_LABELS, METADATA["book_id"].astype(str).values))
BOOKID_TO_META = {row.book_id: row for row in METADATA.itertuples(index=False)}
# Surface a small set of "popular" labels by default (highest-rated)
# This gives the dropdown something to show before the user types.
_DEFAULT_SAMPLE = METADATA.nlargest(SEARCH_RESULT_LIMIT, "rating", keep="first")
DEFAULT_LABELS = [
f"{row.title} β {row.author}" if row.author else row.title
for row in _DEFAULT_SAMPLE.itertuples(index=False)
]
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Search β the core UX fix
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def search_books(query: str, currently_selected: List[str]):
"""
Return up to SEARCH_RESULT_LIMIT book labels matching the query.
The dropdown's `choices` is updated, but `value` (currently selected
items) is preserved exactly. This means a user can search β pick β search
something else β pick, and earlier picks remain selected even though
they're no longer in the search results.
"""
selected = currently_selected or []
if not query or not query.strip():
choices = DEFAULT_LABELS
else:
q = query.strip().lower()
mask = METADATA["_search"].str.contains(q, regex=False, na=False)
matched = METADATA.loc[mask].head(SEARCH_RESULT_LIMIT)
choices = [
f"{row.title} β {row.author}" if row.author else row.title
for row in matched.itertuples(index=False)
]
# Always include the currently-selected items so they remain visible
# even if they don't match the new query.
merged = list(dict.fromkeys(selected + choices))
return gr.update(choices=merged, value=selected)
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Recommendation
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def recommend(seed_labels: List[str], model_label: str, top_k: int):
if not seed_labels:
return _empty_state_html(), gr.update(choices=[], value=[])
seed_book_ids = [
LABEL_TO_BOOKID[label] for label in seed_labels if label in LABEL_TO_BOOKID
]
if not seed_book_ids:
return (_empty_state_html("No valid seed books selected."),
gr.update(choices=[], value=[]))
model = EMBEDDINGS[model_label]
emb, id_to_row = model["emb"], model["id_to_row"]
seed_rows = [id_to_row[bid] for bid in seed_book_ids if bid in id_to_row]
if not seed_rows:
msg = (f"None of the selected books exist in the {model_label} embedding "
"space. This model was trained on a subset of books with sufficient "
"interaction history. Try different books, or switch to "
"<b>Neural Two-Tower</b> which has broader coverage.")
return _empty_state_html(msg), gr.update(choices=[], value=[])
user_vec = emb[seed_rows].mean(axis=0)
user_vec = user_vec / (np.linalg.norm(user_vec) + 1e-8)
scores = emb @ user_vec
scores[seed_rows] = -np.inf
top_k = min(int(top_k), len(scores) - 1)
top_idx = np.argpartition(-scores, top_k)[:top_k]
top_idx = top_idx[np.argsort(-scores[top_idx])]
recs = []
for row_i in top_idx:
book_id = str(model["ids"][row_i])
meta = BOOKID_TO_META.get(book_id)
if meta is None:
continue
recs.append({
"rank": len(recs) + 1,
"title": meta.title,
"author": meta.author,
"category": meta.category,
"rating": meta.rating,
"summary": (meta.summary or "")[:240],
"book_url": meta.book_url,
"score": float(scores[row_i]),
})
html = _render_recommendation_cards(
recs, model_label, len(seed_rows), len(seed_book_ids)
)
rec_choices = [
f"{r['title']} β {r['author']}" if r["author"] else r["title"]
for r in recs
]
return html, gr.update(choices=rec_choices, value=[])
def add_to_favourites(current_favs: List[str], to_add: List[str]):
if not to_add:
return gr.update(value=current_favs or [])
merged = list(dict.fromkeys((current_favs or []) + to_add))
return gr.update(choices=merged, value=merged)
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Rendering
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def _empty_state_html(message: str = "Search for books you have enjoyed, "
"select a few, then click <b>Get Recommendations</b>.") -> str:
return f"""
<div style="text-align:center;padding:3rem 1rem;color:var(--body-text-color-subdued,#888);">
<div style="font-size:2.6rem;margin-bottom:.5rem;">π</div>
<div style="font-size:.95rem;max-width:32rem;margin:0 auto;line-height:1.5;">{message}</div>
</div>
"""
def _render_recommendation_cards(recs, model_label, seeds_in_model, total_seeds):
color = EMBEDDINGS[model_label]["color"]
source = EMBEDDINGS[model_label]["source"]
if not recs:
return _empty_state_html("No recommendations could be produced.")
cards = []
for r in recs:
rating_html = (
f"<span style='color:#f59e0b;font-weight:600;'>β
{r['rating']:.1f}</span>"
if r["rating"] is not None and not pd.isna(r["rating"]) else ""
)
category_html = (
f"<span style='color:var(--body-text-color-subdued,#888);font-size:.78rem;'>{r['category']}</span>"
if r["category"] else ""
)
summary_html = (
f"<div style='margin-top:.6rem;font-size:.82rem;line-height:1.5;color:var(--body-text-color,#444);opacity:.85;'>{r['summary']}β¦</div>"
if r["summary"] else ""
)
url_html = (
f"<div style='margin-top:.7rem;'><a href='{r['book_url']}' target='_blank' rel='noopener' "
f"style='font-size:.78rem;color:{color};text-decoration:none;font-weight:500;'>View on Rokomari β</a></div>"
if r["book_url"] else ""
)
cards.append(f"""
<div style="border:1px solid var(--border-color-primary,#e5e5e5);border-radius:12px;padding:1.1rem;
background:var(--background-fill-secondary,#fff);position:relative;overflow:hidden;">
<div style="position:absolute;top:0;left:0;height:3px;width:100%;background:{color};"></div>
<div style="display:flex;justify-content:space-between;align-items:flex-start;gap:.5rem;margin-bottom:.4rem;">
<div style="font-weight:600;font-size:1rem;line-height:1.3;flex:1;">#{r['rank']}. {r['title']}</div>
<div style="font-size:.68rem;color:#999;white-space:nowrap;font-family:ui-monospace,monospace;">sim {r['score']:.3f}</div>
</div>
<div style="color:var(--body-text-color,#555);font-size:.88rem;margin-bottom:.5rem;">{r['author']}</div>
<div style="display:flex;justify-content:space-between;align-items:center;font-size:.85rem;">{rating_html}{category_html}</div>
{summary_html}{url_html}
</div>
""")
grid = ("<div style='display:grid;grid-template-columns:repeat(auto-fill,minmax(280px,1fr));gap:1rem;'>"
+ "".join(cards) + "</div>")
warnings = []
if source == "synthetic":
warnings.append(
"<div style='background:#fef3c7;border:1px solid #fbbf24;color:#78350f;"
"padding:.6rem .9rem;border-radius:8px;font-size:.82rem;margin-bottom:1rem;'>"
"β οΈ This model is using <b>synthetic random embeddings</b>.</div>"
)
if seeds_in_model < total_seeds:
skipped = total_seeds - seeds_in_model
warnings.append(
f"<div style='background:#dbeafe;border:1px solid #60a5fa;color:#1e40af;"
"padding:.6rem .9rem;border-radius:8px;font-size:.82rem;margin-bottom:1rem;'>"
f"βΉοΈ {skipped} of your {total_seeds} seed book(s) are not in the "
f"<b>{model_label}</b> embedding space and were skipped.</div>"
)
header = f"""
<div style="display:flex;align-items:center;gap:.5rem;margin-bottom:1rem;
font-size:.9rem;color:var(--body-text-color-subdued,#666);">
<span style="display:inline-block;width:.6rem;height:.6rem;border-radius:50%;background:{color};"></span>
<span><strong>{len(recs)}</strong> recommendations from <strong>{model_label}</strong></span>
</div>
"""
return "".join(warnings) + header + grid
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# UI
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
INTRO_MD = f"""
# π Bangla Book Recommender
### Cold-start recommendations from {len(METADATA):,} Bangla titles, powered by two benchmarked models on the [RokomariBG dataset](https://huggingface.co/datasets/DevnilMaster1/Bangla-Book-Recommendation-Dataset).
Type a book name or author in the search box below, pick a few you have enjoyed,
choose a model, and get nearest-neighbour recommendations from the same embedding
space the model was trained in.
"""
ABOUT_MD = """
**How it works.** Each model was trained on the RokomariBG corpus (127K books,
63K users, 209K reviews scraped from Rokomari.com) to produce a vector
representation of every book. When you pick books you like, your *taste vector*
is computed as the mean of those books' embeddings, and the books with the
highest cosine similarity to your taste vector are surfaced.
**Models.**
- **Neural Two-Tower** β best benchmarked model. Item tower fuses ID, content
(title, summary, author, publisher), and metadata. Strongest at cold-start.
- **LightGCN** β pure graph collaborative filtering with 4 GCN layers.
**Citation.** [Paper](https://arxiv.org/abs/2602.12129).
"""
with gr.Blocks(title="Bangla Book Recommender") as demo:
gr.Markdown(INTRO_MD)
with gr.Accordion("βΉοΈ About this Space", open=False):
gr.Markdown(ABOUT_MD)
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### 1. Search and pick books you have enjoyed")
search_box = gr.Textbox(
label="Search by title or author",
placeholder="Type at least 2 letters in Bangla or Englishβ¦",
show_label=True,
)
seeds = gr.Dropdown(
choices=DEFAULT_LABELS,
value=[],
multiselect=True,
label="Your taste profile",
info="Pick 3β5 books from the search results.",
allow_custom_value=False,
max_choices=20,
filterable=False,
)
gr.Markdown("### 2. Choose a model")
model_choice = gr.Radio(
choices=list(MODEL_CONFIG.keys()),
value="Neural Two-Tower (best)",
label="Recommendation model",
)
top_k = gr.Slider(
minimum=TOP_K_MIN, maximum=TOP_K_MAX,
value=TOP_K_DEFAULT, step=1,
label="Number of recommendations",
)
run_btn = gr.Button("π Get Recommendations", variant="primary", size="lg")
gr.Markdown("### 3. Refine (optional)")
refine_picker = gr.Dropdown(
choices=[], value=[], multiselect=True,
label="Promote recommendations into your taste profile",
info="After recommendations appear, pick the ones you like, then add them.",
)
add_btn = gr.Button("β Add to my favourites", size="sm")
with gr.Column(scale=2):
gr.Markdown("### Recommendations")
output_html = gr.HTML(value=_empty_state_html())
gr.Markdown(
"""
---
<div style='text-align:center;font-size:.82rem;color:var(--body-text-color-subdued,#777);'>
Built on the
<a href='https://huggingface.co/datasets/DevnilMaster1/Bangla-Book-Recommendation-Dataset'>RokomariBG dataset</a>
Β·
<a href='https://github.com/DevnilMaster/Bangla-Book-Recommendation-Dataset'>Code</a>
Β·
<a href='https://arxiv.org/abs/2602.12129'>Paper</a>
Β·
CC BY-NC 4.0
</div>
"""
)
search_box.change(
fn=search_books,
inputs=[search_box, seeds],
outputs=[seeds],
show_progress="hidden",
)
run_btn.click(
fn=recommend,
inputs=[seeds, model_choice, top_k],
outputs=[output_html, refine_picker],
)
add_btn.click(
fn=add_to_favourites,
inputs=[seeds, refine_picker],
outputs=[seeds],
)
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=True,
ssr_mode=False,
) |