""" Bangla Book Recommender — Hugging Face Space ============================================ Cold-start recommendation interface for the RokomariBG dataset using two benchmarked recommender models from the paper "Towards Personalized Bangla Book Recommendation: A Large-Scale Multi-Entity Book Graph Dataset" (https://arxiv.org/abs/2602.12129). Performance note ---------------- The catalogue has 127K books. Loading all of them into a Dropdown component freezes the browser. Instead, this app uses a search-as-you-type pattern: the user types a query, the backend filters titles/authors and returns up to ~50 matches, and only those matches are rendered in a Dropdown. This keeps the UI responsive even on free CPU tiers. """ from __future__ import annotations from pathlib import Path from typing import Dict, List import gradio as gr import numpy as np import pandas as pd # ───────────────────────────────────────────────────────────────────────────── # Configuration # ───────────────────────────────────────────────────────────────────────────── ROOT = Path(__file__).parent TOP_K_MIN, TOP_K_DEFAULT, TOP_K_MAX = 5, 10, 30 SEARCH_RESULT_LIMIT = 50 # max books shown in the search dropdown at any time MODEL_CONFIG: Dict[str, Dict] = { "Neural Two-Tower (best)": {"key": "two_tower", "dim": 256, "color": "#7c3aed"}, "LightGCN": {"key": "lightgcn", "dim": 64, "color": "#0891b2"}, } # ───────────────────────────────────────────────────────────────────────────── # Loading # ───────────────────────────────────────────────────────────────────────────── def load_metadata() -> pd.DataFrame: path = ROOT / "books_metadata.parquet" if path.exists(): df = pd.read_parquet(path) print(f" → metadata loaded from {path.name}") else: print(f" ⚠️ {path.name} not found — using synthetic fallback") rng = np.random.default_rng(42) n = 500 df = pd.DataFrame({ "book_id": [f"demo_{i:05d}" for i in range(n)], "title": [f"Demo Book #{i}" for i in range(n)], "author": [f"Demo Author {i % 30}" for i in range(n)], "category": rng.choice(["Fiction", "History", "Science"], size=n), "rating": np.round(3 + rng.random(n) * 2, 1), "summary": ["Synthetic placeholder summary."] * n, "book_url": [""] * n, }) for col, default in [("book_id", ""), ("title", ""), ("author", ""), ("category", ""), ("summary", ""), ("book_url", "")]: if col not in df.columns: df[col] = default df[col] = df[col].fillna(default).astype(str) if "rating" not in df.columns: df["rating"] = np.nan df["rating"] = pd.to_numeric(df["rating"], errors="coerce").clip(lower=1.0, upper=5.0) # Precompute lowercased search index for fast filtering df["_search"] = (df["title"].str.lower() + " " + df["author"].str.lower()) return df.reset_index(drop=True) def load_embedding_pair(model_key: str, fallback_n: int, fallback_dim: int): emb_path = ROOT / f"{model_key}_book_emb.npy" ids_path = ROOT / f"{model_key}_book_ids.npy" if emb_path.exists() and ids_path.exists(): emb = np.load(emb_path).astype(np.float32) ids = np.load(ids_path, allow_pickle=True).astype(str) source = "real" else: rng = np.random.default_rng(seed=abs(hash(model_key)) % (2**32)) emb = rng.standard_normal((fallback_n, fallback_dim)).astype(np.float32) ids = METADATA["book_id"].values[:fallback_n].astype(str) source = "synthetic" norms = np.linalg.norm(emb, axis=1, keepdims=True) norms[norms == 0] = 1.0 emb = emb / norms id_to_row = {bid: i for i, bid in enumerate(ids)} return emb, ids, id_to_row, source print("─" * 60) print(" Bangla Book Recommender — startup") print("─" * 60) print("Loading metadata…") METADATA = load_metadata() print(f" → {len(METADATA):,} books in catalogue") print("Loading model embeddings…") EMBEDDINGS: Dict[str, Dict] = {} for label, cfg in MODEL_CONFIG.items(): emb, ids, id_to_row, source = load_embedding_pair( cfg["key"], len(METADATA), cfg["dim"] ) EMBEDDINGS[label] = { "emb": emb, "ids": ids, "id_to_row": id_to_row, "dim": cfg["dim"], "color": cfg["color"], "source": source, } print(f" → {label:30s} {str(emb.shape):16s} [{source}]") # Pre-computed display labels for ALL books (used for showing selected books) ALL_LABELS = [ f"{row.title} — {row.author}" if row.author else row.title for row in METADATA.itertuples(index=False) ] LABEL_TO_BOOKID = dict(zip(ALL_LABELS, METADATA["book_id"].astype(str).values)) BOOKID_TO_META = {row.book_id: row for row in METADATA.itertuples(index=False)} # Surface a small set of "popular" labels by default (highest-rated) # This gives the dropdown something to show before the user types. _DEFAULT_SAMPLE = METADATA.nlargest(SEARCH_RESULT_LIMIT, "rating", keep="first") DEFAULT_LABELS = [ f"{row.title} — {row.author}" if row.author else row.title for row in _DEFAULT_SAMPLE.itertuples(index=False) ] # ───────────────────────────────────────────────────────────────────────────── # Search — the core UX fix # ───────────────────────────────────────────────────────────────────────────── def search_books(query: str, currently_selected: List[str]): """ Return up to SEARCH_RESULT_LIMIT book labels matching the query. The dropdown's `choices` is updated, but `value` (currently selected items) is preserved exactly. This means a user can search → pick → search something else → pick, and earlier picks remain selected even though they're no longer in the search results. """ selected = currently_selected or [] if not query or not query.strip(): choices = DEFAULT_LABELS else: q = query.strip().lower() mask = METADATA["_search"].str.contains(q, regex=False, na=False) matched = METADATA.loc[mask].head(SEARCH_RESULT_LIMIT) choices = [ f"{row.title} — {row.author}" if row.author else row.title for row in matched.itertuples(index=False) ] # Always include the currently-selected items so they remain visible # even if they don't match the new query. merged = list(dict.fromkeys(selected + choices)) return gr.update(choices=merged, value=selected) # ───────────────────────────────────────────────────────────────────────────── # Recommendation # ───────────────────────────────────────────────────────────────────────────── def recommend(seed_labels: List[str], model_label: str, top_k: int): if not seed_labels: return _empty_state_html(), gr.update(choices=[], value=[]) seed_book_ids = [ LABEL_TO_BOOKID[label] for label in seed_labels if label in LABEL_TO_BOOKID ] if not seed_book_ids: return (_empty_state_html("No valid seed books selected."), gr.update(choices=[], value=[])) model = EMBEDDINGS[model_label] emb, id_to_row = model["emb"], model["id_to_row"] seed_rows = [id_to_row[bid] for bid in seed_book_ids if bid in id_to_row] if not seed_rows: msg = (f"None of the selected books exist in the {model_label} embedding " "space. This model was trained on a subset of books with sufficient " "interaction history. Try different books, or switch to " "Neural Two-Tower which has broader coverage.") return _empty_state_html(msg), gr.update(choices=[], value=[]) user_vec = emb[seed_rows].mean(axis=0) user_vec = user_vec / (np.linalg.norm(user_vec) + 1e-8) scores = emb @ user_vec scores[seed_rows] = -np.inf top_k = min(int(top_k), len(scores) - 1) top_idx = np.argpartition(-scores, top_k)[:top_k] top_idx = top_idx[np.argsort(-scores[top_idx])] recs = [] for row_i in top_idx: book_id = str(model["ids"][row_i]) meta = BOOKID_TO_META.get(book_id) if meta is None: continue recs.append({ "rank": len(recs) + 1, "title": meta.title, "author": meta.author, "category": meta.category, "rating": meta.rating, "summary": (meta.summary or "")[:240], "book_url": meta.book_url, "score": float(scores[row_i]), }) html = _render_recommendation_cards( recs, model_label, len(seed_rows), len(seed_book_ids) ) rec_choices = [ f"{r['title']} — {r['author']}" if r["author"] else r["title"] for r in recs ] return html, gr.update(choices=rec_choices, value=[]) def add_to_favourites(current_favs: List[str], to_add: List[str]): if not to_add: return gr.update(value=current_favs or []) merged = list(dict.fromkeys((current_favs or []) + to_add)) return gr.update(choices=merged, value=merged) # ───────────────────────────────────────────────────────────────────────────── # Rendering # ───────────────────────────────────────────────────────────────────────────── def _empty_state_html(message: str = "Search for books you have enjoyed, " "select a few, then click Get Recommendations.") -> str: return f"""
📚
{message}
""" def _render_recommendation_cards(recs, model_label, seeds_in_model, total_seeds): color = EMBEDDINGS[model_label]["color"] source = EMBEDDINGS[model_label]["source"] if not recs: return _empty_state_html("No recommendations could be produced.") cards = [] for r in recs: rating_html = ( f"★ {r['rating']:.1f}" if r["rating"] is not None and not pd.isna(r["rating"]) else "" ) category_html = ( f"{r['category']}" if r["category"] else "" ) summary_html = ( f"
{r['summary']}…
" if r["summary"] else "" ) url_html = ( f"
View on Rokomari →
" if r["book_url"] else "" ) cards.append(f"""
#{r['rank']}. {r['title']}
sim {r['score']:.3f}
{r['author']}
{rating_html}{category_html}
{summary_html}{url_html}
""") grid = ("
" + "".join(cards) + "
") warnings = [] if source == "synthetic": warnings.append( "
" "⚠️ This model is using synthetic random embeddings.
" ) if seeds_in_model < total_seeds: skipped = total_seeds - seeds_in_model warnings.append( f"
" f"ℹ️ {skipped} of your {total_seeds} seed book(s) are not in the " f"{model_label} embedding space and were skipped.
" ) header = f"""
{len(recs)} recommendations from {model_label}
""" return "".join(warnings) + header + grid # ───────────────────────────────────────────────────────────────────────────── # UI # ───────────────────────────────────────────────────────────────────────────── INTRO_MD = f""" # 📚 Bangla Book Recommender ### Cold-start recommendations from {len(METADATA):,} Bangla titles, powered by two benchmarked models on the [RokomariBG dataset](https://huggingface.co/datasets/DevnilMaster1/Bangla-Book-Recommendation-Dataset). Type a book name or author in the search box below, pick a few you have enjoyed, choose a model, and get nearest-neighbour recommendations from the same embedding space the model was trained in. """ ABOUT_MD = """ **How it works.** Each model was trained on the RokomariBG corpus (127K books, 63K users, 209K reviews scraped from Rokomari.com) to produce a vector representation of every book. When you pick books you like, your *taste vector* is computed as the mean of those books' embeddings, and the books with the highest cosine similarity to your taste vector are surfaced. **Models.** - **Neural Two-Tower** — best benchmarked model. Item tower fuses ID, content (title, summary, author, publisher), and metadata. Strongest at cold-start. - **LightGCN** — pure graph collaborative filtering with 4 GCN layers. **Citation.** [Paper](https://arxiv.org/abs/2602.12129). """ with gr.Blocks(title="Bangla Book Recommender") as demo: gr.Markdown(INTRO_MD) with gr.Accordion("ℹ️ About this Space", open=False): gr.Markdown(ABOUT_MD) with gr.Row(): with gr.Column(scale=1): gr.Markdown("### 1. Search and pick books you have enjoyed") search_box = gr.Textbox( label="Search by title or author", placeholder="Type at least 2 letters in Bangla or English…", show_label=True, ) seeds = gr.Dropdown( choices=DEFAULT_LABELS, value=[], multiselect=True, label="Your taste profile", info="Pick 3–5 books from the search results.", allow_custom_value=False, max_choices=20, filterable=False, ) gr.Markdown("### 2. Choose a model") model_choice = gr.Radio( choices=list(MODEL_CONFIG.keys()), value="Neural Two-Tower (best)", label="Recommendation model", ) top_k = gr.Slider( minimum=TOP_K_MIN, maximum=TOP_K_MAX, value=TOP_K_DEFAULT, step=1, label="Number of recommendations", ) run_btn = gr.Button("🔍 Get Recommendations", variant="primary", size="lg") gr.Markdown("### 3. Refine (optional)") refine_picker = gr.Dropdown( choices=[], value=[], multiselect=True, label="Promote recommendations into your taste profile", info="After recommendations appear, pick the ones you like, then add them.", ) add_btn = gr.Button("➕ Add to my favourites", size="sm") with gr.Column(scale=2): gr.Markdown("### Recommendations") output_html = gr.HTML(value=_empty_state_html()) gr.Markdown( """ ---
Built on the RokomariBG dataset · Code · Paper · CC BY-NC 4.0
""" ) search_box.change( fn=search_books, inputs=[search_box, seeds], outputs=[seeds], show_progress="hidden", ) run_btn.click( fn=recommend, inputs=[seeds, model_choice, top_k], outputs=[output_html, refine_picker], ) add_btn.click( fn=add_to_favourites, inputs=[seeds, refine_picker], outputs=[seeds], ) if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, share=True, ssr_mode=False, )