| """ |
| Bangla Book Recommender β Hugging Face Space |
| ============================================ |
| |
| Cold-start recommendation interface for the RokomariBG dataset using two |
| benchmarked recommender models from the paper "Towards Personalized Bangla |
| Book Recommendation: A Large-Scale Multi-Entity Book Graph Dataset" |
| (https://arxiv.org/abs/2602.12129). |
| |
| Performance note |
| ---------------- |
| The catalogue has 127K books. Loading all of them into a Dropdown component |
| freezes the browser. Instead, this app uses a search-as-you-type pattern: |
| the user types a query, the backend filters titles/authors and returns up |
| to ~50 matches, and only those matches are rendered in a Dropdown. This |
| keeps the UI responsive even on free CPU tiers. |
| """ |
|
|
| from __future__ import annotations |
|
|
| from pathlib import Path |
| from typing import Dict, List |
|
|
| import gradio as gr |
| import numpy as np |
| import pandas as pd |
|
|
| |
| |
| |
| ROOT = Path(__file__).parent |
|
|
| TOP_K_MIN, TOP_K_DEFAULT, TOP_K_MAX = 5, 10, 30 |
| SEARCH_RESULT_LIMIT = 50 |
|
|
| MODEL_CONFIG: Dict[str, Dict] = { |
| "Neural Two-Tower (best)": {"key": "two_tower", "dim": 256, "color": "#7c3aed"}, |
| "LightGCN": {"key": "lightgcn", "dim": 64, "color": "#0891b2"}, |
| } |
|
|
|
|
| |
| |
| |
| def load_metadata() -> pd.DataFrame: |
| path = ROOT / "books_metadata.parquet" |
| if path.exists(): |
| df = pd.read_parquet(path) |
| print(f" β metadata loaded from {path.name}") |
| else: |
| print(f" β οΈ {path.name} not found β using synthetic fallback") |
| rng = np.random.default_rng(42) |
| n = 500 |
| df = pd.DataFrame({ |
| "book_id": [f"demo_{i:05d}" for i in range(n)], |
| "title": [f"Demo Book #{i}" for i in range(n)], |
| "author": [f"Demo Author {i % 30}" for i in range(n)], |
| "category": rng.choice(["Fiction", "History", "Science"], size=n), |
| "rating": np.round(3 + rng.random(n) * 2, 1), |
| "summary": ["Synthetic placeholder summary."] * n, |
| "book_url": [""] * n, |
| }) |
|
|
| for col, default in [("book_id", ""), ("title", ""), ("author", ""), |
| ("category", ""), ("summary", ""), ("book_url", "")]: |
| if col not in df.columns: |
| df[col] = default |
| df[col] = df[col].fillna(default).astype(str) |
|
|
| if "rating" not in df.columns: |
| df["rating"] = np.nan |
| df["rating"] = pd.to_numeric(df["rating"], errors="coerce").clip(lower=1.0, upper=5.0) |
|
|
| |
| df["_search"] = (df["title"].str.lower() + " " + df["author"].str.lower()) |
| return df.reset_index(drop=True) |
|
|
|
|
| def load_embedding_pair(model_key: str, fallback_n: int, fallback_dim: int): |
| emb_path = ROOT / f"{model_key}_book_emb.npy" |
| ids_path = ROOT / f"{model_key}_book_ids.npy" |
|
|
| if emb_path.exists() and ids_path.exists(): |
| emb = np.load(emb_path).astype(np.float32) |
| ids = np.load(ids_path, allow_pickle=True).astype(str) |
| source = "real" |
| else: |
| rng = np.random.default_rng(seed=abs(hash(model_key)) % (2**32)) |
| emb = rng.standard_normal((fallback_n, fallback_dim)).astype(np.float32) |
| ids = METADATA["book_id"].values[:fallback_n].astype(str) |
| source = "synthetic" |
|
|
| norms = np.linalg.norm(emb, axis=1, keepdims=True) |
| norms[norms == 0] = 1.0 |
| emb = emb / norms |
|
|
| id_to_row = {bid: i for i, bid in enumerate(ids)} |
| return emb, ids, id_to_row, source |
|
|
|
|
| print("β" * 60) |
| print(" Bangla Book Recommender β startup") |
| print("β" * 60) |
|
|
| print("Loading metadataβ¦") |
| METADATA = load_metadata() |
| print(f" β {len(METADATA):,} books in catalogue") |
|
|
| print("Loading model embeddingsβ¦") |
| EMBEDDINGS: Dict[str, Dict] = {} |
| for label, cfg in MODEL_CONFIG.items(): |
| emb, ids, id_to_row, source = load_embedding_pair( |
| cfg["key"], len(METADATA), cfg["dim"] |
| ) |
| EMBEDDINGS[label] = { |
| "emb": emb, "ids": ids, "id_to_row": id_to_row, |
| "dim": cfg["dim"], "color": cfg["color"], "source": source, |
| } |
| print(f" β {label:30s} {str(emb.shape):16s} [{source}]") |
|
|
| |
| ALL_LABELS = [ |
| f"{row.title} β {row.author}" if row.author else row.title |
| for row in METADATA.itertuples(index=False) |
| ] |
| LABEL_TO_BOOKID = dict(zip(ALL_LABELS, METADATA["book_id"].astype(str).values)) |
| BOOKID_TO_META = {row.book_id: row for row in METADATA.itertuples(index=False)} |
|
|
| |
| |
| _DEFAULT_SAMPLE = METADATA.nlargest(SEARCH_RESULT_LIMIT, "rating", keep="first") |
| DEFAULT_LABELS = [ |
| f"{row.title} β {row.author}" if row.author else row.title |
| for row in _DEFAULT_SAMPLE.itertuples(index=False) |
| ] |
|
|
|
|
| |
| |
| |
| def search_books(query: str, currently_selected: List[str]): |
| """ |
| Return up to SEARCH_RESULT_LIMIT book labels matching the query. |
| |
| The dropdown's `choices` is updated, but `value` (currently selected |
| items) is preserved exactly. This means a user can search β pick β search |
| something else β pick, and earlier picks remain selected even though |
| they're no longer in the search results. |
| """ |
| selected = currently_selected or [] |
|
|
| if not query or not query.strip(): |
| choices = DEFAULT_LABELS |
| else: |
| q = query.strip().lower() |
| mask = METADATA["_search"].str.contains(q, regex=False, na=False) |
| matched = METADATA.loc[mask].head(SEARCH_RESULT_LIMIT) |
| choices = [ |
| f"{row.title} β {row.author}" if row.author else row.title |
| for row in matched.itertuples(index=False) |
| ] |
|
|
| |
| |
| merged = list(dict.fromkeys(selected + choices)) |
| return gr.update(choices=merged, value=selected) |
|
|
|
|
| |
| |
| |
| def recommend(seed_labels: List[str], model_label: str, top_k: int): |
| if not seed_labels: |
| return _empty_state_html(), gr.update(choices=[], value=[]) |
|
|
| seed_book_ids = [ |
| LABEL_TO_BOOKID[label] for label in seed_labels if label in LABEL_TO_BOOKID |
| ] |
| if not seed_book_ids: |
| return (_empty_state_html("No valid seed books selected."), |
| gr.update(choices=[], value=[])) |
|
|
| model = EMBEDDINGS[model_label] |
| emb, id_to_row = model["emb"], model["id_to_row"] |
|
|
| seed_rows = [id_to_row[bid] for bid in seed_book_ids if bid in id_to_row] |
| if not seed_rows: |
| msg = (f"None of the selected books exist in the {model_label} embedding " |
| "space. This model was trained on a subset of books with sufficient " |
| "interaction history. Try different books, or switch to " |
| "<b>Neural Two-Tower</b> which has broader coverage.") |
| return _empty_state_html(msg), gr.update(choices=[], value=[]) |
|
|
| user_vec = emb[seed_rows].mean(axis=0) |
| user_vec = user_vec / (np.linalg.norm(user_vec) + 1e-8) |
|
|
| scores = emb @ user_vec |
| scores[seed_rows] = -np.inf |
|
|
| top_k = min(int(top_k), len(scores) - 1) |
| top_idx = np.argpartition(-scores, top_k)[:top_k] |
| top_idx = top_idx[np.argsort(-scores[top_idx])] |
|
|
| recs = [] |
| for row_i in top_idx: |
| book_id = str(model["ids"][row_i]) |
| meta = BOOKID_TO_META.get(book_id) |
| if meta is None: |
| continue |
| recs.append({ |
| "rank": len(recs) + 1, |
| "title": meta.title, |
| "author": meta.author, |
| "category": meta.category, |
| "rating": meta.rating, |
| "summary": (meta.summary or "")[:240], |
| "book_url": meta.book_url, |
| "score": float(scores[row_i]), |
| }) |
|
|
| html = _render_recommendation_cards( |
| recs, model_label, len(seed_rows), len(seed_book_ids) |
| ) |
| rec_choices = [ |
| f"{r['title']} β {r['author']}" if r["author"] else r["title"] |
| for r in recs |
| ] |
| return html, gr.update(choices=rec_choices, value=[]) |
|
|
|
|
| def add_to_favourites(current_favs: List[str], to_add: List[str]): |
| if not to_add: |
| return gr.update(value=current_favs or []) |
| merged = list(dict.fromkeys((current_favs or []) + to_add)) |
| return gr.update(choices=merged, value=merged) |
|
|
|
|
| |
| |
| |
| def _empty_state_html(message: str = "Search for books you have enjoyed, " |
| "select a few, then click <b>Get Recommendations</b>.") -> str: |
| return f""" |
| <div style="text-align:center;padding:3rem 1rem;color:var(--body-text-color-subdued,#888);"> |
| <div style="font-size:2.6rem;margin-bottom:.5rem;">π</div> |
| <div style="font-size:.95rem;max-width:32rem;margin:0 auto;line-height:1.5;">{message}</div> |
| </div> |
| """ |
|
|
|
|
| def _render_recommendation_cards(recs, model_label, seeds_in_model, total_seeds): |
| color = EMBEDDINGS[model_label]["color"] |
| source = EMBEDDINGS[model_label]["source"] |
| if not recs: |
| return _empty_state_html("No recommendations could be produced.") |
|
|
| cards = [] |
| for r in recs: |
| rating_html = ( |
| f"<span style='color:#f59e0b;font-weight:600;'>β
{r['rating']:.1f}</span>" |
| if r["rating"] is not None and not pd.isna(r["rating"]) else "" |
| ) |
| category_html = ( |
| f"<span style='color:var(--body-text-color-subdued,#888);font-size:.78rem;'>{r['category']}</span>" |
| if r["category"] else "" |
| ) |
| summary_html = ( |
| f"<div style='margin-top:.6rem;font-size:.82rem;line-height:1.5;color:var(--body-text-color,#444);opacity:.85;'>{r['summary']}β¦</div>" |
| if r["summary"] else "" |
| ) |
| url_html = ( |
| f"<div style='margin-top:.7rem;'><a href='{r['book_url']}' target='_blank' rel='noopener' " |
| f"style='font-size:.78rem;color:{color};text-decoration:none;font-weight:500;'>View on Rokomari β</a></div>" |
| if r["book_url"] else "" |
| ) |
|
|
| cards.append(f""" |
| <div style="border:1px solid var(--border-color-primary,#e5e5e5);border-radius:12px;padding:1.1rem; |
| background:var(--background-fill-secondary,#fff);position:relative;overflow:hidden;"> |
| <div style="position:absolute;top:0;left:0;height:3px;width:100%;background:{color};"></div> |
| <div style="display:flex;justify-content:space-between;align-items:flex-start;gap:.5rem;margin-bottom:.4rem;"> |
| <div style="font-weight:600;font-size:1rem;line-height:1.3;flex:1;">#{r['rank']}. {r['title']}</div> |
| <div style="font-size:.68rem;color:#999;white-space:nowrap;font-family:ui-monospace,monospace;">sim {r['score']:.3f}</div> |
| </div> |
| <div style="color:var(--body-text-color,#555);font-size:.88rem;margin-bottom:.5rem;">{r['author']}</div> |
| <div style="display:flex;justify-content:space-between;align-items:center;font-size:.85rem;">{rating_html}{category_html}</div> |
| {summary_html}{url_html} |
| </div> |
| """) |
|
|
| grid = ("<div style='display:grid;grid-template-columns:repeat(auto-fill,minmax(280px,1fr));gap:1rem;'>" |
| + "".join(cards) + "</div>") |
|
|
| warnings = [] |
| if source == "synthetic": |
| warnings.append( |
| "<div style='background:#fef3c7;border:1px solid #fbbf24;color:#78350f;" |
| "padding:.6rem .9rem;border-radius:8px;font-size:.82rem;margin-bottom:1rem;'>" |
| "β οΈ This model is using <b>synthetic random embeddings</b>.</div>" |
| ) |
| if seeds_in_model < total_seeds: |
| skipped = total_seeds - seeds_in_model |
| warnings.append( |
| f"<div style='background:#dbeafe;border:1px solid #60a5fa;color:#1e40af;" |
| "padding:.6rem .9rem;border-radius:8px;font-size:.82rem;margin-bottom:1rem;'>" |
| f"βΉοΈ {skipped} of your {total_seeds} seed book(s) are not in the " |
| f"<b>{model_label}</b> embedding space and were skipped.</div>" |
| ) |
|
|
| header = f""" |
| <div style="display:flex;align-items:center;gap:.5rem;margin-bottom:1rem; |
| font-size:.9rem;color:var(--body-text-color-subdued,#666);"> |
| <span style="display:inline-block;width:.6rem;height:.6rem;border-radius:50%;background:{color};"></span> |
| <span><strong>{len(recs)}</strong> recommendations from <strong>{model_label}</strong></span> |
| </div> |
| """ |
| return "".join(warnings) + header + grid |
|
|
|
|
| |
| |
| |
| INTRO_MD = f""" |
| # π Bangla Book Recommender |
| ### Cold-start recommendations from {len(METADATA):,} Bangla titles, powered by two benchmarked models on the [RokomariBG dataset](https://huggingface.co/datasets/DevnilMaster1/Bangla-Book-Recommendation-Dataset). |
| |
| Type a book name or author in the search box below, pick a few you have enjoyed, |
| choose a model, and get nearest-neighbour recommendations from the same embedding |
| space the model was trained in. |
| """ |
|
|
| ABOUT_MD = """ |
| **How it works.** Each model was trained on the RokomariBG corpus (127K books, |
| 63K users, 209K reviews scraped from Rokomari.com) to produce a vector |
| representation of every book. When you pick books you like, your *taste vector* |
| is computed as the mean of those books' embeddings, and the books with the |
| highest cosine similarity to your taste vector are surfaced. |
| |
| **Models.** |
| - **Neural Two-Tower** β best benchmarked model. Item tower fuses ID, content |
| (title, summary, author, publisher), and metadata. Strongest at cold-start. |
| - **LightGCN** β pure graph collaborative filtering with 4 GCN layers. |
| |
| **Citation.** [Paper](https://arxiv.org/abs/2602.12129). |
| """ |
|
|
|
|
| with gr.Blocks(title="Bangla Book Recommender") as demo: |
| gr.Markdown(INTRO_MD) |
|
|
| with gr.Accordion("βΉοΈ About this Space", open=False): |
| gr.Markdown(ABOUT_MD) |
|
|
| with gr.Row(): |
| with gr.Column(scale=1): |
| gr.Markdown("### 1. Search and pick books you have enjoyed") |
|
|
| search_box = gr.Textbox( |
| label="Search by title or author", |
| placeholder="Type at least 2 letters in Bangla or Englishβ¦", |
| show_label=True, |
| ) |
|
|
| seeds = gr.Dropdown( |
| choices=DEFAULT_LABELS, |
| value=[], |
| multiselect=True, |
| label="Your taste profile", |
| info="Pick 3β5 books from the search results.", |
| allow_custom_value=False, |
| max_choices=20, |
| filterable=False, |
| ) |
|
|
| gr.Markdown("### 2. Choose a model") |
| model_choice = gr.Radio( |
| choices=list(MODEL_CONFIG.keys()), |
| value="Neural Two-Tower (best)", |
| label="Recommendation model", |
| ) |
| top_k = gr.Slider( |
| minimum=TOP_K_MIN, maximum=TOP_K_MAX, |
| value=TOP_K_DEFAULT, step=1, |
| label="Number of recommendations", |
| ) |
|
|
| run_btn = gr.Button("π Get Recommendations", variant="primary", size="lg") |
|
|
| gr.Markdown("### 3. Refine (optional)") |
| refine_picker = gr.Dropdown( |
| choices=[], value=[], multiselect=True, |
| label="Promote recommendations into your taste profile", |
| info="After recommendations appear, pick the ones you like, then add them.", |
| ) |
| add_btn = gr.Button("β Add to my favourites", size="sm") |
|
|
| with gr.Column(scale=2): |
| gr.Markdown("### Recommendations") |
| output_html = gr.HTML(value=_empty_state_html()) |
|
|
| gr.Markdown( |
| """ |
| --- |
| <div style='text-align:center;font-size:.82rem;color:var(--body-text-color-subdued,#777);'> |
| Built on the |
| <a href='https://huggingface.co/datasets/DevnilMaster1/Bangla-Book-Recommendation-Dataset'>RokomariBG dataset</a> |
| Β· |
| <a href='https://github.com/DevnilMaster/Bangla-Book-Recommendation-Dataset'>Code</a> |
| Β· |
| <a href='https://arxiv.org/abs/2602.12129'>Paper</a> |
| Β· |
| CC BY-NC 4.0 |
| </div> |
| """ |
| ) |
|
|
| search_box.change( |
| fn=search_books, |
| inputs=[search_box, seeds], |
| outputs=[seeds], |
| show_progress="hidden", |
| ) |
| run_btn.click( |
| fn=recommend, |
| inputs=[seeds, model_choice, top_k], |
| outputs=[output_html, refine_picker], |
| ) |
| add_btn.click( |
| fn=add_to_favourites, |
| inputs=[seeds, refine_picker], |
| outputs=[seeds], |
| ) |
|
|
|
|
| if __name__ == "__main__": |
| demo.launch( |
| server_name="0.0.0.0", |
| server_port=7860, |
| share=True, |
| ssr_mode=False, |
| ) |