""" Bangla Book Recommender — Hugging Face Space ============================================ Cold-start recommendation interface for the RokomariBG dataset using two benchmarked recommender models from the paper "Towards Personalized Bangla Book Recommendation: A Large-Scale Multi-Entity Book Graph Dataset" (https://arxiv.org/abs/2602.12129). Performance note ---------------- The catalogue has 127K books. Loading all of them into a Dropdown component freezes the browser. Instead, this app uses a search-as-you-type pattern: the user types a query, the backend filters titles/authors and returns up to ~50 matches, and only those matches are rendered in a Dropdown. This keeps the UI responsive even on free CPU tiers. """ from __future__ import annotations from pathlib import Path from typing import Dict, List import gradio as gr import numpy as np import pandas as pd # ───────────────────────────────────────────────────────────────────────────── # Configuration # ───────────────────────────────────────────────────────────────────────────── ROOT = Path(__file__).parent TOP_K_MIN, TOP_K_DEFAULT, TOP_K_MAX = 5, 10, 30 SEARCH_RESULT_LIMIT = 50 # max books shown in the search dropdown at any time MODEL_CONFIG: Dict[str, Dict] = { "Neural Two-Tower (best)": {"key": "two_tower", "dim": 256, "color": "#7c3aed"}, "LightGCN": {"key": "lightgcn", "dim": 64, "color": "#0891b2"}, } # ───────────────────────────────────────────────────────────────────────────── # Loading # ───────────────────────────────────────────────────────────────────────────── def load_metadata() -> pd.DataFrame: path = ROOT / "books_metadata.parquet" if path.exists(): df = pd.read_parquet(path) print(f" → metadata loaded from {path.name}") else: print(f" ⚠️ {path.name} not found — using synthetic fallback") rng = np.random.default_rng(42) n = 500 df = pd.DataFrame({ "book_id": [f"demo_{i:05d}" for i in range(n)], "title": [f"Demo Book #{i}" for i in range(n)], "author": [f"Demo Author {i % 30}" for i in range(n)], "category": rng.choice(["Fiction", "History", "Science"], size=n), "rating": np.round(3 + rng.random(n) * 2, 1), "summary": ["Synthetic placeholder summary."] * n, "book_url": [""] * n, }) for col, default in [("book_id", ""), ("title", ""), ("author", ""), ("category", ""), ("summary", ""), ("book_url", "")]: if col not in df.columns: df[col] = default df[col] = df[col].fillna(default).astype(str) if "rating" not in df.columns: df["rating"] = np.nan df["rating"] = pd.to_numeric(df["rating"], errors="coerce").clip(lower=1.0, upper=5.0) # Precompute lowercased search index for fast filtering df["_search"] = (df["title"].str.lower() + " " + df["author"].str.lower()) return df.reset_index(drop=True) def load_embedding_pair(model_key: str, fallback_n: int, fallback_dim: int): emb_path = ROOT / f"{model_key}_book_emb.npy" ids_path = ROOT / f"{model_key}_book_ids.npy" if emb_path.exists() and ids_path.exists(): emb = np.load(emb_path).astype(np.float32) ids = np.load(ids_path, allow_pickle=True).astype(str) source = "real" else: rng = np.random.default_rng(seed=abs(hash(model_key)) % (2**32)) emb = rng.standard_normal((fallback_n, fallback_dim)).astype(np.float32) ids = METADATA["book_id"].values[:fallback_n].astype(str) source = "synthetic" norms = np.linalg.norm(emb, axis=1, keepdims=True) norms[norms == 0] = 1.0 emb = emb / norms id_to_row = {bid: i for i, bid in enumerate(ids)} return emb, ids, id_to_row, source print("─" * 60) print(" Bangla Book Recommender — startup") print("─" * 60) print("Loading metadata…") METADATA = load_metadata() print(f" → {len(METADATA):,} books in catalogue") print("Loading model embeddings…") EMBEDDINGS: Dict[str, Dict] = {} for label, cfg in MODEL_CONFIG.items(): emb, ids, id_to_row, source = load_embedding_pair( cfg["key"], len(METADATA), cfg["dim"] ) EMBEDDINGS[label] = { "emb": emb, "ids": ids, "id_to_row": id_to_row, "dim": cfg["dim"], "color": cfg["color"], "source": source, } print(f" → {label:30s} {str(emb.shape):16s} [{source}]") # Pre-computed display labels for ALL books (used for showing selected books) ALL_LABELS = [ f"{row.title} — {row.author}" if row.author else row.title for row in METADATA.itertuples(index=False) ] LABEL_TO_BOOKID = dict(zip(ALL_LABELS, METADATA["book_id"].astype(str).values)) BOOKID_TO_META = {row.book_id: row for row in METADATA.itertuples(index=False)} # Surface a small set of "popular" labels by default (highest-rated) # This gives the dropdown something to show before the user types. _DEFAULT_SAMPLE = METADATA.nlargest(SEARCH_RESULT_LIMIT, "rating", keep="first") DEFAULT_LABELS = [ f"{row.title} — {row.author}" if row.author else row.title for row in _DEFAULT_SAMPLE.itertuples(index=False) ] # ───────────────────────────────────────────────────────────────────────────── # Search — the core UX fix # ───────────────────────────────────────────────────────────────────────────── def search_books(query: str, currently_selected: List[str]): """ Return up to SEARCH_RESULT_LIMIT book labels matching the query. The dropdown's `choices` is updated, but `value` (currently selected items) is preserved exactly. This means a user can search → pick → search something else → pick, and earlier picks remain selected even though they're no longer in the search results. """ selected = currently_selected or [] if not query or not query.strip(): choices = DEFAULT_LABELS else: q = query.strip().lower() mask = METADATA["_search"].str.contains(q, regex=False, na=False) matched = METADATA.loc[mask].head(SEARCH_RESULT_LIMIT) choices = [ f"{row.title} — {row.author}" if row.author else row.title for row in matched.itertuples(index=False) ] # Always include the currently-selected items so they remain visible # even if they don't match the new query. merged = list(dict.fromkeys(selected + choices)) return gr.update(choices=merged, value=selected) # ───────────────────────────────────────────────────────────────────────────── # Recommendation # ───────────────────────────────────────────────────────────────────────────── def recommend(seed_labels: List[str], model_label: str, top_k: int): if not seed_labels: return _empty_state_html(), gr.update(choices=[], value=[]) seed_book_ids = [ LABEL_TO_BOOKID[label] for label in seed_labels if label in LABEL_TO_BOOKID ] if not seed_book_ids: return (_empty_state_html("No valid seed books selected."), gr.update(choices=[], value=[])) model = EMBEDDINGS[model_label] emb, id_to_row = model["emb"], model["id_to_row"] seed_rows = [id_to_row[bid] for bid in seed_book_ids if bid in id_to_row] if not seed_rows: msg = (f"None of the selected books exist in the {model_label} embedding " "space. This model was trained on a subset of books with sufficient " "interaction history. Try different books, or switch to " "Neural Two-Tower which has broader coverage.") return _empty_state_html(msg), gr.update(choices=[], value=[]) user_vec = emb[seed_rows].mean(axis=0) user_vec = user_vec / (np.linalg.norm(user_vec) + 1e-8) scores = emb @ user_vec scores[seed_rows] = -np.inf top_k = min(int(top_k), len(scores) - 1) top_idx = np.argpartition(-scores, top_k)[:top_k] top_idx = top_idx[np.argsort(-scores[top_idx])] recs = [] for row_i in top_idx: book_id = str(model["ids"][row_i]) meta = BOOKID_TO_META.get(book_id) if meta is None: continue recs.append({ "rank": len(recs) + 1, "title": meta.title, "author": meta.author, "category": meta.category, "rating": meta.rating, "summary": (meta.summary or "")[:240], "book_url": meta.book_url, "score": float(scores[row_i]), }) html = _render_recommendation_cards( recs, model_label, len(seed_rows), len(seed_book_ids) ) rec_choices = [ f"{r['title']} — {r['author']}" if r["author"] else r["title"] for r in recs ] return html, gr.update(choices=rec_choices, value=[]) def add_to_favourites(current_favs: List[str], to_add: List[str]): if not to_add: return gr.update(value=current_favs or []) merged = list(dict.fromkeys((current_favs or []) + to_add)) return gr.update(choices=merged, value=merged) # ───────────────────────────────────────────────────────────────────────────── # Rendering # ───────────────────────────────────────────────────────────────────────────── def _empty_state_html(message: str = "Search for books you have enjoyed, " "select a few, then click Get Recommendations.") -> str: return f"""