Spaces:

DevnilMaster1
/

Bangla-Book-Recommender

Running

App Files Files Community

Bangla-Book-Recommender / app.py

DevnilMaster1

Replace 127K-item dropdown with server-side search-as-you-type

a964075 verified 17 days ago

raw

history blame contribute delete

19.6 kB

	"""
	Bangla Book Recommender — Hugging Face Space
	============================================

	Cold-start recommendation interface for the RokomariBG dataset using two
	benchmarked recommender models from the paper "Towards Personalized Bangla
	Book Recommendation: A Large-Scale Multi-Entity Book Graph Dataset"
	(https://arxiv.org/abs/2602.12129).

	Performance note
	----------------
	The catalogue has 127K books. Loading all of them into a Dropdown component
	freezes the browser. Instead, this app uses a search-as-you-type pattern:
	the user types a query, the backend filters titles/authors and returns up
	to ~50 matches, and only those matches are rendered in a Dropdown. This
	keeps the UI responsive even on free CPU tiers.
	"""

	from __future__ import annotations

	from pathlib import Path
	from typing import Dict, List

	import gradio as gr
	import numpy as np
	import pandas as pd

	# ─────────────────────────────────────────────────────────────────────────────
	# Configuration
	# ─────────────────────────────────────────────────────────────────────────────
	ROOT = Path(__file__).parent

	TOP_K_MIN, TOP_K_DEFAULT, TOP_K_MAX = 5, 10, 30
	SEARCH_RESULT_LIMIT = 50 # max books shown in the search dropdown at any time

	MODEL_CONFIG: Dict[str, Dict] = {
	"Neural Two-Tower (best)": {"key": "two_tower", "dim": 256, "color": "#7c3aed"},
	"LightGCN": {"key": "lightgcn", "dim": 64, "color": "#0891b2"},
	}


	# ─────────────────────────────────────────────────────────────────────────────
	# Loading
	# ─────────────────────────────────────────────────────────────────────────────
	def load_metadata() -> pd.DataFrame:
	path = ROOT / "books_metadata.parquet"
	if path.exists():
	df = pd.read_parquet(path)
	print(f" → metadata loaded from {path.name}")
	else:
	print(f" ⚠️ {path.name} not found — using synthetic fallback")
	rng = np.random.default_rng(42)
	n = 500
	df = pd.DataFrame({
	"book_id": [f"demo_{i:05d}" for i in range(n)],
	"title": [f"Demo Book #{i}" for i in range(n)],
	"author": [f"Demo Author {i % 30}" for i in range(n)],
	"category": rng.choice(["Fiction", "History", "Science"], size=n),
	"rating": np.round(3 + rng.random(n) * 2, 1),
	"summary": ["Synthetic placeholder summary."] * n,
	"book_url": [""] * n,
	})

	for col, default in [("book_id", ""), ("title", ""), ("author", ""),
	("category", ""), ("summary", ""), ("book_url", "")]:
	if col not in df.columns:
	df[col] = default
	df[col] = df[col].fillna(default).astype(str)

	if "rating" not in df.columns:
	df["rating"] = np.nan
	df["rating"] = pd.to_numeric(df["rating"], errors="coerce").clip(lower=1.0, upper=5.0)

	# Precompute lowercased search index for fast filtering
	df["_search"] = (df["title"].str.lower() + " " + df["author"].str.lower())
	return df.reset_index(drop=True)


	def load_embedding_pair(model_key: str, fallback_n: int, fallback_dim: int):
	emb_path = ROOT / f"{model_key}_book_emb.npy"
	ids_path = ROOT / f"{model_key}_book_ids.npy"

	if emb_path.exists() and ids_path.exists():
	emb = np.load(emb_path).astype(np.float32)
	ids = np.load(ids_path, allow_pickle=True).astype(str)
	source = "real"
	else:
	rng = np.random.default_rng(seed=abs(hash(model_key)) % (2**32))
	emb = rng.standard_normal((fallback_n, fallback_dim)).astype(np.float32)
	ids = METADATA["book_id"].values[:fallback_n].astype(str)
	source = "synthetic"

	norms = np.linalg.norm(emb, axis=1, keepdims=True)
	norms[norms == 0] = 1.0
	emb = emb / norms

	id_to_row = {bid: i for i, bid in enumerate(ids)}
	return emb, ids, id_to_row, source


	print("─" * 60)
	print(" Bangla Book Recommender — startup")
	print("─" * 60)

	print("Loading metadata…")
	METADATA = load_metadata()
	print(f" → {len(METADATA):,} books in catalogue")

	print("Loading model embeddings…")
	EMBEDDINGS: Dict[str, Dict] = {}
	for label, cfg in MODEL_CONFIG.items():
	emb, ids, id_to_row, source = load_embedding_pair(
	cfg["key"], len(METADATA), cfg["dim"]
	)
	EMBEDDINGS[label] = {
	"emb": emb, "ids": ids, "id_to_row": id_to_row,
	"dim": cfg["dim"], "color": cfg["color"], "source": source,
	}
	print(f" → {label:30s} {str(emb.shape):16s} [{source}]")

	# Pre-computed display labels for ALL books (used for showing selected books)
	ALL_LABELS = [
	f"{row.title} — {row.author}" if row.author else row.title
	for row in METADATA.itertuples(index=False)
	]
	LABEL_TO_BOOKID = dict(zip(ALL_LABELS, METADATA["book_id"].astype(str).values))
	BOOKID_TO_META = {row.book_id: row for row in METADATA.itertuples(index=False)}

	# Surface a small set of "popular" labels by default (highest-rated)
	# This gives the dropdown something to show before the user types.
	_DEFAULT_SAMPLE = METADATA.nlargest(SEARCH_RESULT_LIMIT, "rating", keep="first")
	DEFAULT_LABELS = [
	f"{row.title} — {row.author}" if row.author else row.title
	for row in _DEFAULT_SAMPLE.itertuples(index=False)
	]


	# ─────────────────────────────────────────────────────────────────────────────
	# Search — the core UX fix
	# ─────────────────────────────────────────────────────────────────────────────
	def search_books(query: str, currently_selected: List[str]):
	"""
	Return up to SEARCH_RESULT_LIMIT book labels matching the query.

	The dropdown's `choices` is updated, but `value` (currently selected
	items) is preserved exactly. This means a user can search → pick → search
	something else → pick, and earlier picks remain selected even though
	they're no longer in the search results.
	"""
	selected = currently_selected or []

	if not query or not query.strip():
	choices = DEFAULT_LABELS
	else:
	q = query.strip().lower()
	mask = METADATA["_search"].str.contains(q, regex=False, na=False)
	matched = METADATA.loc[mask].head(SEARCH_RESULT_LIMIT)
	choices = [
	f"{row.title} — {row.author}" if row.author else row.title
	for row in matched.itertuples(index=False)
	]

	# Always include the currently-selected items so they remain visible
	# even if they don't match the new query.
	merged = list(dict.fromkeys(selected + choices))
	return gr.update(choices=merged, value=selected)


	# ─────────────────────────────────────────────────────────────────────────────
	# Recommendation
	# ─────────────────────────────────────────────────────────────────────────────
	def recommend(seed_labels: List[str], model_label: str, top_k: int):
	if not seed_labels:
	return _empty_state_html(), gr.update(choices=[], value=[])

	seed_book_ids = [
	LABEL_TO_BOOKID[label] for label in seed_labels if label in LABEL_TO_BOOKID
	]
	if not seed_book_ids:
	return (_empty_state_html("No valid seed books selected."),
	gr.update(choices=[], value=[]))

	model = EMBEDDINGS[model_label]
	emb, id_to_row = model["emb"], model["id_to_row"]

	seed_rows = [id_to_row[bid] for bid in seed_book_ids if bid in id_to_row]
	if not seed_rows:
	msg = (f"None of the selected books exist in the {model_label} embedding "
	"space. This model was trained on a subset of books with sufficient "
	"interaction history. Try different books, or switch to "
	"<b>Neural Two-Tower</b> which has broader coverage.")
	return _empty_state_html(msg), gr.update(choices=[], value=[])

	user_vec = emb[seed_rows].mean(axis=0)
	user_vec = user_vec / (np.linalg.norm(user_vec) + 1e-8)

	scores = emb @ user_vec
	scores[seed_rows] = -np.inf

	top_k = min(int(top_k), len(scores) - 1)
	top_idx = np.argpartition(-scores, top_k)[:top_k]
	top_idx = top_idx[np.argsort(-scores[top_idx])]

	recs = []
	for row_i in top_idx:
	book_id = str(model["ids"][row_i])
	meta = BOOKID_TO_META.get(book_id)
	if meta is None:
	continue
	recs.append({
	"rank": len(recs) + 1,
	"title": meta.title,
	"author": meta.author,
	"category": meta.category,
	"rating": meta.rating,
	"summary": (meta.summary or "")[:240],
	"book_url": meta.book_url,
	"score": float(scores[row_i]),
	})

	html = _render_recommendation_cards(
	recs, model_label, len(seed_rows), len(seed_book_ids)
	)
	rec_choices = [
	f"{r['title']} — {r['author']}" if r["author"] else r["title"]
	for r in recs
	]
	return html, gr.update(choices=rec_choices, value=[])


	def add_to_favourites(current_favs: List[str], to_add: List[str]):
	if not to_add:
	return gr.update(value=current_favs or [])
	merged = list(dict.fromkeys((current_favs or []) + to_add))
	return gr.update(choices=merged, value=merged)


	# ─────────────────────────────────────────────────────────────────────────────
	# Rendering
	# ─────────────────────────────────────────────────────────────────────────────
	def _empty_state_html(message: str = "Search for books you have enjoyed, "
	"select a few, then click <b>Get Recommendations</b>.") -> str:
	return f"""
	<div style="text-align:center;padding:3rem 1rem;color:var(--body-text-color-subdued,#888);">
	<div style="font-size:2.6rem;margin-bottom:.5rem;">📚</div>
	<div style="font-size:.95rem;max-width:32rem;margin:0 auto;line-height:1.5;">{message}</div>
	</div>
	"""


	def _render_recommendation_cards(recs, model_label, seeds_in_model, total_seeds):
	color = EMBEDDINGS[model_label]["color"]
	source = EMBEDDINGS[model_label]["source"]
	if not recs:
	return _empty_state_html("No recommendations could be produced.")

	cards = []
	for r in recs:
	rating_html = (
	f"<span style='color:#f59e0b;font-weight:600;'>★ {r['rating']:.1f}</span>"
	if r["rating"] is not None and not pd.isna(r["rating"]) else ""
	)
	category_html = (
	f"<span style='color:var(--body-text-color-subdued,#888);font-size:.78rem;'>{r['category']}</span>"
	if r["category"] else ""
	)
	summary_html = (
	f"<div style='margin-top:.6rem;font-size:.82rem;line-height:1.5;color:var(--body-text-color,#444);opacity:.85;'>{r['summary']}…</div>"
	if r["summary"] else ""
	)
	url_html = (
	f"<div style='margin-top:.7rem;'><a href='{r['book_url']}' target='_blank' rel='noopener' "
	f"style='font-size:.78rem;color:{color};text-decoration:none;font-weight:500;'>View on Rokomari →</a></div>"
	if r["book_url"] else ""
	)

	cards.append(f"""
	<div style="border:1px solid var(--border-color-primary,#e5e5e5);border-radius:12px;padding:1.1rem;
	background:var(--background-fill-secondary,#fff);position:relative;overflow:hidden;">
	<div style="position:absolute;top:0;left:0;height:3px;width:100%;background:{color};"></div>
	<div style="display:flex;justify-content:space-between;align-items:flex-start;gap:.5rem;margin-bottom:.4rem;">
	<div style="font-weight:600;font-size:1rem;line-height:1.3;flex:1;">#{r['rank']}. {r['title']}</div>
	<div style="font-size:.68rem;color:#999;white-space:nowrap;font-family:ui-monospace,monospace;">sim {r['score']:.3f}</div>
	</div>
	<div style="color:var(--body-text-color,#555);font-size:.88rem;margin-bottom:.5rem;">{r['author']}</div>
	<div style="display:flex;justify-content:space-between;align-items:center;font-size:.85rem;">{rating_html}{category_html}</div>
	{summary_html}{url_html}
	</div>
	""")

	grid = ("<div style='display:grid;grid-template-columns:repeat(auto-fill,minmax(280px,1fr));gap:1rem;'>"
	+ "".join(cards) + "</div>")

	warnings = []
	if source == "synthetic":
	warnings.append(
	"<div style='background:#fef3c7;border:1px solid #fbbf24;color:#78350f;"
	"padding:.6rem .9rem;border-radius:8px;font-size:.82rem;margin-bottom:1rem;'>"
	"⚠️ This model is using <b>synthetic random embeddings</b>.</div>"
	)
	if seeds_in_model < total_seeds:
	skipped = total_seeds - seeds_in_model
	warnings.append(
	f"<div style='background:#dbeafe;border:1px solid #60a5fa;color:#1e40af;"
	"padding:.6rem .9rem;border-radius:8px;font-size:.82rem;margin-bottom:1rem;'>"
	f"ℹ️ {skipped} of your {total_seeds} seed book(s) are not in the "
	f"<b>{model_label}</b> embedding space and were skipped.</div>"
	)

	header = f"""
	<div style="display:flex;align-items:center;gap:.5rem;margin-bottom:1rem;
	font-size:.9rem;color:var(--body-text-color-subdued,#666);">
	<span style="display:inline-block;width:.6rem;height:.6rem;border-radius:50%;background:{color};"></span>
	<span><strong>{len(recs)}</strong> recommendations from <strong>{model_label}</strong></span>
	</div>
	"""
	return "".join(warnings) + header + grid


	# ─────────────────────────────────────────────────────────────────────────────
	# UI
	# ─────────────────────────────────────────────────────────────────────────────
	INTRO_MD = f"""
	# 📚 Bangla Book Recommender
	### Cold-start recommendations from {len(METADATA):,} Bangla titles, powered by two benchmarked models on the [RokomariBG dataset](https://huggingface.co/datasets/DevnilMaster1/Bangla-Book-Recommendation-Dataset).

	Type a book name or author in the search box below, pick a few you have enjoyed,
	choose a model, and get nearest-neighbour recommendations from the same embedding
	space the model was trained in.
	"""

	ABOUT_MD = """
	How it works. Each model was trained on the RokomariBG corpus (127K books,
	63K users, 209K reviews scraped from Rokomari.com) to produce a vector
	representation of every book. When you pick books you like, your taste vector
	is computed as the mean of those books' embeddings, and the books with the
	highest cosine similarity to your taste vector are surfaced.

	Models.
	- Neural Two-Tower — best benchmarked model. Item tower fuses ID, content
	(title, summary, author, publisher), and metadata. Strongest at cold-start.
	- LightGCN — pure graph collaborative filtering with 4 GCN layers.

	Citation. [Paper](https://arxiv.org/abs/2602.12129).
	"""


	with gr.Blocks(title="Bangla Book Recommender") as demo:
	gr.Markdown(INTRO_MD)

	with gr.Accordion("ℹ️ About this Space", open=False):
	gr.Markdown(ABOUT_MD)

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### 1. Search and pick books you have enjoyed")

	search_box = gr.Textbox(
	label="Search by title or author",
	placeholder="Type at least 2 letters in Bangla or English…",
	show_label=True,
	)

	seeds = gr.Dropdown(
	choices=DEFAULT_LABELS,
	value=[],
	multiselect=True,
	label="Your taste profile",
	info="Pick 3–5 books from the search results.",
	allow_custom_value=False,
	max_choices=20,
	filterable=False,
	)

	gr.Markdown("### 2. Choose a model")
	model_choice = gr.Radio(
	choices=list(MODEL_CONFIG.keys()),
	value="Neural Two-Tower (best)",
	label="Recommendation model",
	)
	top_k = gr.Slider(
	minimum=TOP_K_MIN, maximum=TOP_K_MAX,
	value=TOP_K_DEFAULT, step=1,
	label="Number of recommendations",
	)

	run_btn = gr.Button("🔍 Get Recommendations", variant="primary", size="lg")

	gr.Markdown("### 3. Refine (optional)")
	refine_picker = gr.Dropdown(
	choices=[], value=[], multiselect=True,
	label="Promote recommendations into your taste profile",
	info="After recommendations appear, pick the ones you like, then add them.",
	)
	add_btn = gr.Button("➕ Add to my favourites", size="sm")

	with gr.Column(scale=2):
	gr.Markdown("### Recommendations")
	output_html = gr.HTML(value=_empty_state_html())

	gr.Markdown(
	"""
	---
	<div style='text-align:center;font-size:.82rem;color:var(--body-text-color-subdued,#777);'>
	Built on the
	<a href='https://huggingface.co/datasets/DevnilMaster1/Bangla-Book-Recommendation-Dataset'>RokomariBG dataset</a>
	·
	<a href='https://github.com/DevnilMaster/Bangla-Book-Recommendation-Dataset'>Code</a>
	·
	<a href='https://arxiv.org/abs/2602.12129'>Paper</a>
	·
	CC BY-NC 4.0
	</div>
	"""
	)

	search_box.change(
	fn=search_books,
	inputs=[search_box, seeds],
	outputs=[seeds],
	show_progress="hidden",
	)
	run_btn.click(
	fn=recommend,
	inputs=[seeds, model_choice, top_k],
	outputs=[output_html, refine_picker],
	)
	add_btn.click(
	fn=add_to_favourites,
	inputs=[seeds, refine_picker],
	outputs=[seeds],
	)


	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=True,
	ssr_mode=False,
	)