Spaces:

DevnilMaster1
/

Bangla-Book-Recommender

Running

File size: 19,575 Bytes

"""
Bangla Book Recommender — Hugging Face Space
============================================

Cold-start recommendation interface for the RokomariBG dataset using two
benchmarked recommender models from the paper "Towards Personalized Bangla
Book Recommendation: A Large-Scale Multi-Entity Book Graph Dataset"
(https://arxiv.org/abs/2602.12129).

Performance note
----------------
The catalogue has 127K books. Loading all of them into a Dropdown component
freezes the browser. Instead, this app uses a search-as-you-type pattern:
the user types a query, the backend filters titles/authors and returns up
to ~50 matches, and only those matches are rendered in a Dropdown. This
keeps the UI responsive even on free CPU tiers.
"""

from __future__ import annotations

from pathlib import Path
from typing import Dict, List

import gradio as gr
import numpy as np
import pandas as pd

# ─────────────────────────────────────────────────────────────────────────────
# Configuration
# ─────────────────────────────────────────────────────────────────────────────
ROOT = Path(__file__).parent

TOP_K_MIN, TOP_K_DEFAULT, TOP_K_MAX = 5, 10, 30
SEARCH_RESULT_LIMIT = 50   # max books shown in the search dropdown at any time

MODEL_CONFIG: Dict[str, Dict] = {
    "Neural Two-Tower (best)": {"key": "two_tower", "dim": 256, "color": "#7c3aed"},
    "LightGCN":                {"key": "lightgcn",  "dim": 64,  "color": "#0891b2"},
}


# ─────────────────────────────────────────────────────────────────────────────
# Loading
# ─────────────────────────────────────────────────────────────────────────────
def load_metadata() -> pd.DataFrame:
    path = ROOT / "books_metadata.parquet"
    if path.exists():
        df = pd.read_parquet(path)
        print(f"  → metadata loaded from {path.name}")
    else:
        print(f"  ⚠️  {path.name} not found — using synthetic fallback")
        rng = np.random.default_rng(42)
        n = 500
        df = pd.DataFrame({
            "book_id":  [f"demo_{i:05d}" for i in range(n)],
            "title":    [f"Demo Book #{i}" for i in range(n)],
            "author":   [f"Demo Author {i % 30}" for i in range(n)],
            "category": rng.choice(["Fiction", "History", "Science"], size=n),
            "rating":   np.round(3 + rng.random(n) * 2, 1),
            "summary":  ["Synthetic placeholder summary."] * n,
            "book_url": [""] * n,
        })

    for col, default in [("book_id", ""), ("title", ""), ("author", ""),
                         ("category", ""), ("summary", ""), ("book_url", "")]:
        if col not in df.columns:
            df[col] = default
        df[col] = df[col].fillna(default).astype(str)

    if "rating" not in df.columns:
        df["rating"] = np.nan
    df["rating"] = pd.to_numeric(df["rating"], errors="coerce").clip(lower=1.0, upper=5.0)

    # Precompute lowercased search index for fast filtering
    df["_search"] = (df["title"].str.lower() + " " + df["author"].str.lower())
    return df.reset_index(drop=True)


def load_embedding_pair(model_key: str, fallback_n: int, fallback_dim: int):
    emb_path = ROOT / f"{model_key}_book_emb.npy"
    ids_path = ROOT / f"{model_key}_book_ids.npy"

    if emb_path.exists() and ids_path.exists():
        emb = np.load(emb_path).astype(np.float32)
        ids = np.load(ids_path, allow_pickle=True).astype(str)
        source = "real"
    else:
        rng = np.random.default_rng(seed=abs(hash(model_key)) % (2**32))
        emb = rng.standard_normal((fallback_n, fallback_dim)).astype(np.float32)
        ids = METADATA["book_id"].values[:fallback_n].astype(str)
        source = "synthetic"

    norms = np.linalg.norm(emb, axis=1, keepdims=True)
    norms[norms == 0] = 1.0
    emb = emb / norms

    id_to_row = {bid: i for i, bid in enumerate(ids)}
    return emb, ids, id_to_row, source


print("─" * 60)
print(" Bangla Book Recommender — startup")
print("─" * 60)

print("Loading metadata…")
METADATA = load_metadata()
print(f"  → {len(METADATA):,} books in catalogue")

print("Loading model embeddings…")
EMBEDDINGS: Dict[str, Dict] = {}
for label, cfg in MODEL_CONFIG.items():
    emb, ids, id_to_row, source = load_embedding_pair(
        cfg["key"], len(METADATA), cfg["dim"]
    )
    EMBEDDINGS[label] = {
        "emb": emb, "ids": ids, "id_to_row": id_to_row,
        "dim": cfg["dim"], "color": cfg["color"], "source": source,
    }
    print(f"  → {label:30s} {str(emb.shape):16s}  [{source}]")

# Pre-computed display labels for ALL books (used for showing selected books)
ALL_LABELS = [
    f"{row.title} — {row.author}" if row.author else row.title
    for row in METADATA.itertuples(index=False)
]
LABEL_TO_BOOKID = dict(zip(ALL_LABELS, METADATA["book_id"].astype(str).values))
BOOKID_TO_META  = {row.book_id: row for row in METADATA.itertuples(index=False)}

# Surface a small set of "popular" labels by default (highest-rated)
# This gives the dropdown something to show before the user types.
_DEFAULT_SAMPLE = METADATA.nlargest(SEARCH_RESULT_LIMIT, "rating", keep="first")
DEFAULT_LABELS = [
    f"{row.title} — {row.author}" if row.author else row.title
    for row in _DEFAULT_SAMPLE.itertuples(index=False)
]


# ─────────────────────────────────────────────────────────────────────────────
# Search — the core UX fix
# ─────────────────────────────────────────────────────────────────────────────
def search_books(query: str, currently_selected: List[str]):
    """
    Return up to SEARCH_RESULT_LIMIT book labels matching the query.

    The dropdown's `choices` is updated, but `value` (currently selected
    items) is preserved exactly. This means a user can search → pick → search
    something else → pick, and earlier picks remain selected even though
    they're no longer in the search results.
    """
    selected = currently_selected or []

    if not query or not query.strip():
        choices = DEFAULT_LABELS
    else:
        q = query.strip().lower()
        mask = METADATA["_search"].str.contains(q, regex=False, na=False)
        matched = METADATA.loc[mask].head(SEARCH_RESULT_LIMIT)
        choices = [
            f"{row.title} — {row.author}" if row.author else row.title
            for row in matched.itertuples(index=False)
        ]

    # Always include the currently-selected items so they remain visible
    # even if they don't match the new query.
    merged = list(dict.fromkeys(selected + choices))
    return gr.update(choices=merged, value=selected)


# ─────────────────────────────────────────────────────────────────────────────
# Recommendation
# ─────────────────────────────────────────────────────────────────────────────
def recommend(seed_labels: List[str], model_label: str, top_k: int):
    if not seed_labels:
        return _empty_state_html(), gr.update(choices=[], value=[])

    seed_book_ids = [
        LABEL_TO_BOOKID[label] for label in seed_labels if label in LABEL_TO_BOOKID
    ]
    if not seed_book_ids:
        return (_empty_state_html("No valid seed books selected."),
                gr.update(choices=[], value=[]))

    model = EMBEDDINGS[model_label]
    emb, id_to_row = model["emb"], model["id_to_row"]

    seed_rows = [id_to_row[bid] for bid in seed_book_ids if bid in id_to_row]
    if not seed_rows:
        msg = (f"None of the selected books exist in the {model_label} embedding "
               "space. This model was trained on a subset of books with sufficient "
               "interaction history. Try different books, or switch to "
               "<b>Neural Two-Tower</b> which has broader coverage.")
        return _empty_state_html(msg), gr.update(choices=[], value=[])

    user_vec = emb[seed_rows].mean(axis=0)
    user_vec = user_vec / (np.linalg.norm(user_vec) + 1e-8)

    scores = emb @ user_vec
    scores[seed_rows] = -np.inf

    top_k = min(int(top_k), len(scores) - 1)
    top_idx = np.argpartition(-scores, top_k)[:top_k]
    top_idx = top_idx[np.argsort(-scores[top_idx])]

    recs = []
    for row_i in top_idx:
        book_id = str(model["ids"][row_i])
        meta = BOOKID_TO_META.get(book_id)
        if meta is None:
            continue
        recs.append({
            "rank":     len(recs) + 1,
            "title":    meta.title,
            "author":   meta.author,
            "category": meta.category,
            "rating":   meta.rating,
            "summary":  (meta.summary or "")[:240],
            "book_url": meta.book_url,
            "score":    float(scores[row_i]),
        })

    html = _render_recommendation_cards(
        recs, model_label, len(seed_rows), len(seed_book_ids)
    )
    rec_choices = [
        f"{r['title']} — {r['author']}" if r["author"] else r["title"]
        for r in recs
    ]
    return html, gr.update(choices=rec_choices, value=[])


def add_to_favourites(current_favs: List[str], to_add: List[str]):
    if not to_add:
        return gr.update(value=current_favs or [])
    merged = list(dict.fromkeys((current_favs or []) + to_add))
    return gr.update(choices=merged, value=merged)


# ─────────────────────────────────────────────────────────────────────────────
# Rendering
# ─────────────────────────────────────────────────────────────────────────────
def _empty_state_html(message: str = "Search for books you have enjoyed, "
                                     "select a few, then click <b>Get Recommendations</b>.") -> str:
    return f"""
    <div style="text-align:center;padding:3rem 1rem;color:var(--body-text-color-subdued,#888);">
      <div style="font-size:2.6rem;margin-bottom:.5rem;">📚</div>
      <div style="font-size:.95rem;max-width:32rem;margin:0 auto;line-height:1.5;">{message}</div>
    </div>
    """


def _render_recommendation_cards(recs, model_label, seeds_in_model, total_seeds):
    color = EMBEDDINGS[model_label]["color"]
    source = EMBEDDINGS[model_label]["source"]
    if not recs:
        return _empty_state_html("No recommendations could be produced.")

    cards = []
    for r in recs:
        rating_html = (
            f"<span style='color:#f59e0b;font-weight:600;'>★ {r['rating']:.1f}</span>"
            if r["rating"] is not None and not pd.isna(r["rating"]) else ""
        )
        category_html = (
            f"<span style='color:var(--body-text-color-subdued,#888);font-size:.78rem;'>{r['category']}</span>"
            if r["category"] else ""
        )
        summary_html = (
            f"<div style='margin-top:.6rem;font-size:.82rem;line-height:1.5;color:var(--body-text-color,#444);opacity:.85;'>{r['summary']}…</div>"
            if r["summary"] else ""
        )
        url_html = (
            f"<div style='margin-top:.7rem;'><a href='{r['book_url']}' target='_blank' rel='noopener' "
            f"style='font-size:.78rem;color:{color};text-decoration:none;font-weight:500;'>View on Rokomari →</a></div>"
            if r["book_url"] else ""
        )

        cards.append(f"""
        <div style="border:1px solid var(--border-color-primary,#e5e5e5);border-radius:12px;padding:1.1rem;
                    background:var(--background-fill-secondary,#fff);position:relative;overflow:hidden;">
          <div style="position:absolute;top:0;left:0;height:3px;width:100%;background:{color};"></div>
          <div style="display:flex;justify-content:space-between;align-items:flex-start;gap:.5rem;margin-bottom:.4rem;">
            <div style="font-weight:600;font-size:1rem;line-height:1.3;flex:1;">#{r['rank']}. {r['title']}</div>
            <div style="font-size:.68rem;color:#999;white-space:nowrap;font-family:ui-monospace,monospace;">sim {r['score']:.3f}</div>
          </div>
          <div style="color:var(--body-text-color,#555);font-size:.88rem;margin-bottom:.5rem;">{r['author']}</div>
          <div style="display:flex;justify-content:space-between;align-items:center;font-size:.85rem;">{rating_html}{category_html}</div>
          {summary_html}{url_html}
        </div>
        """)

    grid = ("<div style='display:grid;grid-template-columns:repeat(auto-fill,minmax(280px,1fr));gap:1rem;'>"
            + "".join(cards) + "</div>")

    warnings = []
    if source == "synthetic":
        warnings.append(
            "<div style='background:#fef3c7;border:1px solid #fbbf24;color:#78350f;"
            "padding:.6rem .9rem;border-radius:8px;font-size:.82rem;margin-bottom:1rem;'>"
            "⚠️  This model is using <b>synthetic random embeddings</b>.</div>"
        )
    if seeds_in_model < total_seeds:
        skipped = total_seeds - seeds_in_model
        warnings.append(
            f"<div style='background:#dbeafe;border:1px solid #60a5fa;color:#1e40af;"
            "padding:.6rem .9rem;border-radius:8px;font-size:.82rem;margin-bottom:1rem;'>"
            f"ℹ️  {skipped} of your {total_seeds} seed book(s) are not in the "
            f"<b>{model_label}</b> embedding space and were skipped.</div>"
        )

    header = f"""
    <div style="display:flex;align-items:center;gap:.5rem;margin-bottom:1rem;
                font-size:.9rem;color:var(--body-text-color-subdued,#666);">
      <span style="display:inline-block;width:.6rem;height:.6rem;border-radius:50%;background:{color};"></span>
      <span><strong>{len(recs)}</strong> recommendations from <strong>{model_label}</strong></span>
    </div>
    """
    return "".join(warnings) + header + grid


# ─────────────────────────────────────────────────────────────────────────────
# UI
# ─────────────────────────────────────────────────────────────────────────────
INTRO_MD = f"""
# 📚 Bangla Book Recommender
### Cold-start recommendations from {len(METADATA):,} Bangla titles, powered by two benchmarked models on the [RokomariBG dataset](https://huggingface.co/datasets/DevnilMaster1/Bangla-Book-Recommendation-Dataset).

Type a book name or author in the search box below, pick a few you have enjoyed,
choose a model, and get nearest-neighbour recommendations from the same embedding
space the model was trained in.
"""

ABOUT_MD = """
**How it works.** Each model was trained on the RokomariBG corpus (127K books,
63K users, 209K reviews scraped from Rokomari.com) to produce a vector
representation of every book. When you pick books you like, your *taste vector*
is computed as the mean of those books' embeddings, and the books with the
highest cosine similarity to your taste vector are surfaced.

**Models.**
- **Neural Two-Tower** — best benchmarked model. Item tower fuses ID, content
  (title, summary, author, publisher), and metadata. Strongest at cold-start.
- **LightGCN** — pure graph collaborative filtering with 4 GCN layers.

**Citation.** [Paper](https://arxiv.org/abs/2602.12129).
"""


with gr.Blocks(title="Bangla Book Recommender") as demo:
    gr.Markdown(INTRO_MD)

    with gr.Accordion("ℹ️  About this Space", open=False):
        gr.Markdown(ABOUT_MD)

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### 1. Search and pick books you have enjoyed")

            search_box = gr.Textbox(
                label="Search by title or author",
                placeholder="Type at least 2 letters in Bangla or English…",
                show_label=True,
            )

            seeds = gr.Dropdown(
                choices=DEFAULT_LABELS,
                value=[],
                multiselect=True,
                label="Your taste profile",
                info="Pick 3–5 books from the search results.",
                allow_custom_value=False,
                max_choices=20,
                filterable=False,
            )

            gr.Markdown("### 2. Choose a model")
            model_choice = gr.Radio(
                choices=list(MODEL_CONFIG.keys()),
                value="Neural Two-Tower (best)",
                label="Recommendation model",
            )
            top_k = gr.Slider(
                minimum=TOP_K_MIN, maximum=TOP_K_MAX,
                value=TOP_K_DEFAULT, step=1,
                label="Number of recommendations",
            )

            run_btn = gr.Button("🔍  Get Recommendations", variant="primary", size="lg")

            gr.Markdown("### 3. Refine (optional)")
            refine_picker = gr.Dropdown(
                choices=[], value=[], multiselect=True,
                label="Promote recommendations into your taste profile",
                info="After recommendations appear, pick the ones you like, then add them.",
            )
            add_btn = gr.Button("➕  Add to my favourites", size="sm")

        with gr.Column(scale=2):
            gr.Markdown("### Recommendations")
            output_html = gr.HTML(value=_empty_state_html())

    gr.Markdown(
        """
        ---
        <div style='text-align:center;font-size:.82rem;color:var(--body-text-color-subdued,#777);'>
          Built on the
          <a href='https://huggingface.co/datasets/DevnilMaster1/Bangla-Book-Recommendation-Dataset'>RokomariBG dataset</a>
          ·
          <a href='https://github.com/DevnilMaster/Bangla-Book-Recommendation-Dataset'>Code</a>
          ·
          <a href='https://arxiv.org/abs/2602.12129'>Paper</a>
          ·
          CC BY-NC 4.0
        </div>
        """
    )

    search_box.change(
        fn=search_books,
        inputs=[search_box, seeds],
        outputs=[seeds],
        show_progress="hidden",
    )
    run_btn.click(
        fn=recommend,
        inputs=[seeds, model_choice, top_k],
        outputs=[output_html, refine_picker],
    )
    add_btn.click(
        fn=add_to_favourites,
        inputs=[seeds, refine_picker],
        outputs=[seeds],
    )


if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=True,
        ssr_mode=False,
    )