File size: 19,575 Bytes
be8cb65
 
 
 
32da19c
be8cb65
 
 
 
a964075
 
 
 
 
 
 
be8cb65
 
 
 
 
 
 
 
 
 
 
 
 
 
32da19c
be8cb65
 
a964075
be8cb65
 
 
32da19c
be8cb65
 
 
 
 
 
 
32da19c
be8cb65
 
32da19c
be8cb65
32da19c
be8cb65
 
 
 
 
 
a964075
be8cb65
a964075
32da19c
be8cb65
32da19c
 
 
be8cb65
32da19c
 
 
 
 
a964075
32da19c
a964075
 
be8cb65
 
 
 
32da19c
 
be8cb65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a964075
 
be8cb65
 
 
a964075
be8cb65
 
a964075
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
be8cb65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32da19c
 
 
be8cb65
 
 
 
 
 
a964075
be8cb65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32da19c
be8cb65
 
 
a964075
 
 
be8cb65
 
 
 
 
 
 
 
 
 
 
a964075
be8cb65
 
 
 
 
a964075
 
be8cb65
 
 
 
 
 
 
 
a964075
be8cb65
 
 
a964075
be8cb65
 
 
 
 
 
 
 
a964075
be8cb65
 
 
a964075
be8cb65
 
32da19c
a964075
 
32da19c
 
be8cb65
 
a964075
 
be8cb65
a964075
 
 
be8cb65
a964075
 
 
be8cb65
 
 
a964075
 
be8cb65
32da19c
be8cb65
32da19c
 
 
a964075
32da19c
 
 
 
 
 
 
a964075
32da19c
be8cb65
 
 
 
 
 
 
 
32da19c
be8cb65
 
 
 
 
 
 
32da19c
be8cb65
a964075
 
 
be8cb65
 
 
a964075
 
 
 
 
 
 
 
 
 
 
 
be8cb65
 
 
9e7aae8
be8cb65
 
 
 
 
 
 
a964075
 
 
 
 
 
 
 
be8cb65
a964075
be8cb65
 
 
a964075
be8cb65
 
a964075
be8cb65
 
 
 
 
 
 
 
 
 
 
 
 
 
a964075
be8cb65
 
 
a964075
be8cb65
a964075
be8cb65
 
 
 
 
 
 
 
 
 
a964075
be8cb65
 
 
 
 
 
 
 
 
 
 
 
a964075
 
 
 
 
 
be8cb65
 
 
 
 
 
 
 
 
 
 
 
fa4b1c8
cb3d29e
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
"""
Bangla Book Recommender β€” Hugging Face Space
============================================

Cold-start recommendation interface for the RokomariBG dataset using two
benchmarked recommender models from the paper "Towards Personalized Bangla
Book Recommendation: A Large-Scale Multi-Entity Book Graph Dataset"
(https://arxiv.org/abs/2602.12129).

Performance note
----------------
The catalogue has 127K books. Loading all of them into a Dropdown component
freezes the browser. Instead, this app uses a search-as-you-type pattern:
the user types a query, the backend filters titles/authors and returns up
to ~50 matches, and only those matches are rendered in a Dropdown. This
keeps the UI responsive even on free CPU tiers.
"""

from __future__ import annotations

from pathlib import Path
from typing import Dict, List

import gradio as gr
import numpy as np
import pandas as pd

# ─────────────────────────────────────────────────────────────────────────────
# Configuration
# ─────────────────────────────────────────────────────────────────────────────
ROOT = Path(__file__).parent

TOP_K_MIN, TOP_K_DEFAULT, TOP_K_MAX = 5, 10, 30
SEARCH_RESULT_LIMIT = 50   # max books shown in the search dropdown at any time

MODEL_CONFIG: Dict[str, Dict] = {
    "Neural Two-Tower (best)": {"key": "two_tower", "dim": 256, "color": "#7c3aed"},
    "LightGCN":                {"key": "lightgcn",  "dim": 64,  "color": "#0891b2"},
}


# ─────────────────────────────────────────────────────────────────────────────
# Loading
# ─────────────────────────────────────────────────────────────────────────────
def load_metadata() -> pd.DataFrame:
    path = ROOT / "books_metadata.parquet"
    if path.exists():
        df = pd.read_parquet(path)
        print(f"  β†’ metadata loaded from {path.name}")
    else:
        print(f"  ⚠️  {path.name} not found β€” using synthetic fallback")
        rng = np.random.default_rng(42)
        n = 500
        df = pd.DataFrame({
            "book_id":  [f"demo_{i:05d}" for i in range(n)],
            "title":    [f"Demo Book #{i}" for i in range(n)],
            "author":   [f"Demo Author {i % 30}" for i in range(n)],
            "category": rng.choice(["Fiction", "History", "Science"], size=n),
            "rating":   np.round(3 + rng.random(n) * 2, 1),
            "summary":  ["Synthetic placeholder summary."] * n,
            "book_url": [""] * n,
        })

    for col, default in [("book_id", ""), ("title", ""), ("author", ""),
                         ("category", ""), ("summary", ""), ("book_url", "")]:
        if col not in df.columns:
            df[col] = default
        df[col] = df[col].fillna(default).astype(str)

    if "rating" not in df.columns:
        df["rating"] = np.nan
    df["rating"] = pd.to_numeric(df["rating"], errors="coerce").clip(lower=1.0, upper=5.0)

    # Precompute lowercased search index for fast filtering
    df["_search"] = (df["title"].str.lower() + " " + df["author"].str.lower())
    return df.reset_index(drop=True)


def load_embedding_pair(model_key: str, fallback_n: int, fallback_dim: int):
    emb_path = ROOT / f"{model_key}_book_emb.npy"
    ids_path = ROOT / f"{model_key}_book_ids.npy"

    if emb_path.exists() and ids_path.exists():
        emb = np.load(emb_path).astype(np.float32)
        ids = np.load(ids_path, allow_pickle=True).astype(str)
        source = "real"
    else:
        rng = np.random.default_rng(seed=abs(hash(model_key)) % (2**32))
        emb = rng.standard_normal((fallback_n, fallback_dim)).astype(np.float32)
        ids = METADATA["book_id"].values[:fallback_n].astype(str)
        source = "synthetic"

    norms = np.linalg.norm(emb, axis=1, keepdims=True)
    norms[norms == 0] = 1.0
    emb = emb / norms

    id_to_row = {bid: i for i, bid in enumerate(ids)}
    return emb, ids, id_to_row, source


print("─" * 60)
print(" Bangla Book Recommender β€” startup")
print("─" * 60)

print("Loading metadata…")
METADATA = load_metadata()
print(f"  β†’ {len(METADATA):,} books in catalogue")

print("Loading model embeddings…")
EMBEDDINGS: Dict[str, Dict] = {}
for label, cfg in MODEL_CONFIG.items():
    emb, ids, id_to_row, source = load_embedding_pair(
        cfg["key"], len(METADATA), cfg["dim"]
    )
    EMBEDDINGS[label] = {
        "emb": emb, "ids": ids, "id_to_row": id_to_row,
        "dim": cfg["dim"], "color": cfg["color"], "source": source,
    }
    print(f"  β†’ {label:30s} {str(emb.shape):16s}  [{source}]")

# Pre-computed display labels for ALL books (used for showing selected books)
ALL_LABELS = [
    f"{row.title} β€” {row.author}" if row.author else row.title
    for row in METADATA.itertuples(index=False)
]
LABEL_TO_BOOKID = dict(zip(ALL_LABELS, METADATA["book_id"].astype(str).values))
BOOKID_TO_META  = {row.book_id: row for row in METADATA.itertuples(index=False)}

# Surface a small set of "popular" labels by default (highest-rated)
# This gives the dropdown something to show before the user types.
_DEFAULT_SAMPLE = METADATA.nlargest(SEARCH_RESULT_LIMIT, "rating", keep="first")
DEFAULT_LABELS = [
    f"{row.title} β€” {row.author}" if row.author else row.title
    for row in _DEFAULT_SAMPLE.itertuples(index=False)
]


# ─────────────────────────────────────────────────────────────────────────────
# Search β€” the core UX fix
# ─────────────────────────────────────────────────────────────────────────────
def search_books(query: str, currently_selected: List[str]):
    """
    Return up to SEARCH_RESULT_LIMIT book labels matching the query.

    The dropdown's `choices` is updated, but `value` (currently selected
    items) is preserved exactly. This means a user can search β†’ pick β†’ search
    something else β†’ pick, and earlier picks remain selected even though
    they're no longer in the search results.
    """
    selected = currently_selected or []

    if not query or not query.strip():
        choices = DEFAULT_LABELS
    else:
        q = query.strip().lower()
        mask = METADATA["_search"].str.contains(q, regex=False, na=False)
        matched = METADATA.loc[mask].head(SEARCH_RESULT_LIMIT)
        choices = [
            f"{row.title} β€” {row.author}" if row.author else row.title
            for row in matched.itertuples(index=False)
        ]

    # Always include the currently-selected items so they remain visible
    # even if they don't match the new query.
    merged = list(dict.fromkeys(selected + choices))
    return gr.update(choices=merged, value=selected)


# ─────────────────────────────────────────────────────────────────────────────
# Recommendation
# ─────────────────────────────────────────────────────────────────────────────
def recommend(seed_labels: List[str], model_label: str, top_k: int):
    if not seed_labels:
        return _empty_state_html(), gr.update(choices=[], value=[])

    seed_book_ids = [
        LABEL_TO_BOOKID[label] for label in seed_labels if label in LABEL_TO_BOOKID
    ]
    if not seed_book_ids:
        return (_empty_state_html("No valid seed books selected."),
                gr.update(choices=[], value=[]))

    model = EMBEDDINGS[model_label]
    emb, id_to_row = model["emb"], model["id_to_row"]

    seed_rows = [id_to_row[bid] for bid in seed_book_ids if bid in id_to_row]
    if not seed_rows:
        msg = (f"None of the selected books exist in the {model_label} embedding "
               "space. This model was trained on a subset of books with sufficient "
               "interaction history. Try different books, or switch to "
               "<b>Neural Two-Tower</b> which has broader coverage.")
        return _empty_state_html(msg), gr.update(choices=[], value=[])

    user_vec = emb[seed_rows].mean(axis=0)
    user_vec = user_vec / (np.linalg.norm(user_vec) + 1e-8)

    scores = emb @ user_vec
    scores[seed_rows] = -np.inf

    top_k = min(int(top_k), len(scores) - 1)
    top_idx = np.argpartition(-scores, top_k)[:top_k]
    top_idx = top_idx[np.argsort(-scores[top_idx])]

    recs = []
    for row_i in top_idx:
        book_id = str(model["ids"][row_i])
        meta = BOOKID_TO_META.get(book_id)
        if meta is None:
            continue
        recs.append({
            "rank":     len(recs) + 1,
            "title":    meta.title,
            "author":   meta.author,
            "category": meta.category,
            "rating":   meta.rating,
            "summary":  (meta.summary or "")[:240],
            "book_url": meta.book_url,
            "score":    float(scores[row_i]),
        })

    html = _render_recommendation_cards(
        recs, model_label, len(seed_rows), len(seed_book_ids)
    )
    rec_choices = [
        f"{r['title']} β€” {r['author']}" if r["author"] else r["title"]
        for r in recs
    ]
    return html, gr.update(choices=rec_choices, value=[])


def add_to_favourites(current_favs: List[str], to_add: List[str]):
    if not to_add:
        return gr.update(value=current_favs or [])
    merged = list(dict.fromkeys((current_favs or []) + to_add))
    return gr.update(choices=merged, value=merged)


# ─────────────────────────────────────────────────────────────────────────────
# Rendering
# ─────────────────────────────────────────────────────────────────────────────
def _empty_state_html(message: str = "Search for books you have enjoyed, "
                                     "select a few, then click <b>Get Recommendations</b>.") -> str:
    return f"""
    <div style="text-align:center;padding:3rem 1rem;color:var(--body-text-color-subdued,#888);">
      <div style="font-size:2.6rem;margin-bottom:.5rem;">πŸ“š</div>
      <div style="font-size:.95rem;max-width:32rem;margin:0 auto;line-height:1.5;">{message}</div>
    </div>
    """


def _render_recommendation_cards(recs, model_label, seeds_in_model, total_seeds):
    color = EMBEDDINGS[model_label]["color"]
    source = EMBEDDINGS[model_label]["source"]
    if not recs:
        return _empty_state_html("No recommendations could be produced.")

    cards = []
    for r in recs:
        rating_html = (
            f"<span style='color:#f59e0b;font-weight:600;'>β˜… {r['rating']:.1f}</span>"
            if r["rating"] is not None and not pd.isna(r["rating"]) else ""
        )
        category_html = (
            f"<span style='color:var(--body-text-color-subdued,#888);font-size:.78rem;'>{r['category']}</span>"
            if r["category"] else ""
        )
        summary_html = (
            f"<div style='margin-top:.6rem;font-size:.82rem;line-height:1.5;color:var(--body-text-color,#444);opacity:.85;'>{r['summary']}…</div>"
            if r["summary"] else ""
        )
        url_html = (
            f"<div style='margin-top:.7rem;'><a href='{r['book_url']}' target='_blank' rel='noopener' "
            f"style='font-size:.78rem;color:{color};text-decoration:none;font-weight:500;'>View on Rokomari β†’</a></div>"
            if r["book_url"] else ""
        )

        cards.append(f"""
        <div style="border:1px solid var(--border-color-primary,#e5e5e5);border-radius:12px;padding:1.1rem;
                    background:var(--background-fill-secondary,#fff);position:relative;overflow:hidden;">
          <div style="position:absolute;top:0;left:0;height:3px;width:100%;background:{color};"></div>
          <div style="display:flex;justify-content:space-between;align-items:flex-start;gap:.5rem;margin-bottom:.4rem;">
            <div style="font-weight:600;font-size:1rem;line-height:1.3;flex:1;">#{r['rank']}. {r['title']}</div>
            <div style="font-size:.68rem;color:#999;white-space:nowrap;font-family:ui-monospace,monospace;">sim {r['score']:.3f}</div>
          </div>
          <div style="color:var(--body-text-color,#555);font-size:.88rem;margin-bottom:.5rem;">{r['author']}</div>
          <div style="display:flex;justify-content:space-between;align-items:center;font-size:.85rem;">{rating_html}{category_html}</div>
          {summary_html}{url_html}
        </div>
        """)

    grid = ("<div style='display:grid;grid-template-columns:repeat(auto-fill,minmax(280px,1fr));gap:1rem;'>"
            + "".join(cards) + "</div>")

    warnings = []
    if source == "synthetic":
        warnings.append(
            "<div style='background:#fef3c7;border:1px solid #fbbf24;color:#78350f;"
            "padding:.6rem .9rem;border-radius:8px;font-size:.82rem;margin-bottom:1rem;'>"
            "⚠️  This model is using <b>synthetic random embeddings</b>.</div>"
        )
    if seeds_in_model < total_seeds:
        skipped = total_seeds - seeds_in_model
        warnings.append(
            f"<div style='background:#dbeafe;border:1px solid #60a5fa;color:#1e40af;"
            "padding:.6rem .9rem;border-radius:8px;font-size:.82rem;margin-bottom:1rem;'>"
            f"ℹ️  {skipped} of your {total_seeds} seed book(s) are not in the "
            f"<b>{model_label}</b> embedding space and were skipped.</div>"
        )

    header = f"""
    <div style="display:flex;align-items:center;gap:.5rem;margin-bottom:1rem;
                font-size:.9rem;color:var(--body-text-color-subdued,#666);">
      <span style="display:inline-block;width:.6rem;height:.6rem;border-radius:50%;background:{color};"></span>
      <span><strong>{len(recs)}</strong> recommendations from <strong>{model_label}</strong></span>
    </div>
    """
    return "".join(warnings) + header + grid


# ─────────────────────────────────────────────────────────────────────────────
# UI
# ─────────────────────────────────────────────────────────────────────────────
INTRO_MD = f"""
# πŸ“š Bangla Book Recommender
### Cold-start recommendations from {len(METADATA):,} Bangla titles, powered by two benchmarked models on the [RokomariBG dataset](https://huggingface.co/datasets/DevnilMaster1/Bangla-Book-Recommendation-Dataset).

Type a book name or author in the search box below, pick a few you have enjoyed,
choose a model, and get nearest-neighbour recommendations from the same embedding
space the model was trained in.
"""

ABOUT_MD = """
**How it works.** Each model was trained on the RokomariBG corpus (127K books,
63K users, 209K reviews scraped from Rokomari.com) to produce a vector
representation of every book. When you pick books you like, your *taste vector*
is computed as the mean of those books' embeddings, and the books with the
highest cosine similarity to your taste vector are surfaced.

**Models.**
- **Neural Two-Tower** β€” best benchmarked model. Item tower fuses ID, content
  (title, summary, author, publisher), and metadata. Strongest at cold-start.
- **LightGCN** β€” pure graph collaborative filtering with 4 GCN layers.

**Citation.** [Paper](https://arxiv.org/abs/2602.12129).
"""


with gr.Blocks(title="Bangla Book Recommender") as demo:
    gr.Markdown(INTRO_MD)

    with gr.Accordion("ℹ️  About this Space", open=False):
        gr.Markdown(ABOUT_MD)

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### 1. Search and pick books you have enjoyed")

            search_box = gr.Textbox(
                label="Search by title or author",
                placeholder="Type at least 2 letters in Bangla or English…",
                show_label=True,
            )

            seeds = gr.Dropdown(
                choices=DEFAULT_LABELS,
                value=[],
                multiselect=True,
                label="Your taste profile",
                info="Pick 3–5 books from the search results.",
                allow_custom_value=False,
                max_choices=20,
                filterable=False,
            )

            gr.Markdown("### 2. Choose a model")
            model_choice = gr.Radio(
                choices=list(MODEL_CONFIG.keys()),
                value="Neural Two-Tower (best)",
                label="Recommendation model",
            )
            top_k = gr.Slider(
                minimum=TOP_K_MIN, maximum=TOP_K_MAX,
                value=TOP_K_DEFAULT, step=1,
                label="Number of recommendations",
            )

            run_btn = gr.Button("πŸ”  Get Recommendations", variant="primary", size="lg")

            gr.Markdown("### 3. Refine (optional)")
            refine_picker = gr.Dropdown(
                choices=[], value=[], multiselect=True,
                label="Promote recommendations into your taste profile",
                info="After recommendations appear, pick the ones you like, then add them.",
            )
            add_btn = gr.Button("βž•  Add to my favourites", size="sm")

        with gr.Column(scale=2):
            gr.Markdown("### Recommendations")
            output_html = gr.HTML(value=_empty_state_html())

    gr.Markdown(
        """
        ---
        <div style='text-align:center;font-size:.82rem;color:var(--body-text-color-subdued,#777);'>
          Built on the
          <a href='https://huggingface.co/datasets/DevnilMaster1/Bangla-Book-Recommendation-Dataset'>RokomariBG dataset</a>
          Β·
          <a href='https://github.com/DevnilMaster/Bangla-Book-Recommendation-Dataset'>Code</a>
          Β·
          <a href='https://arxiv.org/abs/2602.12129'>Paper</a>
          Β·
          CC BY-NC 4.0
        </div>
        """
    )

    search_box.change(
        fn=search_books,
        inputs=[search_box, seeds],
        outputs=[seeds],
        show_progress="hidden",
    )
    run_btn.click(
        fn=recommend,
        inputs=[seeds, model_choice, top_k],
        outputs=[output_html, refine_picker],
    )
    add_btn.click(
        fn=add_to_favourites,
        inputs=[seeds, refine_picker],
        outputs=[seeds],
    )


if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=True,
        ssr_mode=False,
    )