rakeshjv2000 commited on
Commit
516e2ce
·
verified ·
1 Parent(s): c3027ad

Upload 6 files

Browse files
Files changed (7) hide show
  1. .gitattributes +1 -0
  2. app.py +181 -0
  3. books.index +3 -0
  4. books_with_emotions.csv +0 -0
  5. cover-not-found.jpg +0 -0
  6. id_map.npy +3 -0
  7. requirements.txt +6 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ books.index filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import pandas as pd
4
+ import faiss
5
+ import gradio as gr
6
+ from dotenv import load_dotenv
7
+ from huggingface_hub import InferenceClient
8
+
9
+ load_dotenv()
10
+
11
+ # -----------------------------
12
+ # CONFIG
13
+ # -----------------------------
14
+ BOOKS_CSV = "books_with_emotions.csv"
15
+ FAISS_INDEX_PATH = "books.index"
16
+ ID_MAP_PATH = "id_map.npy" # isbn13 list aligned with FAISS vectors
17
+
18
+ HF_TOKEN = os.getenv("HF_TOKEN")
19
+ HF_EMBEDDING_MODEL = os.getenv("HF_EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
20
+
21
+ if not HF_TOKEN:
22
+ # Works locally if you set env var / .env, and on Spaces if set as Secret.
23
+ raise RuntimeError("HF_TOKEN missing. Set in .env (local) or HF Spaces Secrets.")
24
+
25
+ client = InferenceClient(provider="hf-inference", api_key=HF_TOKEN)
26
+
27
+ # -----------------------------
28
+ # LOAD DATA
29
+ # -----------------------------
30
+ books = pd.read_csv(BOOKS_CSV)
31
+ books["isbn13"] = books["isbn13"].astype(str)
32
+
33
+ # Keep your thumbnail behavior exactly
34
+ books["large_thumbnail"] = books["thumbnail"] + "&fife=w800"
35
+ books["large_thumbnail"] = np.where(
36
+ books["large_thumbnail"].isna(),
37
+ "cover-not-found.jpg",
38
+ books["large_thumbnail"],
39
+ )
40
+
41
+ # Load FAISS + id_map (must match index order)
42
+ index = faiss.read_index(FAISS_INDEX_PATH)
43
+ id_map = np.load(ID_MAP_PATH, allow_pickle=True).astype(str)
44
+
45
+ # -----------------------------
46
+ # EMBEDDING: HF InferenceClient
47
+ # -----------------------------
48
+ def hf_embed_query(text: str, retry=3, sleep_s=2.0) -> np.ndarray:
49
+ """
50
+ Returns shape (1, dim) float32 normalized for cosine similarity with IndexFlatIP.
51
+ """
52
+ last_err = None
53
+ for attempt in range(retry):
54
+ try:
55
+ out = client.feature_extraction(text, model=HF_EMBEDDING_MODEL)
56
+ arr = np.array(out, dtype=np.float32)
57
+
58
+ # If token-level: (tokens, dim) -> mean pool
59
+ if arr.ndim == 2:
60
+ v = arr.mean(axis=0)
61
+ elif arr.ndim == 1:
62
+ v = arr
63
+ else:
64
+ v = arr.reshape(-1, arr.shape[-1]).mean(axis=0)
65
+
66
+ v = v.reshape(1, -1).astype(np.float32)
67
+ faiss.normalize_L2(v)
68
+ return v
69
+ except Exception as e:
70
+ last_err = e
71
+ import time
72
+ time.sleep(sleep_s * (attempt + 1))
73
+
74
+ raise RuntimeError(f"HF query embedding failed after retries: {last_err}")
75
+
76
+ # -----------------------------
77
+ # RETRIEVAL + FILTERING (same logic)
78
+ # -----------------------------
79
+ def retrieve_semantic_recommendations(
80
+ query: str,
81
+ category: str = None,
82
+ tone: str = None,
83
+ initial_top_k: int = 50,
84
+ final_top_k: int = 16,
85
+ ) -> pd.DataFrame:
86
+
87
+ # 1) Vector search
88
+ qv = hf_embed_query(query)
89
+ scores, idx = index.search(qv, initial_top_k)
90
+
91
+ # 2) Map FAISS positions -> isbn13
92
+ retrieved_isbns = id_map[idx[0]]
93
+ retrieved_isbns = [str(x) for x in retrieved_isbns]
94
+
95
+ # 3) Preserve retrieval order using rank column
96
+ rank_df = pd.DataFrame({"isbn13": retrieved_isbns, "rank": range(len(retrieved_isbns))})
97
+
98
+ book_recs = (
99
+ books.merge(rank_df, on="isbn13", how="inner")
100
+ .sort_values("rank")
101
+ .head(initial_top_k)
102
+ .copy()
103
+ )
104
+
105
+ # 4) Category filter
106
+ if category and category != "All":
107
+ book_recs = book_recs[book_recs["simple_categories"] == category].head(final_top_k)
108
+ else:
109
+ book_recs = book_recs.head(final_top_k)
110
+
111
+ # 5) Tone sorting
112
+ # "All" -> no extra sorting
113
+ if tone == "Happy":
114
+ book_recs.sort_values(by="joy", ascending=False, inplace=True)
115
+ elif tone == "Surprising":
116
+ book_recs.sort_values(by="surprise", ascending=False, inplace=True)
117
+ elif tone == "Angry":
118
+ book_recs.sort_values(by="anger", ascending=False, inplace=True)
119
+ elif tone == "Suspenseful":
120
+ book_recs.sort_values(by="fear", ascending=False, inplace=True)
121
+ elif tone == "Sad":
122
+ book_recs.sort_values(by="sadness", ascending=False, inplace=True)
123
+
124
+ return book_recs
125
+
126
+ # -----------------------------
127
+ # OUTPUT FORMAT (same as yours)
128
+ # -----------------------------
129
+ def recommend_books(query: str, category: str, tone: str):
130
+ recommendations = retrieve_semantic_recommendations(query, category, tone)
131
+ results = []
132
+
133
+ for _, row in recommendations.iterrows():
134
+ description = str(row.get("description", ""))
135
+ truncated_desc_split = description.split()
136
+ truncated_description = " ".join(truncated_desc_split[:30]) + "..." if truncated_desc_split else ""
137
+
138
+ authors_raw = str(row.get("authors", ""))
139
+ authors_split = [a.strip() for a in authors_raw.split(";") if a.strip()]
140
+
141
+ if len(authors_split) == 2:
142
+ authors_str = f"{authors_split[0]} and {authors_split[1]}"
143
+ elif len(authors_split) > 2:
144
+ authors_str = f"{', '.join(authors_split[:-1])}, and {authors_split[-1]}"
145
+ else:
146
+ authors_str = authors_raw
147
+
148
+ caption = f"{row.get('title','')} by {authors_str}: {truncated_description}"
149
+ results.append((row["large_thumbnail"], caption))
150
+
151
+ return results
152
+
153
+ # -----------------------------
154
+ # UI (unchanged)
155
+ # -----------------------------
156
+ categories = ["All"] + sorted(books["simple_categories"].dropna().unique())
157
+ tones = ["All"] + ["Happy", "Surprising", "Angry", "Suspenseful", "Sad"]
158
+
159
+ with gr.Blocks(theme=gr.themes.Glass()) as dashboard:
160
+ gr.Markdown("# Semantic book recommender")
161
+
162
+ with gr.Row():
163
+ user_query = gr.Textbox(
164
+ label="Please enter a description of a book:",
165
+ placeholder="e.g., A story about forgiveness"
166
+ )
167
+ category_dropdown = gr.Dropdown(choices=categories, label="Select a category:", value="All")
168
+ tone_dropdown = gr.Dropdown(choices=tones, label="Select an emotional tone:", value="All")
169
+ submit_button = gr.Button("Find recommendations")
170
+
171
+ gr.Markdown("## Recommendations")
172
+ output = gr.Gallery(label="Recommended books", columns=8, rows=2)
173
+
174
+ submit_button.click(
175
+ fn=recommend_books,
176
+ inputs=[user_query, category_dropdown, tone_dropdown],
177
+ outputs=output
178
+ )
179
+
180
+ if __name__ == "__main__":
181
+ dashboard.launch(server_name="0.0.0.0", server_port=7860)
books.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fabd79cca3f7ae45ca2df4e649d107658bc91b0196b4a07753c8bc759a0c4c36
3
+ size 7982637
books_with_emotions.csv ADDED
The diff for this file is too large to render. See raw diff
 
cover-not-found.jpg ADDED
id_map.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:465089598bbd2651e8fd947f5738740fa2188e508319a12b487887a14b173986
3
+ size 83449
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio
2
+ pandas
3
+ numpy
4
+ python-dotenv
5
+ faiss-cpu
6
+ huggingface_hub