edwinbh commited on
Commit
a7449c5
Β·
verified Β·
1 Parent(s): abd5c27

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +469 -262
src/streamlit_app.py CHANGED
@@ -1,24 +1,28 @@
1
  """
2
- Streamlit Dashboard for DLRM Book Recommendation System - Hugging Face Space Compatible
3
- Simple interface for DLRM-based book recommendations optimized for HF Spaces
4
  """
5
 
6
- import os
7
- import sys
8
  import streamlit as st
9
-
10
- # Force CPU-only mode for Hugging Face Spaces
11
- os.environ['CPU_ONLY'] = 'true'
12
- os.environ['CUDA_VISIBLE_DEVICES'] = ''
13
-
14
- # Disable Streamlit telemetry for HF Spaces
15
- os.environ['STREAMLIT_BROWSER_GATHER_USAGE_STATS'] = 'false'
16
-
17
  import pandas as pd
18
  import numpy as np
 
 
 
 
 
19
  import warnings
20
  warnings.filterwarnings('ignore')
21
 
 
 
 
 
 
 
 
 
 
22
  # Page configuration
23
  st.set_page_config(
24
  page_title="DLRM Book Recommendations",
@@ -36,14 +40,18 @@ st.markdown("""
36
  text-align: center;
37
  margin-bottom: 2rem;
38
  }
39
- .cpu-mode-banner {
40
- background-color: #d4edda;
41
- color: #155724;
42
- padding: 0.75rem;
 
 
 
 
 
43
  border-radius: 0.5rem;
44
- border-left: 4px solid #28a745;
45
  margin: 1rem 0;
46
- text-align: center;
47
  }
48
  .book-card {
49
  background-color: #ffffff;
@@ -56,72 +64,38 @@ st.markdown("""
56
  """, unsafe_allow_html=True)
57
 
58
  @st.cache_data
59
- def load_sample_data():
60
- """Load sample data for demo purposes"""
61
- # Sample book data
62
- sample_books = {
63
- 'ISBN': ['0439023483', '0439358078', '0316666343', '0452264464', '0061120081'],
64
- 'Book-Title': [
65
- 'The Hunger Games',
66
- 'Harry Potter and the Chamber of Secrets',
67
- 'The Catcher in the Rye',
68
- '1984',
69
- 'To Kill a Mockingbird'
70
- ],
71
- 'Book-Author': [
72
- 'Suzanne Collins',
73
- 'J.K. Rowling',
74
- 'J.D. Salinger',
75
- 'George Orwell',
76
- 'Harper Lee'
77
- ],
78
- 'Year-Of-Publication': [2008, 1999, 1951, 1949, 1960],
79
- 'Publisher': ['Scholastic', 'Scholastic', 'Little, Brown', 'Signet', 'Harper']
80
- }
81
-
82
- # Sample users
83
- sample_users = {
84
- 'User-ID': [1, 2, 3, 4, 5],
85
- 'Age': [25, 32, 19, 45, 28],
86
- 'Location': ['New York, USA', 'London, UK', 'Tokyo, Japan', 'Berlin, Germany', 'Toronto, Canada']
87
- }
88
-
89
- # Sample ratings
90
- sample_ratings = {
91
- 'User-ID': [1, 1, 2, 2, 3, 3, 4, 4, 5, 5],
92
- 'ISBN': ['0439023483', '0439358078', '0316666343', '0452264464', '0061120081', '0439023483', '0316666343', '0439358078', '0452264464', '0061120081'],
93
- 'Book-Rating': [9, 8, 7, 10, 8, 6, 9, 7, 8, 9]
94
- }
95
-
96
- books_df = pd.DataFrame(sample_books)
97
- users_df = pd.DataFrame(sample_users)
98
- ratings_df = pd.DataFrame(sample_ratings)
99
-
100
- return books_df, users_df, ratings_df
101
 
102
- def simulate_dlrm_prediction(user_id, book_isbn, user_data=None, book_data=None):
103
- """Simulate DLRM prediction for demo purposes"""
104
- # Simple heuristic-based simulation
105
- np.random.seed(hash(f"{user_id}_{book_isbn}") % 2**32)
106
-
107
- base_score = 0.5
108
-
109
- # User preferences (simulated)
110
- user_bias = np.random.uniform(-0.2, 0.2)
111
-
112
- # Book popularity (simulated)
113
- book_bias = np.random.uniform(-0.1, 0.1)
114
-
115
- # Add some randomness
116
- noise = np.random.uniform(-0.05, 0.05)
117
 
118
- final_score = base_score + user_bias + book_bias + noise
119
- final_score = max(0.0, min(1.0, final_score)) # Clamp to [0,1]
120
-
121
- return final_score
 
 
122
 
123
  def display_book_info(book_isbn, books_df, show_rating=None):
124
- """Display book information"""
125
  book_info = books_df[books_df['ISBN'] == book_isbn]
126
 
127
  if len(book_info) == 0:
@@ -133,8 +107,26 @@ def display_book_info(book_isbn, books_df, show_rating=None):
133
  col1, col2 = st.columns([1, 3])
134
 
135
  with col1:
136
- # Placeholder book cover
137
- st.image("https://via.placeholder.com/150x200?text=πŸ“š&color=1f77b4&bg=f0f2f6", width=150)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
  with col2:
140
  st.markdown(f"**{book['Book-Title']}**")
@@ -150,50 +142,68 @@ def main():
150
  # Header
151
  st.markdown('<h1 class="main-header">πŸ“š DLRM Book Recommendation System</h1>', unsafe_allow_html=True)
152
  st.markdown("### Deep Learning Recommendation Model for Personalized Book Suggestions")
 
153
 
154
- # HF Space optimized banner
155
- st.markdown('''
156
- <div class="cpu-mode-banner">
157
- πŸš€ Optimized for Hugging Face Spaces - CPU-only mode with simulated DLRM predictions
158
- </div>
159
- ''', unsafe_allow_html=True)
160
 
161
- st.markdown("---")
162
-
163
- # Load sample data
164
- with st.spinner("Loading sample data..."):
165
- books_df, users_df, ratings_df = load_sample_data()
 
 
166
 
167
  # Sidebar info
168
- st.sidebar.title("πŸ“Š Demo Dataset")
169
- st.sidebar.metric("πŸ“š Sample Books", len(books_df))
170
- st.sidebar.metric("πŸ‘₯ Sample Users", len(users_df))
171
- st.sidebar.metric("⭐ Sample Ratings", len(ratings_df))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
 
 
 
 
173
  st.sidebar.markdown("---")
174
- st.sidebar.markdown("""
175
- ### πŸ”§ HF Space Features:
176
- - CPU-only processing
177
- - Simulated DLRM predictions
178
- - Sample dataset demo
179
- - No GPU dependencies
180
- """)
181
 
182
  # Main interface
183
- tab1, tab2, tab3 = st.tabs(["🎯 Get Recommendations", "πŸ“Š How DLRM Works", "πŸ” Book Explorer"])
184
 
185
  with tab1:
186
- st.header("🎯 DLRM Book Recommendations (Simulated)")
187
- st.info("Demo of DLRM-based recommendations using simulated predictions")
188
 
189
  # User selection
190
  col1, col2 = st.columns([2, 1])
191
 
192
  with col1:
193
- selected_user_id = st.selectbox("Select a user", users_df['User-ID'].tolist())
 
194
 
195
  with col2:
196
- num_recommendations = st.slider("Number of recommendations", 3, 5, 5)
197
 
198
  # Show user info
199
  user_info = users_df[users_df['User-ID'] == selected_user_id]
@@ -204,194 +214,391 @@ def main():
204
  # User's reading history
205
  user_ratings = ratings_df[ratings_df['User-ID'] == selected_user_id]
206
  if len(user_ratings) > 0:
207
- with st.expander(f"πŸ“– User's Reading History ({len(user_ratings)} books)", expanded=True):
208
- for _, rating in user_ratings.iterrows():
 
209
  book_info = books_df[books_df['ISBN'] == rating['ISBN']]
210
  if len(book_info) > 0:
211
  book = book_info.iloc[0]
212
  st.write(f"β€’ **{book['Book-Title']}** by {book['Book-Author']} - {rating['Book-Rating']}/10 ⭐")
213
 
214
- if st.button("πŸš€ Get Simulated DLRM Recommendations", type="primary"):
215
- with st.spinner("πŸ€– Simulating DLRM analysis..."):
216
 
217
- # Get books not rated by user
218
  user_rated_books = set(user_ratings['ISBN']) if len(user_ratings) > 0 else set()
219
- candidate_books = [isbn for isbn in books_df['ISBN'] if isbn not in user_rated_books]
220
 
221
- # Get simulated recommendations
222
- recommendations = []
223
- for book_isbn in candidate_books:
224
- score = simulate_dlrm_prediction(selected_user_id, book_isbn)
225
- recommendations.append((book_isbn, score))
226
 
227
- # Sort and take top recommendations
228
- recommendations.sort(key=lambda x: x[1], reverse=True)
229
- recommendations = recommendations[:num_recommendations]
230
-
231
- st.success(f"Generated {len(recommendations)} simulated DLRM recommendations!")
232
-
233
- st.subheader("🎯 Simulated DLRM Recommendations")
 
 
234
 
235
- for i, (book_isbn, score) in enumerate(recommendations, 1):
236
- with st.expander(f"{i}. Recommendation (Simulated DLRM Score: {score:.4f})", expanded=(i <= 2)):
237
- display_book_info(book_isbn, books_df, show_rating=score)
238
-
239
- # Additional info
240
- st.markdown(f"""
241
- **πŸ“Š Prediction Details:**
242
- - User ID: {selected_user_id}
243
- - Book ISBN: {book_isbn}
244
- - Simulated DLRM Confidence: {score:.1%}
245
- - Recommendation Rank: #{i}
246
- """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
 
248
  with tab2:
249
- st.header("πŸ“Š How DLRM Works for Book Recommendations")
250
-
251
- st.markdown("""
252
- ## πŸ€– Deep Learning Recommendation Model (DLRM)
253
-
254
- DLRM is specifically designed for recommendation systems and offers several advantages over traditional approaches:
255
-
256
- ### πŸ—οΈ Architecture Benefits:
257
- """)
258
 
259
  col1, col2 = st.columns(2)
260
 
261
  with col1:
262
- st.markdown("""
263
- **πŸ”§ Technical Features:**
264
- - Multi-feature processing
265
- - Embedding tables for categorical features
266
- - Cross-feature interactions
267
- - Scalable design for large datasets
268
- - Real-time inference capability
269
- """)
270
 
271
  with col2:
272
- st.markdown("""
273
- **πŸ“Š Input Features:**
274
- - User ID, Age, Location
275
- - Book ID, Publisher, Publication Year
276
- - Rating patterns and user activity
277
- - Cross-feature interactions
278
- """)
279
-
280
- st.markdown("""
281
- ### 🎯 Why DLRM vs Traditional Methods:
282
-
283
- | Feature | DLRM | Traditional CF | Content-Based |
284
- |---------|------|----------------|---------------|
285
- | **Feature Integration** | βœ… Excellent | ❌ Limited | ⚠️ Moderate |
286
- | **Cold Start Problem** | βœ… Handles well | ❌ Poor | βœ… Good |
287
- | **Scalability** | βœ… Highly scalable | ⚠️ Moderate | βœ… Good |
288
- | **Accuracy** | βœ… High | ⚠️ Moderate | ⚠️ Moderate |
289
- | **Real-time Inference** | βœ… Fast | ⚠️ Slow | βœ… Fast |
290
-
291
- ### πŸ’‘ Best Use Cases:
292
- - **E-commerce**: Product recommendations
293
- - **Streaming**: Content recommendations
294
- - **Publishing**: Book/article suggestions
295
- - **Social Media**: Feed optimization
296
- """)
297
 
298
- # Demo architecture visualization
299
- st.subheader("πŸ—οΈ DLRM Architecture Overview")
300
-
301
- st.markdown("""
302
- ```
303
- User Features Book Features
304
- β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
305
- β”‚ User ID β”‚ β”‚ Book ID β”‚
306
- β”‚ Age Group β”‚ β”‚ Publisher β”‚
307
- β”‚ Location β”‚ β”‚ Decade β”‚
308
- β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
309
- β”‚ β”‚
310
- β–Ό β–Ό
311
- β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
312
- β”‚ Embedding Tables β”‚
313
- β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
314
- β”‚
315
- β–Ό
316
- β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
317
- β”‚ Cross-Feature Network β”‚
318
- β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
319
- β”‚
320
- β–Ό
321
- β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
322
- β”‚ Rating Prediction β”‚
323
- β”‚ (0.0 - 1.0 score) β”‚
324
- β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
325
- ```
326
- """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
327
 
328
  with tab3:
329
- st.header("πŸ” Book Explorer")
330
- st.info("Browse sample books and see simulated DLRM predictions")
331
-
332
- # Book selection
333
- selected_book_isbn = st.selectbox("Select a book", books_df['ISBN'].tolist())
334
- selected_user_for_prediction = st.selectbox("Select user for prediction", users_df['User-ID'].tolist(), key="pred_user")
335
-
336
- # Display selected book
337
- st.subheader("πŸ“š Selected Book")
338
- display_book_info(selected_book_isbn, books_df)
339
 
340
- # Show prediction
341
- if st.button("🎯 Get Simulated DLRM Prediction"):
342
- with st.spinner("Calculating simulated prediction..."):
343
- prediction_score = simulate_dlrm_prediction(selected_user_for_prediction, selected_book_isbn)
344
 
345
- st.success(f"Simulated DLRM Prediction: {prediction_score:.4f}")
 
 
 
 
 
 
 
 
346
 
347
- # Interpretation
348
- if prediction_score > 0.7:
349
- st.success("🎯 High recommendation confidence - User likely to enjoy this book!")
350
- elif prediction_score > 0.5:
351
- st.info("βš–οΈ Moderate recommendation confidence - Could be interesting for user")
352
- else:
353
- st.warning("πŸ“‰ Low recommendation confidence - May not match user preferences")
 
 
 
354
 
355
- # All books overview
356
- st.subheader("πŸ“š All Sample Books")
357
 
358
- for _, book in books_df.iterrows():
359
- with st.expander(f"{book['Book-Title']} by {book['Book-Author']}"):
360
- col1, col2 = st.columns([2, 1])
 
 
 
 
 
 
 
 
 
 
 
 
361
 
362
- with col1:
363
- st.write(f"**ISBN:** {book['ISBN']}")
364
- st.write(f"**Publisher:** {book['Publisher']}")
365
- st.write(f"**Year:** {book['Year-Of-Publication']}")
366
 
367
- with col2:
368
- # Show ratings from sample users
369
- book_ratings = ratings_df[ratings_df['ISBN'] == book['ISBN']]
370
- if len(book_ratings) > 0:
371
- avg_rating = book_ratings['Book-Rating'].mean()
372
- st.metric("Avg Rating", f"{avg_rating:.1f}/10")
373
- st.metric("# Ratings", len(book_ratings))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
374
 
375
- # Footer
376
  st.markdown("---")
377
  st.markdown("""
378
- ### πŸš€ About this Demo
 
 
379
 
380
- This is a **Hugging Face Space** compatible version of a DLRM Book Recommendation System:
 
 
 
 
381
 
382
- - **CPU-only processing**: No GPU or NVIDIA drivers required
383
- - **Simulated predictions**: Demonstrates DLRM concept with heuristic-based scoring
384
- - **Sample dataset**: 5 popular books and 5 sample users
385
- - **Educational purpose**: Shows how DLRM would work in production
386
 
387
- **For production use:**
388
- - Train actual DLRM model with PyTorch/TorchRec
389
- - Use full book datasets (millions of books/users)
390
- - Deploy on GPU infrastructure for better performance
391
- - Implement proper feature engineering and preprocessing
392
 
393
- **πŸ”— Learn more about DLRM:** [Facebook Research DLRM](https://github.com/facebookresearch/dlrm)
 
 
 
 
 
 
 
 
 
 
 
394
  """)
395
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
396
  if __name__ == "__main__":
397
- main()
 
1
  """
2
+ Streamlit Dashboard for DLRM Book Recommendation System
3
+ Simple interface for DLRM-based book recommendations
4
  """
5
 
 
 
6
  import streamlit as st
 
 
 
 
 
 
 
 
7
  import pandas as pd
8
  import numpy as np
9
+ # import torch
10
+ import pickle
11
+ import os
12
+ import sys
13
+ from typing import Dict, List, Tuple, Optional
14
  import warnings
15
  warnings.filterwarnings('ignore')
16
 
17
+ # Try to import DLRM components
18
+ try:
19
+ sys.path.append('.')
20
+ from dlrm_inference import DLRMBookRecommender, load_dlrm_recommender
21
+ DLRM_AVAILABLE = True
22
+ except ImportError as e:
23
+ DLRM_AVAILABLE = False
24
+ st.error(f"DLRM components not available: {e}")
25
+
26
  # Page configuration
27
  st.set_page_config(
28
  page_title="DLRM Book Recommendations",
 
40
  text-align: center;
41
  margin-bottom: 2rem;
42
  }
43
+ .metric-card {
44
+ background-color: #f0f2f6;
45
+ padding: 1rem;
46
+ border-radius: 0.5rem;
47
+ border-left: 5px solid #1f77b4;
48
+ }
49
+ .dlrm-explanation {
50
+ background-color: #e8f4fd;
51
+ padding: 1rem;
52
  border-radius: 0.5rem;
53
+ border-left: 4px solid #0066cc;
54
  margin: 1rem 0;
 
55
  }
56
  .book-card {
57
  background-color: #ffffff;
 
64
  """, unsafe_allow_html=True)
65
 
66
  @st.cache_data
67
+ def load_data():
68
+ """Load and cache the book data"""
69
+ try:
70
+ books_df = pd.read_csv('Books.csv', encoding='latin-1', low_memory=False)
71
+ users_df = pd.read_csv('Users.csv', encoding='latin-1', low_memory=False)
72
+ ratings_df = pd.read_csv('Ratings.csv', encoding='latin-1', low_memory=False)
73
+
74
+ # Clean column names
75
+ books_df.columns = books_df.columns.str.replace('"', '')
76
+ users_df.columns = users_df.columns.str.replace('"', '')
77
+ ratings_df.columns = ratings_df.columns.str.replace('"', '')
78
+
79
+ return books_df, users_df, ratings_df
80
+ except Exception as e:
81
+ st.error(f"Error loading data: {e}")
82
+ return None, None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
+ @st.cache_resource
85
+ def load_dlrm_model():
86
+ """Load and cache the DLRM model"""
87
+ if not DLRM_AVAILABLE:
88
+ return None
 
 
 
 
 
 
 
 
 
 
89
 
90
+ try:
91
+ recommender = load_dlrm_recommender("file")
92
+ return recommender
93
+ except Exception as e:
94
+ st.error(f"Error loading DLRM model: {e}")
95
+ return None
96
 
97
  def display_book_info(book_isbn, books_df, show_rating=None):
98
+ """Display book information with actual book cover"""
99
  book_info = books_df[books_df['ISBN'] == book_isbn]
100
 
101
  if len(book_info) == 0:
 
107
  col1, col2 = st.columns([1, 3])
108
 
109
  with col1:
110
+ # Try to display actual book cover from Image-URL-M
111
+ image_url = book.get('Image-URL-M', '')
112
+
113
+ if image_url and pd.notna(image_url) and str(image_url) != 'nan':
114
+ try:
115
+ # Clean the URL (sometimes there are issues with Amazon URLs)
116
+ clean_url = str(image_url).strip()
117
+ if clean_url and 'http' in clean_url:
118
+ st.image(clean_url, width=150, caption="πŸ“š")
119
+ else:
120
+ # Fallback to placeholder
121
+ st.image("https://via.placeholder.com/150x200?text=πŸ“š&color=1f77b4&bg=f0f2f6", width=150)
122
+ except Exception as e:
123
+ # If image loading fails, show placeholder
124
+ st.image("https://via.placeholder.com/150x200?text=πŸ“š&color=1f77b4&bg=f0f2f6", width=150)
125
+ st.caption("⚠️ Cover unavailable")
126
+ else:
127
+ # Show placeholder if no image URL
128
+ st.image("https://via.placeholder.com/150x200?text=πŸ“š&color=1f77b4&bg=f0f2f6", width=150)
129
+ st.caption("πŸ“š No cover")
130
 
131
  with col2:
132
  st.markdown(f"**{book['Book-Title']}**")
 
142
  # Header
143
  st.markdown('<h1 class="main-header">πŸ“š DLRM Book Recommendation System</h1>', unsafe_allow_html=True)
144
  st.markdown("### Deep Learning Recommendation Model for Personalized Book Suggestions")
145
+ st.markdown("---")
146
 
147
+ if not DLRM_AVAILABLE:
148
+ st.error("DLRM components are not available. Please ensure TorchRec is properly installed.")
149
+ st.info("To install TorchRec: `pip install torchrec`")
150
+ return
 
 
151
 
152
+ # Load data
153
+ with st.spinner("Loading book data..."):
154
+ books_df, users_df, ratings_df = load_data()
155
+
156
+ if books_df is None:
157
+ st.error("Failed to load data. Please check if CSV files are available.")
158
+ return
159
 
160
  # Sidebar info
161
+ st.sidebar.title("πŸ“Š Dataset Information")
162
+ st.sidebar.metric("πŸ“š Books", f"{len(books_df):,}")
163
+ st.sidebar.metric("πŸ‘₯ Users", f"{len(users_df):,}")
164
+ st.sidebar.metric("⭐ Ratings", f"{len(ratings_df):,}")
165
+
166
+ # Load DLRM model
167
+ with st.spinner("Loading DLRM model..."):
168
+ recommender = load_dlrm_model()
169
+
170
+ if recommender is None or recommender.model is None:
171
+ st.error("❌ DLRM model not available")
172
+ st.info("Please run the training script first: `python train_dlrm_books.py`")
173
+
174
+ st.markdown("### Available Options:")
175
+ st.markdown("1. **Train DLRM Model**: Run `python train_dlrm_books.py`")
176
+ st.markdown("2. **Prepare Data**: Run `python dlrm_book_recommender.py`")
177
+ st.markdown("3. **Check Files**: Ensure preprocessing files exist")
178
+
179
+ return
180
 
181
+ st.success("βœ… DLRM model loaded successfully!")
182
+
183
+ # Model info
184
  st.sidebar.markdown("---")
185
+ st.sidebar.subheader("πŸ€– DLRM Model Info")
186
+ if recommender.preprocessing_info:
187
+ st.sidebar.write(f"Dense features: {len(recommender.dense_cols)}")
188
+ st.sidebar.write(f"Categorical features: {len(recommender.cat_cols)}")
189
+ st.sidebar.write(f"Embedding dim: 64")
 
 
190
 
191
  # Main interface
192
+ tab1, tab2, tab3, tab4 = st.tabs(["🎯 Get Recommendations", "πŸ” Test Predictions", "πŸ“Š Model Analysis", "πŸ“Έ Book Gallery"])
193
 
194
  with tab1:
195
+ st.header("🎯 DLRM Book Recommendations")
196
+ st.info("Get personalized book recommendations using the trained DLRM model")
197
 
198
  # User selection
199
  col1, col2 = st.columns([2, 1])
200
 
201
  with col1:
202
+ user_ids = sorted(users_df['User-ID'].unique())
203
+ selected_user_id = st.selectbox("Select a user", user_ids[:1000]) # Limit for performance
204
 
205
  with col2:
206
+ num_recommendations = st.slider("Number of recommendations", 5, 20, 10)
207
 
208
  # Show user info
209
  user_info = users_df[users_df['User-ID'] == selected_user_id]
 
214
  # User's reading history
215
  user_ratings = ratings_df[ratings_df['User-ID'] == selected_user_id]
216
  if len(user_ratings) > 0:
217
+ with st.expander(f"πŸ“– User's Reading History ({len(user_ratings)} books)", expanded=False):
218
+ top_rated = user_ratings.sort_values('Book-Rating', ascending=False).head(10)
219
+ for _, rating in top_rated.iterrows():
220
  book_info = books_df[books_df['ISBN'] == rating['ISBN']]
221
  if len(book_info) > 0:
222
  book = book_info.iloc[0]
223
  st.write(f"β€’ **{book['Book-Title']}** by {book['Book-Author']} - {rating['Book-Rating']}/10 ⭐")
224
 
225
+ if st.button("πŸš€ Get DLRM Recommendations", type="primary"):
226
+ with st.spinner("πŸ€– DLRM is analyzing user preferences..."):
227
 
228
+ # Get candidate books (popular books not rated by user)
229
  user_rated_books = set(user_ratings['ISBN']) if len(user_ratings) > 0 else set()
 
230
 
231
+ # Get popular books as candidates
232
+ book_popularity = ratings_df.groupby('ISBN').size().sort_values(ascending=False)
233
+ candidate_books = [isbn for isbn in book_popularity.head(100).index if isbn not in user_rated_books]
 
 
234
 
235
+ if len(candidate_books) < num_recommendations:
236
+ candidate_books = book_popularity.head(200).index.tolist()
237
+
238
+ # Get recommendations
239
+ recommendations = recommender.get_user_recommendations(
240
+ user_id=selected_user_id,
241
+ candidate_books=candidate_books,
242
+ k=num_recommendations
243
+ )
244
 
245
+ if recommendations:
246
+ st.success(f"Generated {len(recommendations)} DLRM recommendations!")
247
+
248
+ st.subheader("🎯 DLRM Recommendations")
249
+
250
+ for i, (book_isbn, score) in enumerate(recommendations, 1):
251
+ book_info = books_df[books_df['ISBN'] == book_isbn]
252
+ if len(book_info) > 0:
253
+ with st.expander(f"{i}. Recommendation (DLRM Score: {score:.4f})", expanded=(i <= 3)):
254
+ display_book_info(book_isbn, books_df, show_rating=score)
255
+
256
+ # Additional book stats
257
+ book_ratings = ratings_df[ratings_df['ISBN'] == book_isbn]
258
+ if len(book_ratings) > 0:
259
+ avg_rating = book_ratings['Book-Rating'].mean()
260
+ num_ratings = len(book_ratings)
261
+
262
+ st.markdown('<div class="dlrm-explanation">', unsafe_allow_html=True)
263
+ st.markdown("**πŸ“Š Book Statistics:**")
264
+ st.write(f"Average Rating: {avg_rating:.1f}/10 from {num_ratings} readers")
265
+ st.write(f"DLRM Confidence: {score:.1%}")
266
+ st.markdown('</div>', unsafe_allow_html=True)
267
+ else:
268
+ st.write(f"Book with ISBN {book_isbn} not found in database")
269
+ else:
270
+ st.warning("No recommendations generated")
271
 
272
  with tab2:
273
+ st.header("πŸ” Test DLRM Predictions")
274
+ st.info("Test how well DLRM predicts actual user ratings")
 
 
 
 
 
 
 
275
 
276
  col1, col2 = st.columns(2)
277
 
278
  with col1:
279
+ test_user_id = st.selectbox("Select user for testing", user_ids[:500], key="test_user")
 
 
 
 
 
 
 
280
 
281
  with col2:
282
+ test_mode = st.radio("Test mode", ["Random books", "User's actual books"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
 
284
+ if st.button("πŸ§ͺ Test Predictions", type="secondary"):
285
+ with st.spinner("Testing DLRM predictions..."):
286
+
287
+ if test_mode == "User's actual books":
288
+ # Test on user's actual rated books
289
+ user_test_ratings = ratings_df[ratings_df['User-ID'] == test_user_id].sample(min(10, len(user_ratings)))
290
+
291
+ if len(user_test_ratings) > 0:
292
+ st.subheader("🎯 DLRM vs Actual Ratings")
293
+
294
+ predictions = []
295
+ actuals = []
296
+
297
+ for _, rating in user_test_ratings.iterrows():
298
+ book_isbn = rating['ISBN']
299
+ actual_rating = rating['Book-Rating']
300
+
301
+ # Get DLRM prediction
302
+ dlrm_score = recommender.predict_rating(test_user_id, book_isbn)
303
+
304
+ predictions.append(dlrm_score)
305
+ actuals.append(actual_rating >= 6) # Convert to binary
306
+
307
+ # Display comparison
308
+ book_info = books_df[books_df['ISBN'] == book_isbn]
309
+ if len(book_info) > 0:
310
+ book = book_info.iloc[0]
311
+
312
+ col1, col2, col3 = st.columns([2, 1, 1])
313
+ with col1:
314
+ st.write(f"**{book['Book-Title']}**")
315
+ st.write(f"*by {book['Book-Author']}*")
316
+
317
+ with col2:
318
+ st.metric("Actual Rating", f"{actual_rating}/10")
319
+
320
+ with col3:
321
+ st.metric("DLRM Score", f"{dlrm_score:.3f}")
322
+
323
+ # Calculate accuracy
324
+ if predictions and actuals:
325
+ # Convert DLRM scores to binary predictions
326
+ binary_preds = [1 if p > 0.5 else 0 for p in predictions]
327
+ accuracy = sum(p == a for p, a in zip(binary_preds, actuals)) / len(actuals)
328
+
329
+ st.markdown("---")
330
+ st.success(f"🎯 DLRM Accuracy: {accuracy:.1%}")
331
+
332
+ # Show correlation
333
+ actual_numeric = [rating['Book-Rating'] for _, rating in user_test_ratings.iterrows()]
334
+ correlation = np.corrcoef(predictions, actual_numeric)[0, 1] if len(predictions) > 1 else 0
335
+ st.info(f"πŸ“Š Correlation with actual ratings: {correlation:.3f}")
336
+
337
+ else:
338
+ st.warning("No ratings found for this user")
339
+
340
+ else:
341
+ # Test on random books
342
+ random_books = books_df.sample(10)['ISBN'].tolist()
343
+
344
+ st.subheader("🎲 Random Book Predictions")
345
+
346
+ for book_isbn in random_books:
347
+ dlrm_score = recommender.predict_rating(test_user_id, book_isbn)
348
+
349
+ book_info = books_df[books_df['ISBN'] == book_isbn]
350
+ if len(book_info) > 0:
351
+ book = book_info.iloc[0]
352
+
353
+ col1, col2 = st.columns([3, 1])
354
+ with col1:
355
+ st.write(f"**{book['Book-Title']}** by *{book['Book-Author']}*")
356
+
357
+ with col2:
358
+ st.metric("DLRM Score", f"{dlrm_score:.4f}")
359
 
360
  with tab3:
361
+ st.header("πŸ“Š DLRM Model Analysis")
362
+ st.info("Analysis of the DLRM model performance and characteristics")
 
 
 
 
 
 
 
 
363
 
364
+ # Model architecture info
365
+ if recommender and recommender.preprocessing_info:
366
+ col1, col2 = st.columns(2)
 
367
 
368
+ with col1:
369
+ st.subheader("πŸ—οΈ Model Architecture")
370
+ st.write(f"**Dense Features ({len(recommender.dense_cols)}):**")
371
+ for col in recommender.dense_cols:
372
+ st.write(f"β€’ {col}")
373
+
374
+ st.write(f"**Categorical Features ({len(recommender.cat_cols)}):**")
375
+ for i, col in enumerate(recommender.cat_cols):
376
+ st.write(f"β€’ {col}: {recommender.emb_counts[i]} embeddings")
377
 
378
+ with col2:
379
+ st.subheader("πŸ“ˆ Dataset Statistics")
380
+ total_samples = recommender.preprocessing_info.get('total_samples', 0)
381
+ positive_rate = recommender.preprocessing_info.get('positive_rate', 0)
382
+
383
+ st.metric("Total Samples", f"{total_samples:,}")
384
+ st.metric("Positive Rate", f"{positive_rate:.1%}")
385
+ st.metric("Train Samples", f"{recommender.preprocessing_info.get('train_samples', 0):,}")
386
+ st.metric("Validation Samples", f"{recommender.preprocessing_info.get('val_samples', 0):,}")
387
+ st.metric("Test Samples", f"{recommender.preprocessing_info.get('test_samples', 0):,}")
388
 
389
+ # Feature importance analysis
390
+ st.subheader("πŸ” Feature Analysis")
391
 
392
+ if st.button("Analyze Feature Importance"):
393
+ with st.spinner("Analyzing feature importance..."):
394
+
395
+ # Sample some users and books
396
+ sample_users = users_df['User-ID'].sample(20).tolist()
397
+ sample_books = books_df['ISBN'].sample(20).tolist()
398
+
399
+ # Test different feature combinations
400
+ st.write("**Feature Impact Analysis:**")
401
+
402
+ base_predictions = []
403
+ for user_id in sample_users[:5]:
404
+ for book_isbn in sample_books[:5]:
405
+ score = recommender.predict_rating(user_id, book_isbn)
406
+ base_predictions.append(score)
407
 
408
+ avg_prediction = np.mean(base_predictions)
409
+ st.metric("Average Prediction Score", f"{avg_prediction:.4f}")
 
 
410
 
411
+ st.success("βœ… Feature analysis completed!")
412
+
413
+ # Load training results if available
414
+ if os.path.exists('dlrm_book_training_results.pkl'):
415
+ with open('dlrm_book_training_results.pkl', 'rb') as f:
416
+ training_results = pickle.load(f)
417
+
418
+ st.subheader("πŸ“ˆ Training Results")
419
+
420
+ col1, col2 = st.columns(2)
421
+
422
+ with col1:
423
+ st.metric("Final Validation AUROC", f"{training_results.get('final_val_auroc', 0):.4f}")
424
+ st.metric("Test AUROC", f"{training_results.get('test_auroc', 0):.4f}")
425
+
426
+ with col2:
427
+ val_history = training_results.get('val_aurocs_history', [])
428
+ if val_history:
429
+ st.line_chart(pd.DataFrame({
430
+ 'Epoch': range(len(val_history)),
431
+ 'Validation AUROC': val_history
432
+ }).set_index('Epoch'))
433
 
434
+ # Instructions
435
  st.markdown("---")
436
  st.markdown("""
437
+ ## πŸš€ How DLRM Works for Book Recommendations
438
+
439
+ **DLRM (Deep Learning Recommendation Model)** is specifically designed for recommendation systems and offers several advantages:
440
 
441
+ ### πŸ—οΈ Architecture Benefits:
442
+ - **Multi-feature Processing**: Handles both categorical (user ID, book ID, publisher) and numerical (age, ratings) features
443
+ - **Embedding Tables**: Learns rich representations for categorical features
444
+ - **Cross-feature Interactions**: Captures complex relationships between different features
445
+ - **Scalable Design**: Efficiently handles large-scale recommendation datasets
446
 
447
+ ### πŸ“Š Features Used:
448
+ **Categorical Features:**
449
+ - User ID, Book ID, Publisher, Country, Age Group, Publication Decade, Rating Level
 
450
 
451
+ **Dense Features:**
452
+ - Normalized Age, Publication Year, User Activity, Book Popularity, Average Ratings
 
 
 
453
 
454
+ ### 🎯 Why DLRM vs LLM for Recommendations:
455
+ - **Purpose-built**: Specifically designed for recommendation systems
456
+ - **Feature Integration**: Better at combining diverse feature types
457
+ - **Scalability**: More efficient for large-scale recommendation tasks
458
+ - **Performance**: Higher accuracy for rating prediction tasks
459
+ - **Production Ready**: Optimized for real-time inference
460
+
461
+ ### πŸ’‘ Best Use Cases:
462
+ - **Personalized Recommendations**: Based on user behavior and item characteristics
463
+ - **Rating Prediction**: Accurately predicts user preferences
464
+ - **Cold Start**: Handles new users and items through content features
465
+ - **Real-time Serving**: Fast inference for production systems
466
  """)
467
 
468
+ with tab4:
469
+ st.header("πŸ“Έ Book Gallery")
470
+ st.info("Browse book covers and discover new titles")
471
+
472
+ # Gallery options
473
+ col1, col2 = st.columns([2, 1])
474
+
475
+ with col1:
476
+ gallery_mode = st.selectbox(
477
+ "Choose gallery mode",
478
+ ["Popular Books", "Recent Publications", "Random Selection", "Search Results"]
479
+ )
480
+
481
+ with col2:
482
+ books_per_row = st.slider("Books per row", 2, 6, 4)
483
+ max_books = st.slider("Maximum books", 10, 50, 20)
484
+
485
+ # Get books based on selected mode
486
+ if gallery_mode == "Popular Books":
487
+ # Get most rated books
488
+ book_popularity = ratings_df.groupby('ISBN').size().sort_values(ascending=False)
489
+ gallery_books = books_df[books_df['ISBN'].isin(book_popularity.head(max_books).index)]
490
+
491
+ elif gallery_mode == "Recent Publications":
492
+ # Get recent books
493
+ books_df_temp = books_df.copy()
494
+ books_df_temp['Year-Of-Publication'] = pd.to_numeric(books_df_temp['Year-Of-Publication'], errors='coerce')
495
+ recent_books = books_df_temp.sort_values('Year-Of-Publication', ascending=False, na_position='last')
496
+ gallery_books = recent_books.head(max_books)
497
+
498
+ elif gallery_mode == "Random Selection":
499
+ # Random books
500
+ gallery_books = books_df.sample(min(max_books, len(books_df)))
501
+
502
+ else: # Search Results
503
+ search_query = st.text_input("Search books for gallery", placeholder="Enter title, author, or publisher")
504
+ if search_query:
505
+ mask = (
506
+ books_df['Book-Title'].str.contains(search_query, case=False, na=False) |
507
+ books_df['Book-Author'].str.contains(search_query, case=False, na=False) |
508
+ books_df['Publisher'].str.contains(search_query, case=False, na=False)
509
+ )
510
+ gallery_books = books_df[mask].head(max_books)
511
+ else:
512
+ gallery_books = books_df.head(max_books)
513
+
514
+ # Display gallery
515
+ if len(gallery_books) > 0:
516
+ st.markdown(f"**πŸ“š Showing {len(gallery_books)} books**")
517
+
518
+ # Create grid layout
519
+ books_list = gallery_books.to_dict('records')
520
+
521
+ # Display books in rows
522
+ for i in range(0, len(books_list), books_per_row):
523
+ cols = st.columns(books_per_row)
524
+
525
+ for j, col in enumerate(cols):
526
+ if i + j < len(books_list):
527
+ book = books_list[i + j]
528
+
529
+ with col:
530
+ # Book cover
531
+ image_url = book.get('Image-URL-M', '')
532
+
533
+ if image_url and pd.notna(image_url) and str(image_url) != 'nan':
534
+ try:
535
+ clean_url = str(image_url).strip()
536
+ if clean_url and 'http' in clean_url:
537
+ st.image(clean_url, width='stretch')
538
+ else:
539
+ st.image("https://via.placeholder.com/150x200?text=πŸ“š&color=1f77b4&bg=f0f2f6", width='stretch')
540
+ except:
541
+ st.image("https://via.placeholder.com/150x200?text=πŸ“š&color=1f77b4&bg=f0f2f6", width='stretch')
542
+ else:
543
+ st.image("https://via.placeholder.com/150x200?text=πŸ“š&color=1f77b4&bg=f0f2f6", width='stretch')
544
+
545
+ # Book info
546
+ title = book['Book-Title']
547
+ if len(title) > 40:
548
+ title = title[:37] + "..."
549
+
550
+ author = book['Book-Author']
551
+ if len(author) > 25:
552
+ author = author[:22] + "..."
553
+
554
+ st.markdown(f"**{title}**")
555
+ st.write(f"*{author}*")
556
+ st.write(f"πŸ“… {book.get('Year-Of-Publication', 'Unknown')}")
557
+
558
+ # Book statistics
559
+ book_stats = ratings_df[ratings_df['ISBN'] == book['ISBN']]
560
+ if len(book_stats) > 0:
561
+ avg_rating = book_stats['Book-Rating'].mean()
562
+ num_ratings = len(book_stats)
563
+ st.write(f"⭐ {avg_rating:.1f}/10 ({num_ratings} ratings)")
564
+ else:
565
+ st.write("⭐ No ratings")
566
+
567
+ # DLRM prediction button
568
+ if recommender and recommender.model:
569
+ if st.button(f"🎯 DLRM Score", key=f"dlrm_{book['ISBN']}"):
570
+ with st.spinner("Calculating..."):
571
+ # Use first user as example
572
+ sample_user = users_df['User-ID'].iloc[0]
573
+ dlrm_score = recommender.predict_rating(sample_user, book['ISBN'])
574
+ st.success(f"DLRM Score: {dlrm_score:.3f}")
575
+ else:
576
+ st.info("No books found for the selected criteria")
577
+
578
+ # Quick stats
579
+ st.markdown("---")
580
+ st.subheader("πŸ“Š Gallery Statistics")
581
+
582
+ col1, col2, col3, col4 = st.columns(4)
583
+
584
+ with col1:
585
+ books_with_covers = sum(1 for _, book in gallery_books.iterrows()
586
+ if book.get('Image-URL-M') and pd.notna(book.get('Image-URL-M')))
587
+ st.metric("Books with Covers", f"{books_with_covers}/{len(gallery_books)}")
588
+
589
+ with col2:
590
+ # Convert Year-Of-Publication to numeric, coercing errors to NaN
591
+ years = pd.to_numeric(gallery_books['Year-Of-Publication'], errors='coerce')
592
+ avg_year = years.mean()
593
+ st.metric("Average Publication Year", f"{avg_year:.0f}" if not pd.isna(avg_year) else "Unknown")
594
+
595
+ with col3:
596
+ unique_authors = gallery_books['Book-Author'].nunique()
597
+ st.metric("Unique Authors", unique_authors)
598
+
599
+ with col4:
600
+ unique_publishers = gallery_books['Publisher'].nunique()
601
+ st.metric("Unique Publishers", unique_publishers)
602
+
603
  if __name__ == "__main__":
604
+ main()