edwinbh commited on
Commit
d3f7b04
ยท
verified ยท
1 Parent(s): d27a328

Update src/dlrm_inference.py

Browse files
Files changed (1) hide show
  1. src/dlrm_inference.py +792 -420
src/dlrm_inference.py CHANGED
@@ -1,314 +1,692 @@
1
- """
2
- DLRM Inference Engine for Book Recommendations
3
- Loads trained DLRM model and provides recommendation functionality
4
- """
5
 
6
- import os
7
- import sys
8
- import torch
9
- import numpy as np
10
- import pandas as pd
11
- import pickle
12
- import mlflow
13
- from mlflow import MlflowClient
14
- import tempfile
15
- from typing import List, Dict, Tuple, Optional, Any
16
- from functools import partial
17
- import warnings
18
- warnings.filterwarnings('ignore')
19
 
20
- # Check for CPU_ONLY environment variable
21
- CPU_ONLY = os.environ.get('CPU_ONLY', 'false').lower() == 'true'
22
 
23
- # Disable CUDA if CPU_ONLY is set
24
- if CPU_ONLY:
25
- os.environ['CUDA_VISIBLE_DEVICES'] = ''
26
- print("๐Ÿ”„ Running in CPU-only mode (CUDA disabled)")
27
 
28
- # Only import torchrec if not in CPU_ONLY mode
29
- TORCHREC_AVAILABLE = False
30
- if not CPU_ONLY:
31
- try:
32
- from torchrec import EmbeddingBagCollection
33
- from torchrec.models.dlrm import DLRM, DLRMTrain
34
- from torchrec.modules.embedding_configs import EmbeddingBagConfig
35
- from torchrec.sparse.jagged_tensor import KeyedJaggedTensor
36
- from torchrec.datasets.utils import Batch
37
- TORCHREC_AVAILABLE = True
38
- except ImportError as e:
39
- print(f"โš ๏ธ Warning: torchrec import error: {e}")
40
- print("โš ๏ธ Some functionality will be limited")
41
- else:
42
- print("โš ๏ธ Running in CPU-only mode without torchrec")
43
 
44
- class DLRMBookRecommender:
45
- """DLRM-based book recommender for inference"""
46
 
47
- def __init__(self, model_path: str = None, run_id: str = None):
48
- """
49
- Initialize DLRM book recommender
50
-
51
- Args:
52
- model_path: Path to saved model state dict
53
- run_id: MLflow run ID to load model from
54
- """
55
- self.device = torch.device("cpu")
56
- self.model = None
57
- self.preprocessing_info = None
58
- self.torchrec_available = TORCHREC_AVAILABLE
59
- self.cpu_only = CPU_ONLY
60
- self.dense_cols = []
61
- self.cat_cols = []
62
- self.emb_counts = []
63
-
64
- if self.cpu_only:
65
- print("โš ๏ธ Running in CPU-only mode with limited functionality")
66
- # Load minimal preprocessing info for browsing
67
- self._load_minimal_preprocessing()
68
- return
69
-
70
- if not self.torchrec_available:
71
- print("โš ๏ธ Running in limited mode without torchrec")
72
- return
73
-
74
- # Load preprocessing info
75
- self._load_preprocessing_info()
76
-
77
- # Load model
78
- if model_path and os.path.exists(model_path):
79
- self._load_model_from_path(model_path)
80
- elif run_id:
81
- self._load_model_from_mlflow(run_id)
82
- else:
83
- print("โš ๏ธ No model loaded. Please provide model_path or run_id")
84
 
85
- def _load_minimal_preprocessing(self):
86
- """Load minimal preprocessing info for CPU-only mode"""
87
- try:
88
- if os.path.exists('book_dlrm_preprocessing.pkl'):
89
- with open('book_dlrm_preprocessing.pkl', 'rb') as f:
90
- self.preprocessing_info = pickle.load(f)
91
 
92
- self.dense_cols = self.preprocessing_info.get('dense_cols', [])
93
- self.cat_cols = self.preprocessing_info.get('cat_cols', [])
94
- self.emb_counts = self.preprocessing_info.get('emb_counts', [])
95
 
96
- print("โœ… Minimal preprocessing info loaded for CPU-only mode")
97
- else:
98
- print("โš ๏ธ No preprocessing info found for CPU-only mode")
99
- except Exception as e:
100
- print(f"โš ๏ธ Error loading minimal preprocessing: {e}")
101
-
102
- def _load_preprocessing_info(self):
103
- """Load preprocessing information"""
104
- if os.path.exists('book_dlrm_preprocessing.pkl'):
105
- with open('book_dlrm_preprocessing.pkl', 'rb') as f:
106
- self.preprocessing_info = pickle.load(f)
107
-
108
- self.dense_cols = self.preprocessing_info['dense_cols']
109
- self.cat_cols = self.preprocessing_info['cat_cols']
110
- self.emb_counts = self.preprocessing_info['emb_counts']
111
- self.user_encoder = self.preprocessing_info['user_encoder']
112
- self.book_encoder = self.preprocessing_info['book_encoder']
113
- self.publisher_encoder = self.preprocessing_info['publisher_encoder']
114
- self.location_encoder = self.preprocessing_info['location_encoder']
115
- self.scaler = self.preprocessing_info['scaler']
116
-
117
- print("โœ… Preprocessing info loaded")
118
- else:
119
- raise FileNotFoundError("book_dlrm_preprocessing.pkl not found. Run preprocessing first.")
120
 
121
- def _load_model_from_path(self, model_path: str):
122
- """Load model from saved state dict"""
123
- try:
124
- # Create model architecture
125
- eb_configs = [
126
- EmbeddingBagConfig(
127
- name=f"t_{feature_name}",
128
- embedding_dim=64, # Default embedding dim
129
- num_embeddings=self.emb_counts[feature_idx],
130
- feature_names=[feature_name],
131
- )
132
- for feature_idx, feature_name in enumerate(self.cat_cols)
133
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
- dlrm_model = DLRM(
136
- embedding_bag_collection=EmbeddingBagCollection(
137
- tables=eb_configs, device=self.device
138
- ),
139
- dense_in_features=len(self.dense_cols),
140
- dense_arch_layer_sizes=[256, 128, 64],
141
- over_arch_layer_sizes=[512, 256, 128, 1],
142
- dense_device=self.device,
143
- )
144
 
145
- # Load state dict
146
- state_dict = torch.load(model_path, map_location=self.device)
147
 
148
- # Remove 'model.' prefix if present
149
- if any(key.startswith('model.') for key in state_dict.keys()):
150
- state_dict = {k[6:]: v for k, v in state_dict.items()}
151
 
152
- dlrm_model.load_state_dict(state_dict)
153
- self.model = dlrm_model
154
- self.model.eval()
155
 
156
- print(f"โœ… Model loaded from {model_path}")
157
 
158
- except Exception as e:
159
- print(f"โŒ Error loading model: {e}")
160
 
161
- def _load_model_from_mlflow(self, run_id: str):
162
- """Load model from MLflow"""
163
- try:
164
- client = MlflowClient()
165
- run = client.get_run(run_id)
166
-
167
- # Get model parameters from MLflow
168
- params = run.data.params
169
- cat_cols = eval(params.get('cat_cols'))
170
- emb_counts = eval(params.get('emb_counts'))
171
- dense_cols = eval(params.get('dense_cols'))
172
- embedding_dim = int(params.get('embedding_dim', 64))
173
- dense_arch_layer_sizes = eval(params.get('dense_arch_layer_sizes'))
174
- over_arch_layer_sizes = eval(params.get('over_arch_layer_sizes'))
175
-
176
- # Download model from MLflow
177
- temp_dir = tempfile.mkdtemp()
178
-
179
- # Try different artifact paths
180
- for artifact_path in ['model_state_dict_final', 'model_state_dict_2', 'model_state_dict_1', 'model_state_dict_0']:
181
- try:
182
- client.download_artifacts(run_id, f"{artifact_path}/state_dict.pth", temp_dir)
183
- state_dict = mlflow.pytorch.load_state_dict(f"{temp_dir}/{artifact_path}")
184
- break
185
- except:
186
- continue
187
- else:
188
- raise Exception("No model artifacts found")
189
-
190
- # Create model
191
- eb_configs = [
192
- EmbeddingBagConfig(
193
- name=f"t_{feature_name}",
194
- embedding_dim=embedding_dim,
195
- num_embeddings=emb_counts[feature_idx],
196
- feature_names=[feature_name],
197
- )
198
- for feature_idx, feature_name in enumerate(cat_cols)
199
- ]
200
 
201
- dlrm_model = DLRM(
202
- embedding_bag_collection=EmbeddingBagCollection(
203
- tables=eb_configs, device=self.device
204
- ),
205
- dense_in_features=len(dense_cols),
206
- dense_arch_layer_sizes=dense_arch_layer_sizes,
207
- over_arch_layer_sizes=over_arch_layer_sizes,
208
- dense_device=self.device,
209
- )
210
 
211
- # Remove prefix and load state dict
212
- if any(key.startswith('model.') for key in state_dict.keys()):
213
- state_dict = {k[6:]: v for k, v in state_dict.items()}
214
 
215
- dlrm_model.load_state_dict(state_dict)
216
- self.model = dlrm_model
217
- self.model.eval()
218
 
219
- print(f"โœ… Model loaded from MLflow run: {run_id}")
220
 
221
- except Exception as e:
222
- print(f"โŒ Error loading model from MLflow: {e}")
223
 
224
- def _prepare_user_features(self, user_id: int, user_data: Optional[Dict] = None) -> Tuple[torch.Tensor, KeyedJaggedTensor]:
225
- """Prepare user features for inference"""
226
-
227
- if user_data is None:
228
- # Create default user features
229
- user_data = {
230
- 'User-ID': user_id,
231
- 'Age': 30, # Default age
232
- 'Location': 'usa', # Default location
233
- }
234
-
235
- # Encode categorical features
236
- try:
237
- user_id_encoded = self.user_encoder.transform([str(user_id)])[0]
238
- except:
239
- # Handle unknown user
240
- user_id_encoded = 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
 
242
- try:
243
- location = str(user_data.get('Location', 'usa')).split(',')[-1].strip().lower()
244
- country_encoded = self.location_encoder.transform([location])[0]
245
- except:
246
- country_encoded = 0
247
-
248
- # Age group
249
- age = user_data.get('Age', 30)
250
- if age < 18:
251
- age_group = 0
252
- elif age < 25:
253
- age_group = 1
254
- elif age < 35:
255
- age_group = 2
256
- elif age < 50:
257
- age_group = 3
258
- elif age < 65:
259
- age_group = 4
260
- else:
261
- age_group = 5
262
 
263
- # Get user statistics (if available)
264
- user_activity = user_data.get('user_activity', 10) # Default
265
- user_avg_rating = user_data.get('user_avg_rating', 6.0) # Default
266
- age_normalized = user_data.get('Age', 30)
267
 
268
- # Normalize dense features
269
- dense_features = np.array([[age_normalized, 2000, user_activity, 10, user_avg_rating, 6.0]]) # Default values
270
- dense_features = self.scaler.transform(dense_features)
271
- dense_features = torch.tensor(dense_features, dtype=torch.float32)
272
 
273
- return dense_features, user_id_encoded, country_encoded, age_group
 
 
274
 
275
- def _prepare_book_features(self, book_isbn: str, book_data: Optional[Dict] = None) -> Tuple[int, int, int, int]:
276
- """Prepare book features for inference"""
277
-
278
- if book_data is None:
279
- book_data = {}
280
-
281
- # Encode book ID
282
- try:
283
- book_id_encoded = self.book_encoder.transform([str(book_isbn)])[0]
284
- except:
285
- book_id_encoded = 0
 
 
 
 
286
 
287
- # Encode publisher
288
- try:
289
- publisher = str(book_data.get('Publisher', 'Unknown'))
290
- publisher_encoded = self.publisher_encoder.transform([publisher])[0]
291
- except:
292
- publisher_encoded = 0
293
-
294
- # Publication decade
295
- year = book_data.get('Year-Of-Publication', 2000)
296
- decade = ((int(year) // 10) * 10)
297
- try:
298
- decade_encoded = preprocessing_info.get('decade_encoder', LabelEncoder()).transform([str(decade)])[0]
299
- except:
300
- decade_encoded = 6 # Default to 2000s
301
 
302
- # Rating level (default to medium)
303
- rating_level = 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
304
 
305
- return book_id_encoded, publisher_encoded, decade_encoded, rating_level
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
306
 
307
  def predict_rating(self, user_id: int, book_isbn: str,
308
  user_data: Optional[Dict] = None,
309
  book_data: Optional[Dict] = None) -> float:
310
  """
311
- Predict rating probability for user-book pair
312
 
313
  Args:
314
  user_id: User ID
@@ -319,43 +697,79 @@ class DLRMBookRecommender:
319
  Returns:
320
  Prediction probability (0-1)
321
  """
322
- if self.cpu_only:
323
- print("โš ๏ธ Cannot make predictions in CPU-only mode")
324
- return 0.5 # Return default neutral prediction
325
-
326
- if self.model is None:
327
- print("โŒ Model not loaded")
328
- return 0.0
329
-
330
- if not self.torchrec_available:
331
- print("โŒ Cannot make predictions without torchrec")
332
- return 0.5 # Return default neutral prediction
333
-
334
  try:
335
- # Prepare features
336
- dense_features, user_id_encoded, country_encoded, age_group = self._prepare_user_features(user_id, user_data)
337
- book_id_encoded, publisher_encoded, decade_encoded, rating_level = self._prepare_book_features(book_isbn, book_data)
338
-
339
- # Create sparse features
340
- kjt_values = [user_id_encoded, book_id_encoded, publisher_encoded, country_encoded, age_group, decade_encoded, rating_level]
341
- kjt_lengths = [1] * len(kjt_values)
342
-
343
- sparse_features = KeyedJaggedTensor.from_lengths_sync(
344
- self.cat_cols,
345
- torch.tensor(kjt_values),
346
- torch.tensor(kjt_lengths, dtype=torch.int32),
347
- )
348
-
349
- # Make prediction
350
- with torch.no_grad():
351
- logits = self.model(dense_features=dense_features, sparse_features=sparse_features)
352
- prediction = torch.sigmoid(logits).item()
353
-
354
  return prediction
355
 
356
  except Exception as e:
357
  print(f"Error in prediction: {e}")
358
- return 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
359
 
360
  def get_user_recommendations(self, user_id: int,
361
  candidate_books: List[str],
@@ -373,14 +787,10 @@ class DLRMBookRecommender:
373
  Returns:
374
  List of (book_isbn, prediction_score) tuples
375
  """
376
- if self.cpu_only or self.model is None or not self.torchrec_available:
377
- print("โŒ Model not loaded, CPU-only mode, or torchrec not available")
378
- return []
379
 
380
  recommendations = []
381
 
382
- print(f"Generating recommendations for user {user_id} from {len(candidate_books)} candidates...")
383
-
384
  for book_isbn in candidate_books:
385
  score = self.predict_rating(user_id, book_isbn, user_data)
386
  recommendations.append((book_isbn, score))
@@ -447,117 +857,108 @@ class DLRMBookRecommender:
447
  if len(scores) > 0:
448
  scores_array = np.array(scores)
449
  # Calculate correlation as similarity measure
450
- correlation = np.corrcoef(target_scores, scores_array)[0, 1]
451
- if not np.isnan(correlation):
452
- similarities.append((book_isbn, correlation))
 
 
 
 
 
453
 
454
  # Sort by similarity and return top-k
455
  similarities.sort(key=lambda x: x[1], reverse=True)
456
  return similarities[:k]
457
 
458
 
459
- def load_dlrm_recommender(model_source: str = "latest") -> DLRMBookRecommender:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
460
  """
461
- Load DLRM recommender from various sources
462
 
463
  Args:
464
- model_source: "latest" for latest MLflow run, "file" for local file, or specific run_id
465
 
466
  Returns:
467
  DLRMBookRecommender instance
468
  """
469
- # Check if we're in CPU-only mode
470
- cpu_only = os.environ.get('CPU_ONLY', 'false').lower() == 'true'
471
- if cpu_only:
472
- print("๐Ÿ”„ Loading recommender in CPU-only mode")
473
- # In CPU-only mode, just return a basic recommender instance
474
- return DLRMBookRecommender()
475
-
476
- # Create recommender instance
477
- recommender = DLRMBookRecommender()
478
-
479
- # If torchrec is not available, return limited recommender
480
- if not TORCHREC_AVAILABLE:
481
- print("โš ๏ธ torchrec not available, returning limited recommender")
482
- return recommender
483
 
484
- if model_source == "latest":
485
- # Try to get latest MLflow run
486
- try:
487
- experiment = mlflow.get_experiment_by_name('dlrm-book-recommendation-book_recommender')
488
- if experiment:
489
- runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id],
490
- order_by=["start_time desc"], max_results=1)
491
- if len(runs) > 0:
492
- latest_run_id = runs.iloc[0].run_id
493
- recommender = DLRMBookRecommender(run_id=latest_run_id)
494
- return recommender
495
- except Exception as e:
496
- print(f"โš ๏ธ Error loading from MLflow: {e}")
497
-
498
- elif model_source == "file":
499
- # Try to load from local file
500
- for filename in [
501
- '/home/mr-behdadi/PROJECT/ICE/notebooks/dlrm_book_model_final.pth',
502
- '/home/mr-behdadi/PROJECT/ICE/notebooks/dlrm_book_model_epoch_2.pth',
503
- '/home/mr-behdadi/PROJECT/ICE/notebooks/dlrm_book_model_epoch_0.pth',
504
- '/home/mr-behdadi/PROJECT/ICE/notebooks/dlrm_book_model_epoch_1.pth']:
505
- if os.path.exists(filename):
506
- try:
507
- recommender = DLRMBookRecommender(model_path=filename)
508
- return recommender
509
- except Exception as e:
510
- print(f"โš ๏ธ Error loading from {filename}: {e}")
511
-
512
- else:
513
- # Treat as run_id
514
- try:
515
- recommender = DLRMBookRecommender(run_id=model_source)
516
- return recommender
517
- except Exception as e:
518
- print(f"โš ๏ธ Error loading from run_id {model_source}: {e}")
519
 
520
- print("โš ๏ธ Could not load any trained model")
521
  return recommender
522
 
523
 
524
  def demo_dlrm_recommendations():
525
  """Demo function to show DLRM recommendations"""
526
 
527
- print("๐Ÿš€ DLRM Book Recommendation Demo")
528
  print("=" * 50)
529
 
530
- # Load book data for demo
531
- books_df = pd.read_csv('Books.csv', encoding='latin-1', low_memory=False)
532
- users_df = pd.read_csv('Users.csv', encoding='latin-1', low_memory=False)
533
- ratings_df = pd.read_csv('Ratings.csv', encoding='latin-1', low_memory=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
534
 
535
- books_df.columns = books_df.columns.str.replace('"', '')
536
- users_df.columns = users_df.columns.str.replace('"', '')
537
- ratings_df.columns = ratings_df.columns.str.replace('"', '')
538
 
539
  # Load recommender
540
- recommender = load_dlrm_recommender("file")
541
 
542
- if recommender.model is None:
543
- print("โŒ No trained model found. Please run training first.")
544
- return
545
 
546
- # Get sample user and books
547
- sample_user_id = ratings_df['User-ID'].iloc[0]
548
- sample_books = books_df['ISBN'].head(20).tolist()
549
-
550
- print(f"\n๐Ÿ“š Getting recommendations for User {sample_user_id}")
551
  print(f"Testing with {len(sample_books)} candidate books...")
552
 
553
  # Get recommendations
554
  recommendations = recommender.get_user_recommendations(
555
  user_id=sample_user_id,
556
  candidate_books=sample_books,
557
- k=10
 
558
  )
559
 
560
- print(f"\n๐ŸŽฏ Top 10 DLRM Recommendations:")
561
  print("-" * 50)
562
 
563
  for i, (book_isbn, score) in enumerate(recommendations, 1):
@@ -568,42 +969,13 @@ def demo_dlrm_recommendations():
568
  title = book['Book-Title']
569
  author = book['Book-Author']
570
  print(f"{i:2d}. {title} by {author}")
571
- print(f" ISBN: {book_isbn}, Score: {score:.4f}")
572
  else:
573
- print(f"{i:2d}. ISBN: {book_isbn}, Score: {score:.4f}")
574
  print()
575
 
576
- # Show user's actual ratings for comparison
577
- user_ratings = ratings_df[ratings_df['User-ID'] == sample_user_id]
578
- if len(user_ratings) > 0:
579
- print(f"\n๐Ÿ“– User {sample_user_id}'s Actual Reading History:")
580
- print("-" * 50)
581
-
582
- for _, rating in user_ratings.head(5).iterrows():
583
- book_info = books_df[books_df['ISBN'] == rating['ISBN']]
584
- if len(book_info) > 0:
585
- book = book_info.iloc[0]
586
- print(f"โ€ข {book['Book-Title']} by {book['Book-Author']} - Rating: {rating['Book-Rating']}/10")
587
-
588
- # Test book similarity
589
- if len(recommendations) > 0:
590
- target_book = recommendations[0][0]
591
- print(f"\n๐Ÿ” Finding books similar to: {target_book}")
592
-
593
- similar_books = recommender.get_similar_books(
594
- target_book_isbn=target_book,
595
- candidate_books=sample_books,
596
- sample_users=ratings_df['User-ID'].head(10).tolist(),
597
- k=5
598
- )
599
-
600
- print(f"\n๐Ÿ“š Similar Books:")
601
- print("-" * 30)
602
- for i, (book_isbn, similarity) in enumerate(similar_books, 1):
603
- book_info = books_df[books_df['ISBN'] == book_isbn]
604
- if len(book_info) > 0:
605
- book = book_info.iloc[0]
606
- print(f"{i}. {book['Book-Title']} (similarity: {similarity:.3f})")
607
 
608
  if __name__ == "__main__":
609
  demo_dlrm_recommendations()
 
1
+ # """
2
+ # DLRM Inference Engine for Book Recommendations
3
+ # Loads trained DLRM model and provides recommendation functionality
4
+ # """
5
 
6
+ # import os
7
+ # import sys
8
+ # import torch
9
+ # import numpy as np
10
+ # import pandas as pd
11
+ # import pickle
12
+ # import mlflow
13
+ # from mlflow import MlflowClient
14
+ # import tempfile
15
+ # from typing import List, Dict, Tuple, Optional, Any
16
+ # from functools import partial
17
+ # import warnings
18
+ # warnings.filterwarnings('ignore')
19
 
20
+ # # Check for CPU_ONLY environment variable
21
+ # CPU_ONLY = os.environ.get('CPU_ONLY', 'false').lower() == 'true'
22
 
23
+ # # Disable CUDA if CPU_ONLY is set
24
+ # if CPU_ONLY:
25
+ # os.environ['CUDA_VISIBLE_DEVICES'] = ''
26
+ # print("๐Ÿ”„ Running in CPU-only mode (CUDA disabled)")
27
 
28
+ # # Only import torchrec if not in CPU_ONLY mode
29
+ # TORCHREC_AVAILABLE = False
30
+ # if not CPU_ONLY:
31
+ # try:
32
+ # from torchrec import EmbeddingBagCollection
33
+ # from torchrec.models.dlrm import DLRM, DLRMTrain
34
+ # from torchrec.modules.embedding_configs import EmbeddingBagConfig
35
+ # from torchrec.sparse.jagged_tensor import KeyedJaggedTensor
36
+ # from torchrec.datasets.utils import Batch
37
+ # TORCHREC_AVAILABLE = True
38
+ # except ImportError as e:
39
+ # print(f"โš ๏ธ Warning: torchrec import error: {e}")
40
+ # print("โš ๏ธ Some functionality will be limited")
41
+ # else:
42
+ # print("โš ๏ธ Running in CPU-only mode without torchrec")
43
 
44
+ # class DLRMBookRecommender:
45
+ # """DLRM-based book recommender for inference"""
46
 
47
+ # def __init__(self, model_path: str = None, run_id: str = None):
48
+ # """
49
+ # Initialize DLRM book recommender
50
+
51
+ # Args:
52
+ # model_path: Path to saved model state dict
53
+ # run_id: MLflow run ID to load model from
54
+ # """
55
+ # self.device = torch.device("cpu")
56
+ # self.model = None
57
+ # self.preprocessing_info = None
58
+ # self.torchrec_available = TORCHREC_AVAILABLE
59
+ # self.cpu_only = CPU_ONLY
60
+ # self.dense_cols = []
61
+ # self.cat_cols = []
62
+ # self.emb_counts = []
63
+
64
+ # if self.cpu_only:
65
+ # print("โš ๏ธ Running in CPU-only mode with limited functionality")
66
+ # # Load minimal preprocessing info for browsing
67
+ # self._load_minimal_preprocessing()
68
+ # return
69
+
70
+ # if not self.torchrec_available:
71
+ # print("โš ๏ธ Running in limited mode without torchrec")
72
+ # return
73
+
74
+ # # Load preprocessing info
75
+ # self._load_preprocessing_info()
76
+
77
+ # # Load model
78
+ # if model_path and os.path.exists(model_path):
79
+ # self._load_model_from_path(model_path)
80
+ # elif run_id:
81
+ # self._load_model_from_mlflow(run_id)
82
+ # else:
83
+ # print("โš ๏ธ No model loaded. Please provide model_path or run_id")
84
 
85
+ # def _load_minimal_preprocessing(self):
86
+ # """Load minimal preprocessing info for CPU-only mode"""
87
+ # try:
88
+ # if os.path.exists('book_dlrm_preprocessing.pkl'):
89
+ # with open('book_dlrm_preprocessing.pkl', 'rb') as f:
90
+ # self.preprocessing_info = pickle.load(f)
91
 
92
+ # self.dense_cols = self.preprocessing_info.get('dense_cols', [])
93
+ # self.cat_cols = self.preprocessing_info.get('cat_cols', [])
94
+ # self.emb_counts = self.preprocessing_info.get('emb_counts', [])
95
 
96
+ # print("โœ… Minimal preprocessing info loaded for CPU-only mode")
97
+ # else:
98
+ # print("โš ๏ธ No preprocessing info found for CPU-only mode")
99
+ # except Exception as e:
100
+ # print(f"โš ๏ธ Error loading minimal preprocessing: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
+ # def _load_preprocessing_info(self):
103
+ # """Load preprocessing information"""
104
+ # if os.path.exists('book_dlrm_preprocessing.pkl'):
105
+ # with open('book_dlrm_preprocessing.pkl', 'rb') as f:
106
+ # self.preprocessing_info = pickle.load(f)
107
+
108
+ # self.dense_cols = self.preprocessing_info['dense_cols']
109
+ # self.cat_cols = self.preprocessing_info['cat_cols']
110
+ # self.emb_counts = self.preprocessing_info['emb_counts']
111
+ # self.user_encoder = self.preprocessing_info['user_encoder']
112
+ # self.book_encoder = self.preprocessing_info['book_encoder']
113
+ # self.publisher_encoder = self.preprocessing_info['publisher_encoder']
114
+ # self.location_encoder = self.preprocessing_info['location_encoder']
115
+ # self.scaler = self.preprocessing_info['scaler']
116
+
117
+ # print("โœ… Preprocessing info loaded")
118
+ # else:
119
+ # raise FileNotFoundError("book_dlrm_preprocessing.pkl not found. Run preprocessing first.")
120
+
121
+ # def _load_model_from_path(self, model_path: str):
122
+ # """Load model from saved state dict"""
123
+ # try:
124
+ # # Create model architecture
125
+ # eb_configs = [
126
+ # EmbeddingBagConfig(
127
+ # name=f"t_{feature_name}",
128
+ # embedding_dim=64, # Default embedding dim
129
+ # num_embeddings=self.emb_counts[feature_idx],
130
+ # feature_names=[feature_name],
131
+ # )
132
+ # for feature_idx, feature_name in enumerate(self.cat_cols)
133
+ # ]
134
 
135
+ # dlrm_model = DLRM(
136
+ # embedding_bag_collection=EmbeddingBagCollection(
137
+ # tables=eb_configs, device=self.device
138
+ # ),
139
+ # dense_in_features=len(self.dense_cols),
140
+ # dense_arch_layer_sizes=[256, 128, 64],
141
+ # over_arch_layer_sizes=[512, 256, 128, 1],
142
+ # dense_device=self.device,
143
+ # )
144
 
145
+ # # Load state dict
146
+ # state_dict = torch.load(model_path, map_location=self.device)
147
 
148
+ # # Remove 'model.' prefix if present
149
+ # if any(key.startswith('model.') for key in state_dict.keys()):
150
+ # state_dict = {k[6:]: v for k, v in state_dict.items()}
151
 
152
+ # dlrm_model.load_state_dict(state_dict)
153
+ # self.model = dlrm_model
154
+ # self.model.eval()
155
 
156
+ # print(f"โœ… Model loaded from {model_path}")
157
 
158
+ # except Exception as e:
159
+ # print(f"โŒ Error loading model: {e}")
160
 
161
+ # def _load_model_from_mlflow(self, run_id: str):
162
+ # """Load model from MLflow"""
163
+ # try:
164
+ # client = MlflowClient()
165
+ # run = client.get_run(run_id)
166
+
167
+ # # Get model parameters from MLflow
168
+ # params = run.data.params
169
+ # cat_cols = eval(params.get('cat_cols'))
170
+ # emb_counts = eval(params.get('emb_counts'))
171
+ # dense_cols = eval(params.get('dense_cols'))
172
+ # embedding_dim = int(params.get('embedding_dim', 64))
173
+ # dense_arch_layer_sizes = eval(params.get('dense_arch_layer_sizes'))
174
+ # over_arch_layer_sizes = eval(params.get('over_arch_layer_sizes'))
175
+
176
+ # # Download model from MLflow
177
+ # temp_dir = tempfile.mkdtemp()
178
+
179
+ # # Try different artifact paths
180
+ # for artifact_path in ['model_state_dict_final', 'model_state_dict_2', 'model_state_dict_1', 'model_state_dict_0']:
181
+ # try:
182
+ # client.download_artifacts(run_id, f"{artifact_path}/state_dict.pth", temp_dir)
183
+ # state_dict = mlflow.pytorch.load_state_dict(f"{temp_dir}/{artifact_path}")
184
+ # break
185
+ # except:
186
+ # continue
187
+ # else:
188
+ # raise Exception("No model artifacts found")
189
+
190
+ # # Create model
191
+ # eb_configs = [
192
+ # EmbeddingBagConfig(
193
+ # name=f"t_{feature_name}",
194
+ # embedding_dim=embedding_dim,
195
+ # num_embeddings=emb_counts[feature_idx],
196
+ # feature_names=[feature_name],
197
+ # )
198
+ # for feature_idx, feature_name in enumerate(cat_cols)
199
+ # ]
200
 
201
+ # dlrm_model = DLRM(
202
+ # embedding_bag_collection=EmbeddingBagCollection(
203
+ # tables=eb_configs, device=self.device
204
+ # ),
205
+ # dense_in_features=len(dense_cols),
206
+ # dense_arch_layer_sizes=dense_arch_layer_sizes,
207
+ # over_arch_layer_sizes=over_arch_layer_sizes,
208
+ # dense_device=self.device,
209
+ # )
210
 
211
+ # # Remove prefix and load state dict
212
+ # if any(key.startswith('model.') for key in state_dict.keys()):
213
+ # state_dict = {k[6:]: v for k, v in state_dict.items()}
214
 
215
+ # dlrm_model.load_state_dict(state_dict)
216
+ # self.model = dlrm_model
217
+ # self.model.eval()
218
 
219
+ # print(f"โœ… Model loaded from MLflow run: {run_id}")
220
 
221
+ # except Exception as e:
222
+ # print(f"โŒ Error loading model from MLflow: {e}")
223
 
224
+ # def _prepare_user_features(self, user_id: int, user_data: Optional[Dict] = None) -> Tuple[torch.Tensor, KeyedJaggedTensor]:
225
+ # """Prepare user features for inference"""
226
+
227
+ # if user_data is None:
228
+ # # Create default user features
229
+ # user_data = {
230
+ # 'User-ID': user_id,
231
+ # 'Age': 30, # Default age
232
+ # 'Location': 'usa', # Default location
233
+ # }
234
+
235
+ # # Encode categorical features
236
+ # try:
237
+ # user_id_encoded = self.user_encoder.transform([str(user_id)])[0]
238
+ # except:
239
+ # # Handle unknown user
240
+ # user_id_encoded = 0
241
+
242
+ # try:
243
+ # location = str(user_data.get('Location', 'usa')).split(',')[-1].strip().lower()
244
+ # country_encoded = self.location_encoder.transform([location])[0]
245
+ # except:
246
+ # country_encoded = 0
247
+
248
+ # # Age group
249
+ # age = user_data.get('Age', 30)
250
+ # if age < 18:
251
+ # age_group = 0
252
+ # elif age < 25:
253
+ # age_group = 1
254
+ # elif age < 35:
255
+ # age_group = 2
256
+ # elif age < 50:
257
+ # age_group = 3
258
+ # elif age < 65:
259
+ # age_group = 4
260
+ # else:
261
+ # age_group = 5
262
+
263
+ # # Get user statistics (if available)
264
+ # user_activity = user_data.get('user_activity', 10) # Default
265
+ # user_avg_rating = user_data.get('user_avg_rating', 6.0) # Default
266
+ # age_normalized = user_data.get('Age', 30)
267
+
268
+ # # Normalize dense features
269
+ # dense_features = np.array([[age_normalized, 2000, user_activity, 10, user_avg_rating, 6.0]]) # Default values
270
+ # dense_features = self.scaler.transform(dense_features)
271
+ # dense_features = torch.tensor(dense_features, dtype=torch.float32)
272
+
273
+ # return dense_features, user_id_encoded, country_encoded, age_group
274
+
275
+ # def _prepare_book_features(self, book_isbn: str, book_data: Optional[Dict] = None) -> Tuple[int, int, int, int]:
276
+ # """Prepare book features for inference"""
277
+
278
+ # if book_data is None:
279
+ # book_data = {}
280
+
281
+ # # Encode book ID
282
+ # try:
283
+ # book_id_encoded = self.book_encoder.transform([str(book_isbn)])[0]
284
+ # except:
285
+ # book_id_encoded = 0
286
+
287
+ # # Encode publisher
288
+ # try:
289
+ # publisher = str(book_data.get('Publisher', 'Unknown'))
290
+ # publisher_encoded = self.publisher_encoder.transform([publisher])[0]
291
+ # except:
292
+ # publisher_encoded = 0
293
+
294
+ # # Publication decade
295
+ # year = book_data.get('Year-Of-Publication', 2000)
296
+ # decade = ((int(year) // 10) * 10)
297
+ # try:
298
+ # decade_encoded = preprocessing_info.get('decade_encoder', LabelEncoder()).transform([str(decade)])[0]
299
+ # except:
300
+ # decade_encoded = 6 # Default to 2000s
301
+
302
+ # # Rating level (default to medium)
303
+ # rating_level = 1
304
+
305
+ # return book_id_encoded, publisher_encoded, decade_encoded, rating_level
306
+
307
+ # def predict_rating(self, user_id: int, book_isbn: str,
308
+ # user_data: Optional[Dict] = None,
309
+ # book_data: Optional[Dict] = None) -> float:
310
+ # """
311
+ # Predict rating probability for user-book pair
312
+
313
+ # Args:
314
+ # user_id: User ID
315
+ # book_isbn: Book ISBN
316
+ # user_data: Additional user data (optional)
317
+ # book_data: Additional book data (optional)
318
+
319
+ # Returns:
320
+ # Prediction probability (0-1)
321
+ # """
322
+ # if self.cpu_only:
323
+ # print("โš ๏ธ Cannot make predictions in CPU-only mode")
324
+ # return 0.5 # Return default neutral prediction
325
+
326
+ # if self.model is None:
327
+ # print("โŒ Model not loaded")
328
+ # return 0.0
329
+
330
+ # if not self.torchrec_available:
331
+ # print("โŒ Cannot make predictions without torchrec")
332
+ # return 0.5 # Return default neutral prediction
333
+
334
+ # try:
335
+ # # Prepare features
336
+ # dense_features, user_id_encoded, country_encoded, age_group = self._prepare_user_features(user_id, user_data)
337
+ # book_id_encoded, publisher_encoded, decade_encoded, rating_level = self._prepare_book_features(book_isbn, book_data)
338
+
339
+ # # Create sparse features
340
+ # kjt_values = [user_id_encoded, book_id_encoded, publisher_encoded, country_encoded, age_group, decade_encoded, rating_level]
341
+ # kjt_lengths = [1] * len(kjt_values)
342
+
343
+ # sparse_features = KeyedJaggedTensor.from_lengths_sync(
344
+ # self.cat_cols,
345
+ # torch.tensor(kjt_values),
346
+ # torch.tensor(kjt_lengths, dtype=torch.int32),
347
+ # )
348
+
349
+ # # Make prediction
350
+ # with torch.no_grad():
351
+ # logits = self.model(dense_features=dense_features, sparse_features=sparse_features)
352
+ # prediction = torch.sigmoid(logits).item()
353
+
354
+ # return prediction
355
+
356
+ # except Exception as e:
357
+ # print(f"Error in prediction: {e}")
358
+ # return 0.0
359
+
360
+ # def get_user_recommendations(self, user_id: int,
361
+ # candidate_books: List[str],
362
+ # k: int = 10,
363
+ # user_data: Optional[Dict] = None) -> List[Tuple[str, float]]:
364
+ # """
365
+ # Get top-k book recommendations for a user
366
+
367
+ # Args:
368
+ # user_id: User ID
369
+ # candidate_books: List of candidate book ISBNs
370
+ # k: Number of recommendations
371
+ # user_data: Additional user data
372
+
373
+ # Returns:
374
+ # List of (book_isbn, prediction_score) tuples
375
+ # """
376
+ # if self.cpu_only or self.model is None or not self.torchrec_available:
377
+ # print("โŒ Model not loaded, CPU-only mode, or torchrec not available")
378
+ # return []
379
 
380
+ # recommendations = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
381
 
382
+ # print(f"Generating recommendations for user {user_id} from {len(candidate_books)} candidates...")
 
 
 
383
 
384
+ # for book_isbn in candidate_books:
385
+ # score = self.predict_rating(user_id, book_isbn, user_data)
386
+ # recommendations.append((book_isbn, score))
 
387
 
388
+ # # Sort by score and return top-k
389
+ # recommendations.sort(key=lambda x: x[1], reverse=True)
390
+ # return recommendations[:k]
391
 
392
+ # def batch_recommend(self, user_ids: List[int],
393
+ # candidate_books: List[str],
394
+ # k: int = 10) -> Dict[int, List[Tuple[str, float]]]:
395
+ # """
396
+ # Generate recommendations for multiple users
397
+
398
+ # Args:
399
+ # user_ids: List of user IDs
400
+ # candidate_books: List of candidate book ISBNs
401
+ # k: Number of recommendations per user
402
+
403
+ # Returns:
404
+ # Dictionary mapping user_id to recommendations
405
+ # """
406
+ # results = {}
407
 
408
+ # for user_id in user_ids:
409
+ # results[user_id] = self.get_user_recommendations(user_id, candidate_books, k)
 
 
 
 
 
 
 
 
 
 
 
 
410
 
411
+ # return results
412
+
413
+ # def get_similar_books(self, target_book_isbn: str,
414
+ # candidate_books: List[str],
415
+ # sample_users: List[int],
416
+ # k: int = 10) -> List[Tuple[str, float]]:
417
+ # """
418
+ # Find books similar to target book by comparing user preferences
419
+
420
+ # Args:
421
+ # target_book_isbn: Target book ISBN
422
+ # candidate_books: List of candidate book ISBNs
423
+ # sample_users: Sample users to test similarity with
424
+ # k: Number of similar books
425
+
426
+ # Returns:
427
+ # List of (book_isbn, similarity_score) tuples
428
+ # """
429
+ # target_scores = []
430
+ # candidate_scores = {book: [] for book in candidate_books}
431
+
432
+ # # Get predictions for target book and candidates across sample users
433
+ # for user_id in sample_users:
434
+ # target_score = self.predict_rating(user_id, target_book_isbn)
435
+ # target_scores.append(target_score)
436
+
437
+ # for book_isbn in candidate_books:
438
+ # if book_isbn != target_book_isbn:
439
+ # score = self.predict_rating(user_id, book_isbn)
440
+ # candidate_scores[book_isbn].append(score)
441
+
442
+ # # Calculate similarity based on correlation of user preferences
443
+ # similarities = []
444
+ # target_scores = np.array(target_scores)
445
+
446
+ # for book_isbn, scores in candidate_scores.items():
447
+ # if len(scores) > 0:
448
+ # scores_array = np.array(scores)
449
+ # # Calculate correlation as similarity measure
450
+ # correlation = np.corrcoef(target_scores, scores_array)[0, 1]
451
+ # if not np.isnan(correlation):
452
+ # similarities.append((book_isbn, correlation))
453
+
454
+ # # Sort by similarity and return top-k
455
+ # similarities.sort(key=lambda x: x[1], reverse=True)
456
+ # return similarities[:k]
457
+
458
+
459
+ # def load_dlrm_recommender(model_source: str = "latest") -> DLRMBookRecommender:
460
+ # """
461
+ # Load DLRM recommender from various sources
462
+
463
+ # Args:
464
+ # model_source: "latest" for latest MLflow run, "file" for local file, or specific run_id
465
+
466
+ # Returns:
467
+ # DLRMBookRecommender instance
468
+ # """
469
+ # # Check if we're in CPU-only mode
470
+ # cpu_only = os.environ.get('CPU_ONLY', 'false').lower() == 'true'
471
+ # if cpu_only:
472
+ # print("๐Ÿ”„ Loading recommender in CPU-only mode")
473
+ # # In CPU-only mode, just return a basic recommender instance
474
+ # return DLRMBookRecommender()
475
+
476
+ # # Create recommender instance
477
+ # recommender = DLRMBookRecommender()
478
+
479
+ # # If torchrec is not available, return limited recommender
480
+ # if not TORCHREC_AVAILABLE:
481
+ # print("โš ๏ธ torchrec not available, returning limited recommender")
482
+ # return recommender
483
+
484
+ # if model_source == "latest":
485
+ # # Try to get latest MLflow run
486
+ # try:
487
+ # experiment = mlflow.get_experiment_by_name('dlrm-book-recommendation-book_recommender')
488
+ # if experiment:
489
+ # runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id],
490
+ # order_by=["start_time desc"], max_results=1)
491
+ # if len(runs) > 0:
492
+ # latest_run_id = runs.iloc[0].run_id
493
+ # recommender = DLRMBookRecommender(run_id=latest_run_id)
494
+ # return recommender
495
+ # except Exception as e:
496
+ # print(f"โš ๏ธ Error loading from MLflow: {e}")
497
+
498
+ # elif model_source == "file":
499
+ # # Try to load from local file
500
+ # for filename in [
501
+ # '/home/mr-behdadi/PROJECT/ICE/notebooks/dlrm_book_model_final.pth',
502
+ # '/home/mr-behdadi/PROJECT/ICE/notebooks/dlrm_book_model_epoch_2.pth',
503
+ # '/home/mr-behdadi/PROJECT/ICE/notebooks/dlrm_book_model_epoch_0.pth',
504
+ # '/home/mr-behdadi/PROJECT/ICE/notebooks/dlrm_book_model_epoch_1.pth']:
505
+ # if os.path.exists(filename):
506
+ # try:
507
+ # recommender = DLRMBookRecommender(model_path=filename)
508
+ # return recommender
509
+ # except Exception as e:
510
+ # print(f"โš ๏ธ Error loading from {filename}: {e}")
511
+
512
+ # else:
513
+ # # Treat as run_id
514
+ # try:
515
+ # recommender = DLRMBookRecommender(run_id=model_source)
516
+ # return recommender
517
+ # except Exception as e:
518
+ # print(f"โš ๏ธ Error loading from run_id {model_source}: {e}")
519
+
520
+ # print("โš ๏ธ Could not load any trained model")
521
+ # return recommender
522
+
523
+
524
+ # def demo_dlrm_recommendations():
525
+ # """Demo function to show DLRM recommendations"""
526
+
527
+ # print("๐Ÿš€ DLRM Book Recommendation Demo")
528
+ # print("=" * 50)
529
+
530
+ # # Load book data for demo
531
+ # books_df = pd.read_csv('Books.csv', encoding='latin-1', low_memory=False)
532
+ # users_df = pd.read_csv('Users.csv', encoding='latin-1', low_memory=False)
533
+ # ratings_df = pd.read_csv('Ratings.csv', encoding='latin-1', low_memory=False)
534
+
535
+ # books_df.columns = books_df.columns.str.replace('"', '')
536
+ # users_df.columns = users_df.columns.str.replace('"', '')
537
+ # ratings_df.columns = ratings_df.columns.str.replace('"', '')
538
+
539
+ # # Load recommender
540
+ # recommender = load_dlrm_recommender("file")
541
+
542
+ # if recommender.model is None:
543
+ # print("โŒ No trained model found. Please run training first.")
544
+ # return
545
+
546
+ # # Get sample user and books
547
+ # sample_user_id = ratings_df['User-ID'].iloc[0]
548
+ # sample_books = books_df['ISBN'].head(20).tolist()
549
+
550
+ # print(f"\n๐Ÿ“š Getting recommendations for User {sample_user_id}")
551
+ # print(f"Testing with {len(sample_books)} candidate books...")
552
+
553
+ # # Get recommendations
554
+ # recommendations = recommender.get_user_recommendations(
555
+ # user_id=sample_user_id,
556
+ # candidate_books=sample_books,
557
+ # k=10
558
+ # )
559
+
560
+ # print(f"\n๐ŸŽฏ Top 10 DLRM Recommendations:")
561
+ # print("-" * 50)
562
+
563
+ # for i, (book_isbn, score) in enumerate(recommendations, 1):
564
+ # # Get book info
565
+ # book_info = books_df[books_df['ISBN'] == book_isbn]
566
+ # if len(book_info) > 0:
567
+ # book = book_info.iloc[0]
568
+ # title = book['Book-Title']
569
+ # author = book['Book-Author']
570
+ # print(f"{i:2d}. {title} by {author}")
571
+ # print(f" ISBN: {book_isbn}, Score: {score:.4f}")
572
+ # else:
573
+ # print(f"{i:2d}. ISBN: {book_isbn}, Score: {score:.4f}")
574
+ # print()
575
+
576
+ # # Show user's actual ratings for comparison
577
+ # user_ratings = ratings_df[ratings_df['User-ID'] == sample_user_id]
578
+ # if len(user_ratings) > 0:
579
+ # print(f"\n๐Ÿ“– User {sample_user_id}'s Actual Reading History:")
580
+ # print("-" * 50)
581
+
582
+ # for _, rating in user_ratings.head(5).iterrows():
583
+ # book_info = books_df[books_df['ISBN'] == rating['ISBN']]
584
+ # if len(book_info) > 0:
585
+ # book = book_info.iloc[0]
586
+ # print(f"โ€ข {book['Book-Title']} by {book['Book-Author']} - Rating: {rating['Book-Rating']}/10")
587
+
588
+ # # Test book similarity
589
+ # if len(recommendations) > 0:
590
+ # target_book = recommendations[0][0]
591
+ # print(f"\n๐Ÿ” Finding books similar to: {target_book}")
592
+
593
+ # similar_books = recommender.get_similar_books(
594
+ # target_book_isbn=target_book,
595
+ # candidate_books=sample_books,
596
+ # sample_users=ratings_df['User-ID'].head(10).tolist(),
597
+ # k=5
598
+ # )
599
+
600
+ # print(f"\n๐Ÿ“š Similar Books:")
601
+ # print("-" * 30)
602
+ # for i, (book_isbn, similarity) in enumerate(similar_books, 1):
603
+ # book_info = books_df[books_df['ISBN'] == book_isbn]
604
+ # if len(book_info) > 0:
605
+ # book = book_info.iloc[0]
606
+ # print(f"{i}. {book['Book-Title']} (similarity: {similarity:.3f})")
607
+
608
+ # if __name__ == "__main__":
609
+ # demo_dlrm_recommendations()
610
+
611
+
612
+ """
613
+ DLRM Inference Engine for Book Recommendations - Hugging Face Space Compatible
614
+ Lightweight version without PyTorch dependencies
615
+ """
616
+
617
+ import os
618
+ import sys
619
+ import numpy as np
620
+ import pandas as pd
621
+ import pickle
622
+ from typing import List, Dict, Tuple, Optional, Any
623
+ import warnings
624
+ warnings.filterwarnings('ignore')
625
+
626
+ # Force CPU-only mode for HF Spaces
627
+ CPU_ONLY = True
628
+ os.environ['CPU_ONLY'] = 'true'
629
+ os.environ['CUDA_VISIBLE_DEVICES'] = ''
630
+
631
+ print("๐Ÿ”„ Running in HF Spaces CPU-only mode (no PyTorch dependencies)")
632
+
633
+ class DLRMBookRecommender:
634
+ """DLRM-based book recommender for inference - HF Spaces compatible"""
635
+
636
+ def __init__(self, model_path: str = None, run_id: str = None):
637
+ """
638
+ Initialize DLRM book recommender
639
 
640
+ Args:
641
+ model_path: Path to saved model state dict (not used in HF Spaces)
642
+ run_id: MLflow run ID (not used in HF Spaces)
643
+ """
644
+ self.device = "cpu"
645
+ self.model = None
646
+ self.preprocessing_info = None
647
+ self.cpu_only = True
648
+ self.dense_cols = [
649
+ 'Age_normalized', 'Year-Of-Publication', 'user_activity',
650
+ 'book_popularity', 'user_avg_rating', 'book_avg_rating'
651
+ ]
652
+ self.cat_cols = [
653
+ 'User-ID', 'ISBN', 'Publisher', 'Country',
654
+ 'Age_Group', 'Publication_Decade', 'Rating_Level'
655
+ ]
656
+ self.emb_counts = [1000, 5000, 500, 50, 6, 8, 3] # Example counts
657
+
658
+ print("โœ… DLRM recommender initialized in HF Spaces mode")
659
+
660
+ # Load minimal preprocessing info for demo
661
+ self._create_demo_preprocessing_info()
662
+
663
+ def _create_demo_preprocessing_info(self):
664
+ """Create demo preprocessing info for HF Spaces"""
665
+ self.preprocessing_info = {
666
+ 'dense_cols': self.dense_cols,
667
+ 'cat_cols': self.cat_cols,
668
+ 'emb_counts': self.emb_counts,
669
+ 'total_samples': 100000,
670
+ 'positive_rate': 0.6,
671
+ 'train_samples': 70000,
672
+ 'val_samples': 15000,
673
+ 'test_samples': 15000
674
+ }
675
+
676
+ # Create mock encoders (for demo purposes)
677
+ self.user_encoder = MockEncoder()
678
+ self.book_encoder = MockEncoder()
679
+ self.publisher_encoder = MockEncoder()
680
+ self.location_encoder = MockEncoder()
681
+ self.scaler = MockScaler()
682
+
683
+ print("โœ… Demo preprocessing info created")
684
 
685
  def predict_rating(self, user_id: int, book_isbn: str,
686
  user_data: Optional[Dict] = None,
687
  book_data: Optional[Dict] = None) -> float:
688
  """
689
+ Predict rating probability for user-book pair using simulation
690
 
691
  Args:
692
  user_id: User ID
 
697
  Returns:
698
  Prediction probability (0-1)
699
  """
 
 
 
 
 
 
 
 
 
 
 
 
700
  try:
701
+ # Simulate DLRM prediction using heuristic approach
702
+ prediction = self._simulate_dlrm_prediction(user_id, book_isbn, user_data, book_data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
703
  return prediction
704
 
705
  except Exception as e:
706
  print(f"Error in prediction: {e}")
707
+ return 0.5 # Return neutral prediction on error
708
+
709
+ def _simulate_dlrm_prediction(self, user_id: int, book_isbn: str,
710
+ user_data: Optional[Dict] = None,
711
+ book_data: Optional[Dict] = None) -> float:
712
+ """Simulate DLRM prediction using heuristics"""
713
+
714
+ # Use deterministic random based on inputs
715
+ np.random.seed(hash(f"{user_id}_{book_isbn}") % 2**32)
716
+
717
+ # Base prediction
718
+ base_score = 0.5
719
+
720
+ # User factors
721
+ user_age = user_data.get('Age', 30) if user_data else 30
722
+ age_factor = 0.0
723
+ if 18 <= user_age <= 25:
724
+ age_factor = 0.1 # Young adults like popular books
725
+ elif 26 <= user_age <= 40:
726
+ age_factor = 0.05 # Adults have varied tastes
727
+ else:
728
+ age_factor = -0.05 # Older users are more selective
729
+
730
+ # Book factors
731
+ book_year = book_data.get('Year-Of-Publication', 2000) if book_data else 2000
732
+ book_year = int(book_year) if str(book_year).isdigit() else 2000
733
+
734
+ year_factor = 0.0
735
+ if book_year >= 2010:
736
+ year_factor = 0.1 # Recent books get slight boost
737
+ elif book_year >= 2000:
738
+ year_factor = 0.05
739
+ elif book_year >= 1990:
740
+ year_factor = 0.0
741
+ else:
742
+ year_factor = -0.1 # Very old books less likely
743
+
744
+ # Publisher factor (simplified)
745
+ publisher = book_data.get('Publisher', '') if book_data else ''
746
+ publisher_factor = 0.0
747
+ popular_publishers = ['penguin', 'random', 'harper', 'simon', 'macmillan']
748
+ if any(pub in publisher.lower() for pub in popular_publishers):
749
+ publisher_factor = 0.08
750
+
751
+ # User-book interaction simulation
752
+ interaction_factor = np.random.uniform(-0.15, 0.15)
753
+
754
+ # Genre preference simulation (based on book title/publisher)
755
+ genre_factor = np.random.uniform(-0.1, 0.1)
756
+
757
+ # Combine all factors
758
+ final_score = (base_score +
759
+ age_factor +
760
+ year_factor +
761
+ publisher_factor +
762
+ interaction_factor +
763
+ genre_factor)
764
+
765
+ # Add some controlled randomness
766
+ noise = np.random.uniform(-0.05, 0.05)
767
+ final_score += noise
768
+
769
+ # Clamp to valid range
770
+ final_score = max(0.0, min(1.0, final_score))
771
+
772
+ return final_score
773
 
774
  def get_user_recommendations(self, user_id: int,
775
  candidate_books: List[str],
 
787
  Returns:
788
  List of (book_isbn, prediction_score) tuples
789
  """
790
+ print(f"Generating simulated recommendations for user {user_id} from {len(candidate_books)} candidates...")
 
 
791
 
792
  recommendations = []
793
 
 
 
794
  for book_isbn in candidate_books:
795
  score = self.predict_rating(user_id, book_isbn, user_data)
796
  recommendations.append((book_isbn, score))
 
857
  if len(scores) > 0:
858
  scores_array = np.array(scores)
859
  # Calculate correlation as similarity measure
860
+ if len(scores_array) > 1:
861
+ correlation = np.corrcoef(target_scores, scores_array)[0, 1]
862
+ if not np.isnan(correlation):
863
+ similarities.append((book_isbn, correlation))
864
+ else:
865
+ # Fallback similarity measure
866
+ similarity = 1.0 - abs(target_scores[0] - scores_array[0])
867
+ similarities.append((book_isbn, similarity))
868
 
869
  # Sort by similarity and return top-k
870
  similarities.sort(key=lambda x: x[1], reverse=True)
871
  return similarities[:k]
872
 
873
 
874
+ class MockEncoder:
875
+ """Mock encoder for demo purposes"""
876
+
877
+ def __init__(self):
878
+ self.classes_ = []
879
+
880
+ def transform(self, values):
881
+ """Mock transform that returns hash-based encoding"""
882
+ return [hash(str(val)) % 1000 for val in values]
883
+
884
+
885
+ class MockScaler:
886
+ """Mock scaler for demo purposes"""
887
+
888
+ def transform(self, X):
889
+ """Mock transform that returns normalized values"""
890
+ X = np.array(X)
891
+ # Simple min-max normalization simulation
892
+ return (X - X.min()) / (X.max() - X.min() + 1e-8)
893
+
894
+
895
+ def load_dlrm_recommender(model_source: str = "demo") -> DLRMBookRecommender:
896
  """
897
+ Load DLRM recommender for HF Spaces
898
 
899
  Args:
900
+ model_source: Always returns demo version in HF Spaces
901
 
902
  Returns:
903
  DLRMBookRecommender instance
904
  """
905
+ print("๐Ÿ”„ Loading DLRM recommender in HF Spaces demo mode")
 
 
 
 
 
 
 
 
 
 
 
 
 
906
 
907
+ # Always return demo version for HF Spaces
908
+ recommender = DLRMBookRecommender()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
909
 
910
+ print("โœ… DLRM recommender loaded successfully")
911
  return recommender
912
 
913
 
914
  def demo_dlrm_recommendations():
915
  """Demo function to show DLRM recommendations"""
916
 
917
+ print("๐Ÿš€ DLRM Book Recommendation Demo - HF Spaces Version")
918
  print("=" * 50)
919
 
920
+ # Create sample data for demo
921
+ sample_books_data = {
922
+ 'ISBN': ['0439023483', '0439358078', '0316666343', '0452264464', '0061120081'],
923
+ 'Book-Title': [
924
+ 'The Hunger Games',
925
+ 'Harry Potter and the Chamber of Secrets',
926
+ 'The Catcher in the Rye',
927
+ '1984',
928
+ 'To Kill a Mockingbird'
929
+ ],
930
+ 'Book-Author': [
931
+ 'Suzanne Collins',
932
+ 'J.K. Rowling',
933
+ 'J.D. Salinger',
934
+ 'George Orwell',
935
+ 'Harper Lee'
936
+ ],
937
+ 'Year-Of-Publication': [2008, 1999, 1951, 1949, 1960],
938
+ 'Publisher': ['Scholastic', 'Scholastic', 'Little, Brown', 'Signet', 'Harper']
939
+ }
940
 
941
+ books_df = pd.DataFrame(sample_books_data)
 
 
942
 
943
  # Load recommender
944
+ recommender = load_dlrm_recommender()
945
 
946
+ # Demo user
947
+ sample_user_id = 1
948
+ sample_books = books_df['ISBN'].tolist()
949
 
950
+ print(f"\n๐Ÿ“š Getting simulated recommendations for User {sample_user_id}")
 
 
 
 
951
  print(f"Testing with {len(sample_books)} candidate books...")
952
 
953
  # Get recommendations
954
  recommendations = recommender.get_user_recommendations(
955
  user_id=sample_user_id,
956
  candidate_books=sample_books,
957
+ k=3,
958
+ user_data={'Age': 25, 'Location': 'New York, USA'}
959
  )
960
 
961
+ print(f"\n๐ŸŽฏ Top 3 Simulated DLRM Recommendations:")
962
  print("-" * 50)
963
 
964
  for i, (book_isbn, score) in enumerate(recommendations, 1):
 
969
  title = book['Book-Title']
970
  author = book['Book-Author']
971
  print(f"{i:2d}. {title} by {author}")
972
+ print(f" ISBN: {book_isbn}, Simulated Score: {score:.4f}")
973
  else:
974
+ print(f"{i:2d}. ISBN: {book_isbn}, Simulated Score: {score:.4f}")
975
  print()
976
 
977
+ print("โœ… Demo completed successfully!")
978
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
979
 
980
  if __name__ == "__main__":
981
  demo_dlrm_recommendations()