""" Content-Based Recommendation System for CMU Landmarks This model provides personalized landmark recommendations based on user preferences using content-based filtering with cosine similarity. """ import numpy as np from typing import List, Dict, Tuple, Optional from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from sklearn.preprocessing import StandardScaler, LabelEncoder import json import pickle class ContentBasedRecommender: """ Content-Based Recommendation System (Trained-from-scratch) Uses landmark features to recommend similar landmarks based on user preferences. This is a trained-from-scratch model that learns from the landmark dataset. """ def __init__(self, landmarks_data: List[Dict] = None): self.landmarks = landmarks_data or [] self.feature_matrix = None self.scaler = StandardScaler() self.class_encoder = LabelEncoder() self.landmark_ids = [] if landmarks_data: self._build_feature_matrix() def _build_feature_matrix(self): """Build feature matrix from landmark data""" features = [] all_classes = [] # Collect all unique classes for encoding for lm in self.landmarks: all_classes.extend(lm.get('Class', [])) unique_classes = sorted(list(set(all_classes))) if unique_classes: self.class_encoder.fit(unique_classes) # Create feature vectors for each landmark for lm in self.landmarks: feature_vector = self._extract_features(lm, unique_classes) features.append(feature_vector) self.landmark_ids.append(lm['id']) # Convert to numpy array and scale if features: self.feature_matrix = np.array(features) self.feature_matrix = self.scaler.fit_transform(self.feature_matrix) def _extract_features(self, landmark: Dict, all_classes: List[str]) -> np.ndarray: """Extract numerical features from a landmark""" features = [] # Rating (normalized to 0-1) rating = landmark.get('rating', 0.0) features.append(rating / 5.0) # Indoor/outdoor (binary encoding) io_type = landmark.get('indoor/outdoor', 'outdoor') features.append(1.0 if io_type == 'indoor' else 0.0) # Dwell time (normalized) dwell_min = landmark.get('time taken to explore', 30) features.append(dwell_min / 480.0) # Class encoding (multi-hot encoding) class_vector = np.zeros(len(all_classes)) landmark_classes = landmark.get('Class', []) for cls in landmark_classes: if cls in all_classes: idx = all_classes.index(cls) class_vector[idx] = 1.0 features.extend(class_vector) # Geographic features (normalized lat/lon around CMU) cmu_lat, cmu_lon = 40.4433, -79.9436 geocoord = landmark.get('geocoord', {'lat': cmu_lat, 'lon': cmu_lon}) features.append(abs(geocoord['lat'] - cmu_lat) / 0.1) features.append(abs(geocoord['lon'] - cmu_lon) / 0.1) return np.array(features) def get_user_preference_vector(self, selected_classes: List[str], indoor_pref: Optional[str] = None, min_rating: float = 0.0) -> np.ndarray: """Create user preference vector from selections""" if not self.feature_matrix.size: return np.array([]) all_classes = self.class_encoder.classes_ # Start with average landmark profile user_vector = np.mean(self.feature_matrix, axis=0) # Boost selected classes if selected_classes: class_mask = np.zeros(len(all_classes)) for cls in selected_classes: if cls in all_classes: idx = list(all_classes).index(cls) class_mask[idx] = 1.0 # Add class preferences to user vector class_start_idx = 3 # After rating, indoor/outdoor, dwell_time class_end_idx = class_start_idx + len(all_classes) user_vector[class_start_idx:class_end_idx] += class_mask * 0.5 # Indoor/outdoor preference if indoor_pref == 'indoor': user_vector[1] += 0.3 elif indoor_pref == 'outdoor': user_vector[1] -= 0.3 return user_vector def recommend(self, selected_classes: List[str], indoor_pref: Optional[str] = None, min_rating: float = 0.0, diversity_weight: float = 0.6, exclude_ids: List[str] = None, top_k: int = 10) -> List[Tuple[str, float]]: """ Get recommendations based on user preferences Returns list of (landmark_id, similarity_score) tuples """ if not self.feature_matrix.size: return [] if exclude_ids is None: exclude_ids = [] # Get user preference vector user_vector = self.get_user_preference_vector(selected_classes, indoor_pref, min_rating) # Calculate similarities similarities = cosine_similarity([user_vector], self.feature_matrix)[0] # Filter by minimum rating and excluded IDs filtered_results = [] for i, lm in enumerate(self.landmarks): if (lm.get('rating', 0) >= min_rating and lm['id'] not in exclude_ids and i < len(similarities)): # Apply diversity weighting base_score = similarities[i] # Diversity bonus based on class rarity class_diversity = self._calculate_diversity_bonus(lm, selected_classes) final_score = base_score + diversity_weight * class_diversity filtered_results.append((lm['id'], final_score)) # Sort by score (descending) and return top_k sorted_results = sorted(filtered_results, key=lambda x: x[1], reverse=True) return sorted_results[:top_k] def _calculate_diversity_bonus(self, landmark: Dict, selected_classes: List[str]) -> float: """Calculate diversity bonus for a landmark""" landmark_classes = set(landmark.get('Class', [])) selected_classes_set = set(selected_classes) new_classes = landmark_classes - selected_classes_set return len(new_classes) * 0.1 # Small bonus for diversity def save_model(self, filepath: str): """Save the trained model""" model_data = { 'feature_matrix': self.feature_matrix.tolist() if self.feature_matrix is not None else None, 'landmark_ids': self.landmark_ids, 'scaler_mean': self.scaler.mean_.tolist() if hasattr(self.scaler, 'mean_') else None, 'scaler_scale': self.scaler.scale_.tolist() if hasattr(self.scaler, 'scale_') else None, 'class_encoder_classes': self.class_encoder.classes_.tolist() if hasattr(self.class_encoder, 'classes_') else None } with open(filepath, 'w') as f: json.dump(model_data, f) def load_model(self, filepath: str): """Load a trained model""" with open(filepath, 'r') as f: model_data = json.load(f) self.feature_matrix = np.array(model_data['feature_matrix']) if model_data['feature_matrix'] else None self.landmark_ids = model_data['landmark_ids'] if model_data['scaler_mean']: self.scaler.mean_ = np.array(model_data['scaler_mean']) self.scaler.scale_ = np.array(model_data['scaler_scale']) if model_data['class_encoder_classes']: self.class_encoder.classes_ = np.array(model_data['class_encoder_classes']) def load_model_from_data(data_path: str) -> ContentBasedRecommender: """Load model from landmarks data""" with open(data_path, 'r') as f: landmarks = json.load(f) recommender = ContentBasedRecommender(landmarks) return recommender # Example usage if __name__ == "__main__": # Load landmarks data with open('data/landmarks.json', 'r') as f: landmarks = json.load(f) # Initialize recommender recommender = ContentBasedRecommender(landmarks) # Get recommendations recommendations = recommender.recommend( selected_classes=['Culture', 'Research'], indoor_pref='indoor', min_rating=4.0, top_k=5 ) print("Top 5 recommendations:") for lm_id, score in recommendations: print(f"{lm_id}: {score:.3f}")