ysakhale's picture
Upload 2 files
0b94a95 verified
"""
Content-Based Recommendation System for CMU Landmarks
This model provides personalized landmark recommendations based on user preferences
using content-based filtering with cosine similarity.
"""
import numpy as np
from typing import List, Dict, Tuple, Optional
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, LabelEncoder
import json
import pickle
class ContentBasedRecommender:
"""
Content-Based Recommendation System (Trained-from-scratch)
Uses landmark features to recommend similar landmarks based on user preferences.
This is a trained-from-scratch model that learns from the landmark dataset.
"""
def __init__(self, landmarks_data: List[Dict] = None):
self.landmarks = landmarks_data or []
self.feature_matrix = None
self.scaler = StandardScaler()
self.class_encoder = LabelEncoder()
self.landmark_ids = []
if landmarks_data:
self._build_feature_matrix()
def _build_feature_matrix(self):
"""Build feature matrix from landmark data"""
features = []
all_classes = []
# Collect all unique classes for encoding
for lm in self.landmarks:
all_classes.extend(lm.get('Class', []))
unique_classes = sorted(list(set(all_classes)))
if unique_classes:
self.class_encoder.fit(unique_classes)
# Create feature vectors for each landmark
for lm in self.landmarks:
feature_vector = self._extract_features(lm, unique_classes)
features.append(feature_vector)
self.landmark_ids.append(lm['id'])
# Convert to numpy array and scale
if features:
self.feature_matrix = np.array(features)
self.feature_matrix = self.scaler.fit_transform(self.feature_matrix)
def _extract_features(self, landmark: Dict, all_classes: List[str]) -> np.ndarray:
"""Extract numerical features from a landmark"""
features = []
# Rating (normalized to 0-1)
rating = landmark.get('rating', 0.0)
features.append(rating / 5.0)
# Indoor/outdoor (binary encoding)
io_type = landmark.get('indoor/outdoor', 'outdoor')
features.append(1.0 if io_type == 'indoor' else 0.0)
# Dwell time (normalized)
dwell_min = landmark.get('time taken to explore', 30)
features.append(dwell_min / 480.0)
# Class encoding (multi-hot encoding)
class_vector = np.zeros(len(all_classes))
landmark_classes = landmark.get('Class', [])
for cls in landmark_classes:
if cls in all_classes:
idx = all_classes.index(cls)
class_vector[idx] = 1.0
features.extend(class_vector)
# Geographic features (normalized lat/lon around CMU)
cmu_lat, cmu_lon = 40.4433, -79.9436
geocoord = landmark.get('geocoord', {'lat': cmu_lat, 'lon': cmu_lon})
features.append(abs(geocoord['lat'] - cmu_lat) / 0.1)
features.append(abs(geocoord['lon'] - cmu_lon) / 0.1)
return np.array(features)
def get_user_preference_vector(self, selected_classes: List[str],
indoor_pref: Optional[str] = None,
min_rating: float = 0.0) -> np.ndarray:
"""Create user preference vector from selections"""
if not self.feature_matrix.size:
return np.array([])
all_classes = self.class_encoder.classes_
# Start with average landmark profile
user_vector = np.mean(self.feature_matrix, axis=0)
# Boost selected classes
if selected_classes:
class_mask = np.zeros(len(all_classes))
for cls in selected_classes:
if cls in all_classes:
idx = list(all_classes).index(cls)
class_mask[idx] = 1.0
# Add class preferences to user vector
class_start_idx = 3 # After rating, indoor/outdoor, dwell_time
class_end_idx = class_start_idx + len(all_classes)
user_vector[class_start_idx:class_end_idx] += class_mask * 0.5
# Indoor/outdoor preference
if indoor_pref == 'indoor':
user_vector[1] += 0.3
elif indoor_pref == 'outdoor':
user_vector[1] -= 0.3
return user_vector
def recommend(self, selected_classes: List[str],
indoor_pref: Optional[str] = None,
min_rating: float = 0.0,
diversity_weight: float = 0.6,
exclude_ids: List[str] = None,
top_k: int = 10) -> List[Tuple[str, float]]:
"""
Get recommendations based on user preferences
Returns list of (landmark_id, similarity_score) tuples
"""
if not self.feature_matrix.size:
return []
if exclude_ids is None:
exclude_ids = []
# Get user preference vector
user_vector = self.get_user_preference_vector(selected_classes, indoor_pref, min_rating)
# Calculate similarities
similarities = cosine_similarity([user_vector], self.feature_matrix)[0]
# Filter by minimum rating and excluded IDs
filtered_results = []
for i, lm in enumerate(self.landmarks):
if (lm.get('rating', 0) >= min_rating and
lm['id'] not in exclude_ids and
i < len(similarities)):
# Apply diversity weighting
base_score = similarities[i]
# Diversity bonus based on class rarity
class_diversity = self._calculate_diversity_bonus(lm, selected_classes)
final_score = base_score + diversity_weight * class_diversity
filtered_results.append((lm['id'], final_score))
# Sort by score (descending) and return top_k
sorted_results = sorted(filtered_results, key=lambda x: x[1], reverse=True)
return sorted_results[:top_k]
def _calculate_diversity_bonus(self, landmark: Dict, selected_classes: List[str]) -> float:
"""Calculate diversity bonus for a landmark"""
landmark_classes = set(landmark.get('Class', []))
selected_classes_set = set(selected_classes)
new_classes = landmark_classes - selected_classes_set
return len(new_classes) * 0.1 # Small bonus for diversity
def save_model(self, filepath: str):
"""Save the trained model"""
model_data = {
'feature_matrix': self.feature_matrix.tolist() if self.feature_matrix is not None else None,
'landmark_ids': self.landmark_ids,
'scaler_mean': self.scaler.mean_.tolist() if hasattr(self.scaler, 'mean_') else None,
'scaler_scale': self.scaler.scale_.tolist() if hasattr(self.scaler, 'scale_') else None,
'class_encoder_classes': self.class_encoder.classes_.tolist() if hasattr(self.class_encoder, 'classes_') else None
}
with open(filepath, 'w') as f:
json.dump(model_data, f)
def load_model(self, filepath: str):
"""Load a trained model"""
with open(filepath, 'r') as f:
model_data = json.load(f)
self.feature_matrix = np.array(model_data['feature_matrix']) if model_data['feature_matrix'] else None
self.landmark_ids = model_data['landmark_ids']
if model_data['scaler_mean']:
self.scaler.mean_ = np.array(model_data['scaler_mean'])
self.scaler.scale_ = np.array(model_data['scaler_scale'])
if model_data['class_encoder_classes']:
self.class_encoder.classes_ = np.array(model_data['class_encoder_classes'])
def load_model_from_data(data_path: str) -> ContentBasedRecommender:
"""Load model from landmarks data"""
with open(data_path, 'r') as f:
landmarks = json.load(f)
recommender = ContentBasedRecommender(landmarks)
return recommender
# Example usage
if __name__ == "__main__":
# Load landmarks data
with open('data/landmarks.json', 'r') as f:
landmarks = json.load(f)
# Initialize recommender
recommender = ContentBasedRecommender(landmarks)
# Get recommendations
recommendations = recommender.recommend(
selected_classes=['Culture', 'Research'],
indoor_pref='indoor',
min_rating=4.0,
top_k=5
)
print("Top 5 recommendations:")
for lm_id, score in recommendations:
print(f"{lm_id}: {score:.3f}")