|
|
"""
|
|
|
Content-Based Recommendation System for CMU Landmarks
|
|
|
|
|
|
This model provides personalized landmark recommendations based on user preferences
|
|
|
using content-based filtering with cosine similarity.
|
|
|
"""
|
|
|
|
|
|
import numpy as np
|
|
|
from typing import List, Dict, Tuple, Optional
|
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
|
|
import json
|
|
|
import pickle
|
|
|
|
|
|
|
|
|
class ContentBasedRecommender:
|
|
|
"""
|
|
|
Content-Based Recommendation System (Trained-from-scratch)
|
|
|
|
|
|
Uses landmark features to recommend similar landmarks based on user preferences.
|
|
|
This is a trained-from-scratch model that learns from the landmark dataset.
|
|
|
"""
|
|
|
|
|
|
def __init__(self, landmarks_data: List[Dict] = None):
|
|
|
self.landmarks = landmarks_data or []
|
|
|
self.feature_matrix = None
|
|
|
self.scaler = StandardScaler()
|
|
|
self.class_encoder = LabelEncoder()
|
|
|
self.landmark_ids = []
|
|
|
|
|
|
if landmarks_data:
|
|
|
self._build_feature_matrix()
|
|
|
|
|
|
def _build_feature_matrix(self):
|
|
|
"""Build feature matrix from landmark data"""
|
|
|
features = []
|
|
|
all_classes = []
|
|
|
|
|
|
|
|
|
for lm in self.landmarks:
|
|
|
all_classes.extend(lm.get('Class', []))
|
|
|
|
|
|
unique_classes = sorted(list(set(all_classes)))
|
|
|
if unique_classes:
|
|
|
self.class_encoder.fit(unique_classes)
|
|
|
|
|
|
|
|
|
for lm in self.landmarks:
|
|
|
feature_vector = self._extract_features(lm, unique_classes)
|
|
|
features.append(feature_vector)
|
|
|
self.landmark_ids.append(lm['id'])
|
|
|
|
|
|
|
|
|
if features:
|
|
|
self.feature_matrix = np.array(features)
|
|
|
self.feature_matrix = self.scaler.fit_transform(self.feature_matrix)
|
|
|
|
|
|
def _extract_features(self, landmark: Dict, all_classes: List[str]) -> np.ndarray:
|
|
|
"""Extract numerical features from a landmark"""
|
|
|
features = []
|
|
|
|
|
|
|
|
|
rating = landmark.get('rating', 0.0)
|
|
|
features.append(rating / 5.0)
|
|
|
|
|
|
|
|
|
io_type = landmark.get('indoor/outdoor', 'outdoor')
|
|
|
features.append(1.0 if io_type == 'indoor' else 0.0)
|
|
|
|
|
|
|
|
|
dwell_min = landmark.get('time taken to explore', 30)
|
|
|
features.append(dwell_min / 480.0)
|
|
|
|
|
|
|
|
|
class_vector = np.zeros(len(all_classes))
|
|
|
landmark_classes = landmark.get('Class', [])
|
|
|
for cls in landmark_classes:
|
|
|
if cls in all_classes:
|
|
|
idx = all_classes.index(cls)
|
|
|
class_vector[idx] = 1.0
|
|
|
features.extend(class_vector)
|
|
|
|
|
|
|
|
|
cmu_lat, cmu_lon = 40.4433, -79.9436
|
|
|
geocoord = landmark.get('geocoord', {'lat': cmu_lat, 'lon': cmu_lon})
|
|
|
features.append(abs(geocoord['lat'] - cmu_lat) / 0.1)
|
|
|
features.append(abs(geocoord['lon'] - cmu_lon) / 0.1)
|
|
|
|
|
|
return np.array(features)
|
|
|
|
|
|
def get_user_preference_vector(self, selected_classes: List[str],
|
|
|
indoor_pref: Optional[str] = None,
|
|
|
min_rating: float = 0.0) -> np.ndarray:
|
|
|
"""Create user preference vector from selections"""
|
|
|
if not self.feature_matrix.size:
|
|
|
return np.array([])
|
|
|
|
|
|
all_classes = self.class_encoder.classes_
|
|
|
|
|
|
|
|
|
user_vector = np.mean(self.feature_matrix, axis=0)
|
|
|
|
|
|
|
|
|
if selected_classes:
|
|
|
class_mask = np.zeros(len(all_classes))
|
|
|
for cls in selected_classes:
|
|
|
if cls in all_classes:
|
|
|
idx = list(all_classes).index(cls)
|
|
|
class_mask[idx] = 1.0
|
|
|
|
|
|
|
|
|
class_start_idx = 3
|
|
|
class_end_idx = class_start_idx + len(all_classes)
|
|
|
user_vector[class_start_idx:class_end_idx] += class_mask * 0.5
|
|
|
|
|
|
|
|
|
if indoor_pref == 'indoor':
|
|
|
user_vector[1] += 0.3
|
|
|
elif indoor_pref == 'outdoor':
|
|
|
user_vector[1] -= 0.3
|
|
|
|
|
|
return user_vector
|
|
|
|
|
|
def recommend(self, selected_classes: List[str],
|
|
|
indoor_pref: Optional[str] = None,
|
|
|
min_rating: float = 0.0,
|
|
|
diversity_weight: float = 0.6,
|
|
|
exclude_ids: List[str] = None,
|
|
|
top_k: int = 10) -> List[Tuple[str, float]]:
|
|
|
"""
|
|
|
Get recommendations based on user preferences
|
|
|
|
|
|
Returns list of (landmark_id, similarity_score) tuples
|
|
|
"""
|
|
|
if not self.feature_matrix.size:
|
|
|
return []
|
|
|
|
|
|
if exclude_ids is None:
|
|
|
exclude_ids = []
|
|
|
|
|
|
|
|
|
user_vector = self.get_user_preference_vector(selected_classes, indoor_pref, min_rating)
|
|
|
|
|
|
|
|
|
similarities = cosine_similarity([user_vector], self.feature_matrix)[0]
|
|
|
|
|
|
|
|
|
filtered_results = []
|
|
|
for i, lm in enumerate(self.landmarks):
|
|
|
if (lm.get('rating', 0) >= min_rating and
|
|
|
lm['id'] not in exclude_ids and
|
|
|
i < len(similarities)):
|
|
|
|
|
|
|
|
|
base_score = similarities[i]
|
|
|
|
|
|
|
|
|
class_diversity = self._calculate_diversity_bonus(lm, selected_classes)
|
|
|
final_score = base_score + diversity_weight * class_diversity
|
|
|
|
|
|
filtered_results.append((lm['id'], final_score))
|
|
|
|
|
|
|
|
|
sorted_results = sorted(filtered_results, key=lambda x: x[1], reverse=True)
|
|
|
return sorted_results[:top_k]
|
|
|
|
|
|
def _calculate_diversity_bonus(self, landmark: Dict, selected_classes: List[str]) -> float:
|
|
|
"""Calculate diversity bonus for a landmark"""
|
|
|
landmark_classes = set(landmark.get('Class', []))
|
|
|
selected_classes_set = set(selected_classes)
|
|
|
new_classes = landmark_classes - selected_classes_set
|
|
|
return len(new_classes) * 0.1
|
|
|
|
|
|
def save_model(self, filepath: str):
|
|
|
"""Save the trained model"""
|
|
|
model_data = {
|
|
|
'feature_matrix': self.feature_matrix.tolist() if self.feature_matrix is not None else None,
|
|
|
'landmark_ids': self.landmark_ids,
|
|
|
'scaler_mean': self.scaler.mean_.tolist() if hasattr(self.scaler, 'mean_') else None,
|
|
|
'scaler_scale': self.scaler.scale_.tolist() if hasattr(self.scaler, 'scale_') else None,
|
|
|
'class_encoder_classes': self.class_encoder.classes_.tolist() if hasattr(self.class_encoder, 'classes_') else None
|
|
|
}
|
|
|
|
|
|
with open(filepath, 'w') as f:
|
|
|
json.dump(model_data, f)
|
|
|
|
|
|
def load_model(self, filepath: str):
|
|
|
"""Load a trained model"""
|
|
|
with open(filepath, 'r') as f:
|
|
|
model_data = json.load(f)
|
|
|
|
|
|
self.feature_matrix = np.array(model_data['feature_matrix']) if model_data['feature_matrix'] else None
|
|
|
self.landmark_ids = model_data['landmark_ids']
|
|
|
|
|
|
if model_data['scaler_mean']:
|
|
|
self.scaler.mean_ = np.array(model_data['scaler_mean'])
|
|
|
self.scaler.scale_ = np.array(model_data['scaler_scale'])
|
|
|
|
|
|
if model_data['class_encoder_classes']:
|
|
|
self.class_encoder.classes_ = np.array(model_data['class_encoder_classes'])
|
|
|
|
|
|
|
|
|
def load_model_from_data(data_path: str) -> ContentBasedRecommender:
|
|
|
"""Load model from landmarks data"""
|
|
|
with open(data_path, 'r') as f:
|
|
|
landmarks = json.load(f)
|
|
|
|
|
|
recommender = ContentBasedRecommender(landmarks)
|
|
|
return recommender
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
with open('data/landmarks.json', 'r') as f:
|
|
|
landmarks = json.load(f)
|
|
|
|
|
|
|
|
|
recommender = ContentBasedRecommender(landmarks)
|
|
|
|
|
|
|
|
|
recommendations = recommender.recommend(
|
|
|
selected_classes=['Culture', 'Research'],
|
|
|
indoor_pref='indoor',
|
|
|
min_rating=4.0,
|
|
|
top_k=5
|
|
|
)
|
|
|
|
|
|
print("Top 5 recommendations:")
|
|
|
for lm_id, score in recommendations:
|
|
|
print(f"{lm_id}: {score:.3f}")
|
|
|
|