File size: 9,200 Bytes
0b94a95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
"""

Content-Based Recommendation System for CMU Landmarks



This model provides personalized landmark recommendations based on user preferences

using content-based filtering with cosine similarity.

"""

import numpy as np
from typing import List, Dict, Tuple, Optional
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, LabelEncoder
import json
import pickle


class ContentBasedRecommender:
    """

    Content-Based Recommendation System (Trained-from-scratch)

    

    Uses landmark features to recommend similar landmarks based on user preferences.

    This is a trained-from-scratch model that learns from the landmark dataset.

    """
    
    def __init__(self, landmarks_data: List[Dict] = None):
        self.landmarks = landmarks_data or []
        self.feature_matrix = None
        self.scaler = StandardScaler()
        self.class_encoder = LabelEncoder()
        self.landmark_ids = []
        
        if landmarks_data:
            self._build_feature_matrix()
    
    def _build_feature_matrix(self):
        """Build feature matrix from landmark data"""
        features = []
        all_classes = []
        
        # Collect all unique classes for encoding
        for lm in self.landmarks:
            all_classes.extend(lm.get('Class', []))
        
        unique_classes = sorted(list(set(all_classes)))
        if unique_classes:
            self.class_encoder.fit(unique_classes)
        
        # Create feature vectors for each landmark
        for lm in self.landmarks:
            feature_vector = self._extract_features(lm, unique_classes)
            features.append(feature_vector)
            self.landmark_ids.append(lm['id'])
        
        # Convert to numpy array and scale
        if features:
            self.feature_matrix = np.array(features)
            self.feature_matrix = self.scaler.fit_transform(self.feature_matrix)
    
    def _extract_features(self, landmark: Dict, all_classes: List[str]) -> np.ndarray:
        """Extract numerical features from a landmark"""
        features = []
        
        # Rating (normalized to 0-1)
        rating = landmark.get('rating', 0.0)
        features.append(rating / 5.0)
        
        # Indoor/outdoor (binary encoding)
        io_type = landmark.get('indoor/outdoor', 'outdoor')
        features.append(1.0 if io_type == 'indoor' else 0.0)
        
        # Dwell time (normalized)
        dwell_min = landmark.get('time taken to explore', 30)
        features.append(dwell_min / 480.0)
        
        # Class encoding (multi-hot encoding)
        class_vector = np.zeros(len(all_classes))
        landmark_classes = landmark.get('Class', [])
        for cls in landmark_classes:
            if cls in all_classes:
                idx = all_classes.index(cls)
                class_vector[idx] = 1.0
        features.extend(class_vector)
        
        # Geographic features (normalized lat/lon around CMU)
        cmu_lat, cmu_lon = 40.4433, -79.9436
        geocoord = landmark.get('geocoord', {'lat': cmu_lat, 'lon': cmu_lon})
        features.append(abs(geocoord['lat'] - cmu_lat) / 0.1)
        features.append(abs(geocoord['lon'] - cmu_lon) / 0.1)
        
        return np.array(features)
    
    def get_user_preference_vector(self, selected_classes: List[str], 

                                 indoor_pref: Optional[str] = None,

                                 min_rating: float = 0.0) -> np.ndarray:
        """Create user preference vector from selections"""
        if not self.feature_matrix.size:
            return np.array([])
            
        all_classes = self.class_encoder.classes_
        
        # Start with average landmark profile
        user_vector = np.mean(self.feature_matrix, axis=0)
        
        # Boost selected classes
        if selected_classes:
            class_mask = np.zeros(len(all_classes))
            for cls in selected_classes:
                if cls in all_classes:
                    idx = list(all_classes).index(cls)
                    class_mask[idx] = 1.0
            
            # Add class preferences to user vector
            class_start_idx = 3  # After rating, indoor/outdoor, dwell_time
            class_end_idx = class_start_idx + len(all_classes)
            user_vector[class_start_idx:class_end_idx] += class_mask * 0.5
        
        # Indoor/outdoor preference
        if indoor_pref == 'indoor':
            user_vector[1] += 0.3
        elif indoor_pref == 'outdoor':
            user_vector[1] -= 0.3
        
        return user_vector
    
    def recommend(self, selected_classes: List[str], 

                 indoor_pref: Optional[str] = None,

                 min_rating: float = 0.0,

                 diversity_weight: float = 0.6,

                 exclude_ids: List[str] = None,

                 top_k: int = 10) -> List[Tuple[str, float]]:
        """

        Get recommendations based on user preferences

        

        Returns list of (landmark_id, similarity_score) tuples

        """
        if not self.feature_matrix.size:
            return []
            
        if exclude_ids is None:
            exclude_ids = []
        
        # Get user preference vector
        user_vector = self.get_user_preference_vector(selected_classes, indoor_pref, min_rating)
        
        # Calculate similarities
        similarities = cosine_similarity([user_vector], self.feature_matrix)[0]
        
        # Filter by minimum rating and excluded IDs
        filtered_results = []
        for i, lm in enumerate(self.landmarks):
            if (lm.get('rating', 0) >= min_rating and 
                lm['id'] not in exclude_ids and
                i < len(similarities)):
                
                # Apply diversity weighting
                base_score = similarities[i]
                
                # Diversity bonus based on class rarity
                class_diversity = self._calculate_diversity_bonus(lm, selected_classes)
                final_score = base_score + diversity_weight * class_diversity
                
                filtered_results.append((lm['id'], final_score))
        
        # Sort by score (descending) and return top_k
        sorted_results = sorted(filtered_results, key=lambda x: x[1], reverse=True)
        return sorted_results[:top_k]
    
    def _calculate_diversity_bonus(self, landmark: Dict, selected_classes: List[str]) -> float:
        """Calculate diversity bonus for a landmark"""
        landmark_classes = set(landmark.get('Class', []))
        selected_classes_set = set(selected_classes)
        new_classes = landmark_classes - selected_classes_set
        return len(new_classes) * 0.1  # Small bonus for diversity
    
    def save_model(self, filepath: str):
        """Save the trained model"""
        model_data = {
            'feature_matrix': self.feature_matrix.tolist() if self.feature_matrix is not None else None,
            'landmark_ids': self.landmark_ids,
            'scaler_mean': self.scaler.mean_.tolist() if hasattr(self.scaler, 'mean_') else None,
            'scaler_scale': self.scaler.scale_.tolist() if hasattr(self.scaler, 'scale_') else None,
            'class_encoder_classes': self.class_encoder.classes_.tolist() if hasattr(self.class_encoder, 'classes_') else None
        }
        
        with open(filepath, 'w') as f:
            json.dump(model_data, f)
    
    def load_model(self, filepath: str):
        """Load a trained model"""
        with open(filepath, 'r') as f:
            model_data = json.load(f)
        
        self.feature_matrix = np.array(model_data['feature_matrix']) if model_data['feature_matrix'] else None
        self.landmark_ids = model_data['landmark_ids']
        
        if model_data['scaler_mean']:
            self.scaler.mean_ = np.array(model_data['scaler_mean'])
            self.scaler.scale_ = np.array(model_data['scaler_scale'])
        
        if model_data['class_encoder_classes']:
            self.class_encoder.classes_ = np.array(model_data['class_encoder_classes'])


def load_model_from_data(data_path: str) -> ContentBasedRecommender:
    """Load model from landmarks data"""
    with open(data_path, 'r') as f:
        landmarks = json.load(f)
    
    recommender = ContentBasedRecommender(landmarks)
    return recommender


# Example usage
if __name__ == "__main__":
    # Load landmarks data
    with open('data/landmarks.json', 'r') as f:
        landmarks = json.load(f)
    
    # Initialize recommender
    recommender = ContentBasedRecommender(landmarks)
    
    # Get recommendations
    recommendations = recommender.recommend(
        selected_classes=['Culture', 'Research'],
        indoor_pref='indoor',
        min_rating=4.0,
        top_k=5
    )
    
    print("Top 5 recommendations:")
    for lm_id, score in recommendations:
        print(f"{lm_id}: {score:.3f}")