""" Rating Prediction Model for CMU Landmarks This model predicts landmark ratings based on feature characteristics using a Random Forest regressor from scikit-learn. """ import numpy as np import json from typing import List, Dict, Tuple from sklearn.ensemble import RandomForestRegressor import pickle class RatingPredictor: """ Rating Prediction Model (Off-the-shelf + Fine-tuning) Uses Random Forest to predict landmark ratings based on features. This can help validate existing ratings or predict ratings for new landmarks. """ def __init__(self, landmarks_data: List[Dict] = None): self.landmarks = landmarks_data or [] self.model = RandomForestRegressor(n_estimators=100, random_state=42) self.feature_columns = [] self.is_trained = False if landmarks_data: self.train() def _prepare_training_data(self) -> Tuple[np.ndarray, np.ndarray]: """Prepare training data for rating prediction""" features = [] ratings = [] # Collect all unique classes all_classes = set() for lm in self.landmarks: all_classes.update(lm.get('Class', [])) all_classes = sorted(list(all_classes)) # Create feature matrix for lm in self.landmarks: feature_vector = [] # Basic features feature_vector.append(1.0 if lm.get('indoor/outdoor', 'outdoor') == 'indoor' else 0.0) dwell_time = lm.get('time taken to explore', 30) feature_vector.append(dwell_time / 480.0) # normalized dwell time # Class features (multi-hot encoding) class_vector = np.zeros(len(all_classes)) landmark_classes = lm.get('Class', []) for cls in landmark_classes: if cls in all_classes: idx = all_classes.index(cls) class_vector[idx] = 1.0 feature_vector.extend(class_vector) # Geographic features cmu_lat, cmu_lon = 40.4433, -79.9436 geocoord = lm.get('geocoord', {'lat': cmu_lat, 'lon': cmu_lon}) feature_vector.append(abs(geocoord['lat'] - cmu_lat)) feature_vector.append(abs(geocoord['lon'] - cmu_lon)) features.append(feature_vector) ratings.append(lm.get('rating', 0.0)) # Store feature column names self.feature_columns = ['is_indoor', 'dwell_time_norm'] + all_classes + ['dist_from_cmu_lat', 'dist_from_cmu_lon'] return np.array(features), np.array(ratings) def train(self): """Train the rating prediction model""" if not self.landmarks: raise ValueError("No landmarks data provided for training") X, y = self._prepare_training_data() self.model.fit(X, y) self.is_trained = True # Calculate and return training metrics train_score = self.model.score(X, y) return train_score def predict_rating(self, landmark: Dict) -> float: """Predict rating for a landmark""" if not self.is_trained: raise ValueError("Model must be trained before making predictions") # Extract features for this landmark all_classes = set() for lm in self.landmarks: all_classes.update(lm.get('Class', [])) all_classes = sorted(list(all_classes)) feature_vector = [] feature_vector.append(1.0 if landmark.get('indoor/outdoor', 'outdoor') == 'indoor' else 0.0) dwell_time = landmark.get('time taken to explore', 30) feature_vector.append(dwell_time / 480.0) class_vector = np.zeros(len(all_classes)) landmark_classes = landmark.get('Class', []) for cls in landmark_classes: if cls in all_classes: idx = all_classes.index(cls) class_vector[idx] = 1.0 feature_vector.extend(class_vector) cmu_lat, cmu_lon = 40.4433, -79.9436 geocoord = landmark.get('geocoord', {'lat': cmu_lat, 'lon': cmu_lon}) feature_vector.append(abs(geocoord['lat'] - cmu_lat)) feature_vector.append(abs(geocoord['lon'] - cmu_lon)) X = np.array([feature_vector]) prediction = self.model.predict(X)[0] # Clamp to valid rating range return max(0.0, min(5.0, prediction)) def get_feature_importance(self) -> Dict[str, float]: """Get feature importance from the trained model""" if not self.is_trained: raise ValueError("Model must be trained before getting feature importance") importances = self.model.feature_importances_ return dict(zip(self.feature_columns, importances)) def get_predictions_for_all(self) -> Dict[str, float]: """Get predictions for all landmarks in training data""" if not self.is_trained: raise ValueError("Model must be trained before making predictions") predictions = {} for lm in self.landmarks: predictions[lm['id']] = self.predict_rating(lm) return predictions def save_model(self, filepath: str): """Save the trained model""" model_data = { 'feature_columns': self.feature_columns, 'is_trained': self.is_trained, 'model': pickle.dumps(self.model).hex(), 'landmarks_sample': self.landmarks[:5] if self.landmarks else [] # Store sample for reference } with open(filepath, 'w') as f: json.dump(model_data, f) def load_model(self, filepath: str): """Load a trained model""" with open(filepath, 'r') as f: model_data = json.load(f) self.feature_columns = model_data['feature_columns'] self.is_trained = model_data['is_trained'] self.model = pickle.loads(bytes.fromhex(model_data['model'])) def load_model_from_data(data_path: str) -> RatingPredictor: """Load and train model from landmarks data""" with open(data_path, 'r') as f: landmarks = json.load(f) predictor = RatingPredictor(landmarks) return predictor # Example usage if __name__ == "__main__": # Load landmarks data with open('data/landmarks.json', 'r') as f: landmarks = json.load(f) # Initialize predictor predictor = RatingPredictor(landmarks) # Get feature importance importance = predictor.get_feature_importance() print("Feature importance:") for feature, imp in sorted(importance.items(), key=lambda x: x[1], reverse=True)[:10]: print(f"{feature}: {imp:.3f}") # Get predictions for all landmarks predictions = predictor.get_predictions_for_all() # Calculate metrics actual_ratings = [lm['rating'] for lm in landmarks] predicted_ratings = list(predictions.values()) mae = np.mean(np.abs(np.array(actual_ratings) - np.array(predicted_ratings))) mse = np.mean((np.array(actual_ratings) - np.array(predicted_ratings)) ** 2) print(f"\nModel Performance:") print(f"Mean Absolute Error: {mae:.3f}") print(f"Mean Squared Error: {mse:.3f}") print(f"Training Score: {predictor.model.score(predictor._prepare_training_data()[0], predictor._prepare_training_data()[1]):.3f}")