ysakhale's picture
Upload 2 files
21dc841 verified
"""
Rating Prediction Model for CMU Landmarks
This model predicts landmark ratings based on feature characteristics using
a Random Forest regressor from scikit-learn.
"""
import numpy as np
import json
from typing import List, Dict, Tuple
from sklearn.ensemble import RandomForestRegressor
import pickle
class RatingPredictor:
"""
Rating Prediction Model (Off-the-shelf + Fine-tuning)
Uses Random Forest to predict landmark ratings based on features.
This can help validate existing ratings or predict ratings for new landmarks.
"""
def __init__(self, landmarks_data: List[Dict] = None):
self.landmarks = landmarks_data or []
self.model = RandomForestRegressor(n_estimators=100, random_state=42)
self.feature_columns = []
self.is_trained = False
if landmarks_data:
self.train()
def _prepare_training_data(self) -> Tuple[np.ndarray, np.ndarray]:
"""Prepare training data for rating prediction"""
features = []
ratings = []
# Collect all unique classes
all_classes = set()
for lm in self.landmarks:
all_classes.update(lm.get('Class', []))
all_classes = sorted(list(all_classes))
# Create feature matrix
for lm in self.landmarks:
feature_vector = []
# Basic features
feature_vector.append(1.0 if lm.get('indoor/outdoor', 'outdoor') == 'indoor' else 0.0)
dwell_time = lm.get('time taken to explore', 30)
feature_vector.append(dwell_time / 480.0) # normalized dwell time
# Class features (multi-hot encoding)
class_vector = np.zeros(len(all_classes))
landmark_classes = lm.get('Class', [])
for cls in landmark_classes:
if cls in all_classes:
idx = all_classes.index(cls)
class_vector[idx] = 1.0
feature_vector.extend(class_vector)
# Geographic features
cmu_lat, cmu_lon = 40.4433, -79.9436
geocoord = lm.get('geocoord', {'lat': cmu_lat, 'lon': cmu_lon})
feature_vector.append(abs(geocoord['lat'] - cmu_lat))
feature_vector.append(abs(geocoord['lon'] - cmu_lon))
features.append(feature_vector)
ratings.append(lm.get('rating', 0.0))
# Store feature column names
self.feature_columns = ['is_indoor', 'dwell_time_norm'] + all_classes + ['dist_from_cmu_lat', 'dist_from_cmu_lon']
return np.array(features), np.array(ratings)
def train(self):
"""Train the rating prediction model"""
if not self.landmarks:
raise ValueError("No landmarks data provided for training")
X, y = self._prepare_training_data()
self.model.fit(X, y)
self.is_trained = True
# Calculate and return training metrics
train_score = self.model.score(X, y)
return train_score
def predict_rating(self, landmark: Dict) -> float:
"""Predict rating for a landmark"""
if not self.is_trained:
raise ValueError("Model must be trained before making predictions")
# Extract features for this landmark
all_classes = set()
for lm in self.landmarks:
all_classes.update(lm.get('Class', []))
all_classes = sorted(list(all_classes))
feature_vector = []
feature_vector.append(1.0 if landmark.get('indoor/outdoor', 'outdoor') == 'indoor' else 0.0)
dwell_time = landmark.get('time taken to explore', 30)
feature_vector.append(dwell_time / 480.0)
class_vector = np.zeros(len(all_classes))
landmark_classes = landmark.get('Class', [])
for cls in landmark_classes:
if cls in all_classes:
idx = all_classes.index(cls)
class_vector[idx] = 1.0
feature_vector.extend(class_vector)
cmu_lat, cmu_lon = 40.4433, -79.9436
geocoord = landmark.get('geocoord', {'lat': cmu_lat, 'lon': cmu_lon})
feature_vector.append(abs(geocoord['lat'] - cmu_lat))
feature_vector.append(abs(geocoord['lon'] - cmu_lon))
X = np.array([feature_vector])
prediction = self.model.predict(X)[0]
# Clamp to valid rating range
return max(0.0, min(5.0, prediction))
def get_feature_importance(self) -> Dict[str, float]:
"""Get feature importance from the trained model"""
if not self.is_trained:
raise ValueError("Model must be trained before getting feature importance")
importances = self.model.feature_importances_
return dict(zip(self.feature_columns, importances))
def get_predictions_for_all(self) -> Dict[str, float]:
"""Get predictions for all landmarks in training data"""
if not self.is_trained:
raise ValueError("Model must be trained before making predictions")
predictions = {}
for lm in self.landmarks:
predictions[lm['id']] = self.predict_rating(lm)
return predictions
def save_model(self, filepath: str):
"""Save the trained model"""
model_data = {
'feature_columns': self.feature_columns,
'is_trained': self.is_trained,
'model': pickle.dumps(self.model).hex(),
'landmarks_sample': self.landmarks[:5] if self.landmarks else [] # Store sample for reference
}
with open(filepath, 'w') as f:
json.dump(model_data, f)
def load_model(self, filepath: str):
"""Load a trained model"""
with open(filepath, 'r') as f:
model_data = json.load(f)
self.feature_columns = model_data['feature_columns']
self.is_trained = model_data['is_trained']
self.model = pickle.loads(bytes.fromhex(model_data['model']))
def load_model_from_data(data_path: str) -> RatingPredictor:
"""Load and train model from landmarks data"""
with open(data_path, 'r') as f:
landmarks = json.load(f)
predictor = RatingPredictor(landmarks)
return predictor
# Example usage
if __name__ == "__main__":
# Load landmarks data
with open('data/landmarks.json', 'r') as f:
landmarks = json.load(f)
# Initialize predictor
predictor = RatingPredictor(landmarks)
# Get feature importance
importance = predictor.get_feature_importance()
print("Feature importance:")
for feature, imp in sorted(importance.items(), key=lambda x: x[1], reverse=True)[:10]:
print(f"{feature}: {imp:.3f}")
# Get predictions for all landmarks
predictions = predictor.get_predictions_for_all()
# Calculate metrics
actual_ratings = [lm['rating'] for lm in landmarks]
predicted_ratings = list(predictions.values())
mae = np.mean(np.abs(np.array(actual_ratings) - np.array(predicted_ratings)))
mse = np.mean((np.array(actual_ratings) - np.array(predicted_ratings)) ** 2)
print(f"\nModel Performance:")
print(f"Mean Absolute Error: {mae:.3f}")
print(f"Mean Squared Error: {mse:.3f}")
print(f"Training Score: {predictor.model.score(predictor._prepare_training_data()[0], predictor._prepare_training_data()[1]):.3f}")