|
|
"""
|
|
|
Rating Prediction Model for CMU Landmarks
|
|
|
|
|
|
This model predicts landmark ratings based on feature characteristics using
|
|
|
a Random Forest regressor from scikit-learn.
|
|
|
"""
|
|
|
|
|
|
import numpy as np
|
|
|
import json
|
|
|
from typing import List, Dict, Tuple
|
|
|
from sklearn.ensemble import RandomForestRegressor
|
|
|
import pickle
|
|
|
|
|
|
|
|
|
class RatingPredictor:
|
|
|
"""
|
|
|
Rating Prediction Model (Off-the-shelf + Fine-tuning)
|
|
|
|
|
|
Uses Random Forest to predict landmark ratings based on features.
|
|
|
This can help validate existing ratings or predict ratings for new landmarks.
|
|
|
"""
|
|
|
|
|
|
def __init__(self, landmarks_data: List[Dict] = None):
|
|
|
self.landmarks = landmarks_data or []
|
|
|
self.model = RandomForestRegressor(n_estimators=100, random_state=42)
|
|
|
self.feature_columns = []
|
|
|
self.is_trained = False
|
|
|
|
|
|
if landmarks_data:
|
|
|
self.train()
|
|
|
|
|
|
def _prepare_training_data(self) -> Tuple[np.ndarray, np.ndarray]:
|
|
|
"""Prepare training data for rating prediction"""
|
|
|
features = []
|
|
|
ratings = []
|
|
|
|
|
|
|
|
|
all_classes = set()
|
|
|
for lm in self.landmarks:
|
|
|
all_classes.update(lm.get('Class', []))
|
|
|
all_classes = sorted(list(all_classes))
|
|
|
|
|
|
|
|
|
for lm in self.landmarks:
|
|
|
feature_vector = []
|
|
|
|
|
|
|
|
|
feature_vector.append(1.0 if lm.get('indoor/outdoor', 'outdoor') == 'indoor' else 0.0)
|
|
|
dwell_time = lm.get('time taken to explore', 30)
|
|
|
feature_vector.append(dwell_time / 480.0)
|
|
|
|
|
|
|
|
|
class_vector = np.zeros(len(all_classes))
|
|
|
landmark_classes = lm.get('Class', [])
|
|
|
for cls in landmark_classes:
|
|
|
if cls in all_classes:
|
|
|
idx = all_classes.index(cls)
|
|
|
class_vector[idx] = 1.0
|
|
|
feature_vector.extend(class_vector)
|
|
|
|
|
|
|
|
|
cmu_lat, cmu_lon = 40.4433, -79.9436
|
|
|
geocoord = lm.get('geocoord', {'lat': cmu_lat, 'lon': cmu_lon})
|
|
|
feature_vector.append(abs(geocoord['lat'] - cmu_lat))
|
|
|
feature_vector.append(abs(geocoord['lon'] - cmu_lon))
|
|
|
|
|
|
features.append(feature_vector)
|
|
|
ratings.append(lm.get('rating', 0.0))
|
|
|
|
|
|
|
|
|
self.feature_columns = ['is_indoor', 'dwell_time_norm'] + all_classes + ['dist_from_cmu_lat', 'dist_from_cmu_lon']
|
|
|
|
|
|
return np.array(features), np.array(ratings)
|
|
|
|
|
|
def train(self):
|
|
|
"""Train the rating prediction model"""
|
|
|
if not self.landmarks:
|
|
|
raise ValueError("No landmarks data provided for training")
|
|
|
|
|
|
X, y = self._prepare_training_data()
|
|
|
self.model.fit(X, y)
|
|
|
self.is_trained = True
|
|
|
|
|
|
|
|
|
train_score = self.model.score(X, y)
|
|
|
return train_score
|
|
|
|
|
|
def predict_rating(self, landmark: Dict) -> float:
|
|
|
"""Predict rating for a landmark"""
|
|
|
if not self.is_trained:
|
|
|
raise ValueError("Model must be trained before making predictions")
|
|
|
|
|
|
|
|
|
all_classes = set()
|
|
|
for lm in self.landmarks:
|
|
|
all_classes.update(lm.get('Class', []))
|
|
|
all_classes = sorted(list(all_classes))
|
|
|
|
|
|
feature_vector = []
|
|
|
feature_vector.append(1.0 if landmark.get('indoor/outdoor', 'outdoor') == 'indoor' else 0.0)
|
|
|
|
|
|
dwell_time = landmark.get('time taken to explore', 30)
|
|
|
feature_vector.append(dwell_time / 480.0)
|
|
|
|
|
|
class_vector = np.zeros(len(all_classes))
|
|
|
landmark_classes = landmark.get('Class', [])
|
|
|
for cls in landmark_classes:
|
|
|
if cls in all_classes:
|
|
|
idx = all_classes.index(cls)
|
|
|
class_vector[idx] = 1.0
|
|
|
feature_vector.extend(class_vector)
|
|
|
|
|
|
cmu_lat, cmu_lon = 40.4433, -79.9436
|
|
|
geocoord = landmark.get('geocoord', {'lat': cmu_lat, 'lon': cmu_lon})
|
|
|
feature_vector.append(abs(geocoord['lat'] - cmu_lat))
|
|
|
feature_vector.append(abs(geocoord['lon'] - cmu_lon))
|
|
|
|
|
|
X = np.array([feature_vector])
|
|
|
prediction = self.model.predict(X)[0]
|
|
|
|
|
|
|
|
|
return max(0.0, min(5.0, prediction))
|
|
|
|
|
|
def get_feature_importance(self) -> Dict[str, float]:
|
|
|
"""Get feature importance from the trained model"""
|
|
|
if not self.is_trained:
|
|
|
raise ValueError("Model must be trained before getting feature importance")
|
|
|
|
|
|
importances = self.model.feature_importances_
|
|
|
return dict(zip(self.feature_columns, importances))
|
|
|
|
|
|
def get_predictions_for_all(self) -> Dict[str, float]:
|
|
|
"""Get predictions for all landmarks in training data"""
|
|
|
if not self.is_trained:
|
|
|
raise ValueError("Model must be trained before making predictions")
|
|
|
|
|
|
predictions = {}
|
|
|
for lm in self.landmarks:
|
|
|
predictions[lm['id']] = self.predict_rating(lm)
|
|
|
return predictions
|
|
|
|
|
|
def save_model(self, filepath: str):
|
|
|
"""Save the trained model"""
|
|
|
model_data = {
|
|
|
'feature_columns': self.feature_columns,
|
|
|
'is_trained': self.is_trained,
|
|
|
'model': pickle.dumps(self.model).hex(),
|
|
|
'landmarks_sample': self.landmarks[:5] if self.landmarks else []
|
|
|
}
|
|
|
|
|
|
with open(filepath, 'w') as f:
|
|
|
json.dump(model_data, f)
|
|
|
|
|
|
def load_model(self, filepath: str):
|
|
|
"""Load a trained model"""
|
|
|
with open(filepath, 'r') as f:
|
|
|
model_data = json.load(f)
|
|
|
|
|
|
self.feature_columns = model_data['feature_columns']
|
|
|
self.is_trained = model_data['is_trained']
|
|
|
self.model = pickle.loads(bytes.fromhex(model_data['model']))
|
|
|
|
|
|
|
|
|
def load_model_from_data(data_path: str) -> RatingPredictor:
|
|
|
"""Load and train model from landmarks data"""
|
|
|
with open(data_path, 'r') as f:
|
|
|
landmarks = json.load(f)
|
|
|
|
|
|
predictor = RatingPredictor(landmarks)
|
|
|
return predictor
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
with open('data/landmarks.json', 'r') as f:
|
|
|
landmarks = json.load(f)
|
|
|
|
|
|
|
|
|
predictor = RatingPredictor(landmarks)
|
|
|
|
|
|
|
|
|
importance = predictor.get_feature_importance()
|
|
|
print("Feature importance:")
|
|
|
for feature, imp in sorted(importance.items(), key=lambda x: x[1], reverse=True)[:10]:
|
|
|
print(f"{feature}: {imp:.3f}")
|
|
|
|
|
|
|
|
|
predictions = predictor.get_predictions_for_all()
|
|
|
|
|
|
|
|
|
actual_ratings = [lm['rating'] for lm in landmarks]
|
|
|
predicted_ratings = list(predictions.values())
|
|
|
|
|
|
mae = np.mean(np.abs(np.array(actual_ratings) - np.array(predicted_ratings)))
|
|
|
mse = np.mean((np.array(actual_ratings) - np.array(predicted_ratings)) ** 2)
|
|
|
|
|
|
print(f"\nModel Performance:")
|
|
|
print(f"Mean Absolute Error: {mae:.3f}")
|
|
|
print(f"Mean Squared Error: {mse:.3f}")
|
|
|
print(f"Training Score: {predictor.model.score(predictor._prepare_training_data()[0], predictor._prepare_training_data()[1]):.3f}")
|
|
|
|