goodreads / src /model_training.py
fguryel's picture
Deploy ML project
ce92e54
"""
Model training module for book popularity prediction.
This module handles model training, evaluation, and saving.
"""
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score
import joblib
import os
import matplotlib.pyplot as plt
import seaborn as sns
from data_preprocessing import BookDataPreprocessor
class BookPopularityModel:
def __init__(self):
self.model = None
self.preprocessor = BookDataPreprocessor()
self.feature_importance = None
def train_model(self, model_type='random_forest'):
"""Train the book popularity prediction model"""
# Load and preprocess data
print("Loading and preprocessing data...")
df = self.preprocessor.load_data()
if df is None:
return False
df_clean = self.preprocessor.clean_data(df)
df_engineered = self.preprocessor.engineer_features(df_clean)
X, y, df_final = self.preprocessor.prepare_features(df_engineered, fit=True)
# Split data
X_train, X_test, y_train, y_test = self.preprocessor.split_data(X, y)
print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")
# Initialize model
if model_type == 'random_forest':
self.model = RandomForestRegressor(
n_estimators=100,
max_depth=10,
random_state=42,
n_jobs=-1
)
elif model_type == 'linear':
self.model = LinearRegression()
else:
raise ValueError("Model type must be 'random_forest' or 'linear'")
# Train model
print(f"Training {model_type} model...")
self.model.fit(X_train, y_train)
# Evaluate model
self.evaluate_model(X_train, X_test, y_train, y_test)
# Save model and preprocessor
self.save_model()
self.preprocessor.save_preprocessor()
return True
def evaluate_model(self, X_train, X_test, y_train, y_test):
"""Evaluate the trained model"""
# Predictions
y_train_pred = self.model.predict(X_train)
y_test_pred = self.model.predict(X_test)
# Calculate metrics
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
print("\\n" + "="*50)
print("MODEL EVALUATION RESULTS")
print("="*50)
print(f"Train MSE: {train_mse:.4f}")
print(f"Test MSE: {test_mse:.4f}")
print(f"Train R²: {train_r2:.4f}")
print(f"Test R²: {test_r2:.4f}")
print(f"Train MAE: {train_mae:.4f}")
print(f"Test MAE: {test_mae:.4f}")
# Cross-validation score
cv_scores = cross_val_score(self.model, X_train, y_train, cv=5, scoring='r2')
print(f"\\nCross-validation R² scores: {cv_scores}")
print(f"Mean CV R²: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
# Feature importance (if available)
if hasattr(self.model, 'feature_importances_'):
self.feature_importance = self.model.feature_importances_
feature_names = self.preprocessor.get_feature_names()
print("\\nFeature Importance:")
for name, importance in zip(feature_names, self.feature_importance):
print(f"{name}: {importance:.4f}")
print("="*50)
def save_model(self, models_dir='models'):
"""Save the trained model"""
if not os.path.exists(models_dir):
os.makedirs(models_dir)
model_path = f'{models_dir}/book_popularity_model.pkl'
joblib.dump(self.model, model_path)
print(f"Model saved to {model_path}")
def load_model(self, models_dir='models'):
"""Load a previously trained model"""
try:
model_path = f'{models_dir}/book_popularity_model.pkl'
self.model = joblib.load(model_path)
# Also load preprocessor
self.preprocessor.load_preprocessor(models_dir)
print("Model and preprocessor loaded successfully!")
return True
except FileNotFoundError as e:
print(f"Model not found: {e}")
return False
def predict_single_book(self, author, ratings_count, reviews_count=None):
"""Predict popularity for a single book"""
if self.model is None:
print("Model not trained or loaded!")
return None
# Handle missing reviews_count
if reviews_count is None:
reviews_count = ratings_count / 8.33 # Average ratio from dataset
# Calculate additional features
rating_to_review_ratio = ratings_count / (reviews_count + 1)
log_ratings_count = np.log1p(ratings_count)
log_reviews_count = np.log1p(reviews_count)
# Get author book count (from training data)
if author in self.preprocessor.top_authors:
# This would need to be stored from training, for now use a default
author_book_count = 5 # Default value
else:
author_book_count = 1 # Default for unknown authors
# Encode author
if author in self.preprocessor.author_encoder.classes_:
author_encoded = self.preprocessor.author_encoder.transform([author])[0]
else:
# Use most common author as default
author_encoded = 0
# Create feature vector
features = np.array([[
ratings_count, reviews_count, rating_to_review_ratio,
log_ratings_count, log_reviews_count, author_book_count,
author_encoded
]])
# Scale features
features_scaled = self.preprocessor.scaler.transform(features)
# Make prediction
prediction = self.model.predict(features_scaled)[0]
return round(prediction, 2)
def get_model_info(self):
"""Get information about the trained model"""
if self.model is None:
return None
info = {
'model_type': type(self.model).__name__,
'top_authors': self.preprocessor.top_authors,
'feature_names': self.preprocessor.get_feature_names()
}
if hasattr(self.model, 'feature_importances_'):
info['feature_importance'] = dict(zip(
self.preprocessor.get_feature_names(),
self.model.feature_importances_
))
return info
def main():
"""Train and evaluate the book popularity prediction model"""
print("Starting book popularity model training...")
# Initialize model trainer
model_trainer = BookPopularityModel()
# Train model
success = model_trainer.train_model(model_type='random_forest')
if success:
print("\\nModel training completed successfully!")
# Test single prediction
print("\\nTesting single prediction...")
test_author = "Stephen King" # Most frequent author in dataset
test_ratings = 100000
test_reviews = 12000
prediction = model_trainer.predict_single_book(
test_author, test_ratings, test_reviews
)
print(f"Prediction for '{test_author}' book with {test_ratings:,} ratings and {test_reviews:,} reviews:")
print(f"Predicted average rating: {prediction}")
# Get model info
model_info = model_trainer.get_model_info()
print(f"\\nModel type: {model_info['model_type']}")
print(f"Number of top authors: {len(model_info['top_authors'])}")
if 'feature_importance' in model_info:
print("\\nTop 3 most important features:")
sorted_features = sorted(
model_info['feature_importance'].items(),
key=lambda x: x[1],
reverse=True
)
for feature, importance in sorted_features[:3]:
print(f" {feature}: {importance:.4f}")
else:
print("Model training failed!")
if __name__ == "__main__":
main()