Spaces:

fguryel
/

goodreads

Sleeping

App Files Files Community

goodreads / src /model_training.py

fguryel

Deploy ML project

ce92e54 5 months ago

raw

history blame contribute delete

8.84 kB

	"""
	Model training module for book popularity prediction.
	This module handles model training, evaluation, and saving.
	"""

	import pandas as pd
	import numpy as np
	from sklearn.ensemble import RandomForestRegressor
	from sklearn.linear_model import LinearRegression
	from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
	from sklearn.model_selection import cross_val_score
	import joblib
	import os
	import matplotlib.pyplot as plt
	import seaborn as sns
	from data_preprocessing import BookDataPreprocessor

	class BookPopularityModel:
	def __init__(self):
	self.model = None
	self.preprocessor = BookDataPreprocessor()
	self.feature_importance = None

	def train_model(self, model_type='random_forest'):
	"""Train the book popularity prediction model"""
	# Load and preprocess data
	print("Loading and preprocessing data...")
	df = self.preprocessor.load_data()
	if df is None:
	return False

	df_clean = self.preprocessor.clean_data(df)
	df_engineered = self.preprocessor.engineer_features(df_clean)
	X, y, df_final = self.preprocessor.prepare_features(df_engineered, fit=True)

	# Split data
	X_train, X_test, y_train, y_test = self.preprocessor.split_data(X, y)

	print(f"Training data shape: {X_train.shape}")
	print(f"Test data shape: {X_test.shape}")

	# Initialize model
	if model_type == 'random_forest':
	self.model = RandomForestRegressor(
	n_estimators=100,
	max_depth=10,
	random_state=42,
	n_jobs=-1
	)
	elif model_type == 'linear':
	self.model = LinearRegression()
	else:
	raise ValueError("Model type must be 'random_forest' or 'linear'")

	# Train model
	print(f"Training {model_type} model...")
	self.model.fit(X_train, y_train)

	# Evaluate model
	self.evaluate_model(X_train, X_test, y_train, y_test)

	# Save model and preprocessor
	self.save_model()
	self.preprocessor.save_preprocessor()

	return True

	def evaluate_model(self, X_train, X_test, y_train, y_test):
	"""Evaluate the trained model"""
	# Predictions
	y_train_pred = self.model.predict(X_train)
	y_test_pred = self.model.predict(X_test)

	# Calculate metrics
	train_mse = mean_squared_error(y_train, y_train_pred)
	test_mse = mean_squared_error(y_test, y_test_pred)
	train_r2 = r2_score(y_train, y_train_pred)
	test_r2 = r2_score(y_test, y_test_pred)
	train_mae = mean_absolute_error(y_train, y_train_pred)
	test_mae = mean_absolute_error(y_test, y_test_pred)

	print("\\n" + "="*50)
	print("MODEL EVALUATION RESULTS")
	print("="*50)
	print(f"Train MSE: {train_mse:.4f}")
	print(f"Test MSE: {test_mse:.4f}")
	print(f"Train R²: {train_r2:.4f}")
	print(f"Test R²: {test_r2:.4f}")
	print(f"Train MAE: {train_mae:.4f}")
	print(f"Test MAE: {test_mae:.4f}")

	# Cross-validation score
	cv_scores = cross_val_score(self.model, X_train, y_train, cv=5, scoring='r2')
	print(f"\\nCross-validation R² scores: {cv_scores}")
	print(f"Mean CV R²: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

	# Feature importance (if available)
	if hasattr(self.model, 'feature_importances_'):
	self.feature_importance = self.model.feature_importances_
	feature_names = self.preprocessor.get_feature_names()

	print("\\nFeature Importance:")
	for name, importance in zip(feature_names, self.feature_importance):
	print(f"{name}: {importance:.4f}")

	print("="*50)

	def save_model(self, models_dir='models'):
	"""Save the trained model"""
	if not os.path.exists(models_dir):
	os.makedirs(models_dir)

	model_path = f'{models_dir}/book_popularity_model.pkl'
	joblib.dump(self.model, model_path)
	print(f"Model saved to {model_path}")

	def load_model(self, models_dir='models'):
	"""Load a previously trained model"""
	try:
	model_path = f'{models_dir}/book_popularity_model.pkl'
	self.model = joblib.load(model_path)

	# Also load preprocessor
	self.preprocessor.load_preprocessor(models_dir)

	print("Model and preprocessor loaded successfully!")
	return True
	except FileNotFoundError as e:
	print(f"Model not found: {e}")
	return False

	def predict_single_book(self, author, ratings_count, reviews_count=None):
	"""Predict popularity for a single book"""
	if self.model is None:
	print("Model not trained or loaded!")
	return None

	# Handle missing reviews_count
	if reviews_count is None:
	reviews_count = ratings_count / 8.33 # Average ratio from dataset

	# Calculate additional features
	rating_to_review_ratio = ratings_count / (reviews_count + 1)
	log_ratings_count = np.log1p(ratings_count)
	log_reviews_count = np.log1p(reviews_count)

	# Get author book count (from training data)
	if author in self.preprocessor.top_authors:
	# This would need to be stored from training, for now use a default
	author_book_count = 5 # Default value
	else:
	author_book_count = 1 # Default for unknown authors

	# Encode author
	if author in self.preprocessor.author_encoder.classes_:
	author_encoded = self.preprocessor.author_encoder.transform([author])[0]
	else:
	# Use most common author as default
	author_encoded = 0

	# Create feature vector
	features = np.array([[
	ratings_count, reviews_count, rating_to_review_ratio,
	log_ratings_count, log_reviews_count, author_book_count,
	author_encoded
	]])

	# Scale features
	features_scaled = self.preprocessor.scaler.transform(features)

	# Make prediction
	prediction = self.model.predict(features_scaled)[0]

	return round(prediction, 2)

	def get_model_info(self):
	"""Get information about the trained model"""
	if self.model is None:
	return None

	info = {
	'model_type': type(self.model).__name__,
	'top_authors': self.preprocessor.top_authors,
	'feature_names': self.preprocessor.get_feature_names()
	}

	if hasattr(self.model, 'feature_importances_'):
	info['feature_importance'] = dict(zip(
	self.preprocessor.get_feature_names(),
	self.model.feature_importances_
	))

	return info

	def main():
	"""Train and evaluate the book popularity prediction model"""
	print("Starting book popularity model training...")

	# Initialize model trainer
	model_trainer = BookPopularityModel()

	# Train model
	success = model_trainer.train_model(model_type='random_forest')

	if success:
	print("\\nModel training completed successfully!")

	# Test single prediction
	print("\\nTesting single prediction...")
	test_author = "Stephen King" # Most frequent author in dataset
	test_ratings = 100000
	test_reviews = 12000

	prediction = model_trainer.predict_single_book(
	test_author, test_ratings, test_reviews
	)

	print(f"Prediction for '{test_author}' book with {test_ratings:,} ratings and {test_reviews:,} reviews:")
	print(f"Predicted average rating: {prediction}")

	# Get model info
	model_info = model_trainer.get_model_info()
	print(f"\\nModel type: {model_info['model_type']}")
	print(f"Number of top authors: {len(model_info['top_authors'])}")

	if 'feature_importance' in model_info:
	print("\\nTop 3 most important features:")
	sorted_features = sorted(
	model_info['feature_importance'].items(),
	key=lambda x: x[1],
	reverse=True
	)
	for feature, importance in sorted_features[:3]:
	print(f" {feature}: {importance:.4f}")

	else:
	print("Model training failed!")

	if __name__ == "__main__":
	main()