goodreads / src /data_preprocessing.py
fguryel's picture
Deploy ML project
ce92e54
"""
Data preprocessing module for book popularity prediction.
This module handles data cleaning, feature engineering, and preparation for ML model.
"""
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import joblib
import os
class BookDataPreprocessor:
def __init__(self):
self.author_encoder = LabelEncoder()
self.scaler = StandardScaler()
self.top_authors = []
self.is_fitted = False
def load_data(self, data_path='data/goodreads_books.csv'):
"""Load and return the raw dataset"""
try:
df = pd.read_csv(data_path)
print(f"Data loaded successfully. Shape: {df.shape}")
return df
except FileNotFoundError:
print(f"Data file not found at {data_path}")
return None
def clean_data(self, df):
"""Clean the dataset by handling missing values and duplicates"""
# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())
# Remove any rows with missing target variable (average_rating)
df = df.dropna(subset=['average_rating'])
# Fill missing values in other columns if any
df['reviews_count'] = df['reviews_count'].fillna(0)
df['ratings_count'] = df['ratings_count'].fillna(0)
# Remove duplicates
initial_rows = len(df)
df = df.drop_duplicates()
print(f"Removed {initial_rows - len(df)} duplicate rows")
return df
def engineer_features(self, df):
"""Create additional features for better prediction"""
# Calculate rating to review ratio if not already present
if 'rating_to_review_ratio' not in df.columns:
df['rating_to_review_ratio'] = df['ratings_count'] / (df['reviews_count'] + 1) # +1 to avoid division by zero
# Log transform of counts to handle skewness
df['log_ratings_count'] = np.log1p(df['ratings_count'])
df['log_reviews_count'] = np.log1p(df['reviews_count'])
# Author popularity (number of books by author in dataset)
author_counts = df['author'].value_counts()
df['author_book_count'] = df['author'].map(author_counts)
# Get top authors (will be used for the web app dropdown)
self.top_authors = author_counts.head(20).index.tolist()
return df
def prepare_features(self, df, fit=True):
"""Prepare features for machine learning model"""
# Select features for modeling
feature_columns = [
'ratings_count', 'reviews_count', 'rating_to_review_ratio',
'log_ratings_count', 'log_reviews_count', 'author_book_count'
]
# Prepare author encoding
if fit:
# Filter to keep only top authors for better model performance
df_filtered = df[df['author'].isin(self.top_authors)].copy()
# Fit label encoder on top authors
self.author_encoder.fit(df_filtered['author'])
df_filtered['author_encoded'] = self.author_encoder.transform(df_filtered['author'])
else:
# Transform using existing encoder
# For authors not in top list, assign a default value
df_filtered = df.copy()
df_filtered['author_encoded'] = df_filtered['author'].apply(
lambda x: self.author_encoder.transform([x])[0] if x in self.author_encoder.classes_ else -1
)
# Add author_encoded to feature columns
feature_columns.append('author_encoded')
# Prepare feature matrix
X = df_filtered[feature_columns]
y = df_filtered['average_rating']
# Scale numerical features
if fit:
X_scaled = self.scaler.fit_transform(X)
self.is_fitted = True
else:
X_scaled = self.scaler.transform(X)
# Convert back to DataFrame for better handling
X_scaled_df = pd.DataFrame(X_scaled, columns=feature_columns, index=X.index)
return X_scaled_df, y, df_filtered
def split_data(self, X, y, test_size=0.2, random_state=42):
"""Split data into train and test sets"""
return train_test_split(X, y, test_size=test_size, random_state=random_state)
def save_preprocessor(self, models_dir='models'):
"""Save the fitted preprocessor components"""
if not os.path.exists(models_dir):
os.makedirs(models_dir)
joblib.dump(self.author_encoder, f'{models_dir}/author_encoder.pkl')
joblib.dump(self.scaler, f'{models_dir}/scaler.pkl')
joblib.dump(self.top_authors, f'{models_dir}/top_authors.pkl')
print("Preprocessor components saved successfully!")
def load_preprocessor(self, models_dir='models'):
"""Load previously fitted preprocessor components"""
try:
self.author_encoder = joblib.load(f'{models_dir}/author_encoder.pkl')
self.scaler = joblib.load(f'{models_dir}/scaler.pkl')
self.top_authors = joblib.load(f'{models_dir}/top_authors.pkl')
self.is_fitted = True
print("Preprocessor components loaded successfully!")
return True
except FileNotFoundError as e:
print(f"Preprocessor components not found: {e}")
return False
def get_feature_names(self):
"""Return the names of features used in the model"""
return [
'ratings_count', 'reviews_count', 'rating_to_review_ratio',
'log_ratings_count', 'log_reviews_count', 'author_book_count',
'author_encoded'
]
def main():
"""Test the preprocessing pipeline"""
preprocessor = BookDataPreprocessor()
# Load data
df = preprocessor.load_data()
if df is None:
return
print(f"\nOriginal data shape: {df.shape}")
print("\nFirst few rows:")
print(df.head())
# Clean data
df_clean = preprocessor.clean_data(df)
print(f"\nCleaned data shape: {df_clean.shape}")
# Engineer features
df_engineered = preprocessor.engineer_features(df_clean)
print(f"\nEngineered data shape: {df_engineered.shape}")
print(f"\nTop 20 authors: {preprocessor.top_authors}")
# Prepare features
X, y, df_final = preprocessor.prepare_features(df_engineered, fit=True)
print(f"\nFinal feature matrix shape: {X.shape}")
print(f"Target variable shape: {y.shape}")
print(f"\nFeature columns: {preprocessor.get_feature_names()}")
# Split data
X_train, X_test, y_train, y_test = preprocessor.split_data(X, y)
print(f"\nTrain set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
# Save preprocessor
preprocessor.save_preprocessor()
print("\nPreprocessing pipeline completed successfully!")
if __name__ == "__main__":
main()