File size: 7,134 Bytes
ce92e54 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 | """
Data preprocessing module for book popularity prediction.
This module handles data cleaning, feature engineering, and preparation for ML model.
"""
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import joblib
import os
class BookDataPreprocessor:
def __init__(self):
self.author_encoder = LabelEncoder()
self.scaler = StandardScaler()
self.top_authors = []
self.is_fitted = False
def load_data(self, data_path='data/goodreads_books.csv'):
"""Load and return the raw dataset"""
try:
df = pd.read_csv(data_path)
print(f"Data loaded successfully. Shape: {df.shape}")
return df
except FileNotFoundError:
print(f"Data file not found at {data_path}")
return None
def clean_data(self, df):
"""Clean the dataset by handling missing values and duplicates"""
# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())
# Remove any rows with missing target variable (average_rating)
df = df.dropna(subset=['average_rating'])
# Fill missing values in other columns if any
df['reviews_count'] = df['reviews_count'].fillna(0)
df['ratings_count'] = df['ratings_count'].fillna(0)
# Remove duplicates
initial_rows = len(df)
df = df.drop_duplicates()
print(f"Removed {initial_rows - len(df)} duplicate rows")
return df
def engineer_features(self, df):
"""Create additional features for better prediction"""
# Calculate rating to review ratio if not already present
if 'rating_to_review_ratio' not in df.columns:
df['rating_to_review_ratio'] = df['ratings_count'] / (df['reviews_count'] + 1) # +1 to avoid division by zero
# Log transform of counts to handle skewness
df['log_ratings_count'] = np.log1p(df['ratings_count'])
df['log_reviews_count'] = np.log1p(df['reviews_count'])
# Author popularity (number of books by author in dataset)
author_counts = df['author'].value_counts()
df['author_book_count'] = df['author'].map(author_counts)
# Get top authors (will be used for the web app dropdown)
self.top_authors = author_counts.head(20).index.tolist()
return df
def prepare_features(self, df, fit=True):
"""Prepare features for machine learning model"""
# Select features for modeling
feature_columns = [
'ratings_count', 'reviews_count', 'rating_to_review_ratio',
'log_ratings_count', 'log_reviews_count', 'author_book_count'
]
# Prepare author encoding
if fit:
# Filter to keep only top authors for better model performance
df_filtered = df[df['author'].isin(self.top_authors)].copy()
# Fit label encoder on top authors
self.author_encoder.fit(df_filtered['author'])
df_filtered['author_encoded'] = self.author_encoder.transform(df_filtered['author'])
else:
# Transform using existing encoder
# For authors not in top list, assign a default value
df_filtered = df.copy()
df_filtered['author_encoded'] = df_filtered['author'].apply(
lambda x: self.author_encoder.transform([x])[0] if x in self.author_encoder.classes_ else -1
)
# Add author_encoded to feature columns
feature_columns.append('author_encoded')
# Prepare feature matrix
X = df_filtered[feature_columns]
y = df_filtered['average_rating']
# Scale numerical features
if fit:
X_scaled = self.scaler.fit_transform(X)
self.is_fitted = True
else:
X_scaled = self.scaler.transform(X)
# Convert back to DataFrame for better handling
X_scaled_df = pd.DataFrame(X_scaled, columns=feature_columns, index=X.index)
return X_scaled_df, y, df_filtered
def split_data(self, X, y, test_size=0.2, random_state=42):
"""Split data into train and test sets"""
return train_test_split(X, y, test_size=test_size, random_state=random_state)
def save_preprocessor(self, models_dir='models'):
"""Save the fitted preprocessor components"""
if not os.path.exists(models_dir):
os.makedirs(models_dir)
joblib.dump(self.author_encoder, f'{models_dir}/author_encoder.pkl')
joblib.dump(self.scaler, f'{models_dir}/scaler.pkl')
joblib.dump(self.top_authors, f'{models_dir}/top_authors.pkl')
print("Preprocessor components saved successfully!")
def load_preprocessor(self, models_dir='models'):
"""Load previously fitted preprocessor components"""
try:
self.author_encoder = joblib.load(f'{models_dir}/author_encoder.pkl')
self.scaler = joblib.load(f'{models_dir}/scaler.pkl')
self.top_authors = joblib.load(f'{models_dir}/top_authors.pkl')
self.is_fitted = True
print("Preprocessor components loaded successfully!")
return True
except FileNotFoundError as e:
print(f"Preprocessor components not found: {e}")
return False
def get_feature_names(self):
"""Return the names of features used in the model"""
return [
'ratings_count', 'reviews_count', 'rating_to_review_ratio',
'log_ratings_count', 'log_reviews_count', 'author_book_count',
'author_encoded'
]
def main():
"""Test the preprocessing pipeline"""
preprocessor = BookDataPreprocessor()
# Load data
df = preprocessor.load_data()
if df is None:
return
print(f"\nOriginal data shape: {df.shape}")
print("\nFirst few rows:")
print(df.head())
# Clean data
df_clean = preprocessor.clean_data(df)
print(f"\nCleaned data shape: {df_clean.shape}")
# Engineer features
df_engineered = preprocessor.engineer_features(df_clean)
print(f"\nEngineered data shape: {df_engineered.shape}")
print(f"\nTop 20 authors: {preprocessor.top_authors}")
# Prepare features
X, y, df_final = preprocessor.prepare_features(df_engineered, fit=True)
print(f"\nFinal feature matrix shape: {X.shape}")
print(f"Target variable shape: {y.shape}")
print(f"\nFeature columns: {preprocessor.get_feature_names()}")
# Split data
X_train, X_test, y_train, y_test = preprocessor.split_data(X, y)
print(f"\nTrain set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
# Save preprocessor
preprocessor.save_preprocessor()
print("\nPreprocessing pipeline completed successfully!")
if __name__ == "__main__":
main() |