""" Data preprocessing module for book popularity prediction. This module handles data cleaning, feature engineering, and preparation for ML model. """ import pandas as pd import numpy as np from sklearn.preprocessing import LabelEncoder, StandardScaler from sklearn.model_selection import train_test_split import joblib import os class BookDataPreprocessor: def __init__(self): self.author_encoder = LabelEncoder() self.scaler = StandardScaler() self.top_authors = [] self.is_fitted = False def load_data(self, data_path='data/goodreads_books.csv'): """Load and return the raw dataset""" try: df = pd.read_csv(data_path) print(f"Data loaded successfully. Shape: {df.shape}") return df except FileNotFoundError: print(f"Data file not found at {data_path}") return None def clean_data(self, df): """Clean the dataset by handling missing values and duplicates""" # Check for missing values print("Missing values per column:") print(df.isnull().sum()) # Remove any rows with missing target variable (average_rating) df = df.dropna(subset=['average_rating']) # Fill missing values in other columns if any df['reviews_count'] = df['reviews_count'].fillna(0) df['ratings_count'] = df['ratings_count'].fillna(0) # Remove duplicates initial_rows = len(df) df = df.drop_duplicates() print(f"Removed {initial_rows - len(df)} duplicate rows") return df def engineer_features(self, df): """Create additional features for better prediction""" # Calculate rating to review ratio if not already present if 'rating_to_review_ratio' not in df.columns: df['rating_to_review_ratio'] = df['ratings_count'] / (df['reviews_count'] + 1) # +1 to avoid division by zero # Log transform of counts to handle skewness df['log_ratings_count'] = np.log1p(df['ratings_count']) df['log_reviews_count'] = np.log1p(df['reviews_count']) # Author popularity (number of books by author in dataset) author_counts = df['author'].value_counts() df['author_book_count'] = df['author'].map(author_counts) # Get top authors (will be used for the web app dropdown) self.top_authors = author_counts.head(20).index.tolist() return df def prepare_features(self, df, fit=True): """Prepare features for machine learning model""" # Select features for modeling feature_columns = [ 'ratings_count', 'reviews_count', 'rating_to_review_ratio', 'log_ratings_count', 'log_reviews_count', 'author_book_count' ] # Prepare author encoding if fit: # Filter to keep only top authors for better model performance df_filtered = df[df['author'].isin(self.top_authors)].copy() # Fit label encoder on top authors self.author_encoder.fit(df_filtered['author']) df_filtered['author_encoded'] = self.author_encoder.transform(df_filtered['author']) else: # Transform using existing encoder # For authors not in top list, assign a default value df_filtered = df.copy() df_filtered['author_encoded'] = df_filtered['author'].apply( lambda x: self.author_encoder.transform([x])[0] if x in self.author_encoder.classes_ else -1 ) # Add author_encoded to feature columns feature_columns.append('author_encoded') # Prepare feature matrix X = df_filtered[feature_columns] y = df_filtered['average_rating'] # Scale numerical features if fit: X_scaled = self.scaler.fit_transform(X) self.is_fitted = True else: X_scaled = self.scaler.transform(X) # Convert back to DataFrame for better handling X_scaled_df = pd.DataFrame(X_scaled, columns=feature_columns, index=X.index) return X_scaled_df, y, df_filtered def split_data(self, X, y, test_size=0.2, random_state=42): """Split data into train and test sets""" return train_test_split(X, y, test_size=test_size, random_state=random_state) def save_preprocessor(self, models_dir='models'): """Save the fitted preprocessor components""" if not os.path.exists(models_dir): os.makedirs(models_dir) joblib.dump(self.author_encoder, f'{models_dir}/author_encoder.pkl') joblib.dump(self.scaler, f'{models_dir}/scaler.pkl') joblib.dump(self.top_authors, f'{models_dir}/top_authors.pkl') print("Preprocessor components saved successfully!") def load_preprocessor(self, models_dir='models'): """Load previously fitted preprocessor components""" try: self.author_encoder = joblib.load(f'{models_dir}/author_encoder.pkl') self.scaler = joblib.load(f'{models_dir}/scaler.pkl') self.top_authors = joblib.load(f'{models_dir}/top_authors.pkl') self.is_fitted = True print("Preprocessor components loaded successfully!") return True except FileNotFoundError as e: print(f"Preprocessor components not found: {e}") return False def get_feature_names(self): """Return the names of features used in the model""" return [ 'ratings_count', 'reviews_count', 'rating_to_review_ratio', 'log_ratings_count', 'log_reviews_count', 'author_book_count', 'author_encoded' ] def main(): """Test the preprocessing pipeline""" preprocessor = BookDataPreprocessor() # Load data df = preprocessor.load_data() if df is None: return print(f"\nOriginal data shape: {df.shape}") print("\nFirst few rows:") print(df.head()) # Clean data df_clean = preprocessor.clean_data(df) print(f"\nCleaned data shape: {df_clean.shape}") # Engineer features df_engineered = preprocessor.engineer_features(df_clean) print(f"\nEngineered data shape: {df_engineered.shape}") print(f"\nTop 20 authors: {preprocessor.top_authors}") # Prepare features X, y, df_final = preprocessor.prepare_features(df_engineered, fit=True) print(f"\nFinal feature matrix shape: {X.shape}") print(f"Target variable shape: {y.shape}") print(f"\nFeature columns: {preprocessor.get_feature_names()}") # Split data X_train, X_test, y_train, y_test = preprocessor.split_data(X, y) print(f"\nTrain set shape: {X_train.shape}") print(f"Test set shape: {X_test.shape}") # Save preprocessor preprocessor.save_preprocessor() print("\nPreprocessing pipeline completed successfully!") if __name__ == "__main__": main()