Spaces:

fguryel
/

goodreads

Sleeping

File size: 7,134 Bytes

ce92e54

"""
Data preprocessing module for book popularity prediction.
This module handles data cleaning, feature engineering, and preparation for ML model.
"""

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import joblib
import os

class BookDataPreprocessor:
    def __init__(self):
        self.author_encoder = LabelEncoder()
        self.scaler = StandardScaler()
        self.top_authors = []
        self.is_fitted = False
        
    def load_data(self, data_path='data/goodreads_books.csv'):
        """Load and return the raw dataset"""
        try:
            df = pd.read_csv(data_path)
            print(f"Data loaded successfully. Shape: {df.shape}")
            return df
        except FileNotFoundError:
            print(f"Data file not found at {data_path}")
            return None
    
    def clean_data(self, df):
        """Clean the dataset by handling missing values and duplicates"""
        # Check for missing values
        print("Missing values per column:")
        print(df.isnull().sum())
        
        # Remove any rows with missing target variable (average_rating)
        df = df.dropna(subset=['average_rating'])
        
        # Fill missing values in other columns if any
        df['reviews_count'] = df['reviews_count'].fillna(0)
        df['ratings_count'] = df['ratings_count'].fillna(0)
        
        # Remove duplicates
        initial_rows = len(df)
        df = df.drop_duplicates()
        print(f"Removed {initial_rows - len(df)} duplicate rows")
        
        return df
    
    def engineer_features(self, df):
        """Create additional features for better prediction"""
        # Calculate rating to review ratio if not already present
        if 'rating_to_review_ratio' not in df.columns:
            df['rating_to_review_ratio'] = df['ratings_count'] / (df['reviews_count'] + 1)  # +1 to avoid division by zero
        
        # Log transform of counts to handle skewness
        df['log_ratings_count'] = np.log1p(df['ratings_count'])
        df['log_reviews_count'] = np.log1p(df['reviews_count'])
        
        # Author popularity (number of books by author in dataset)
        author_counts = df['author'].value_counts()
        df['author_book_count'] = df['author'].map(author_counts)
        
        # Get top authors (will be used for the web app dropdown)
        self.top_authors = author_counts.head(20).index.tolist()
        
        return df
    
    def prepare_features(self, df, fit=True):
        """Prepare features for machine learning model"""
        # Select features for modeling
        feature_columns = [
            'ratings_count', 'reviews_count', 'rating_to_review_ratio',
            'log_ratings_count', 'log_reviews_count', 'author_book_count'
        ]
        
        # Prepare author encoding
        if fit:
            # Filter to keep only top authors for better model performance
            df_filtered = df[df['author'].isin(self.top_authors)].copy()
            
            # Fit label encoder on top authors
            self.author_encoder.fit(df_filtered['author'])
            df_filtered['author_encoded'] = self.author_encoder.transform(df_filtered['author'])
        else:
            # Transform using existing encoder
            # For authors not in top list, assign a default value
            df_filtered = df.copy()
            df_filtered['author_encoded'] = df_filtered['author'].apply(
                lambda x: self.author_encoder.transform([x])[0] if x in self.author_encoder.classes_ else -1
            )
        
        # Add author_encoded to feature columns
        feature_columns.append('author_encoded')
        
        # Prepare feature matrix
        X = df_filtered[feature_columns]
        y = df_filtered['average_rating']
        
        # Scale numerical features
        if fit:
            X_scaled = self.scaler.fit_transform(X)
            self.is_fitted = True
        else:
            X_scaled = self.scaler.transform(X)
        
        # Convert back to DataFrame for better handling
        X_scaled_df = pd.DataFrame(X_scaled, columns=feature_columns, index=X.index)
        
        return X_scaled_df, y, df_filtered
    
    def split_data(self, X, y, test_size=0.2, random_state=42):
        """Split data into train and test sets"""
        return train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    def save_preprocessor(self, models_dir='models'):
        """Save the fitted preprocessor components"""
        if not os.path.exists(models_dir):
            os.makedirs(models_dir)
            
        joblib.dump(self.author_encoder, f'{models_dir}/author_encoder.pkl')
        joblib.dump(self.scaler, f'{models_dir}/scaler.pkl')
        joblib.dump(self.top_authors, f'{models_dir}/top_authors.pkl')
        
        print("Preprocessor components saved successfully!")
    
    def load_preprocessor(self, models_dir='models'):
        """Load previously fitted preprocessor components"""
        try:
            self.author_encoder = joblib.load(f'{models_dir}/author_encoder.pkl')
            self.scaler = joblib.load(f'{models_dir}/scaler.pkl')
            self.top_authors = joblib.load(f'{models_dir}/top_authors.pkl')
            self.is_fitted = True
            print("Preprocessor components loaded successfully!")
            return True
        except FileNotFoundError as e:
            print(f"Preprocessor components not found: {e}")
            return False
    
    def get_feature_names(self):
        """Return the names of features used in the model"""
        return [
            'ratings_count', 'reviews_count', 'rating_to_review_ratio',
            'log_ratings_count', 'log_reviews_count', 'author_book_count',
            'author_encoded'
        ]

def main():
    """Test the preprocessing pipeline"""
    preprocessor = BookDataPreprocessor()
    
    # Load data
    df = preprocessor.load_data()
    if df is None:
        return
    
    print(f"\nOriginal data shape: {df.shape}")
    print("\nFirst few rows:")
    print(df.head())
    
    # Clean data
    df_clean = preprocessor.clean_data(df)
    print(f"\nCleaned data shape: {df_clean.shape}")
    
    # Engineer features
    df_engineered = preprocessor.engineer_features(df_clean)
    print(f"\nEngineered data shape: {df_engineered.shape}")
    print(f"\nTop 20 authors: {preprocessor.top_authors}")
    
    # Prepare features
    X, y, df_final = preprocessor.prepare_features(df_engineered, fit=True)
    print(f"\nFinal feature matrix shape: {X.shape}")
    print(f"Target variable shape: {y.shape}")
    print(f"\nFeature columns: {preprocessor.get_feature_names()}")
    
    # Split data
    X_train, X_test, y_train, y_test = preprocessor.split_data(X, y)
    print(f"\nTrain set shape: {X_train.shape}")
    print(f"Test set shape: {X_test.shape}")
    
    # Save preprocessor
    preprocessor.save_preprocessor()
    
    print("\nPreprocessing pipeline completed successfully!")

if __name__ == "__main__":
    main()