| | """ |
| | Data preprocessing module for book popularity prediction. |
| | This module handles data cleaning, feature engineering, and preparation for ML model. |
| | """ |
| |
|
| | import pandas as pd |
| | import numpy as np |
| | from sklearn.preprocessing import LabelEncoder, StandardScaler |
| | from sklearn.model_selection import train_test_split |
| | import joblib |
| | import os |
| |
|
| | class BookDataPreprocessor: |
| | def __init__(self): |
| | self.author_encoder = LabelEncoder() |
| | self.scaler = StandardScaler() |
| | self.top_authors = [] |
| | self.is_fitted = False |
| | |
| | def load_data(self, data_path='data/goodreads_books.csv'): |
| | """Load and return the raw dataset""" |
| | try: |
| | df = pd.read_csv(data_path) |
| | print(f"Data loaded successfully. Shape: {df.shape}") |
| | return df |
| | except FileNotFoundError: |
| | print(f"Data file not found at {data_path}") |
| | return None |
| | |
| | def clean_data(self, df): |
| | """Clean the dataset by handling missing values and duplicates""" |
| | |
| | print("Missing values per column:") |
| | print(df.isnull().sum()) |
| | |
| | |
| | df = df.dropna(subset=['average_rating']) |
| | |
| | |
| | df['reviews_count'] = df['reviews_count'].fillna(0) |
| | df['ratings_count'] = df['ratings_count'].fillna(0) |
| | |
| | |
| | initial_rows = len(df) |
| | df = df.drop_duplicates() |
| | print(f"Removed {initial_rows - len(df)} duplicate rows") |
| | |
| | return df |
| | |
| | def engineer_features(self, df): |
| | """Create additional features for better prediction""" |
| | |
| | if 'rating_to_review_ratio' not in df.columns: |
| | df['rating_to_review_ratio'] = df['ratings_count'] / (df['reviews_count'] + 1) |
| | |
| | |
| | df['log_ratings_count'] = np.log1p(df['ratings_count']) |
| | df['log_reviews_count'] = np.log1p(df['reviews_count']) |
| | |
| | |
| | author_counts = df['author'].value_counts() |
| | df['author_book_count'] = df['author'].map(author_counts) |
| | |
| | |
| | self.top_authors = author_counts.head(20).index.tolist() |
| | |
| | return df |
| | |
| | def prepare_features(self, df, fit=True): |
| | """Prepare features for machine learning model""" |
| | |
| | feature_columns = [ |
| | 'ratings_count', 'reviews_count', 'rating_to_review_ratio', |
| | 'log_ratings_count', 'log_reviews_count', 'author_book_count' |
| | ] |
| | |
| | |
| | if fit: |
| | |
| | df_filtered = df[df['author'].isin(self.top_authors)].copy() |
| | |
| | |
| | self.author_encoder.fit(df_filtered['author']) |
| | df_filtered['author_encoded'] = self.author_encoder.transform(df_filtered['author']) |
| | else: |
| | |
| | |
| | df_filtered = df.copy() |
| | df_filtered['author_encoded'] = df_filtered['author'].apply( |
| | lambda x: self.author_encoder.transform([x])[0] if x in self.author_encoder.classes_ else -1 |
| | ) |
| | |
| | |
| | feature_columns.append('author_encoded') |
| | |
| | |
| | X = df_filtered[feature_columns] |
| | y = df_filtered['average_rating'] |
| | |
| | |
| | if fit: |
| | X_scaled = self.scaler.fit_transform(X) |
| | self.is_fitted = True |
| | else: |
| | X_scaled = self.scaler.transform(X) |
| | |
| | |
| | X_scaled_df = pd.DataFrame(X_scaled, columns=feature_columns, index=X.index) |
| | |
| | return X_scaled_df, y, df_filtered |
| | |
| | def split_data(self, X, y, test_size=0.2, random_state=42): |
| | """Split data into train and test sets""" |
| | return train_test_split(X, y, test_size=test_size, random_state=random_state) |
| | |
| | def save_preprocessor(self, models_dir='models'): |
| | """Save the fitted preprocessor components""" |
| | if not os.path.exists(models_dir): |
| | os.makedirs(models_dir) |
| | |
| | joblib.dump(self.author_encoder, f'{models_dir}/author_encoder.pkl') |
| | joblib.dump(self.scaler, f'{models_dir}/scaler.pkl') |
| | joblib.dump(self.top_authors, f'{models_dir}/top_authors.pkl') |
| | |
| | print("Preprocessor components saved successfully!") |
| | |
| | def load_preprocessor(self, models_dir='models'): |
| | """Load previously fitted preprocessor components""" |
| | try: |
| | self.author_encoder = joblib.load(f'{models_dir}/author_encoder.pkl') |
| | self.scaler = joblib.load(f'{models_dir}/scaler.pkl') |
| | self.top_authors = joblib.load(f'{models_dir}/top_authors.pkl') |
| | self.is_fitted = True |
| | print("Preprocessor components loaded successfully!") |
| | return True |
| | except FileNotFoundError as e: |
| | print(f"Preprocessor components not found: {e}") |
| | return False |
| | |
| | def get_feature_names(self): |
| | """Return the names of features used in the model""" |
| | return [ |
| | 'ratings_count', 'reviews_count', 'rating_to_review_ratio', |
| | 'log_ratings_count', 'log_reviews_count', 'author_book_count', |
| | 'author_encoded' |
| | ] |
| |
|
| | def main(): |
| | """Test the preprocessing pipeline""" |
| | preprocessor = BookDataPreprocessor() |
| | |
| | |
| | df = preprocessor.load_data() |
| | if df is None: |
| | return |
| | |
| | print(f"\nOriginal data shape: {df.shape}") |
| | print("\nFirst few rows:") |
| | print(df.head()) |
| | |
| | |
| | df_clean = preprocessor.clean_data(df) |
| | print(f"\nCleaned data shape: {df_clean.shape}") |
| | |
| | |
| | df_engineered = preprocessor.engineer_features(df_clean) |
| | print(f"\nEngineered data shape: {df_engineered.shape}") |
| | print(f"\nTop 20 authors: {preprocessor.top_authors}") |
| | |
| | |
| | X, y, df_final = preprocessor.prepare_features(df_engineered, fit=True) |
| | print(f"\nFinal feature matrix shape: {X.shape}") |
| | print(f"Target variable shape: {y.shape}") |
| | print(f"\nFeature columns: {preprocessor.get_feature_names()}") |
| | |
| | |
| | X_train, X_test, y_train, y_test = preprocessor.split_data(X, y) |
| | print(f"\nTrain set shape: {X_train.shape}") |
| | print(f"Test set shape: {X_test.shape}") |
| | |
| | |
| | preprocessor.save_preprocessor() |
| | |
| | print("\nPreprocessing pipeline completed successfully!") |
| |
|
| | if __name__ == "__main__": |
| | main() |