Spaces:

fguryel
/

goodreads

Sleeping

App Files Files Community

goodreads / src /data_preprocessing.py

fguryel

Deploy ML project

ce92e54 5 months ago

raw

history blame contribute delete

7.13 kB

	"""
	Data preprocessing module for book popularity prediction.
	This module handles data cleaning, feature engineering, and preparation for ML model.
	"""

	import pandas as pd
	import numpy as np
	from sklearn.preprocessing import LabelEncoder, StandardScaler
	from sklearn.model_selection import train_test_split
	import joblib
	import os

	class BookDataPreprocessor:
	def __init__(self):
	self.author_encoder = LabelEncoder()
	self.scaler = StandardScaler()
	self.top_authors = []
	self.is_fitted = False

	def load_data(self, data_path='data/goodreads_books.csv'):
	"""Load and return the raw dataset"""
	try:
	df = pd.read_csv(data_path)
	print(f"Data loaded successfully. Shape: {df.shape}")
	return df
	except FileNotFoundError:
	print(f"Data file not found at {data_path}")
	return None

	def clean_data(self, df):
	"""Clean the dataset by handling missing values and duplicates"""
	# Check for missing values
	print("Missing values per column:")
	print(df.isnull().sum())

	# Remove any rows with missing target variable (average_rating)
	df = df.dropna(subset=['average_rating'])

	# Fill missing values in other columns if any
	df['reviews_count'] = df['reviews_count'].fillna(0)
	df['ratings_count'] = df['ratings_count'].fillna(0)

	# Remove duplicates
	initial_rows = len(df)
	df = df.drop_duplicates()
	print(f"Removed {initial_rows - len(df)} duplicate rows")

	return df

	def engineer_features(self, df):
	"""Create additional features for better prediction"""
	# Calculate rating to review ratio if not already present
	if 'rating_to_review_ratio' not in df.columns:
	df['rating_to_review_ratio'] = df['ratings_count'] / (df['reviews_count'] + 1) # +1 to avoid division by zero

	# Log transform of counts to handle skewness
	df['log_ratings_count'] = np.log1p(df['ratings_count'])
	df['log_reviews_count'] = np.log1p(df['reviews_count'])

	# Author popularity (number of books by author in dataset)
	author_counts = df['author'].value_counts()
	df['author_book_count'] = df['author'].map(author_counts)

	# Get top authors (will be used for the web app dropdown)
	self.top_authors = author_counts.head(20).index.tolist()

	return df

	def prepare_features(self, df, fit=True):
	"""Prepare features for machine learning model"""
	# Select features for modeling
	feature_columns = [
	'ratings_count', 'reviews_count', 'rating_to_review_ratio',
	'log_ratings_count', 'log_reviews_count', 'author_book_count'
	]

	# Prepare author encoding
	if fit:
	# Filter to keep only top authors for better model performance
	df_filtered = df[df['author'].isin(self.top_authors)].copy()

	# Fit label encoder on top authors
	self.author_encoder.fit(df_filtered['author'])
	df_filtered['author_encoded'] = self.author_encoder.transform(df_filtered['author'])
	else:
	# Transform using existing encoder
	# For authors not in top list, assign a default value
	df_filtered = df.copy()
	df_filtered['author_encoded'] = df_filtered['author'].apply(
	lambda x: self.author_encoder.transform([x])[0] if x in self.author_encoder.classes_ else -1
	)

	# Add author_encoded to feature columns
	feature_columns.append('author_encoded')

	# Prepare feature matrix
	X = df_filtered[feature_columns]
	y = df_filtered['average_rating']

	# Scale numerical features
	if fit:
	X_scaled = self.scaler.fit_transform(X)
	self.is_fitted = True
	else:
	X_scaled = self.scaler.transform(X)

	# Convert back to DataFrame for better handling
	X_scaled_df = pd.DataFrame(X_scaled, columns=feature_columns, index=X.index)

	return X_scaled_df, y, df_filtered

	def split_data(self, X, y, test_size=0.2, random_state=42):
	"""Split data into train and test sets"""
	return train_test_split(X, y, test_size=test_size, random_state=random_state)

	def save_preprocessor(self, models_dir='models'):
	"""Save the fitted preprocessor components"""
	if not os.path.exists(models_dir):
	os.makedirs(models_dir)

	joblib.dump(self.author_encoder, f'{models_dir}/author_encoder.pkl')
	joblib.dump(self.scaler, f'{models_dir}/scaler.pkl')
	joblib.dump(self.top_authors, f'{models_dir}/top_authors.pkl')

	print("Preprocessor components saved successfully!")

	def load_preprocessor(self, models_dir='models'):
	"""Load previously fitted preprocessor components"""
	try:
	self.author_encoder = joblib.load(f'{models_dir}/author_encoder.pkl')
	self.scaler = joblib.load(f'{models_dir}/scaler.pkl')
	self.top_authors = joblib.load(f'{models_dir}/top_authors.pkl')
	self.is_fitted = True
	print("Preprocessor components loaded successfully!")
	return True
	except FileNotFoundError as e:
	print(f"Preprocessor components not found: {e}")
	return False

	def get_feature_names(self):
	"""Return the names of features used in the model"""
	return [
	'ratings_count', 'reviews_count', 'rating_to_review_ratio',
	'log_ratings_count', 'log_reviews_count', 'author_book_count',
	'author_encoded'
	]

	def main():
	"""Test the preprocessing pipeline"""
	preprocessor = BookDataPreprocessor()

	# Load data
	df = preprocessor.load_data()
	if df is None:
	return

	print(f"\nOriginal data shape: {df.shape}")
	print("\nFirst few rows:")
	print(df.head())

	# Clean data
	df_clean = preprocessor.clean_data(df)
	print(f"\nCleaned data shape: {df_clean.shape}")

	# Engineer features
	df_engineered = preprocessor.engineer_features(df_clean)
	print(f"\nEngineered data shape: {df_engineered.shape}")
	print(f"\nTop 20 authors: {preprocessor.top_authors}")

	# Prepare features
	X, y, df_final = preprocessor.prepare_features(df_engineered, fit=True)
	print(f"\nFinal feature matrix shape: {X.shape}")
	print(f"Target variable shape: {y.shape}")
	print(f"\nFeature columns: {preprocessor.get_feature_names()}")

	# Split data
	X_train, X_test, y_train, y_test = preprocessor.split_data(X, y)
	print(f"\nTrain set shape: {X_train.shape}")
	print(f"Test set shape: {X_test.shape}")

	# Save preprocessor
	preprocessor.save_preprocessor()

	print("\nPreprocessing pipeline completed successfully!")

	if __name__ == "__main__":
	main()