Final_ML_Project / src /preprocessing.py
Bardi-ya's picture
Upload 51 files
c296592 verified
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
import missingno as msno
import ast
warnings.filterwarnings('ignore')
class Preprocessing:
def __init__(self):
self.main_path = "D:/Uni/Term 6/Machine Learning/HomeWork/6/data/raw/"
self.movies_metadata_path = self.main_path + "movies_metadata.csv"
self.credits_path = self.main_path + "credits.csv"
self.keywords_path = self.main_path + "keywords.csv"
self.links_path = self.main_path + "links_small.csv"
self.ratings_path = self.main_path + "ratings_small.csv"
self.img_path = "D:/Uni/Term 6/Machine Learning/HomeWork/6/report/images/"
self.interim_path = "D:/Uni/Term 6/Machine Learning/HomeWork/6/data/interim/"
self.proceed_path = "D:/Uni/Term 6/Machine Learning/HomeWork/6/data/processed/"
def load_data(self):
self.df = pd.read_csv(self.movies_metadata_path)
self.credits_df = pd.read_csv(self.credits_path)
self.keywords_df = pd.read_csv(self.keywords_path)
self.links_df = pd.read_csv(self.links_path)
self.ratings_df = pd.read_csv(self.ratings_path)
def df_missing_value(self):
# saving missing value pic
ax = msno.matrix(self.df)
fig = ax.get_figure()
fig.savefig(self.img_path + "df_missing.png", dpi=300, bbox_inches='tight')
plt.close(fig)
# Analyze missing value percentages from missing_df_info
missing_df = self.df.isnull().sum().sort_values(ascending=False)
missing_df_percent = (missing_df / len(self.df)) * 100
missing_df_info = pd.DataFrame({'Missing Count': missing_df, 'Missing Percentage (%)': missing_df_percent})
high_missing_cols = missing_df_info[missing_df_info['Missing Percentage (%)'] > 50].index.tolist()
moderate_missing_cols = missing_df_info[(missing_df_info['Missing Percentage (%)'] <= 50) & (missing_df_info['Missing Percentage (%)'] > 1)].index.tolist()
low_missing_cols = missing_df_info[missing_df_info['Missing Percentage (%)'] <= 1].index.tolist()
handling_strategy = {}
for col in high_missing_cols:
handling_strategy[col] = "Drop column due to high missing percentage"
if 'overview' in moderate_missing_cols:
handling_strategy['overview'] = "Consider dropping or using a placeholder for text data"
numerical_low_missing = ['runtime', 'vote_average', 'vote_count', 'revenue', 'popularity']
for col in low_missing_cols:
if col in numerical_low_missing:
handling_strategy[col] = "Impute with mean or median"
elif col in ['status', 'release_date', 'imdb_id', 'original_language', 'title', 'video', 'spoken_languages', 'production_countries', 'production_companies', 'poster_path']:
handling_strategy[col] = "Consider imputation (e.g., mode, placeholder) or dropping rows"
handling_strategy['adult'] = "Investigate and potentially remove incorrect entries (e.g., 'R')"
# 1. Drop columns with high missing percentages
cols_to_drop = ['belongs_to_collection', 'homepage', 'tagline']
self.df = self.df.drop(columns=cols_to_drop)
# 2. Fill missing values in 'overview' with a placeholder
self.df['overview'] = self.df['overview'].fillna('No overview available')
self.df['popularity'] = pd.to_numeric(self.df['popularity'], errors='coerce')
numerical_cols_to_impute = ['runtime', 'vote_average', 'vote_count', 'revenue', 'popularity']
for col in numerical_cols_to_impute:
if col in self.df.columns:
median_val = self.df[col].median()
self.df[col] = self.df[col].fillna(median_val)
remaining_missing_cols = self.df.columns[self.df.isnull().any()].tolist()
cols_to_fill_unknown = [col for col in remaining_missing_cols if col not in numerical_cols_to_impute and col != 'adult']
for col in cols_to_fill_unknown:
self.df[col] = self.df[col].fillna('Unknown')
# Investigate and remove incorrect entries in 'adult' column
self.df = self.df[self.df['adult'].isin(['True', 'False'])]
def extract_names_and_handle_empty(self, json_list_string):
"""Extracts names from a string representation of a list of dictionaries and handles empty lists as NaN."""
if isinstance(json_list_string, str) and json_list_string.startswith('[') and json_list_string.endswith(']'):
try:
data_list = ast.literal_eval(json_list_string)
if isinstance(data_list, list):
if not data_list:
return np.nan
names = [item.get('name', '') for item in data_list if isinstance(item, dict) and 'name' in item]
return ', '.join(names)
except (ValueError, SyntaxError):
pass
return ''
def extract_names_from_list(self,json_list_string, key='name'):
"""Extracts values for a given key from a string representation of a list of dictionaries and handles empty lists as NaN."""
if isinstance(json_list_string, str) and json_list_string.startswith('[') and json_list_string.endswith(']'):
try:
data_list = ast.literal_eval(json_list_string)
if isinstance(data_list, list):
if not data_list:
return np.nan
names = [item.get(key, '') for item in data_list if isinstance(item, dict) and key in item]
return ', '.join(names)
except (ValueError, SyntaxError):
pass
return ''
def clean_data(self):
# Drop null values in links_df
self.links_df = self.links_df.dropna(subset=['tmdbId'])
# Handling data types
self.df['id'] = pd.to_numeric(self.df['id'], errors='coerce')
self.df.dropna(subset=['id'], inplace=True)
self.df['id'] = self.df['id'].astype(int)
self.links_df.dropna(subset=['tmdbId'], inplace=True)
self.links_df['tmdbId'] = self.links_df['tmdbId'].astype(int)
copy_df = self.df.copy()
# Columns identified as containing JSON format
json_columns = ['genres', 'production_companies', 'production_countries', 'spoken_languages']
# Apply the extraction function and handle empty lists to copy_df
for col in json_columns:
if col in copy_df.columns:
copy_df[col] = copy_df[col].apply(self.extract_names_and_handle_empty)
# Now fill the NaN values (which were empty lists) with 'Unknown'
copy_df[col] = copy_df[col].fillna('Unknown')
self.df = copy_df.copy()
# Handle JSON columns in credits_df
# 'cast' and 'crew' columns contain lists of dictionaries, we'll extract names
self.credits_df['cast'] = self.credits_df['cast'].apply(self.extract_names_from_list, key='name')
self.credits_df['crew'] = self.credits_df['crew'].apply(self.extract_names_from_list, key='name')
# Fill NaN values (originally empty lists) with 'Unknown' in credits_df
self.credits_df['cast'] = self.credits_df['cast'].fillna('Unknown')
self.credits_df['crew'] = self.credits_df['crew'].fillna('Unknown')
# Handle JSON columns in keywords_df
# 'keywords' column contains a list of dictionaries, we'll extract names
self.keywords_df['keywords'] = self.keywords_df['keywords'].apply(self.extract_names_from_list, key='name')
# Fill NaN values (originally empty lists) with 'Unknown' in self.keywords_df
self.keywords_df['keywords'] = self.keywords_df['keywords'].fillna('Unknown')
# Remove duplicates from key columns
self.df.drop_duplicates(subset=['id'], inplace=True)
self.credits_df.drop_duplicates(subset=['id'], inplace=True)
self.keywords_df.drop_duplicates(subset=['id'], inplace=True)
self.links_df.drop_duplicates(subset=['tmdbId'], inplace=True)
self.ratings_df.drop_duplicates(subset=['movieId', 'userId'], inplace=True)
def merge_data(self):
# Merge df, credits_df, and keywords_df on 'id'
self.merged_df = pd.merge(self.df, self.credits_df, on='id', how='inner')
self.merged_df = pd.merge(self.merged_df, self.keywords_df, on='id', how='inner')
# Merge with links_df using tmdbId from links_df and id from self.merged_df
self.merged_df = pd.merge(self.merged_df, self.links_df, left_on='id', right_on='tmdbId', how='inner')
# Do NOT merge with ratings_df here! Only merge for modeling step.
def generate_interim_va_proceed_csv(self):
# Save cleaned DataFrames to interim CSV files
self.df.to_csv(self.interim_path + "movies_metadata_clean.csv", index=False)
self.credits_df.to_csv(self.interim_path + "credits_clean.csv", index=False)
self.keywords_df.to_csv(self.interim_path + "keywords_clean.csv", index=False)
self.links_df.to_csv(self.interim_path + "links_clean.csv", index=False)
self.ratings_df.to_csv(self.interim_path + "ratings_clean.csv", index=False)
self.merged_df.to_csv(self.proceed_path + "merged_clean.csv", index=False)
def run_all(self):
self.load_data()
self.df_missing_value()
self.clean_data()
self.merge_data()
self.generate_interim_va_proceed_csv()
return {
"df": self.df,
"credits_df": self.credits_df,
"keywords_df": self.keywords_df,
"links_df": self.links_df,
"ratings_df": self.ratings_df,
"merged_df": self.merged_df
}