Spaces:
Sleeping
Sleeping
File size: 10,078 Bytes
c296592 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
import missingno as msno
import ast
warnings.filterwarnings('ignore')
class Preprocessing:
def __init__(self):
self.main_path = "D:/Uni/Term 6/Machine Learning/HomeWork/6/data/raw/"
self.movies_metadata_path = self.main_path + "movies_metadata.csv"
self.credits_path = self.main_path + "credits.csv"
self.keywords_path = self.main_path + "keywords.csv"
self.links_path = self.main_path + "links_small.csv"
self.ratings_path = self.main_path + "ratings_small.csv"
self.img_path = "D:/Uni/Term 6/Machine Learning/HomeWork/6/report/images/"
self.interim_path = "D:/Uni/Term 6/Machine Learning/HomeWork/6/data/interim/"
self.proceed_path = "D:/Uni/Term 6/Machine Learning/HomeWork/6/data/processed/"
def load_data(self):
self.df = pd.read_csv(self.movies_metadata_path)
self.credits_df = pd.read_csv(self.credits_path)
self.keywords_df = pd.read_csv(self.keywords_path)
self.links_df = pd.read_csv(self.links_path)
self.ratings_df = pd.read_csv(self.ratings_path)
def df_missing_value(self):
# saving missing value pic
ax = msno.matrix(self.df)
fig = ax.get_figure()
fig.savefig(self.img_path + "df_missing.png", dpi=300, bbox_inches='tight')
plt.close(fig)
# Analyze missing value percentages from missing_df_info
missing_df = self.df.isnull().sum().sort_values(ascending=False)
missing_df_percent = (missing_df / len(self.df)) * 100
missing_df_info = pd.DataFrame({'Missing Count': missing_df, 'Missing Percentage (%)': missing_df_percent})
high_missing_cols = missing_df_info[missing_df_info['Missing Percentage (%)'] > 50].index.tolist()
moderate_missing_cols = missing_df_info[(missing_df_info['Missing Percentage (%)'] <= 50) & (missing_df_info['Missing Percentage (%)'] > 1)].index.tolist()
low_missing_cols = missing_df_info[missing_df_info['Missing Percentage (%)'] <= 1].index.tolist()
handling_strategy = {}
for col in high_missing_cols:
handling_strategy[col] = "Drop column due to high missing percentage"
if 'overview' in moderate_missing_cols:
handling_strategy['overview'] = "Consider dropping or using a placeholder for text data"
numerical_low_missing = ['runtime', 'vote_average', 'vote_count', 'revenue', 'popularity']
for col in low_missing_cols:
if col in numerical_low_missing:
handling_strategy[col] = "Impute with mean or median"
elif col in ['status', 'release_date', 'imdb_id', 'original_language', 'title', 'video', 'spoken_languages', 'production_countries', 'production_companies', 'poster_path']:
handling_strategy[col] = "Consider imputation (e.g., mode, placeholder) or dropping rows"
handling_strategy['adult'] = "Investigate and potentially remove incorrect entries (e.g., 'R')"
# 1. Drop columns with high missing percentages
cols_to_drop = ['belongs_to_collection', 'homepage', 'tagline']
self.df = self.df.drop(columns=cols_to_drop)
# 2. Fill missing values in 'overview' with a placeholder
self.df['overview'] = self.df['overview'].fillna('No overview available')
self.df['popularity'] = pd.to_numeric(self.df['popularity'], errors='coerce')
numerical_cols_to_impute = ['runtime', 'vote_average', 'vote_count', 'revenue', 'popularity']
for col in numerical_cols_to_impute:
if col in self.df.columns:
median_val = self.df[col].median()
self.df[col] = self.df[col].fillna(median_val)
remaining_missing_cols = self.df.columns[self.df.isnull().any()].tolist()
cols_to_fill_unknown = [col for col in remaining_missing_cols if col not in numerical_cols_to_impute and col != 'adult']
for col in cols_to_fill_unknown:
self.df[col] = self.df[col].fillna('Unknown')
# Investigate and remove incorrect entries in 'adult' column
self.df = self.df[self.df['adult'].isin(['True', 'False'])]
def extract_names_and_handle_empty(self, json_list_string):
"""Extracts names from a string representation of a list of dictionaries and handles empty lists as NaN."""
if isinstance(json_list_string, str) and json_list_string.startswith('[') and json_list_string.endswith(']'):
try:
data_list = ast.literal_eval(json_list_string)
if isinstance(data_list, list):
if not data_list:
return np.nan
names = [item.get('name', '') for item in data_list if isinstance(item, dict) and 'name' in item]
return ', '.join(names)
except (ValueError, SyntaxError):
pass
return ''
def extract_names_from_list(self,json_list_string, key='name'):
"""Extracts values for a given key from a string representation of a list of dictionaries and handles empty lists as NaN."""
if isinstance(json_list_string, str) and json_list_string.startswith('[') and json_list_string.endswith(']'):
try:
data_list = ast.literal_eval(json_list_string)
if isinstance(data_list, list):
if not data_list:
return np.nan
names = [item.get(key, '') for item in data_list if isinstance(item, dict) and key in item]
return ', '.join(names)
except (ValueError, SyntaxError):
pass
return ''
def clean_data(self):
# Drop null values in links_df
self.links_df = self.links_df.dropna(subset=['tmdbId'])
# Handling data types
self.df['id'] = pd.to_numeric(self.df['id'], errors='coerce')
self.df.dropna(subset=['id'], inplace=True)
self.df['id'] = self.df['id'].astype(int)
self.links_df.dropna(subset=['tmdbId'], inplace=True)
self.links_df['tmdbId'] = self.links_df['tmdbId'].astype(int)
copy_df = self.df.copy()
# Columns identified as containing JSON format
json_columns = ['genres', 'production_companies', 'production_countries', 'spoken_languages']
# Apply the extraction function and handle empty lists to copy_df
for col in json_columns:
if col in copy_df.columns:
copy_df[col] = copy_df[col].apply(self.extract_names_and_handle_empty)
# Now fill the NaN values (which were empty lists) with 'Unknown'
copy_df[col] = copy_df[col].fillna('Unknown')
self.df = copy_df.copy()
# Handle JSON columns in credits_df
# 'cast' and 'crew' columns contain lists of dictionaries, we'll extract names
self.credits_df['cast'] = self.credits_df['cast'].apply(self.extract_names_from_list, key='name')
self.credits_df['crew'] = self.credits_df['crew'].apply(self.extract_names_from_list, key='name')
# Fill NaN values (originally empty lists) with 'Unknown' in credits_df
self.credits_df['cast'] = self.credits_df['cast'].fillna('Unknown')
self.credits_df['crew'] = self.credits_df['crew'].fillna('Unknown')
# Handle JSON columns in keywords_df
# 'keywords' column contains a list of dictionaries, we'll extract names
self.keywords_df['keywords'] = self.keywords_df['keywords'].apply(self.extract_names_from_list, key='name')
# Fill NaN values (originally empty lists) with 'Unknown' in self.keywords_df
self.keywords_df['keywords'] = self.keywords_df['keywords'].fillna('Unknown')
# Remove duplicates from key columns
self.df.drop_duplicates(subset=['id'], inplace=True)
self.credits_df.drop_duplicates(subset=['id'], inplace=True)
self.keywords_df.drop_duplicates(subset=['id'], inplace=True)
self.links_df.drop_duplicates(subset=['tmdbId'], inplace=True)
self.ratings_df.drop_duplicates(subset=['movieId', 'userId'], inplace=True)
def merge_data(self):
# Merge df, credits_df, and keywords_df on 'id'
self.merged_df = pd.merge(self.df, self.credits_df, on='id', how='inner')
self.merged_df = pd.merge(self.merged_df, self.keywords_df, on='id', how='inner')
# Merge with links_df using tmdbId from links_df and id from self.merged_df
self.merged_df = pd.merge(self.merged_df, self.links_df, left_on='id', right_on='tmdbId', how='inner')
# Do NOT merge with ratings_df here! Only merge for modeling step.
def generate_interim_va_proceed_csv(self):
# Save cleaned DataFrames to interim CSV files
self.df.to_csv(self.interim_path + "movies_metadata_clean.csv", index=False)
self.credits_df.to_csv(self.interim_path + "credits_clean.csv", index=False)
self.keywords_df.to_csv(self.interim_path + "keywords_clean.csv", index=False)
self.links_df.to_csv(self.interim_path + "links_clean.csv", index=False)
self.ratings_df.to_csv(self.interim_path + "ratings_clean.csv", index=False)
self.merged_df.to_csv(self.proceed_path + "merged_clean.csv", index=False)
def run_all(self):
self.load_data()
self.df_missing_value()
self.clean_data()
self.merge_data()
self.generate_interim_va_proceed_csv()
return {
"df": self.df,
"credits_df": self.credits_df,
"keywords_df": self.keywords_df,
"links_df": self.links_df,
"ratings_df": self.ratings_df,
"merged_df": self.merged_df
} |