Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import numpy as np | |
| import warnings | |
| import missingno as msno | |
| import ast | |
| warnings.filterwarnings('ignore') | |
| class Preprocessing: | |
| def __init__(self): | |
| self.main_path = "D:/Uni/Term 6/Machine Learning/HomeWork/6/data/raw/" | |
| self.movies_metadata_path = self.main_path + "movies_metadata.csv" | |
| self.credits_path = self.main_path + "credits.csv" | |
| self.keywords_path = self.main_path + "keywords.csv" | |
| self.links_path = self.main_path + "links_small.csv" | |
| self.ratings_path = self.main_path + "ratings_small.csv" | |
| self.img_path = "D:/Uni/Term 6/Machine Learning/HomeWork/6/report/images/" | |
| self.interim_path = "D:/Uni/Term 6/Machine Learning/HomeWork/6/data/interim/" | |
| self.proceed_path = "D:/Uni/Term 6/Machine Learning/HomeWork/6/data/processed/" | |
| def load_data(self): | |
| self.df = pd.read_csv(self.movies_metadata_path) | |
| self.credits_df = pd.read_csv(self.credits_path) | |
| self.keywords_df = pd.read_csv(self.keywords_path) | |
| self.links_df = pd.read_csv(self.links_path) | |
| self.ratings_df = pd.read_csv(self.ratings_path) | |
| def df_missing_value(self): | |
| # saving missing value pic | |
| ax = msno.matrix(self.df) | |
| fig = ax.get_figure() | |
| fig.savefig(self.img_path + "df_missing.png", dpi=300, bbox_inches='tight') | |
| plt.close(fig) | |
| # Analyze missing value percentages from missing_df_info | |
| missing_df = self.df.isnull().sum().sort_values(ascending=False) | |
| missing_df_percent = (missing_df / len(self.df)) * 100 | |
| missing_df_info = pd.DataFrame({'Missing Count': missing_df, 'Missing Percentage (%)': missing_df_percent}) | |
| high_missing_cols = missing_df_info[missing_df_info['Missing Percentage (%)'] > 50].index.tolist() | |
| moderate_missing_cols = missing_df_info[(missing_df_info['Missing Percentage (%)'] <= 50) & (missing_df_info['Missing Percentage (%)'] > 1)].index.tolist() | |
| low_missing_cols = missing_df_info[missing_df_info['Missing Percentage (%)'] <= 1].index.tolist() | |
| handling_strategy = {} | |
| for col in high_missing_cols: | |
| handling_strategy[col] = "Drop column due to high missing percentage" | |
| if 'overview' in moderate_missing_cols: | |
| handling_strategy['overview'] = "Consider dropping or using a placeholder for text data" | |
| numerical_low_missing = ['runtime', 'vote_average', 'vote_count', 'revenue', 'popularity'] | |
| for col in low_missing_cols: | |
| if col in numerical_low_missing: | |
| handling_strategy[col] = "Impute with mean or median" | |
| elif col in ['status', 'release_date', 'imdb_id', 'original_language', 'title', 'video', 'spoken_languages', 'production_countries', 'production_companies', 'poster_path']: | |
| handling_strategy[col] = "Consider imputation (e.g., mode, placeholder) or dropping rows" | |
| handling_strategy['adult'] = "Investigate and potentially remove incorrect entries (e.g., 'R')" | |
| # 1. Drop columns with high missing percentages | |
| cols_to_drop = ['belongs_to_collection', 'homepage', 'tagline'] | |
| self.df = self.df.drop(columns=cols_to_drop) | |
| # 2. Fill missing values in 'overview' with a placeholder | |
| self.df['overview'] = self.df['overview'].fillna('No overview available') | |
| self.df['popularity'] = pd.to_numeric(self.df['popularity'], errors='coerce') | |
| numerical_cols_to_impute = ['runtime', 'vote_average', 'vote_count', 'revenue', 'popularity'] | |
| for col in numerical_cols_to_impute: | |
| if col in self.df.columns: | |
| median_val = self.df[col].median() | |
| self.df[col] = self.df[col].fillna(median_val) | |
| remaining_missing_cols = self.df.columns[self.df.isnull().any()].tolist() | |
| cols_to_fill_unknown = [col for col in remaining_missing_cols if col not in numerical_cols_to_impute and col != 'adult'] | |
| for col in cols_to_fill_unknown: | |
| self.df[col] = self.df[col].fillna('Unknown') | |
| # Investigate and remove incorrect entries in 'adult' column | |
| self.df = self.df[self.df['adult'].isin(['True', 'False'])] | |
| def extract_names_and_handle_empty(self, json_list_string): | |
| """Extracts names from a string representation of a list of dictionaries and handles empty lists as NaN.""" | |
| if isinstance(json_list_string, str) and json_list_string.startswith('[') and json_list_string.endswith(']'): | |
| try: | |
| data_list = ast.literal_eval(json_list_string) | |
| if isinstance(data_list, list): | |
| if not data_list: | |
| return np.nan | |
| names = [item.get('name', '') for item in data_list if isinstance(item, dict) and 'name' in item] | |
| return ', '.join(names) | |
| except (ValueError, SyntaxError): | |
| pass | |
| return '' | |
| def extract_names_from_list(self,json_list_string, key='name'): | |
| """Extracts values for a given key from a string representation of a list of dictionaries and handles empty lists as NaN.""" | |
| if isinstance(json_list_string, str) and json_list_string.startswith('[') and json_list_string.endswith(']'): | |
| try: | |
| data_list = ast.literal_eval(json_list_string) | |
| if isinstance(data_list, list): | |
| if not data_list: | |
| return np.nan | |
| names = [item.get(key, '') for item in data_list if isinstance(item, dict) and key in item] | |
| return ', '.join(names) | |
| except (ValueError, SyntaxError): | |
| pass | |
| return '' | |
| def clean_data(self): | |
| # Drop null values in links_df | |
| self.links_df = self.links_df.dropna(subset=['tmdbId']) | |
| # Handling data types | |
| self.df['id'] = pd.to_numeric(self.df['id'], errors='coerce') | |
| self.df.dropna(subset=['id'], inplace=True) | |
| self.df['id'] = self.df['id'].astype(int) | |
| self.links_df.dropna(subset=['tmdbId'], inplace=True) | |
| self.links_df['tmdbId'] = self.links_df['tmdbId'].astype(int) | |
| copy_df = self.df.copy() | |
| # Columns identified as containing JSON format | |
| json_columns = ['genres', 'production_companies', 'production_countries', 'spoken_languages'] | |
| # Apply the extraction function and handle empty lists to copy_df | |
| for col in json_columns: | |
| if col in copy_df.columns: | |
| copy_df[col] = copy_df[col].apply(self.extract_names_and_handle_empty) | |
| # Now fill the NaN values (which were empty lists) with 'Unknown' | |
| copy_df[col] = copy_df[col].fillna('Unknown') | |
| self.df = copy_df.copy() | |
| # Handle JSON columns in credits_df | |
| # 'cast' and 'crew' columns contain lists of dictionaries, we'll extract names | |
| self.credits_df['cast'] = self.credits_df['cast'].apply(self.extract_names_from_list, key='name') | |
| self.credits_df['crew'] = self.credits_df['crew'].apply(self.extract_names_from_list, key='name') | |
| # Fill NaN values (originally empty lists) with 'Unknown' in credits_df | |
| self.credits_df['cast'] = self.credits_df['cast'].fillna('Unknown') | |
| self.credits_df['crew'] = self.credits_df['crew'].fillna('Unknown') | |
| # Handle JSON columns in keywords_df | |
| # 'keywords' column contains a list of dictionaries, we'll extract names | |
| self.keywords_df['keywords'] = self.keywords_df['keywords'].apply(self.extract_names_from_list, key='name') | |
| # Fill NaN values (originally empty lists) with 'Unknown' in self.keywords_df | |
| self.keywords_df['keywords'] = self.keywords_df['keywords'].fillna('Unknown') | |
| # Remove duplicates from key columns | |
| self.df.drop_duplicates(subset=['id'], inplace=True) | |
| self.credits_df.drop_duplicates(subset=['id'], inplace=True) | |
| self.keywords_df.drop_duplicates(subset=['id'], inplace=True) | |
| self.links_df.drop_duplicates(subset=['tmdbId'], inplace=True) | |
| self.ratings_df.drop_duplicates(subset=['movieId', 'userId'], inplace=True) | |
| def merge_data(self): | |
| # Merge df, credits_df, and keywords_df on 'id' | |
| self.merged_df = pd.merge(self.df, self.credits_df, on='id', how='inner') | |
| self.merged_df = pd.merge(self.merged_df, self.keywords_df, on='id', how='inner') | |
| # Merge with links_df using tmdbId from links_df and id from self.merged_df | |
| self.merged_df = pd.merge(self.merged_df, self.links_df, left_on='id', right_on='tmdbId', how='inner') | |
| # Do NOT merge with ratings_df here! Only merge for modeling step. | |
| def generate_interim_va_proceed_csv(self): | |
| # Save cleaned DataFrames to interim CSV files | |
| self.df.to_csv(self.interim_path + "movies_metadata_clean.csv", index=False) | |
| self.credits_df.to_csv(self.interim_path + "credits_clean.csv", index=False) | |
| self.keywords_df.to_csv(self.interim_path + "keywords_clean.csv", index=False) | |
| self.links_df.to_csv(self.interim_path + "links_clean.csv", index=False) | |
| self.ratings_df.to_csv(self.interim_path + "ratings_clean.csv", index=False) | |
| self.merged_df.to_csv(self.proceed_path + "merged_clean.csv", index=False) | |
| def run_all(self): | |
| self.load_data() | |
| self.df_missing_value() | |
| self.clean_data() | |
| self.merge_data() | |
| self.generate_interim_va_proceed_csv() | |
| return { | |
| "df": self.df, | |
| "credits_df": self.credits_df, | |
| "keywords_df": self.keywords_df, | |
| "links_df": self.links_df, | |
| "ratings_df": self.ratings_df, | |
| "merged_df": self.merged_df | |
| } |