Spaces:

Bardi-ya
/

Final_ML_Project

Sleeping

App Files Files Community

Final_ML_Project / src /preprocessing.py

Bardi-ya

Upload 51 files

c296592 verified 4 months ago

raw

history blame contribute delete

10.1 kB

	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns
	import numpy as np
	import warnings
	import missingno as msno
	import ast
	warnings.filterwarnings('ignore')

	class Preprocessing:
	def __init__(self):
	self.main_path = "D:/Uni/Term 6/Machine Learning/HomeWork/6/data/raw/"
	self.movies_metadata_path = self.main_path + "movies_metadata.csv"
	self.credits_path = self.main_path + "credits.csv"
	self.keywords_path = self.main_path + "keywords.csv"
	self.links_path = self.main_path + "links_small.csv"
	self.ratings_path = self.main_path + "ratings_small.csv"
	self.img_path = "D:/Uni/Term 6/Machine Learning/HomeWork/6/report/images/"
	self.interim_path = "D:/Uni/Term 6/Machine Learning/HomeWork/6/data/interim/"
	self.proceed_path = "D:/Uni/Term 6/Machine Learning/HomeWork/6/data/processed/"

	def load_data(self):
	self.df = pd.read_csv(self.movies_metadata_path)
	self.credits_df = pd.read_csv(self.credits_path)
	self.keywords_df = pd.read_csv(self.keywords_path)
	self.links_df = pd.read_csv(self.links_path)
	self.ratings_df = pd.read_csv(self.ratings_path)

	def df_missing_value(self):
	# saving missing value pic
	ax = msno.matrix(self.df)
	fig = ax.get_figure()
	fig.savefig(self.img_path + "df_missing.png", dpi=300, bbox_inches='tight')
	plt.close(fig)
	# Analyze missing value percentages from missing_df_info
	missing_df = self.df.isnull().sum().sort_values(ascending=False)
	missing_df_percent = (missing_df / len(self.df)) * 100
	missing_df_info = pd.DataFrame({'Missing Count': missing_df, 'Missing Percentage (%)': missing_df_percent})

	high_missing_cols = missing_df_info[missing_df_info['Missing Percentage (%)'] > 50].index.tolist()
	moderate_missing_cols = missing_df_info[(missing_df_info['Missing Percentage (%)'] <= 50) & (missing_df_info['Missing Percentage (%)'] > 1)].index.tolist()
	low_missing_cols = missing_df_info[missing_df_info['Missing Percentage (%)'] <= 1].index.tolist()
	handling_strategy = {}

	for col in high_missing_cols:
	handling_strategy[col] = "Drop column due to high missing percentage"

	if 'overview' in moderate_missing_cols:
	handling_strategy['overview'] = "Consider dropping or using a placeholder for text data"

	numerical_low_missing = ['runtime', 'vote_average', 'vote_count', 'revenue', 'popularity']
	for col in low_missing_cols:
	if col in numerical_low_missing:
	handling_strategy[col] = "Impute with mean or median"
	elif col in ['status', 'release_date', 'imdb_id', 'original_language', 'title', 'video', 'spoken_languages', 'production_countries', 'production_companies', 'poster_path']:
	handling_strategy[col] = "Consider imputation (e.g., mode, placeholder) or dropping rows"


	handling_strategy['adult'] = "Investigate and potentially remove incorrect entries (e.g., 'R')"

	# 1. Drop columns with high missing percentages
	cols_to_drop = ['belongs_to_collection', 'homepage', 'tagline']
	self.df = self.df.drop(columns=cols_to_drop)

	# 2. Fill missing values in 'overview' with a placeholder
	self.df['overview'] = self.df['overview'].fillna('No overview available')

	self.df['popularity'] = pd.to_numeric(self.df['popularity'], errors='coerce')

	numerical_cols_to_impute = ['runtime', 'vote_average', 'vote_count', 'revenue', 'popularity']
	for col in numerical_cols_to_impute:
	if col in self.df.columns:
	median_val = self.df[col].median()
	self.df[col] = self.df[col].fillna(median_val)

	remaining_missing_cols = self.df.columns[self.df.isnull().any()].tolist()
	cols_to_fill_unknown = [col for col in remaining_missing_cols if col not in numerical_cols_to_impute and col != 'adult']

	for col in cols_to_fill_unknown:
	self.df[col] = self.df[col].fillna('Unknown')

	# Investigate and remove incorrect entries in 'adult' column
	self.df = self.df[self.df['adult'].isin(['True', 'False'])]

	def extract_names_and_handle_empty(self, json_list_string):
	"""Extracts names from a string representation of a list of dictionaries and handles empty lists as NaN."""
	if isinstance(json_list_string, str) and json_list_string.startswith('[') and json_list_string.endswith(']'):
	try:
	data_list = ast.literal_eval(json_list_string)
	if isinstance(data_list, list):
	if not data_list:
	return np.nan
	names = [item.get('name', '') for item in data_list if isinstance(item, dict) and 'name' in item]
	return ', '.join(names)
	except (ValueError, SyntaxError):
	pass
	return ''


	def extract_names_from_list(self,json_list_string, key='name'):
	"""Extracts values for a given key from a string representation of a list of dictionaries and handles empty lists as NaN."""
	if isinstance(json_list_string, str) and json_list_string.startswith('[') and json_list_string.endswith(']'):
	try:
	data_list = ast.literal_eval(json_list_string)
	if isinstance(data_list, list):
	if not data_list:
	return np.nan
	names = [item.get(key, '') for item in data_list if isinstance(item, dict) and key in item]
	return ', '.join(names)
	except (ValueError, SyntaxError):
	pass
	return ''

	def clean_data(self):
	# Drop null values in links_df
	self.links_df = self.links_df.dropna(subset=['tmdbId'])
	# Handling data types
	self.df['id'] = pd.to_numeric(self.df['id'], errors='coerce')
	self.df.dropna(subset=['id'], inplace=True)
	self.df['id'] = self.df['id'].astype(int)

	self.links_df.dropna(subset=['tmdbId'], inplace=True)
	self.links_df['tmdbId'] = self.links_df['tmdbId'].astype(int)

	copy_df = self.df.copy()
	# Columns identified as containing JSON format
	json_columns = ['genres', 'production_companies', 'production_countries', 'spoken_languages']

	# Apply the extraction function and handle empty lists to copy_df
	for col in json_columns:
	if col in copy_df.columns:
	copy_df[col] = copy_df[col].apply(self.extract_names_and_handle_empty)
	# Now fill the NaN values (which were empty lists) with 'Unknown'
	copy_df[col] = copy_df[col].fillna('Unknown')
	self.df = copy_df.copy()

	# Handle JSON columns in credits_df
	# 'cast' and 'crew' columns contain lists of dictionaries, we'll extract names
	self.credits_df['cast'] = self.credits_df['cast'].apply(self.extract_names_from_list, key='name')
	self.credits_df['crew'] = self.credits_df['crew'].apply(self.extract_names_from_list, key='name')

	# Fill NaN values (originally empty lists) with 'Unknown' in credits_df
	self.credits_df['cast'] = self.credits_df['cast'].fillna('Unknown')
	self.credits_df['crew'] = self.credits_df['crew'].fillna('Unknown')

	# Handle JSON columns in keywords_df
	# 'keywords' column contains a list of dictionaries, we'll extract names
	self.keywords_df['keywords'] = self.keywords_df['keywords'].apply(self.extract_names_from_list, key='name')

	# Fill NaN values (originally empty lists) with 'Unknown' in self.keywords_df
	self.keywords_df['keywords'] = self.keywords_df['keywords'].fillna('Unknown')

	# Remove duplicates from key columns
	self.df.drop_duplicates(subset=['id'], inplace=True)
	self.credits_df.drop_duplicates(subset=['id'], inplace=True)
	self.keywords_df.drop_duplicates(subset=['id'], inplace=True)
	self.links_df.drop_duplicates(subset=['tmdbId'], inplace=True)
	self.ratings_df.drop_duplicates(subset=['movieId', 'userId'], inplace=True)


	def merge_data(self):
	# Merge df, credits_df, and keywords_df on 'id'
	self.merged_df = pd.merge(self.df, self.credits_df, on='id', how='inner')
	self.merged_df = pd.merge(self.merged_df, self.keywords_df, on='id', how='inner')
	# Merge with links_df using tmdbId from links_df and id from self.merged_df
	self.merged_df = pd.merge(self.merged_df, self.links_df, left_on='id', right_on='tmdbId', how='inner')
	# Do NOT merge with ratings_df here! Only merge for modeling step.

	def generate_interim_va_proceed_csv(self):
	# Save cleaned DataFrames to interim CSV files
	self.df.to_csv(self.interim_path + "movies_metadata_clean.csv", index=False)
	self.credits_df.to_csv(self.interim_path + "credits_clean.csv", index=False)
	self.keywords_df.to_csv(self.interim_path + "keywords_clean.csv", index=False)
	self.links_df.to_csv(self.interim_path + "links_clean.csv", index=False)
	self.ratings_df.to_csv(self.interim_path + "ratings_clean.csv", index=False)
	self.merged_df.to_csv(self.proceed_path + "merged_clean.csv", index=False)

	def run_all(self):
	self.load_data()
	self.df_missing_value()
	self.clean_data()
	self.merge_data()
	self.generate_interim_va_proceed_csv()
	return {
	"df": self.df,
	"credits_df": self.credits_df,
	"keywords_df": self.keywords_df,
	"links_df": self.links_df,
	"ratings_df": self.ratings_df,
	"merged_df": self.merged_df
	}