File size: 10,078 Bytes
c296592
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
import missingno as msno
import ast
warnings.filterwarnings('ignore')

class Preprocessing:
    def __init__(self):
        self.main_path = "D:/Uni/Term 6/Machine Learning/HomeWork/6/data/raw/"
        self.movies_metadata_path = self.main_path + "movies_metadata.csv"
        self.credits_path = self.main_path + "credits.csv"
        self.keywords_path = self.main_path + "keywords.csv"
        self.links_path = self.main_path + "links_small.csv"
        self.ratings_path = self.main_path + "ratings_small.csv"
        self.img_path = "D:/Uni/Term 6/Machine Learning/HomeWork/6/report/images/"
        self.interim_path = "D:/Uni/Term 6/Machine Learning/HomeWork/6/data/interim/"
        self.proceed_path = "D:/Uni/Term 6/Machine Learning/HomeWork/6/data/processed/"

    def load_data(self):
        self.df = pd.read_csv(self.movies_metadata_path)
        self.credits_df = pd.read_csv(self.credits_path)
        self.keywords_df = pd.read_csv(self.keywords_path)
        self.links_df = pd.read_csv(self.links_path)
        self.ratings_df = pd.read_csv(self.ratings_path)

    def df_missing_value(self):
        # saving missing value pic
        ax = msno.matrix(self.df)
        fig = ax.get_figure()
        fig.savefig(self.img_path + "df_missing.png", dpi=300, bbox_inches='tight')
        plt.close(fig)
        # Analyze missing value percentages from missing_df_info
        missing_df = self.df.isnull().sum().sort_values(ascending=False)
        missing_df_percent = (missing_df / len(self.df)) * 100
        missing_df_info = pd.DataFrame({'Missing Count': missing_df, 'Missing Percentage (%)': missing_df_percent})
        
        high_missing_cols = missing_df_info[missing_df_info['Missing Percentage (%)'] > 50].index.tolist()
        moderate_missing_cols = missing_df_info[(missing_df_info['Missing Percentage (%)'] <= 50) & (missing_df_info['Missing Percentage (%)'] > 1)].index.tolist()
        low_missing_cols = missing_df_info[missing_df_info['Missing Percentage (%)'] <= 1].index.tolist()
        handling_strategy = {}

        for col in high_missing_cols:
            handling_strategy[col] = "Drop column due to high missing percentage"

        if 'overview' in moderate_missing_cols:
            handling_strategy['overview'] = "Consider dropping or using a placeholder for text data"

        numerical_low_missing = ['runtime', 'vote_average', 'vote_count', 'revenue', 'popularity']
        for col in low_missing_cols:
            if col in numerical_low_missing:
                handling_strategy[col] = "Impute with mean or median"
            elif col in ['status', 'release_date', 'imdb_id', 'original_language', 'title', 'video', 'spoken_languages', 'production_countries', 'production_companies', 'poster_path']:
                handling_strategy[col] = "Consider imputation (e.g., mode, placeholder) or dropping rows"


        handling_strategy['adult'] = "Investigate and potentially remove incorrect entries (e.g., 'R')"
        
        # 1. Drop columns with high missing percentages
        cols_to_drop = ['belongs_to_collection', 'homepage', 'tagline']
        self.df = self.df.drop(columns=cols_to_drop)

        # 2. Fill missing values in 'overview' with a placeholder
        self.df['overview'] = self.df['overview'].fillna('No overview available')

        self.df['popularity'] = pd.to_numeric(self.df['popularity'], errors='coerce')

        numerical_cols_to_impute = ['runtime', 'vote_average', 'vote_count', 'revenue', 'popularity']
        for col in numerical_cols_to_impute:
            if col in self.df.columns:
                median_val = self.df[col].median()
                self.df[col] = self.df[col].fillna(median_val)

        remaining_missing_cols = self.df.columns[self.df.isnull().any()].tolist()
        cols_to_fill_unknown = [col for col in remaining_missing_cols if col not in numerical_cols_to_impute and col != 'adult']

        for col in cols_to_fill_unknown:
            self.df[col] = self.df[col].fillna('Unknown')

        # Investigate and remove incorrect entries in 'adult' column
        self.df = self.df[self.df['adult'].isin(['True', 'False'])]

    def extract_names_and_handle_empty(self, json_list_string):
            """Extracts names from a string representation of a list of dictionaries and handles empty lists as NaN."""
            if isinstance(json_list_string, str) and json_list_string.startswith('[') and json_list_string.endswith(']'):
                try:
                    data_list = ast.literal_eval(json_list_string)
                    if isinstance(data_list, list):
                        if not data_list:
                            return np.nan
                        names = [item.get('name', '') for item in data_list if isinstance(item, dict) and 'name' in item]
                        return ', '.join(names)
                except (ValueError, SyntaxError):
                    pass
            return ''
        

    def extract_names_from_list(self,json_list_string, key='name'):
        """Extracts values for a given key from a string representation of a list of dictionaries and handles empty lists as NaN."""
        if isinstance(json_list_string, str) and json_list_string.startswith('[') and json_list_string.endswith(']'):
            try:
                data_list = ast.literal_eval(json_list_string)
                if isinstance(data_list, list):
                    if not data_list:
                        return np.nan
                    names = [item.get(key, '') for item in data_list if isinstance(item, dict) and key in item]
                    return ', '.join(names)
            except (ValueError, SyntaxError):
                pass
        return ''
    
    def clean_data(self):
        # Drop null values in links_df
        self.links_df = self.links_df.dropna(subset=['tmdbId'])
        # Handling data types
        self.df['id'] = pd.to_numeric(self.df['id'], errors='coerce')
        self.df.dropna(subset=['id'], inplace=True)
        self.df['id'] = self.df['id'].astype(int)

        self.links_df.dropna(subset=['tmdbId'], inplace=True)
        self.links_df['tmdbId'] = self.links_df['tmdbId'].astype(int)
    
        copy_df = self.df.copy()
        # Columns identified as containing JSON format
        json_columns = ['genres', 'production_companies', 'production_countries', 'spoken_languages']

        # Apply the extraction function and handle empty lists to copy_df
        for col in json_columns:
            if col in copy_df.columns:
                copy_df[col] = copy_df[col].apply(self.extract_names_and_handle_empty)
                # Now fill the NaN values (which were empty lists) with 'Unknown'
                copy_df[col] = copy_df[col].fillna('Unknown')
        self.df = copy_df.copy()

        # Handle JSON columns in credits_df
        # 'cast' and 'crew' columns contain lists of dictionaries, we'll extract names
        self.credits_df['cast'] = self.credits_df['cast'].apply(self.extract_names_from_list, key='name')
        self.credits_df['crew'] = self.credits_df['crew'].apply(self.extract_names_from_list, key='name')

        # Fill NaN values (originally empty lists) with 'Unknown' in credits_df
        self.credits_df['cast'] = self.credits_df['cast'].fillna('Unknown')
        self.credits_df['crew'] = self.credits_df['crew'].fillna('Unknown')

        # Handle JSON columns in keywords_df
        # 'keywords' column contains a list of dictionaries, we'll extract names
        self.keywords_df['keywords'] = self.keywords_df['keywords'].apply(self.extract_names_from_list, key='name')

        # Fill NaN values (originally empty lists) with 'Unknown' in self.keywords_df
        self.keywords_df['keywords'] = self.keywords_df['keywords'].fillna('Unknown')

        # Remove duplicates from key columns
        self.df.drop_duplicates(subset=['id'], inplace=True)
        self.credits_df.drop_duplicates(subset=['id'], inplace=True)
        self.keywords_df.drop_duplicates(subset=['id'], inplace=True)
        self.links_df.drop_duplicates(subset=['tmdbId'], inplace=True)
        self.ratings_df.drop_duplicates(subset=['movieId', 'userId'], inplace=True)
        

    def merge_data(self):
        # Merge df, credits_df, and keywords_df on 'id'
        self.merged_df = pd.merge(self.df, self.credits_df, on='id', how='inner')
        self.merged_df = pd.merge(self.merged_df, self.keywords_df, on='id', how='inner')
        # Merge with links_df using tmdbId from links_df and id from self.merged_df
        self.merged_df = pd.merge(self.merged_df, self.links_df, left_on='id', right_on='tmdbId', how='inner')
        # Do NOT merge with ratings_df here! Only merge for modeling step.

    def generate_interim_va_proceed_csv(self):
        # Save cleaned DataFrames to interim CSV files
        self.df.to_csv(self.interim_path + "movies_metadata_clean.csv", index=False)
        self.credits_df.to_csv(self.interim_path + "credits_clean.csv", index=False)
        self.keywords_df.to_csv(self.interim_path + "keywords_clean.csv", index=False)
        self.links_df.to_csv(self.interim_path + "links_clean.csv", index=False)
        self.ratings_df.to_csv(self.interim_path + "ratings_clean.csv", index=False)
        self.merged_df.to_csv(self.proceed_path + "merged_clean.csv", index=False)

    def run_all(self):
        self.load_data()
        self.df_missing_value()
        self.clean_data()
        self.merge_data()
        self.generate_interim_va_proceed_csv()
        return {
            "df": self.df,
            "credits_df": self.credits_df,
            "keywords_df": self.keywords_df,
            "links_df": self.links_df,
            "ratings_df": self.ratings_df,
            "merged_df": self.merged_df
        }