File size: 13,232 Bytes
c296592
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import os
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.decomposition import TruncatedSVD

class FeatureEngineering:
    def __init__(self, dfs, interim_path="D:/Uni/Term 6/Machine Learning/HomeWork/6/data/interim/"):
        self.merged_df = dfs["merged_df"]
        self.ratings_df = dfs["ratings_df"]
        self.interim_path = interim_path
        os.makedirs(self.interim_path, exist_ok=True)
    
    def ordering(self):
        self.merged_df = self.merged_df.drop(columns=['id', 'tmdbId', 'imdbId', 'imdb_id', 'original_title', 'video'])
        desired_column_order = [
            'movieId',
            'title',
            'release_date',
            'runtime',
            'status',
            'adult',
            'budget',
            'revenue',
            'popularity',
            'vote_average',
            'vote_count',
            'overview',
            'genres',
            'keywords',
            'cast',
            'crew',
            'production_companies',
            'production_countries',
            'original_language',
            'userId',
            'rating',
        ]

        self.merged_df = self.merged_df.reindex(columns=desired_column_order)
    
    def outliers(self):
        self.merged_df['budget'] = pd.to_numeric(self.merged_df['budget'], errors='coerce').fillna(0)
        self.merged_df['revenue'] = pd.to_numeric(self.merged_df['revenue'], errors='coerce').fillna(0)
        self.merged_df = self.merged_df[self.merged_df['runtime'] > 0]
        self.merged_df = self.merged_df[self.merged_df['budget'] >= 0]
        self.merged_df = self.merged_df[self.merged_df['revenue'] >= 0]
        
        for col in ['budget', 'revenue']:
            upper = self.merged_df[col].quantile(0.995)
            self.merged_df = self.merged_df[self.merged_df[col] <= upper]
    
    def add_budget_to_revenue_ratio(self):
        self.merged_df['budget'] = pd.to_numeric(self.merged_df['budget'], errors='coerce').fillna(0)
        self.merged_df['revenue'] = pd.to_numeric(self.merged_df['revenue'], errors='coerce').fillna(0)
        self.merged_df['budget_to_revenue_ratio'] = self.merged_df.apply(
            lambda row: row['budget'] / row['revenue'] if row['revenue'] > 0 else 0, axis=1
        )

    def add_top_genre_onehot(self, top_n=5):
        genre_dummies = self.merged_df['genres'].str.get_dummies(sep=', ')
        top_genres = genre_dummies.sum().sort_values(ascending=False).head(top_n).index
        for genre in top_genres:
            self.merged_df[f"genre_{genre}"] = genre_dummies[genre]

    def add_log_features(self):
        for col in ['budget', 'revenue', 'popularity', 'vote_count']:
            self.merged_df[f'log_{col}'] = np.log1p(self.merged_df[col])

    def add_interaction_features(self):
        self.merged_df['budget_x_popularity'] = self.merged_df['budget'] * self.merged_df['popularity']
        self.merged_df['budget_x_vote_count'] = self.merged_df['budget'] * self.merged_df['vote_count']

    def add_count_features(self):
        self.merged_df['num_genres'] = self.merged_df['genres'].fillna('').apply(lambda x: len([g for g in x.split(',') if g.strip()]))
        self.merged_df['num_keywords'] = self.merged_df['keywords'].fillna('').apply(lambda x: len([k for k in x.split(',') if k.strip()]))
        self.merged_df['num_cast'] = self.merged_df['cast'].fillna('').apply(lambda x: len([c for c in x.split(',') if c.strip()]))
        self.merged_df['num_crew'] = self.merged_df['crew'].fillna('').apply(lambda x: len([c for c in x.split(',') if c.strip()]))

    def add_text_length_features(self):
        self.merged_df['overview_length'] = self.merged_df['overview'].fillna('').apply(len)
        self.merged_df['title_length'] = self.merged_df['title'].fillna('').apply(len)

    def add_genre_mean_encoding(self):
        genre_ratings = {}
        for genre in self.merged_df['genres'].str.split(',').explode().str.strip().unique():
            if genre and genre != 'Unknown':
                mask = self.merged_df['genres'].str.contains(rf'\b{genre}\b', regex=True)
                genre_ratings[genre] = self.merged_df.loc[mask, 'vote_average'].mean()
        for genre in list(genre_ratings.keys())[:10]:
            self.merged_df[f'genre_{genre}_mean_vote'] = self.merged_df['genres'].apply(
                lambda x: genre_ratings[genre] if genre in x else np.nan
            )

    def add_release_date_features(self):
        self.merged_df['release_date'] = pd.to_datetime(self.merged_df['release_date'], errors='coerce')
        self.merged_df['release_year'] = self.merged_df['release_date'].dt.year
        self.merged_df.drop(columns=['release_date'], inplace=True)


    def add_adult_flag(self):
        if 'adult' in self.merged_df.columns:
            self.merged_df['is_adult'] = self.merged_df['adult'].map({'True': 1, 'False': 0})
        self.merged_df.drop(columns=['adult'], inplace=True)

    def add_multi_hot_keywords(self, top_n=20):
        keywords_split = self.merged_df['keywords'].fillna('').apply(lambda x: [k.strip() for k in x.split(',') if k.strip()])
        mlb = MultiLabelBinarizer()
        top_keywords = pd.Series([k for sublist in keywords_split for k in sublist]).value_counts().head(top_n).index
        keywords_filtered = keywords_split.apply(lambda x: [k for k in x if k in top_keywords])
        keyword_dummies = pd.DataFrame(mlb.fit_transform(keywords_filtered), columns=[f'kw_{k}' for k in mlb.classes_], index=self.merged_df.index)
        self.merged_df = pd.concat([self.merged_df, keyword_dummies], axis=1)

    def add_cast_crew_features(self, top_n_cast=5, top_n_crew=5):
        cast_split = self.merged_df['cast'].fillna('').apply(lambda x: [c.strip() for c in x.split(',') if c.strip()])
        crew_split = self.merged_df['crew'].fillna('').apply(lambda x: [c.strip() for c in x.split(',') if c.strip()])
        mlb_cast = MultiLabelBinarizer()
        mlb_crew = MultiLabelBinarizer()
        top_cast = pd.Series([c for sublist in cast_split for c in sublist]).value_counts().head(top_n_cast).index
        top_crew = pd.Series([c for sublist in crew_split for c in sublist]).value_counts().head(top_n_crew).index
        cast_filtered = cast_split.apply(lambda x: [c for c in x if c in top_cast])
        crew_filtered = crew_split.apply(lambda x: [c for c in x if c in top_crew])
        cast_dummies = pd.DataFrame(mlb_cast.fit_transform(cast_filtered), columns=[f'cast_{c}' for c in mlb_cast.classes_], index=self.merged_df.index)
        crew_dummies = pd.DataFrame(mlb_crew.fit_transform(crew_filtered), columns=[f'crew_{c}' for c in mlb_crew.classes_], index=self.merged_df.index)
        self.merged_df = pd.concat([self.merged_df, cast_dummies, crew_dummies], axis=1)

    def add_company_country_features(self, top_n_company=5, top_n_country=5):
        company_split = self.merged_df['production_companies'].fillna('').apply(lambda x: [c.strip() for c in x.split(',') if c.strip()])
        country_split = self.merged_df['production_countries'].fillna('').apply(lambda x: [c.strip() for c in x.split(',') if c.strip()])
        mlb_company = MultiLabelBinarizer()
        mlb_country = MultiLabelBinarizer()
        top_company = pd.Series([c for sublist in company_split for c in sublist]).value_counts().head(top_n_company).index
        top_country = pd.Series([c for sublist in country_split for c in sublist]).value_counts().head(top_n_country).index
        company_filtered = company_split.apply(lambda x: [c for c in x if c in top_company])
        country_filtered = country_split.apply(lambda x: [c for c in x if c in top_country])
        company_dummies = pd.DataFrame(mlb_company.fit_transform(company_filtered), columns=[f'company_{c}' for c in mlb_company.classes_], index=self.merged_df.index)
        country_dummies = pd.DataFrame(mlb_country.fit_transform(country_filtered), columns=[f'country_{c}' for c in mlb_country.classes_], index=self.merged_df.index)
        self.merged_df = pd.concat([self.merged_df, company_dummies, country_dummies], axis=1)

    def add_target_encoding(self, col, target='vote_average', top_n=10):
        values = pd.Series([v for sublist in self.merged_df[col].fillna('').apply(lambda x: [i.strip() for i in x.split(',') if i.strip()]) for v in sublist])
        top_values = values.value_counts().head(top_n).index
        for v in top_values:
            mask = self.merged_df[col].str.contains(rf'\b{v}\b', regex=True)
            mean_val = self.merged_df.loc[mask, target].mean()
            self.merged_df[f'{col}_{v}_mean_{target}'] = mask.astype(int) * mean_val
    
    def coding(self):
        self.add_target_encoding(col='genres')
        self.add_target_encoding(col='production_companies')
    
    def Tfidf(self):
        tfidf_overview_vectorizer = TfidfVectorizer(max_features=2100, stop_words='english')
        tfidf_overview_matrix = tfidf_overview_vectorizer.fit_transform(self.merged_df['overview'].fillna(''))
        self.tfidf_overview_df = pd.DataFrame(tfidf_overview_matrix.toarray(), columns=[f'overview_tfidf_{col}' for col in tfidf_overview_vectorizer.get_feature_names_out()], index=self.merged_df.index)
        
    def merging_Tfidf(self):
        # Combine the original dataframe with the TF-IDF features
        self.merged_df_with_tfidf = pd.concat([self.merged_df, self.tfidf_overview_df], axis=1)
        
    def presvd(self):
        columns_for_svd = self.merged_df_with_tfidf.select_dtypes(include=np.number).columns.tolist()
        columns_for_svd = [col for col in columns_for_svd if col not in ['rating', 'movieId', 'userId', 'timestamp', 'release_year']] # Exclude non-feature columns and year

        for col in columns_for_svd:
            if self.merged_df_with_tfidf[col].isnull().any():
                median_val = self.merged_df_with_tfidf[col].median()
                self.merged_df_with_tfidf[col] = self.merged_df_with_tfidf[col].fillna(median_val)
        if 'production_companies_Warner Bros._mean_vote_average' in self.merged_df_with_tfidf.columns:
            self.merged_df_with_tfidf['production_companies_Warner Bros._mean_vote_average'] = self.merged_df_with_tfidf['production_companies_Warner Bros._mean_vote_average'].fillna(0)
        
    
    def svd(self):
        unique_movies_df = self.merged_df_with_tfidf.groupby('movieId').first().reset_index()
        columns_for_svd_unique = unique_movies_df.select_dtypes(include=np.number).columns.tolist()
        columns_for_svd_unique = [col for col in columns_for_svd_unique if col not in ['rating', 'movieId', 'userId', 'timestamp', 'release_year', 'vote_average', 'vote_count']]

        # Fill NaNs with median for all SVD columns
        for col in columns_for_svd_unique:
            if unique_movies_df[col].isnull().any():
                median_val = unique_movies_df[col].median()
                unique_movies_df[col] = unique_movies_df[col].fillna(median_val)
        # Extra: fill any remaining NaNs with 0 (safety for SVD)
        unique_movies_df[columns_for_svd_unique] = unique_movies_df[columns_for_svd_unique].fillna(0)

        if 'production_companies_Warner Bros._mean_vote_average' in unique_movies_df.columns:
            unique_movies_df['production_companies_Warner Bros._mean_vote_average'] = unique_movies_df['production_companies_Warner Bros._mean_vote_average'].fillna(0)


        n_components = 150
        svd = TruncatedSVD(n_components=n_components, random_state=42)
        svd_matrix_unique = svd.fit_transform(unique_movies_df[columns_for_svd_unique])
        svd_df_unique = pd.DataFrame(svd_matrix_unique, columns=[f'svd_{i+1}' for i in range(n_components)], index=unique_movies_df.index)
        columns_to_drop_after_svd_unique = [col for col in columns_for_svd_unique if col not in ['vote_average', 'vote_count']]
        self.unique_movies_reduced = unique_movies_df.drop(columns=columns_to_drop_after_svd_unique).copy()
        self.unique_movies_reduced = pd.concat([self.unique_movies_reduced, svd_df_unique], axis=1)

    def run_all(self):
        self.ordering()
        self.outliers()
        self.add_budget_to_revenue_ratio()
        self.add_top_genre_onehot()
        self.add_log_features()
        self.add_interaction_features()
        self.add_count_features()
        self.add_text_length_features()
        self.add_genre_mean_encoding()
        self.add_release_date_features()
        self.add_adult_flag()
        self.add_multi_hot_keywords()
        self.add_cast_crew_features()
        self.add_company_country_features()
        self.coding()
        self.Tfidf()
        self.merging_Tfidf()
        self.presvd()
        self.svd()
        
        return {
            "merged_df": self.merged_df,
            "merged_df_with_tfidf": self.merged_df_with_tfidf,
            "unique_movies_reduced": self.unique_movies_reduced
        }