Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| import os | |
| from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler | |
| from sklearn.decomposition import TruncatedSVD | |
| class FeatureEngineering: | |
| def __init__(self, dfs, interim_path="D:/Uni/Term 6/Machine Learning/HomeWork/6/data/interim/"): | |
| self.merged_df = dfs["merged_df"] | |
| self.ratings_df = dfs["ratings_df"] | |
| self.interim_path = interim_path | |
| os.makedirs(self.interim_path, exist_ok=True) | |
| def ordering(self): | |
| self.merged_df = self.merged_df.drop(columns=['id', 'tmdbId', 'imdbId', 'imdb_id', 'original_title', 'video']) | |
| desired_column_order = [ | |
| 'movieId', | |
| 'title', | |
| 'release_date', | |
| 'runtime', | |
| 'status', | |
| 'adult', | |
| 'budget', | |
| 'revenue', | |
| 'popularity', | |
| 'vote_average', | |
| 'vote_count', | |
| 'overview', | |
| 'genres', | |
| 'keywords', | |
| 'cast', | |
| 'crew', | |
| 'production_companies', | |
| 'production_countries', | |
| 'original_language', | |
| 'userId', | |
| 'rating', | |
| ] | |
| self.merged_df = self.merged_df.reindex(columns=desired_column_order) | |
| def outliers(self): | |
| self.merged_df['budget'] = pd.to_numeric(self.merged_df['budget'], errors='coerce').fillna(0) | |
| self.merged_df['revenue'] = pd.to_numeric(self.merged_df['revenue'], errors='coerce').fillna(0) | |
| self.merged_df = self.merged_df[self.merged_df['runtime'] > 0] | |
| self.merged_df = self.merged_df[self.merged_df['budget'] >= 0] | |
| self.merged_df = self.merged_df[self.merged_df['revenue'] >= 0] | |
| for col in ['budget', 'revenue']: | |
| upper = self.merged_df[col].quantile(0.995) | |
| self.merged_df = self.merged_df[self.merged_df[col] <= upper] | |
| def add_budget_to_revenue_ratio(self): | |
| self.merged_df['budget'] = pd.to_numeric(self.merged_df['budget'], errors='coerce').fillna(0) | |
| self.merged_df['revenue'] = pd.to_numeric(self.merged_df['revenue'], errors='coerce').fillna(0) | |
| self.merged_df['budget_to_revenue_ratio'] = self.merged_df.apply( | |
| lambda row: row['budget'] / row['revenue'] if row['revenue'] > 0 else 0, axis=1 | |
| ) | |
| def add_top_genre_onehot(self, top_n=5): | |
| genre_dummies = self.merged_df['genres'].str.get_dummies(sep=', ') | |
| top_genres = genre_dummies.sum().sort_values(ascending=False).head(top_n).index | |
| for genre in top_genres: | |
| self.merged_df[f"genre_{genre}"] = genre_dummies[genre] | |
| def add_log_features(self): | |
| for col in ['budget', 'revenue', 'popularity', 'vote_count']: | |
| self.merged_df[f'log_{col}'] = np.log1p(self.merged_df[col]) | |
| def add_interaction_features(self): | |
| self.merged_df['budget_x_popularity'] = self.merged_df['budget'] * self.merged_df['popularity'] | |
| self.merged_df['budget_x_vote_count'] = self.merged_df['budget'] * self.merged_df['vote_count'] | |
| def add_count_features(self): | |
| self.merged_df['num_genres'] = self.merged_df['genres'].fillna('').apply(lambda x: len([g for g in x.split(',') if g.strip()])) | |
| self.merged_df['num_keywords'] = self.merged_df['keywords'].fillna('').apply(lambda x: len([k for k in x.split(',') if k.strip()])) | |
| self.merged_df['num_cast'] = self.merged_df['cast'].fillna('').apply(lambda x: len([c for c in x.split(',') if c.strip()])) | |
| self.merged_df['num_crew'] = self.merged_df['crew'].fillna('').apply(lambda x: len([c for c in x.split(',') if c.strip()])) | |
| def add_text_length_features(self): | |
| self.merged_df['overview_length'] = self.merged_df['overview'].fillna('').apply(len) | |
| self.merged_df['title_length'] = self.merged_df['title'].fillna('').apply(len) | |
| def add_genre_mean_encoding(self): | |
| genre_ratings = {} | |
| for genre in self.merged_df['genres'].str.split(',').explode().str.strip().unique(): | |
| if genre and genre != 'Unknown': | |
| mask = self.merged_df['genres'].str.contains(rf'\b{genre}\b', regex=True) | |
| genre_ratings[genre] = self.merged_df.loc[mask, 'vote_average'].mean() | |
| for genre in list(genre_ratings.keys())[:10]: | |
| self.merged_df[f'genre_{genre}_mean_vote'] = self.merged_df['genres'].apply( | |
| lambda x: genre_ratings[genre] if genre in x else np.nan | |
| ) | |
| def add_release_date_features(self): | |
| self.merged_df['release_date'] = pd.to_datetime(self.merged_df['release_date'], errors='coerce') | |
| self.merged_df['release_year'] = self.merged_df['release_date'].dt.year | |
| self.merged_df.drop(columns=['release_date'], inplace=True) | |
| def add_adult_flag(self): | |
| if 'adult' in self.merged_df.columns: | |
| self.merged_df['is_adult'] = self.merged_df['adult'].map({'True': 1, 'False': 0}) | |
| self.merged_df.drop(columns=['adult'], inplace=True) | |
| def add_multi_hot_keywords(self, top_n=20): | |
| keywords_split = self.merged_df['keywords'].fillna('').apply(lambda x: [k.strip() for k in x.split(',') if k.strip()]) | |
| mlb = MultiLabelBinarizer() | |
| top_keywords = pd.Series([k for sublist in keywords_split for k in sublist]).value_counts().head(top_n).index | |
| keywords_filtered = keywords_split.apply(lambda x: [k for k in x if k in top_keywords]) | |
| keyword_dummies = pd.DataFrame(mlb.fit_transform(keywords_filtered), columns=[f'kw_{k}' for k in mlb.classes_], index=self.merged_df.index) | |
| self.merged_df = pd.concat([self.merged_df, keyword_dummies], axis=1) | |
| def add_cast_crew_features(self, top_n_cast=5, top_n_crew=5): | |
| cast_split = self.merged_df['cast'].fillna('').apply(lambda x: [c.strip() for c in x.split(',') if c.strip()]) | |
| crew_split = self.merged_df['crew'].fillna('').apply(lambda x: [c.strip() for c in x.split(',') if c.strip()]) | |
| mlb_cast = MultiLabelBinarizer() | |
| mlb_crew = MultiLabelBinarizer() | |
| top_cast = pd.Series([c for sublist in cast_split for c in sublist]).value_counts().head(top_n_cast).index | |
| top_crew = pd.Series([c for sublist in crew_split for c in sublist]).value_counts().head(top_n_crew).index | |
| cast_filtered = cast_split.apply(lambda x: [c for c in x if c in top_cast]) | |
| crew_filtered = crew_split.apply(lambda x: [c for c in x if c in top_crew]) | |
| cast_dummies = pd.DataFrame(mlb_cast.fit_transform(cast_filtered), columns=[f'cast_{c}' for c in mlb_cast.classes_], index=self.merged_df.index) | |
| crew_dummies = pd.DataFrame(mlb_crew.fit_transform(crew_filtered), columns=[f'crew_{c}' for c in mlb_crew.classes_], index=self.merged_df.index) | |
| self.merged_df = pd.concat([self.merged_df, cast_dummies, crew_dummies], axis=1) | |
| def add_company_country_features(self, top_n_company=5, top_n_country=5): | |
| company_split = self.merged_df['production_companies'].fillna('').apply(lambda x: [c.strip() for c in x.split(',') if c.strip()]) | |
| country_split = self.merged_df['production_countries'].fillna('').apply(lambda x: [c.strip() for c in x.split(',') if c.strip()]) | |
| mlb_company = MultiLabelBinarizer() | |
| mlb_country = MultiLabelBinarizer() | |
| top_company = pd.Series([c for sublist in company_split for c in sublist]).value_counts().head(top_n_company).index | |
| top_country = pd.Series([c for sublist in country_split for c in sublist]).value_counts().head(top_n_country).index | |
| company_filtered = company_split.apply(lambda x: [c for c in x if c in top_company]) | |
| country_filtered = country_split.apply(lambda x: [c for c in x if c in top_country]) | |
| company_dummies = pd.DataFrame(mlb_company.fit_transform(company_filtered), columns=[f'company_{c}' for c in mlb_company.classes_], index=self.merged_df.index) | |
| country_dummies = pd.DataFrame(mlb_country.fit_transform(country_filtered), columns=[f'country_{c}' for c in mlb_country.classes_], index=self.merged_df.index) | |
| self.merged_df = pd.concat([self.merged_df, company_dummies, country_dummies], axis=1) | |
| def add_target_encoding(self, col, target='vote_average', top_n=10): | |
| values = pd.Series([v for sublist in self.merged_df[col].fillna('').apply(lambda x: [i.strip() for i in x.split(',') if i.strip()]) for v in sublist]) | |
| top_values = values.value_counts().head(top_n).index | |
| for v in top_values: | |
| mask = self.merged_df[col].str.contains(rf'\b{v}\b', regex=True) | |
| mean_val = self.merged_df.loc[mask, target].mean() | |
| self.merged_df[f'{col}_{v}_mean_{target}'] = mask.astype(int) * mean_val | |
| def coding(self): | |
| self.add_target_encoding(col='genres') | |
| self.add_target_encoding(col='production_companies') | |
| def Tfidf(self): | |
| tfidf_overview_vectorizer = TfidfVectorizer(max_features=2100, stop_words='english') | |
| tfidf_overview_matrix = tfidf_overview_vectorizer.fit_transform(self.merged_df['overview'].fillna('')) | |
| self.tfidf_overview_df = pd.DataFrame(tfidf_overview_matrix.toarray(), columns=[f'overview_tfidf_{col}' for col in tfidf_overview_vectorizer.get_feature_names_out()], index=self.merged_df.index) | |
| def merging_Tfidf(self): | |
| # Combine the original dataframe with the TF-IDF features | |
| self.merged_df_with_tfidf = pd.concat([self.merged_df, self.tfidf_overview_df], axis=1) | |
| def presvd(self): | |
| columns_for_svd = self.merged_df_with_tfidf.select_dtypes(include=np.number).columns.tolist() | |
| columns_for_svd = [col for col in columns_for_svd if col not in ['rating', 'movieId', 'userId', 'timestamp', 'release_year']] # Exclude non-feature columns and year | |
| for col in columns_for_svd: | |
| if self.merged_df_with_tfidf[col].isnull().any(): | |
| median_val = self.merged_df_with_tfidf[col].median() | |
| self.merged_df_with_tfidf[col] = self.merged_df_with_tfidf[col].fillna(median_val) | |
| if 'production_companies_Warner Bros._mean_vote_average' in self.merged_df_with_tfidf.columns: | |
| self.merged_df_with_tfidf['production_companies_Warner Bros._mean_vote_average'] = self.merged_df_with_tfidf['production_companies_Warner Bros._mean_vote_average'].fillna(0) | |
| def svd(self): | |
| unique_movies_df = self.merged_df_with_tfidf.groupby('movieId').first().reset_index() | |
| columns_for_svd_unique = unique_movies_df.select_dtypes(include=np.number).columns.tolist() | |
| columns_for_svd_unique = [col for col in columns_for_svd_unique if col not in ['rating', 'movieId', 'userId', 'timestamp', 'release_year', 'vote_average', 'vote_count']] | |
| # Fill NaNs with median for all SVD columns | |
| for col in columns_for_svd_unique: | |
| if unique_movies_df[col].isnull().any(): | |
| median_val = unique_movies_df[col].median() | |
| unique_movies_df[col] = unique_movies_df[col].fillna(median_val) | |
| # Extra: fill any remaining NaNs with 0 (safety for SVD) | |
| unique_movies_df[columns_for_svd_unique] = unique_movies_df[columns_for_svd_unique].fillna(0) | |
| if 'production_companies_Warner Bros._mean_vote_average' in unique_movies_df.columns: | |
| unique_movies_df['production_companies_Warner Bros._mean_vote_average'] = unique_movies_df['production_companies_Warner Bros._mean_vote_average'].fillna(0) | |
| n_components = 150 | |
| svd = TruncatedSVD(n_components=n_components, random_state=42) | |
| svd_matrix_unique = svd.fit_transform(unique_movies_df[columns_for_svd_unique]) | |
| svd_df_unique = pd.DataFrame(svd_matrix_unique, columns=[f'svd_{i+1}' for i in range(n_components)], index=unique_movies_df.index) | |
| columns_to_drop_after_svd_unique = [col for col in columns_for_svd_unique if col not in ['vote_average', 'vote_count']] | |
| self.unique_movies_reduced = unique_movies_df.drop(columns=columns_to_drop_after_svd_unique).copy() | |
| self.unique_movies_reduced = pd.concat([self.unique_movies_reduced, svd_df_unique], axis=1) | |
| def run_all(self): | |
| self.ordering() | |
| self.outliers() | |
| self.add_budget_to_revenue_ratio() | |
| self.add_top_genre_onehot() | |
| self.add_log_features() | |
| self.add_interaction_features() | |
| self.add_count_features() | |
| self.add_text_length_features() | |
| self.add_genre_mean_encoding() | |
| self.add_release_date_features() | |
| self.add_adult_flag() | |
| self.add_multi_hot_keywords() | |
| self.add_cast_crew_features() | |
| self.add_company_country_features() | |
| self.coding() | |
| self.Tfidf() | |
| self.merging_Tfidf() | |
| self.presvd() | |
| self.svd() | |
| return { | |
| "merged_df": self.merged_df, | |
| "merged_df_with_tfidf": self.merged_df_with_tfidf, | |
| "unique_movies_reduced": self.unique_movies_reduced | |
| } | |