Final_ML_Project / src /feature_engineering.py
Bardi-ya's picture
Upload 51 files
c296592 verified
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import os
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.decomposition import TruncatedSVD
class FeatureEngineering:
def __init__(self, dfs, interim_path="D:/Uni/Term 6/Machine Learning/HomeWork/6/data/interim/"):
self.merged_df = dfs["merged_df"]
self.ratings_df = dfs["ratings_df"]
self.interim_path = interim_path
os.makedirs(self.interim_path, exist_ok=True)
def ordering(self):
self.merged_df = self.merged_df.drop(columns=['id', 'tmdbId', 'imdbId', 'imdb_id', 'original_title', 'video'])
desired_column_order = [
'movieId',
'title',
'release_date',
'runtime',
'status',
'adult',
'budget',
'revenue',
'popularity',
'vote_average',
'vote_count',
'overview',
'genres',
'keywords',
'cast',
'crew',
'production_companies',
'production_countries',
'original_language',
'userId',
'rating',
]
self.merged_df = self.merged_df.reindex(columns=desired_column_order)
def outliers(self):
self.merged_df['budget'] = pd.to_numeric(self.merged_df['budget'], errors='coerce').fillna(0)
self.merged_df['revenue'] = pd.to_numeric(self.merged_df['revenue'], errors='coerce').fillna(0)
self.merged_df = self.merged_df[self.merged_df['runtime'] > 0]
self.merged_df = self.merged_df[self.merged_df['budget'] >= 0]
self.merged_df = self.merged_df[self.merged_df['revenue'] >= 0]
for col in ['budget', 'revenue']:
upper = self.merged_df[col].quantile(0.995)
self.merged_df = self.merged_df[self.merged_df[col] <= upper]
def add_budget_to_revenue_ratio(self):
self.merged_df['budget'] = pd.to_numeric(self.merged_df['budget'], errors='coerce').fillna(0)
self.merged_df['revenue'] = pd.to_numeric(self.merged_df['revenue'], errors='coerce').fillna(0)
self.merged_df['budget_to_revenue_ratio'] = self.merged_df.apply(
lambda row: row['budget'] / row['revenue'] if row['revenue'] > 0 else 0, axis=1
)
def add_top_genre_onehot(self, top_n=5):
genre_dummies = self.merged_df['genres'].str.get_dummies(sep=', ')
top_genres = genre_dummies.sum().sort_values(ascending=False).head(top_n).index
for genre in top_genres:
self.merged_df[f"genre_{genre}"] = genre_dummies[genre]
def add_log_features(self):
for col in ['budget', 'revenue', 'popularity', 'vote_count']:
self.merged_df[f'log_{col}'] = np.log1p(self.merged_df[col])
def add_interaction_features(self):
self.merged_df['budget_x_popularity'] = self.merged_df['budget'] * self.merged_df['popularity']
self.merged_df['budget_x_vote_count'] = self.merged_df['budget'] * self.merged_df['vote_count']
def add_count_features(self):
self.merged_df['num_genres'] = self.merged_df['genres'].fillna('').apply(lambda x: len([g for g in x.split(',') if g.strip()]))
self.merged_df['num_keywords'] = self.merged_df['keywords'].fillna('').apply(lambda x: len([k for k in x.split(',') if k.strip()]))
self.merged_df['num_cast'] = self.merged_df['cast'].fillna('').apply(lambda x: len([c for c in x.split(',') if c.strip()]))
self.merged_df['num_crew'] = self.merged_df['crew'].fillna('').apply(lambda x: len([c for c in x.split(',') if c.strip()]))
def add_text_length_features(self):
self.merged_df['overview_length'] = self.merged_df['overview'].fillna('').apply(len)
self.merged_df['title_length'] = self.merged_df['title'].fillna('').apply(len)
def add_genre_mean_encoding(self):
genre_ratings = {}
for genre in self.merged_df['genres'].str.split(',').explode().str.strip().unique():
if genre and genre != 'Unknown':
mask = self.merged_df['genres'].str.contains(rf'\b{genre}\b', regex=True)
genre_ratings[genre] = self.merged_df.loc[mask, 'vote_average'].mean()
for genre in list(genre_ratings.keys())[:10]:
self.merged_df[f'genre_{genre}_mean_vote'] = self.merged_df['genres'].apply(
lambda x: genre_ratings[genre] if genre in x else np.nan
)
def add_release_date_features(self):
self.merged_df['release_date'] = pd.to_datetime(self.merged_df['release_date'], errors='coerce')
self.merged_df['release_year'] = self.merged_df['release_date'].dt.year
self.merged_df.drop(columns=['release_date'], inplace=True)
def add_adult_flag(self):
if 'adult' in self.merged_df.columns:
self.merged_df['is_adult'] = self.merged_df['adult'].map({'True': 1, 'False': 0})
self.merged_df.drop(columns=['adult'], inplace=True)
def add_multi_hot_keywords(self, top_n=20):
keywords_split = self.merged_df['keywords'].fillna('').apply(lambda x: [k.strip() for k in x.split(',') if k.strip()])
mlb = MultiLabelBinarizer()
top_keywords = pd.Series([k for sublist in keywords_split for k in sublist]).value_counts().head(top_n).index
keywords_filtered = keywords_split.apply(lambda x: [k for k in x if k in top_keywords])
keyword_dummies = pd.DataFrame(mlb.fit_transform(keywords_filtered), columns=[f'kw_{k}' for k in mlb.classes_], index=self.merged_df.index)
self.merged_df = pd.concat([self.merged_df, keyword_dummies], axis=1)
def add_cast_crew_features(self, top_n_cast=5, top_n_crew=5):
cast_split = self.merged_df['cast'].fillna('').apply(lambda x: [c.strip() for c in x.split(',') if c.strip()])
crew_split = self.merged_df['crew'].fillna('').apply(lambda x: [c.strip() for c in x.split(',') if c.strip()])
mlb_cast = MultiLabelBinarizer()
mlb_crew = MultiLabelBinarizer()
top_cast = pd.Series([c for sublist in cast_split for c in sublist]).value_counts().head(top_n_cast).index
top_crew = pd.Series([c for sublist in crew_split for c in sublist]).value_counts().head(top_n_crew).index
cast_filtered = cast_split.apply(lambda x: [c for c in x if c in top_cast])
crew_filtered = crew_split.apply(lambda x: [c for c in x if c in top_crew])
cast_dummies = pd.DataFrame(mlb_cast.fit_transform(cast_filtered), columns=[f'cast_{c}' for c in mlb_cast.classes_], index=self.merged_df.index)
crew_dummies = pd.DataFrame(mlb_crew.fit_transform(crew_filtered), columns=[f'crew_{c}' for c in mlb_crew.classes_], index=self.merged_df.index)
self.merged_df = pd.concat([self.merged_df, cast_dummies, crew_dummies], axis=1)
def add_company_country_features(self, top_n_company=5, top_n_country=5):
company_split = self.merged_df['production_companies'].fillna('').apply(lambda x: [c.strip() for c in x.split(',') if c.strip()])
country_split = self.merged_df['production_countries'].fillna('').apply(lambda x: [c.strip() for c in x.split(',') if c.strip()])
mlb_company = MultiLabelBinarizer()
mlb_country = MultiLabelBinarizer()
top_company = pd.Series([c for sublist in company_split for c in sublist]).value_counts().head(top_n_company).index
top_country = pd.Series([c for sublist in country_split for c in sublist]).value_counts().head(top_n_country).index
company_filtered = company_split.apply(lambda x: [c for c in x if c in top_company])
country_filtered = country_split.apply(lambda x: [c for c in x if c in top_country])
company_dummies = pd.DataFrame(mlb_company.fit_transform(company_filtered), columns=[f'company_{c}' for c in mlb_company.classes_], index=self.merged_df.index)
country_dummies = pd.DataFrame(mlb_country.fit_transform(country_filtered), columns=[f'country_{c}' for c in mlb_country.classes_], index=self.merged_df.index)
self.merged_df = pd.concat([self.merged_df, company_dummies, country_dummies], axis=1)
def add_target_encoding(self, col, target='vote_average', top_n=10):
values = pd.Series([v for sublist in self.merged_df[col].fillna('').apply(lambda x: [i.strip() for i in x.split(',') if i.strip()]) for v in sublist])
top_values = values.value_counts().head(top_n).index
for v in top_values:
mask = self.merged_df[col].str.contains(rf'\b{v}\b', regex=True)
mean_val = self.merged_df.loc[mask, target].mean()
self.merged_df[f'{col}_{v}_mean_{target}'] = mask.astype(int) * mean_val
def coding(self):
self.add_target_encoding(col='genres')
self.add_target_encoding(col='production_companies')
def Tfidf(self):
tfidf_overview_vectorizer = TfidfVectorizer(max_features=2100, stop_words='english')
tfidf_overview_matrix = tfidf_overview_vectorizer.fit_transform(self.merged_df['overview'].fillna(''))
self.tfidf_overview_df = pd.DataFrame(tfidf_overview_matrix.toarray(), columns=[f'overview_tfidf_{col}' for col in tfidf_overview_vectorizer.get_feature_names_out()], index=self.merged_df.index)
def merging_Tfidf(self):
# Combine the original dataframe with the TF-IDF features
self.merged_df_with_tfidf = pd.concat([self.merged_df, self.tfidf_overview_df], axis=1)
def presvd(self):
columns_for_svd = self.merged_df_with_tfidf.select_dtypes(include=np.number).columns.tolist()
columns_for_svd = [col for col in columns_for_svd if col not in ['rating', 'movieId', 'userId', 'timestamp', 'release_year']] # Exclude non-feature columns and year
for col in columns_for_svd:
if self.merged_df_with_tfidf[col].isnull().any():
median_val = self.merged_df_with_tfidf[col].median()
self.merged_df_with_tfidf[col] = self.merged_df_with_tfidf[col].fillna(median_val)
if 'production_companies_Warner Bros._mean_vote_average' in self.merged_df_with_tfidf.columns:
self.merged_df_with_tfidf['production_companies_Warner Bros._mean_vote_average'] = self.merged_df_with_tfidf['production_companies_Warner Bros._mean_vote_average'].fillna(0)
def svd(self):
unique_movies_df = self.merged_df_with_tfidf.groupby('movieId').first().reset_index()
columns_for_svd_unique = unique_movies_df.select_dtypes(include=np.number).columns.tolist()
columns_for_svd_unique = [col for col in columns_for_svd_unique if col not in ['rating', 'movieId', 'userId', 'timestamp', 'release_year', 'vote_average', 'vote_count']]
# Fill NaNs with median for all SVD columns
for col in columns_for_svd_unique:
if unique_movies_df[col].isnull().any():
median_val = unique_movies_df[col].median()
unique_movies_df[col] = unique_movies_df[col].fillna(median_val)
# Extra: fill any remaining NaNs with 0 (safety for SVD)
unique_movies_df[columns_for_svd_unique] = unique_movies_df[columns_for_svd_unique].fillna(0)
if 'production_companies_Warner Bros._mean_vote_average' in unique_movies_df.columns:
unique_movies_df['production_companies_Warner Bros._mean_vote_average'] = unique_movies_df['production_companies_Warner Bros._mean_vote_average'].fillna(0)
n_components = 150
svd = TruncatedSVD(n_components=n_components, random_state=42)
svd_matrix_unique = svd.fit_transform(unique_movies_df[columns_for_svd_unique])
svd_df_unique = pd.DataFrame(svd_matrix_unique, columns=[f'svd_{i+1}' for i in range(n_components)], index=unique_movies_df.index)
columns_to_drop_after_svd_unique = [col for col in columns_for_svd_unique if col not in ['vote_average', 'vote_count']]
self.unique_movies_reduced = unique_movies_df.drop(columns=columns_to_drop_after_svd_unique).copy()
self.unique_movies_reduced = pd.concat([self.unique_movies_reduced, svd_df_unique], axis=1)
def run_all(self):
self.ordering()
self.outliers()
self.add_budget_to_revenue_ratio()
self.add_top_genre_onehot()
self.add_log_features()
self.add_interaction_features()
self.add_count_features()
self.add_text_length_features()
self.add_genre_mean_encoding()
self.add_release_date_features()
self.add_adult_flag()
self.add_multi_hot_keywords()
self.add_cast_crew_features()
self.add_company_country_features()
self.coding()
self.Tfidf()
self.merging_Tfidf()
self.presvd()
self.svd()
return {
"merged_df": self.merged_df,
"merged_df_with_tfidf": self.merged_df_with_tfidf,
"unique_movies_reduced": self.unique_movies_reduced
}