import matplotlib.pyplot as plt import seaborn as sns import os import pandas as pd from wordcloud import WordCloud, STOPWORDS import plotly.graph_objs as go import plotly.io as pio import pycountry class EDA: def __init__(self, dfs): self.df = dfs["df"] self.credits_df = dfs["credits_df"] self.keywords_df = dfs["keywords_df"] self.links_df = dfs["links_df"] self.ratings_df = dfs["ratings_df"] self.merged_df = dfs["merged_df"] self.img_path = "D:/Uni/Term 6/Machine Learning/HomeWork/6/report/images/" os.makedirs(self.img_path, exist_ok=True) def plot_rating_distribution(self): plt.figure(figsize=(10, 6)) sns.histplot(self.merged_df['rating'], bins=10, kde=False) plt.title('Distribution of Movie Ratings') plt.xlabel('Rating') plt.ylabel('Frequency') plt.savefig(os.path.join(self.img_path, "rating_distribution.png"), bbox_inches='tight') plt.close() def plot_release_year_distribution(self): df = self.merged_df.copy() df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce') df['release_year'] = df['release_date'].dt.year plt.figure(figsize=(12, 6)) sns.histplot(df['release_year'].dropna(), bins=50, kde=False) plt.title('Distribution of Movie Release Years') plt.xlabel('Release Year') plt.ylabel('Number of Movies') plt.savefig(os.path.join(self.img_path, "release_year_distribution.png"), bbox_inches='tight') plt.close() def plot_budget_vs_revenue(self): plt.figure(figsize=(10, 6)) sns.scatterplot(data=self.merged_df, x='budget', y='revenue') plt.title('Relationship between Movie Budget and Revenue') plt.xlabel('Budget') plt.ylabel('Revenue') plt.savefig(os.path.join(self.img_path, "budget_vs_revenue.png"), bbox_inches='tight') plt.close() # Convert 'budget' and 'revenue' to numeric, coercing errors to NaN self.merged_df['budget'] = pd.to_numeric(self.merged_df['budget'], errors='coerce') self.merged_df['revenue'] = pd.to_numeric(self.merged_df['revenue'], errors='coerce') # Fill NaN values in 'budget' and 'revenue' with 0, as 0 budget/revenue is a meaningful value self.merged_df['budget'] = self.merged_df['budget'].fillna(0) self.merged_df['revenue'] = self.merged_df['revenue'].fillna(0) # Filter out movies with zero budget AND zero revenue filtered_df = self.merged_df[(self.merged_df['budget'] > 0) | (self.merged_df['revenue'] > 0)].copy() plt.figure(figsize=(10, 6)) sns.scatterplot(data=filtered_df, x='budget', y='revenue') plt.title('Relationship between Movie Budget and Revenue (Filtered)') plt.xlabel('Budget') plt.ylabel('Revenue') plt.savefig(os.path.join(self.img_path, "budget_vs_revenue_filtered.png"), bbox_inches='tight') plt.close() def plot_genre_counts(self): genre_counts = {} for genres_list in self.df['genres'].dropna(): if isinstance(genres_list, str): genres = [genre.strip() for genre in genres_list.split(',')] for genre in genres: if genre: genre_counts[genre] = genre_counts.get(genre, 0) + 1 top_n = 15 top_genres = pd.Series(genre_counts).sort_values(ascending=False).head(top_n) plt.figure(figsize=(12, 8)) sns.barplot(x=top_genres.index, y=top_genres.values, palette='viridis') plt.title('Top Movie Genres by Frequency') plt.xlabel('Genre') plt.ylabel('Frequency') plt.xticks(rotation=45, ha='right') plt.tight_layout() plt.savefig(os.path.join(self.img_path, "top_genres.png"), bbox_inches='tight') plt.close() def plot_popularity_distribution(self): plt.figure(figsize=(10, 6)) sns.histplot(self.merged_df['popularity'], bins=50, kde=False) plt.title('Distribution of Movie Popularity') plt.xlabel('Popularity') plt.ylabel('Frequency') plt.savefig(os.path.join(self.img_path, "popularity_distribution.png"), bbox_inches='tight') plt.close() filtered_popularity_df = self.merged_df[self.merged_df['popularity'] < 100].copy() plt.figure(figsize=(10, 6)) sns.histplot(filtered_popularity_df['popularity'], bins=50, kde=False) plt.title('Distribution of Movie Popularity (Popularity < 100)') plt.xlabel('Popularity') plt.ylabel('Frequency') plt.savefig(os.path.join(self.img_path, "popularity_distribution_lt100.png"), bbox_inches='tight') plt.close() filtered_popularity_df_low = self.merged_df[self.merged_df['popularity'] < 10].copy() plt.figure(figsize=(10, 6)) sns.histplot(filtered_popularity_df_low['popularity'], bins=50, kde=False) plt.title('Distribution of Movie Popularity (Popularity < 10)') plt.xlabel('Popularity') plt.ylabel('Frequency') plt.savefig(os.path.join(self.img_path, "popularity_distribution_lt10.png"), bbox_inches='tight') plt.close() def plot_runtime_distribution(self): plt.figure(figsize=(10, 6)) sns.histplot(self.merged_df['runtime'].dropna(), bins=50, kde=False) plt.title('Distribution of Movie Runtimes') plt.xlabel('Runtime (minutes)') plt.ylabel('Frequency') plt.savefig(os.path.join(self.img_path, "runtime_distribution.png"), bbox_inches='tight') plt.close() def plot_production_company_counts(self): company_counts = {} for companies_list in self.merged_df['production_companies'].dropna(): if isinstance(companies_list, str): companies = [company.strip() for company in companies_list.split(',')] for company in companies: if company and company != 'Unknown': company_counts[company] = company_counts.get(company, 0) + 1 top_n_companies = 15 top_companies = pd.Series(company_counts).sort_values(ascending=False).head(top_n_companies) plt.figure(figsize=(14, 8)) sns.barplot(x=top_companies.index, y=top_companies.values, palette='viridis') plt.title(f'Top {top_n_companies} Production Companies') plt.xlabel('Production Company') plt.ylabel('Frequency') plt.xticks(rotation=45, ha='right') plt.tight_layout() plt.savefig(os.path.join(self.img_path, "top_production_companies.png"), bbox_inches='tight') plt.close() def plot_production_country_counts(self): country_counts = {} for countries_list in self.merged_df['production_countries'].dropna(): if isinstance(countries_list, str): countries = [country.strip() for country in countries_list.split(',')] for country in countries: if country and country != 'Unknown': country_counts[country] = country_counts.get(country, 0) + 1 top_n_countries = 15 top_countries = pd.Series(country_counts).sort_values(ascending=False).head(top_n_countries) plt.figure(figsize=(14, 8)) sns.barplot(x=top_countries.index, y=top_countries.values, palette='magma') plt.title(f'Top {top_n_countries} Production Countries') plt.xlabel('Production Country') plt.ylabel('Frequency') plt.xticks(rotation=45, ha='right') plt.tight_layout() plt.savefig(os.path.join(self.img_path, "top_production_countries.png"), bbox_inches='tight') plt.close() def plot_language_counts(self): language_counts = {} for languages_list in self.merged_df['spoken_languages'].dropna(): if isinstance(languages_list, str): languages = [lang.strip() for lang in languages_list.split(',')] for lang in languages: if lang and lang != 'Unknown': language_counts[lang] = language_counts.get(lang, 0) + 1 language_counts_series = pd.Series(language_counts).sort_values(ascending=False) top_languages = language_counts_series.head(15) plt.figure(figsize=(12, 8)) sns.barplot(x=top_languages.index, y=top_languages.values, palette='viridis') plt.title('Top 15 Spoken Languages') plt.xlabel('Language') plt.ylabel('Frequency') plt.xticks(rotation=45, ha='right') plt.tight_layout() plt.savefig(os.path.join(self.img_path, "top_languages.png"), bbox_inches='tight') plt.close() def plot_vote_count_distribution(self): plt.figure(figsize=(10, 6)) sns.histplot(self.merged_df['vote_count'], bins=50, kde=False) plt.title('Distribution of Movie Vote Counts') plt.xlabel('Vote Count') plt.ylabel('Frequency') plt.savefig(os.path.join(self.img_path, "vote_count_distribution.png"), bbox_inches='tight') plt.close() def plot_vote_average_distribution(self): plt.figure(figsize=(10, 6)) sns.histplot(self.merged_df['vote_average'], bins=20, kde=False) plt.title('Distribution of Movie Vote Averages') plt.xlabel('Vote Average') plt.ylabel('Frequency') plt.savefig(os.path.join(self.img_path, "vote_average_distribution.png"), bbox_inches='tight') plt.close() def plot_vote_count_vs_average(self): plt.figure(figsize=(10, 6)) sns.scatterplot(data=self.merged_df, x='vote_count', y='vote_average') plt.title('Relationship between Vote Count and Vote Average') plt.xlabel('Vote Count') plt.ylabel('Vote Average') plt.savefig(os.path.join(self.img_path, "vote_count_vs_average.png"), bbox_inches='tight') plt.close() def plot_wordclouds(self): copy = self.df.copy() copy['title'] = copy['title'].astype('str') copy['overview'] = copy['overview'].astype('str') title_corpus = ' '.join(copy['title']) overview_corpus = ' '.join(copy['overview']) title_wordcloud = WordCloud(stopwords=STOPWORDS, background_color='white', height=2000, width=4000).generate(title_corpus) plt.figure(figsize=(16,8)) plt.imshow(title_wordcloud) plt.axis('off') plt.tight_layout() plt.savefig(os.path.join(self.img_path, "wordcloud_title.png"), bbox_inches='tight') plt.close() overview_wordcloud = WordCloud(stopwords=STOPWORDS, background_color='white', height=2000, width=4000).generate(overview_corpus) plt.figure(figsize=(16,8)) plt.imshow(overview_wordcloud) plt.axis('off') plt.tight_layout() plt.savefig(os.path.join(self.img_path, "wordcloud_overview.png"), bbox_inches='tight') plt.close() def plot_world_production_map(self): copy = self.df.copy() country_counts = copy['production_countries'].value_counts().reset_index() country_counts.columns = ['country', 'num_movies'] country_counts = country_counts[country_counts['country'] != "United States of America"] def get_iso3(country_name): try: return pycountry.countries.lookup(country_name).alpha_3 except: return None country_counts['iso_alpha'] = country_counts['country'].apply(get_iso3) country_counts = country_counts.dropna(subset=['iso_alpha']) data = [go.Choropleth( locations = country_counts['iso_alpha'], z = country_counts['num_movies'], text = country_counts['country'], colorscale = [[0,'rgb(255,255,255)'], [1,'rgb(255,0,0)']], autocolorscale = False, reversescale = False, marker = dict(line = dict(color='rgb(180,180,180)', width=0.5)), colorbar = dict(title='Production Countries') )] layout = dict( title = 'Production Countries for the MovieLens Movies (Apart from US)', geo = dict( showframe = False, showcoastlines = False, projection = dict(type = 'mercator') ) ) fig = go.Figure(data=data, layout=layout) # Save as static image (requires kaleido) try: # Use plotly.io.write_image for better compatibility pio.write_image(fig, os.path.join(self.img_path, "world_production_map.png")) except Exception: # As a fallback, save as HTML if static image export fails try: fig.write_html(os.path.join(self.img_path, "world_production_map.html")) except Exception: pass def plot_decade_pie(self): import plotly.express as px copy = self.df.copy() copy['release_date'] = pd.to_datetime(copy['release_date'], errors='coerce') copy['decade'] = (copy['release_date'].dt.year // 10) * 10 decade_counts = copy['decade'].value_counts().sort_index().reset_index() decade_counts.columns = ['decade', 'num_movies'] decade_counts['decade'] = decade_counts['decade'].astype(int).astype(str) + "s" fig = px.pie( decade_counts, names='decade', values='num_movies', title="Movies Distribution by Decade (Release Date)", color_discrete_sequence=px.colors.qualitative.Set3 ) # Save as static image (requires kaleido) try: # Use plotly.io.write_image for better compatibility pio.write_image(fig, os.path.join(self.img_path, "movies_by_decade_pie.png")) except Exception: # As a fallback, save as HTML if static image export fails try: fig.write_html(os.path.join(self.img_path, "movies_by_decade_pie.html")) except Exception: pass def run_all(self): self.plot_rating_distribution() self.plot_release_year_distribution() self.plot_budget_vs_revenue() self.plot_genre_counts() self.plot_popularity_distribution() self.plot_runtime_distribution() self.plot_production_company_counts() self.plot_production_country_counts() self.plot_language_counts() self.plot_vote_count_distribution() self.plot_vote_average_distribution() self.plot_vote_count_vs_average() self.plot_wordclouds() self.plot_world_production_map() self.plot_decade_pie()