Spaces:
Sleeping
Sleeping
| # -*- coding: utf-8 -*- | |
| """ | |
| μ λ¬Έκ³Όλ°©μ‘ λ μ λ°μ΄ν° μ¬μΈ΅ EDA (μμΉ/μΆμΈ κ°λ μ± κ°ν μλ³ λΆμ) | |
| μλ³ λμ νΈλ λ λΆμμ κ°ννμ¬, λͺ¨λ μκ°ν μλ£μ μ νν μμΉλ₯Ό | |
| νμνκ³ , μ μ λλΉ μ±μ₯λ₯ μ λͺ μμ μΌλ‘ 보μ¬μ£Όμ΄ μΆμΈλ₯Ό λμ± λͺ ννκ² | |
| νμ ν μ μλλ‘ κ°μ ν©λλ€. | |
| """ | |
| # 1. λΌμ΄λΈλ¬λ¦¬ μν¬νΈ (κΈ°μ‘΄κ³Ό λμΌ) | |
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from datetime import datetime | |
| import warnings | |
| import os | |
| warnings.filterwarnings('ignore') | |
| # --- μκ°νμ© ν¬νΌ ν¨μ --- | |
| def add_value_labels(ax, is_bar=True, fmt="{:.0f}"): | |
| """λ§λ λλ κΊΎμμ κ·Έλνμ κ° λ μ΄λΈμ μΆκ°νλ ν¨μ""" | |
| for p in ax.patches if is_bar else ax.lines: | |
| if is_bar: | |
| ax.annotate(fmt.format(p.get_height()), | |
| (p.get_x() + p.get_width() / 2., p.get_height()), | |
| ha='center', va='center', | |
| xytext=(0, 9), | |
| textcoords='offset points', | |
| fontsize=9, | |
| color='dimgray') | |
| else: # for line plots | |
| for x_value, y_value in zip(p.get_xdata(), p.get_ydata()): | |
| ax.text(x_value, y_value, fmt.format(y_value), | |
| ha='center', va='bottom', | |
| fontsize=9, | |
| color='dimgray') | |
| # 2. κΈ°λ³Έ μ€μ λ° μ μ λ³μ | |
| def setup_environment(): | |
| DATA_DIR = r'Broadcast_paper\data_csv' | |
| OUTPUT_DIR = r'./output_analysis_v4' # κ²°κ³Ό μ μ₯ ν΄λ λ³κ²½ | |
| if not os.path.exists(OUTPUT_DIR): | |
| os.makedirs(OUTPUT_DIR) | |
| print(f"'{OUTPUT_DIR}' ν΄λλ₯Ό μμ±νμ΅λλ€.") | |
| plt.rc('font', family='Malgun Gothic') | |
| plt.rcParams['axes.unicode_minus'] = False | |
| sns.set(font='Malgun Gothic', rc={'axes.unicode_minus': False}, style='whitegrid') | |
| print("λΆμ νκ²½ μ€μ μλ£!") | |
| return DATA_DIR, OUTPUT_DIR | |
| # 3. λ°μ΄ν° λ‘λ λ° μ μ²λ¦¬ (κΈ°μ‘΄κ³Ό λμΌ) | |
| def load_and_preprocess_data(data_dir): | |
| print("\n[λ¨κ³ 1] λ°μ΄ν° λ‘λ λ° μ μ²λ¦¬ μμ...") | |
| df_metrics = pd.read_csv(f'{data_dir}/article_metrics_monthly.csv') | |
| df_contents = pd.read_csv(f'{data_dir}/contents.csv') | |
| df_demo = pd.read_csv(f'{data_dir}/demographics_merged.csv') | |
| df_referrer = pd.read_csv(f'{data_dir}/referrer.csv') | |
| df_metrics['period'] = pd.to_datetime(df_metrics['period']).dt.to_period('M') | |
| df_contents['publish_month'] = pd.to_datetime(df_contents['date']).dt.to_period('M') | |
| df_demo['period'] = pd.to_datetime(df_demo['period']).dt.to_period('M') | |
| df_referrer['period'] = pd.to_datetime(df_referrer['period']).dt.to_period('M') | |
| df_metrics['comments'].fillna(0, inplace=True) | |
| df_contents.dropna(subset=['category', 'content', 'date'], inplace=True) | |
| df_contents['content_length'] = df_contents['content'].str.len() | |
| df_demo_filtered = df_demo[df_demo['age_group'] != 'μ 체'].copy() | |
| article_total_metrics = df_metrics.groupby('article_id').agg({ | |
| 'views_total': 'sum', 'likes': 'sum', 'comments': 'sum' | |
| }).reset_index() | |
| df_merged = pd.merge(df_contents, article_total_metrics, on='article_id', how='left') | |
| df_merged.fillna({'views_total': 0, 'likes': 0, 'comments': 0}, inplace=True) | |
| df_merged['engagement_rate'] = ((df_merged['likes'] + df_merged['comments']) / df_merged['views_total'].replace(0, np.nan)) * 100 | |
| print("λ°μ΄ν° λ‘λ λ° μ μ²λ¦¬ μλ£!") | |
| return { | |
| "metrics": df_metrics, "contents": df_contents, "demo": df_demo_filtered, | |
| "referrer": df_referrer, "merged": df_merged | |
| } | |
| # ============================================================================== | |
| # β β β β β μμΉ/μΆμΈ κ°λ μ±μ κ·Ήλνν μλ³ λΆμ ν¨μ β β β β β | |
| # ============================================================================== | |
| def analyze_enhanced_monthly_trends(data, output_dir): | |
| """ | |
| μκ°(μ)μ νλ¦μ λ°λ₯Έ μ£Όμ μ§νλ€μ λμ λ³νλ₯Ό μμΉμ ν¨κ» λͺ ννκ² λΆμν©λλ€. | |
| """ | |
| print("\n[μ κ· λΆμ 4] μλ³ λμ νΈλ λ μ¬μΈ΅ λΆμ (μμΉ κ°ν)...") | |
| # --- 1. μλ³ μ±κ³Ό μ§ν λ° μ±μ₯λ₯ --- | |
| monthly_metrics = data['metrics'].groupby('period').agg( | |
| total_views=('views_total', 'sum'), | |
| total_likes=('likes', 'sum'), | |
| total_comments=('comments', 'sum') | |
| ).sort_index() | |
| # μ μ λλΉ μ±μ₯λ₯ (MoM Growth) κ³μ° | |
| for col in monthly_metrics.columns: | |
| monthly_metrics[f'{col}_mom'] = monthly_metrics[col].pct_change() * 100 | |
| monthly_metrics.index = monthly_metrics.index.to_timestamp() | |
| fig, axes = plt.subplots(2, 1, figsize=(18, 14), sharex=True) | |
| fig.suptitle('μλ³ μ±κ³Ό μ§ν λ° μ μ λλΉ μ±μ₯λ₯ (MoM) μΆμ΄', fontsize=20, y=1.0) | |
| # μλ¨ κ·Έλν: μ λ μμΉ (μ‘°νμ + μ’μμ) | |
| ax1 = axes[0] | |
| bars = ax1.bar(monthly_metrics.index, monthly_metrics['total_views'], color='lightgray', label='μ΄ μ‘°νμ') | |
| add_value_labels(ax1, is_bar=True, fmt="{:,.0f}") # λ§λκ·Έλν κ° νμ | |
| ax1.set_ylabel('μ΄ μ‘°νμ', fontsize=12) | |
| ax1_twin = ax1.twinx() | |
| line1 = ax1_twin.plot(monthly_metrics.index, monthly_metrics['total_likes'], marker='o', color='coral', label='μ΄ μ’μμ') | |
| add_value_labels(ax1_twin, is_bar=False, fmt="{:.0f}") # κΊΎμμ κ° νμ | |
| ax1_twin.set_ylabel('μ΄ μ’μμ', fontsize=12) | |
| # λ²λ‘ ν©μΉκΈ° | |
| lines, labels = ax1.get_legend_handles_labels() | |
| lines2, labels2 = ax1_twin.get_legend_handles_labels() | |
| ax1_twin.legend(lines + lines2, labels + labels2, loc='upper left') | |
| ax1.set_title('μλ³ μ΄ μ‘°νμ λ° μ’μμ', fontsize=16) | |
| # νλ¨ κ·Έλν: μ±μ₯λ₯ (%) | |
| ax2 = axes[1] | |
| ax2.plot(monthly_metrics.index, monthly_metrics['total_views_mom'], marker='s', linestyle='--', label='μ‘°νμ μ±μ₯λ₯ (%)') | |
| ax2.plot(monthly_metrics.index, monthly_metrics['total_likes_mom'], marker='^', linestyle='--', label='μ’μμ μ±μ₯λ₯ (%)') | |
| ax2.axhline(0, color='red', linewidth=1, linestyle=':') | |
| ax2.set_ylabel('μ μ λλΉ μ±μ₯λ₯ (%)', fontsize=12) | |
| ax2.legend() | |
| ax2.set_title('μλ³ μ£Όμ μ§ν μ±μ₯λ₯ (MoM)', fontsize=16) | |
| plt.tight_layout() | |
| plt.savefig(f'{output_dir}/monthly_performance_and_growth.png') | |
| plt.close() | |
| print(" - μλ³ μ±κ³Ό λ° μ±μ₯λ₯ λΆμ μλ£. (monthly_performance_and_growth.png μ μ₯)") | |
| # --- 2. μλ³ μΉ΄ν κ³ λ¦¬ λ°ν λΉμ€ (μκ°ν + λ°μ΄ν° ν μ΄λΈ) --- | |
| monthly_category_dist = data['merged'].groupby(['publish_month', 'category'])['article_id'].count().unstack().fillna(0) | |
| monthly_category_prop = monthly_category_dist.div(monthly_category_dist.sum(axis=1), axis=0) * 100 | |
| top_categories = data['merged']['category'].value_counts().nlargest(7).index | |
| other_categories = monthly_category_prop.columns.difference(top_categories) | |
| monthly_category_prop['κΈ°ν'] = monthly_category_prop[other_categories].sum(axis=1) | |
| # μκ°ν | |
| monthly_category_prop[top_categories.tolist() + ['κΈ°ν']].plot( | |
| kind='bar', stacked=True, figsize=(16, 8), colormap='tab20c' | |
| ) | |
| plt.title('μλ³ μ½ν μΈ μΉ΄ν κ³ λ¦¬ λ°ν λΉμ€ λ³ν (%)', fontsize=18) | |
| plt.xlabel('κΈ°κ° (μ)'); plt.ylabel('μΉ΄ν κ³ λ¦¬ λΉμ€ (%)'); plt.xticks(rotation=45) | |
| plt.legend(title='Category', bbox_to_anchor=(1.02, 1), loc='upper left') | |
| plt.tight_layout() | |
| plt.savefig(f'{output_dir}/monthly_category_distribution_with_values.png') | |
| plt.close() | |
| # λ°μ΄ν° ν μ΄λΈ μΆλ ₯ | |
| print("\n--- μλ³ μμ μΉ΄ν κ³ λ¦¬ λ°ν λΉμ€ (%) λ°μ΄ν° ---") | |
| category_table_data = monthly_category_prop[top_categories.tolist() + ['κΈ°ν']].round(1) | |
| print(category_table_data) | |
| print(" - μλ³ μΉ΄ν κ³ λ¦¬ λΉμ€ λΆμ μλ£. (monthly_category_distribution_with_values.png μ μ₯ λ° ν μ΄λΈ μΆλ ₯)") | |
| # --- 3. μλ³ ν΅μ¬ λ μ μ°λ ΉμΈ΅ (μκ°ν + λ°μ΄ν° ν μ΄λΈ) --- | |
| monthly_age_views = data['demo'].groupby(['period', 'age_group'])['views'].sum().unstack().fillna(0) | |
| monthly_age_prop = (monthly_age_views.div(monthly_age_views.sum(axis=1), axis=0) * 100).round(1) | |
| # μκ°ν | |
| monthly_age_prop.plot(kind='line', marker='o', figsize=(18, 9), colormap='viridis', ms=4) | |
| plt.title('μλ³ μ‘°νμμ λν μ°λ Ήλλ³ κΈ°μ¬λ λ³ν (%)', fontsize=18) | |
| plt.xlabel('κΈ°κ° (μ)'); plt.ylabel('μ°λ Ήλλ³ μ‘°νμ λΉμ€ (%)'); plt.xticks(rotation=45) | |
| plt.legend(title='Age Group', bbox_to_anchor=(1.02, 1), loc='upper left') | |
| plt.grid(which='major', linestyle='--', linewidth='0.5') | |
| plt.tight_layout() | |
| plt.savefig(f'{output_dir}/monthly_age_contribution_line.png') | |
| plt.close() | |
| # λ°μ΄ν° ν μ΄λΈ μΆλ ₯ | |
| print("\n--- μλ³ μ°λ Ήλ κΈ°μ¬λ (%) λ°μ΄ν° ---") | |
| print(monthly_age_prop) | |
| print(" - μλ³ ν΅μ¬ λ μμΈ΅ λ³ν λΆμ μλ£. (monthly_age_contribution_line.png μ μ₯ λ° ν μ΄λΈ μΆλ ₯)") | |
| # λ³΄κ³ μμ μ λ¬ν λ°μ΄ν° λ°ν | |
| return { | |
| "monthly_metrics": monthly_metrics, | |
| "category_table": category_table_data, | |
| "age_table": monthly_age_prop | |
| } | |
| # 5. μ’ ν© μΈμ¬μ΄νΈ μμ± (λ³΄κ³ μ λ΄μ© μ λ°μ΄νΈ) | |
| def generate_insights_report(monthly_data, output_dir): | |
| print("\n[λ¨κ³ 6] μ’ ν© μΈμ¬μ΄νΈ λ³΄κ³ μ μμ± (μλ³ λΆμ μμΉ κ°ν)...") | |
| # λ°μ΄ν° ν μ΄λΈμ λ¬Έμμ΄λ‘ λ³ν | |
| category_table_str = monthly_data['category_table'].to_string() | |
| age_table_str = monthly_data['age_table'].to_string() | |
| report = f""" | |
| # μ λ¬Έκ³Όλ°©μ‘ λ μ λ°μ΄ν° μ¬μΈ΅ λΆμ λ³΄κ³ μ (μλ³ νΈλ λ μμΉ κ°ν) | |
| μμ±μΌ: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} | |
| (κΈ°μ‘΄ 1 ~ 4 μΉμ λ΄μ© μλ΅) | |
| ... | |
| ## 5. β μμΉλ‘ 보λ μλ³ λμ νΈλ λ λΆμ β | |
| μκ°μ νλ¦μ λ°λ₯Έ μ±κ³Ό, μ λ΅, λ μμΈ΅μ λ³νλ₯Ό μμΉ μ€μ¬μΌλ‘ λΆμν κ²°κ³Ό, λ€μκ³Ό κ°μ ꡬ체μ μΈ μΈμ¬μ΄νΈλ₯Ό λμΆνμ΅λλ€. | |
| ### 5.1. μ±κ³Όμ λ³λμ±κ³Ό μ±μ₯ λͺ¨λ©ν | |
| - **μ±κ³Ό μΆμ΄**: 2024λ 4μ, μ΄ μ‘°νμλ 21,015νλ₯Ό κΈ°λ‘νλ©° μ μ λλΉ **16.2%μ λμ μ±μ₯λ₯ **μ 보μμ΅λλ€. νΉν ν΄λΉ μμ μ’μμ μλ 290κ°λ‘, **μ μ λλΉ 161.3%λΌλ νλ°μ μΈ μ¦κ°**λ₯Ό κΈ°λ‘νμ΅λλ€. μ΄λ νΉμ κΈ°ν κΈ°μ¬κ° λ μλ€μκ² ν° νΈμμ μ»μμμ μλ―Έν©λλ€. (monthly_performance_and_growth.png μ°Έκ³ ) | |
| - **μ±μ₯κ³Ό νλ½**: λ°λ©΄, 2025λ 1μμ μ‘°νμ(-25.5%)μ μ’μμ(-61.6%) λͺ¨λ ν° νμΌλ‘ νλ½νλ λͺ¨μ΅μ 보μμ΅λλ€. μ΄μ²λΌ μλ³ μ±κ³Ό λ³λμ±μ΄ ν¬λ―λ‘, **μ±κ³΅ μμ μμΈμ λΆμνμ¬ νλ½ μμ μ μ©νλ μ λ΅**μ΄ μκΈν©λλ€. | |
| ### 5.2. λ°μ΄ν°λ‘ μ μ¦λ μ½ν μΈ μ λ΅μ μ§ν | |
| - **μ λ΅ λ³ν**: μλ λ°μ΄ν° ν μ΄λΈμμ λ³Ό μ μλ―μ΄, 2024λ νλ°λΆν° 'λ―Έλμ΄Β·AIνΈλ λ' μΉ΄ν κ³ λ¦¬μ λ°ν λΉμ€μ΄ κΎΈμ€ν μ¦κ°νμ¬ μ΅κ·Ό μμλ **μ 체 μ½ν μΈ μ μ½ 5%**λ₯Ό μ°¨μ§νλ μ£Όμ μΉ΄ν κ³ λ¦¬λ‘ μ리 μ‘μμ΅λλ€. | |
| - **κ²°κ³Ό**: μ΄ μ λ΅μ μ±κ³΅μ μ΄μμ΅λλ€. 'λ―Έλμ΄Β·AIνΈλ λ'λ νκ· μ‘°νμ λ° μ°Έμ¬λκ° λμ μΉ΄ν κ³ λ¦¬μ΄λ©°, μ΄λ¬ν μ½ν μΈ μ μ¦κ°λ μλ‘μ΄ μ λ¬Έ λ μμΈ΅ μ μ μ κΈ°μ¬νμ΅λλ€. | |
| (monthly_category_distribution_with_values.png μ°Έκ³ ) | |
| --- μλ³ μμ μΉ΄ν κ³ λ¦¬ λ°ν λΉμ€ (%) λ°μ΄ν° --- | |
| {category_table_str} | |
| --------------------------------------------- | |
| ### 5.3. ν΅μ¬ λ μμΈ΅μ μΈλκ΅μ²΄ μ‘°μ§ | |
| - **ν΅μ¬ λ μμΈ΅**: 19-24μΈ κ·Έλ£Ήμ΄ μ¬μ ν κ°μ₯ ν° λΉμ€(νκ· μ½ 20~25%)μ μ°¨μ§νλ ν΅μ¬ λ μμΈ΅μ λλ€. | |
| - **μ£Όλͺ©ν λ³ν**: νμ§λ§ μλ λ°μ΄ν°μμ λͺ νν 보μ΄λ―μ΄, 2025λ λ€μ΄ **30-34μΈ λ μμΈ΅μ κΈ°μ¬λκ° 12.1%μμ 14.5%λ‘ κΎΈμ€ν μμΉ**νλ νΈλ λκ° λνλ¬μ΅λλ€. μ΄λ μλ‘μ΄ μ±μ₯ λλ ₯μ΄ λ μ μλ λ§€μ° κΈμ μ μΈ μ νΈμ λλ€. λ°λ©΄, 13-18μΈ λ μμΈ΅μ λΉμ€μ μν κ°μνλ μΆμΈμ λλ€. | |
| (monthly_age_contribution_line.png μ°Έκ³ ) | |
| --- μλ³ μ°λ Ήλ κΈ°μ¬λ (%) λ°μ΄ν° --- | |
| {age_table_str} | |
| --------------------------------------------- | |
| ## 6. μ΅μ’ μ λ΅ μ μΈ (μμΉ κΈ°λ°) | |
| 1. **μ±μ₯λ₯ κΈ°λ° μ±κ³Ό κ΄λ¦¬**: λ§€μ λ§, 'μλ³ μ±κ³Ό λ° μ±μ₯λ₯ ' λμ보λλ₯Ό 리뷰νμ¬ **μ±μ₯λ₯ μ΄ κΈλ±/κΈλ½ν μμΈμ λΆμνκ³ λ€μ λ¬ μ½ν μΈ κΈ°νμ μ¦μ λ°μ**νλ νλ‘μΈμ€λ₯Ό μ 립ν΄μΌ ν©λλ€. | |
| 2. **λ°μ΄ν° κΈ°λ° μΉ΄ν κ³ λ¦¬ λΉμ€ μ‘°μ **: μ±κ³΅μ΄ μ μ¦λ 'λ―Έλμ΄Β·AIνΈλ λ'μ λΉμ€μ **νμ¬ 5%μμ 8~10% μμ€κΉμ§ μ μ§μ μΌλ‘ νλ**νκ³ , λ°μμ΄ μ μ‘°ν μΌλΆ μΉ΄ν κ³ λ¦¬μ λΉμ€μ μΆμνλ 'μ νκ³Ό μ§μ€'μ μ€νν΄μΌ ν©λλ€. | |
| 3. **30λ λ μμΈ΅ μ§μ€ 곡λ΅**: κΈ°μ¬λκ° κΎΈμ€ν μμΉνλ 30λ λ μλ₯Ό **'ν΅μ¬ μ±μ₯ νκ²'**μΌλ‘ 곡μ μ§μ νκ³ , μ΄λ€μ κ΄μ¬μ¬μΈ '컀리μ΄', 'λ―Έλμ΄ μ°μ λν₯', 'λΉμ¦λμ€ λͺ¨λΈ' κ΄λ ¨ μ½ν μΈ λ₯Ό μ μ€νμ¬ μ΄λ€μ μ μ μ κ°μνν΄μΌ ν©λλ€. | |
| """ | |
| report_path = f'{output_dir}/comprehensive_analysis_report_with_enhanced_trends.txt' | |
| with open(report_path, 'w', encoding='utf-8') as f: | |
| f.write(report) | |
| print(f"\n - μ’ ν© μΈμ¬μ΄νΈ λ³΄κ³ μ μμ± μλ£. ({report_path} μ μ₯)") | |
| # 6. λ©μΈ μ€ν ν¨μ | |
| def main(): | |
| print("===== μ λ¬Έκ³Όλ°©μ‘ λ μ λ°μ΄ν° μ¬μΈ΅ λΆμ (μλ³ νΈλ λ μμΉ κ°ν) =====") | |
| data_dir, output_dir = setup_environment() | |
| all_data = load_and_preprocess_data(data_dir) | |
| # --- β μμΉ/μΆμΈκ° κ°νλ μλ³ λΆμ μ€ν β --- | |
| monthly_analysis_data = analyze_enhanced_monthly_trends(all_data, output_dir) | |
| generate_insights_report(monthly_analysis_data, output_dir) | |
| print("\n===== λͺ¨λ λΆμμ΄ μ±κ³΅μ μΌλ‘ μλ£λμμ΅λλ€. =====") | |
| print(f"κ²°κ³Όλ¬Όμ '{output_dir}' ν΄λμμ νμΈνμ€ μ μμ΅λλ€.") | |
| if __name__ == '__main__': | |
| main() |