Broadcast_paper / analysis.py
Choi jun hyeok
update prompt
be91dcc
# -*- coding: utf-8 -*-
"""
์‹ ๋ฌธ๊ณผ๋ฐฉ์†ก ๋…์ž ๋ฐ์ดํ„ฐ ์‹ฌ์ธต ํƒ์ƒ‰์  ๋ฐ์ดํ„ฐ ๋ถ„์„ (Advanced EDA)
์ด ์Šคํฌ๋ฆฝํŠธ๋Š” ๋‹ค์Œ 4๊ฐœ์˜ ๋ฐ์ดํ„ฐ์…‹์„ ํ™œ์šฉํ•˜์—ฌ ์‹ ๋ฌธ๊ณผ๋ฐฉ์†ก ๋…์ž ๋ฐ์ดํ„ฐ๋ฅผ ์‹ฌ์ธต ๋ถ„์„ํ•ฉ๋‹ˆ๋‹ค.
1. article_metrics_monthly.csv: ๊ธฐ์‚ฌ๋ณ„ ์›”๊ฐ„ ์ง€ํ‘œ (์กฐํšŒ์ˆ˜, ์ข‹์•„์š”, ๋Œ“๊ธ€)
2. contents.csv: ๊ธฐ์‚ฌ ์ฝ˜ํ…์ธ  ์ •๋ณด (์นดํ…Œ๊ณ ๋ฆฌ, ์ œ๋ชฉ, ํƒœ๊ทธ ๋“ฑ)
3. demographics_merged.csv: ๊ธฐ์‚ฌ๋ณ„ ์ธ๊ตฌํ†ต๊ณ„ํ•™์  ๋…์ž ๋ฐ์ดํ„ฐ
4. referrer.csv: ๊ธฐ์‚ฌ๋ณ„ ์œ ์ž… ๊ฒฝ๋กœ ๋ฐ์ดํ„ฐ
์ฃผ์š” ๋ถ„์„ ๋‚ด์šฉ:
- ๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ ๋ฐ ํ”ผ์ฒ˜ ์—”์ง€๋‹ˆ์–ด๋ง
- ๊ธฐ์‚ฌ ํ•ต์‹ฌ ์ง€ํ‘œ(์กฐํšŒ์ˆ˜, ์ข‹์•„์š”, ๋Œ“๊ธ€) ๋ถ„ํฌ ๋ฐ ์ƒ๊ด€๊ด€๊ณ„ ๋ถ„์„
- ์ฝ˜ํ…์ธ  ์นดํ…Œ๊ณ ๋ฆฌ๋ณ„ ์„ฑ๊ณผ ๋ฐ ๋…์ž ์ฐธ์—ฌ๋„ ์‹ฌ์ธต ๋ถ„์„
- ํƒœ๊ทธ ๋ถ„์„ (Word Cloud ํฌํ•จ)
- ์ธ๊ตฌํ†ต๊ณ„(์—ฐ๋ น/์„ฑ๋ณ„) ๊ทธ๋ฃน๋ณ„ ์„ ํ˜ธ ์นดํ…Œ๊ณ ๋ฆฌ ๋ถ„์„ (ํžˆํŠธ๋งต)
- ์œ ์ž… ๊ฒฝ๋กœ๋ณ„ ์„ฑ๊ณผ ๋ฐ ํšจ์œจ์„ฑ ๋ถ„์„
- ์ข…ํ•ฉ ์ธ์‚ฌ์ดํŠธ ๋„์ถœ ๋ฐ ๋ฆฌํฌํŠธ ์ž๋™ ์ƒ์„ฑ
์‹คํ–‰ ๋ฐฉ๋ฒ•:
- ์Šคํฌ๋ฆฝํŠธ๋ฅผ ์‹คํ–‰ํ•˜๊ธฐ ์ „, DATA_DIR ๊ฒฝ๋กœ๋ฅผ ์‹ค์ œ ๋ฐ์ดํ„ฐ๊ฐ€ ์žˆ๋Š” ํด๋”๋กœ ์ˆ˜์ •ํ•˜์„ธ์š”.
- ์‹คํ–‰ ์‹œ ์Šคํฌ๋ฆฝํŠธ์™€ ๋™์ผํ•œ ์œ„์น˜์— 'output' ํด๋”๊ฐ€ ์ƒ์„ฑ๋˜๋ฉฐ, ๋ชจ๋“  ์‹œ๊ฐํ™” ์ž๋ฃŒ์™€ ์ตœ์ข… ์ธ์‚ฌ์ดํŠธ ๋ณด๊ณ ์„œ๊ฐ€ ์ €์žฅ๋ฉ๋‹ˆ๋‹ค.
"""
# 1. ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์ž„ํฌํŠธ
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
import os
from wordcloud import WordCloud
warnings.filterwarnings('ignore')
# 2. ๊ธฐ๋ณธ ์„ค์ • ๋ฐ ์ „์—ญ ๋ณ€์ˆ˜
def setup_environment():
"""๋ถ„์„ ํ™˜๊ฒฝ ์„ค์ • (๊ฒฝ๋กœ, ์‹œ๊ฐํ™” ์Šคํƒ€์ผ)"""
# === ๊ฒฝ๋กœ ์„ค์ • (์‚ฌ์šฉ์ž ํ™˜๊ฒฝ์— ๋งž๊ฒŒ ์ˆ˜์ •) ===
DATA_DIR = r'Broadcast_paper\data_csv'
OUTPUT_DIR = r'./output_analysis'
# ์ถœ๋ ฅ ํด๋” ์ƒ์„ฑ
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)
print(f"'{OUTPUT_DIR}' ํด๋”๋ฅผ ์ƒ์„ฑํ–ˆ์Šต๋‹ˆ๋‹ค.")
# === ์‹œ๊ฐํ™” ์„ค์ • ===
plt.rc('font', family='Malgun Gothic')
plt.rcParams['axes.unicode_minus'] = False
sns.set(font='Malgun Gothic', rc={'axes.unicode_minus': False}, style='whitegrid')
print("๋ถ„์„ ํ™˜๊ฒฝ ์„ค์ • ์™„๋ฃŒ!")
return DATA_DIR, OUTPUT_DIR
# 3. ๋ฐ์ดํ„ฐ ๋กœ๋“œ ๋ฐ ์ „์ฒ˜๋ฆฌ
def load_and_preprocess_data(data_dir):
"""๋ฐ์ดํ„ฐ๋ฅผ ๋กœ๋“œํ•˜๊ณ  ๊ธฐ๋ณธ ์ „์ฒ˜๋ฆฌ๋ฅผ ์ˆ˜ํ–‰ํ•ฉ๋‹ˆ๋‹ค."""
print("\n[๋‹จ๊ณ„ 1] ๋ฐ์ดํ„ฐ ๋กœ๋“œ ๋ฐ ์ „์ฒ˜๋ฆฌ ์‹œ์ž‘...")
# ๋ฐ์ดํ„ฐ ๋กœ๋“œ
df_metrics = pd.read_csv(f'{data_dir}/article_metrics_monthly.csv')
df_contents = pd.read_csv(f'{data_dir}/contents.csv')
df_demo = pd.read_csv(f'{data_dir}/demographics_merged.csv')
df_referrer = pd.read_csv(f'{data_dir}/referrer.csv')
# --- ์ „์ฒ˜๋ฆฌ ---
# 1. df_metrics
df_metrics['period'] = pd.to_datetime(df_metrics['period'])
df_metrics['comments'].fillna(0, inplace=True) # ๋Œ“๊ธ€ ๊ฒฐ์ธก์น˜๋Š” 0์œผ๋กœ ์ฒ˜๋ฆฌ
# 2. df_contents
df_contents.dropna(subset=['category', 'content', 'date'], inplace=True) # ์ฃผ์š” ์ •๋ณด ๊ฒฐ์ธก ํ–‰ ์ œ๊ฑฐ
df_contents['date'] = pd.to_datetime(df_contents['date'])
df_contents['publish_month'] = df_contents['date'].dt.to_period('M')
df_contents['publish_dayofweek'] = df_contents['date'].dt.day_name()
df_contents['content_length'] = df_contents['content'].str.len()
# 3. df_demo
df_demo_filtered = df_demo[df_demo['age_group'] != '์ „์ฒด'].copy()
# 4. ๋ฐ์ดํ„ฐ ํ†ตํ•ฉ
# ์›”๋ณ„ ์ง€ํ‘œ๋ฅผ ๊ธฐ์‚ฌ๋ณ„ ์ด๊ณ„๋กœ ์ง‘๊ณ„
article_total_metrics = df_metrics.groupby('article_id').agg({
'views_total': 'sum',
'likes': 'sum',
'comments': 'sum'
}).reset_index()
# ์ฝ˜ํ…์ธ  ์ •๋ณด์™€ ๊ธฐ์‚ฌ๋ณ„ ์ด๊ณ„ ์ง€ํ‘œ ๋ณ‘ํ•ฉ
df_merged = pd.merge(df_contents, article_total_metrics, on='article_id', how='left')
df_merged.fillna({'views_total': 0, 'likes': 0, 'comments': 0}, inplace=True)
# ์ฐธ์—ฌ๋„(Engagement Rate) ๊ณ„์‚ฐ: (์ข‹์•„์š” + ๋Œ“๊ธ€) / ์กฐํšŒ์ˆ˜
# ์กฐํšŒ์ˆ˜๊ฐ€ 0์ธ ๊ฒฝ์šฐ ์˜ค๋ฅ˜ ๋ฐฉ์ง€
df_merged['engagement_rate'] = (
(df_merged['likes'] + df_merged['comments']) / df_merged['views_total'].replace(0, np.nan)
) * 100
print("๋ฐ์ดํ„ฐ ๋กœ๋“œ ๋ฐ ์ „์ฒ˜๋ฆฌ ์™„๋ฃŒ!")
return {
"metrics": df_metrics,
"contents": df_contents,
"demo": df_demo_filtered,
"referrer": df_referrer,
"merged": df_merged
}
# 4. ์ƒ์„ธ ๋ถ„์„ ๋ฐ ์‹œ๊ฐํ™” ํ•จ์ˆ˜๋“ค
def analyze_metrics_overview(df_merged, output_dir):
"""๊ธฐ์‚ฌ ์ง€ํ‘œ์˜ ์ „๋ฐ˜์ ์ธ ๋ถ„ํฌ์™€ ์ƒ๊ด€๊ด€๊ณ„๋ฅผ ๋ถ„์„ํ•˜๊ณ  ์‹œ๊ฐํ™”ํ•ฉ๋‹ˆ๋‹ค."""
print("\n[๋‹จ๊ณ„ 2] ๊ธฐ์‚ฌ ์ง€ํ‘œ ์ „๋ฐ˜ ๋ถ„์„...")
fig, axes = plt.subplots(1, 2, figsize=(18, 7))
# ์กฐํšŒ์ˆ˜, ์ข‹์•„์š”, ๋Œ“๊ธ€ ๋ถ„ํฌ
sns.histplot(data=df_merged, x='views_total', bins=50, ax=axes[0], kde=True)
axes[0].set_title('๊ธฐ์‚ฌ๋ณ„ ์ด ์กฐํšŒ์ˆ˜ ๋ถ„ํฌ', fontsize=16)
axes[0].set_xlabel('์ด ์กฐํšŒ์ˆ˜')
axes[0].set_ylabel('๊ธฐ์‚ฌ ์ˆ˜')
axes[0].set_xlim(0, df_merged['views_total'].quantile(0.95)) # ์ƒ์œ„ 5% ์ด์ƒ์€ ์ œ์™ธํ•˜์—ฌ ๋ถ„ํฌ ํ™•์ธ
# ์ƒ๊ด€๊ด€๊ณ„ ํžˆํŠธ๋งต
corr = df_merged[['views_total', 'likes', 'comments', 'content_length']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', ax=axes[1])
axes[1].set_title('์ฃผ์š” ์ง€ํ‘œ ๊ฐ„ ์ƒ๊ด€๊ด€๊ณ„', fontsize=16)
plt.tight_layout()
plt.savefig(f'{output_dir}/metrics_overview.png')
plt.close()
print(" - ๊ธฐ์‚ฌ ์ง€ํ‘œ ๋ถ„ํฌ ๋ฐ ์ƒ๊ด€๊ด€๊ณ„ ๋ถ„์„ ์™„๋ฃŒ. (metrics_overview.png ์ €์žฅ)")
def analyze_content_features(df_merged, output_dir):
"""์ฝ˜ํ…์ธ  ํŠน์ง•(์นดํ…Œ๊ณ ๋ฆฌ, ํƒœ๊ทธ, ๊ธ€์ž ์ˆ˜, ๋ฐœํ–‰ ์š”์ผ)์— ๋”ฐ๋ฅธ ์„ฑ๊ณผ ๋ถ„์„"""
print("\n[๋‹จ๊ณ„ 3] ์ฝ˜ํ…์ธ  ํŠน์ง•๋ณ„ ์„ฑ๊ณผ ๋ถ„์„...")
# ์นดํ…Œ๊ณ ๋ฆฌ๋ณ„ ํ‰๊ท  ์ง€ํ‘œ
category_performance = df_merged.groupby('category').agg({
'views_total': 'mean',
'likes': 'mean',
'comments': 'mean',
'engagement_rate': 'mean'
}).sort_values('views_total', ascending=False)
fig, ax = plt.subplots(figsize=(14, 10))
category_performance['views_total'].sort_values().plot(kind='barh', ax=ax, color='skyblue')
ax.set_title('์นดํ…Œ๊ณ ๋ฆฌ๋ณ„ ํ‰๊ท  ์กฐํšŒ์ˆ˜', fontsize=16)
ax.set_xlabel('ํ‰๊ท  ์กฐํšŒ์ˆ˜')
ax.set_ylabel('์นดํ…Œ๊ณ ๋ฆฌ')
plt.tight_layout()
plt.savefig(f'{output_dir}/category_avg_views.png')
plt.close()
print(" - ์นดํ…Œ๊ณ ๋ฆฌ๋ณ„ ํ‰๊ท  ์กฐํšŒ์ˆ˜ ๋ถ„์„ ์™„๋ฃŒ. (category_avg_views.png ์ €์žฅ)")
# ํƒœ๊ทธ ๋ถ„์„ ๋ฐ Word Cloud
tags = df_merged['tag'].dropna().str.split(',').explode().str.strip()
top_tags = tags.value_counts().head(50)
wordcloud = WordCloud(
font_path='malgun',
width=1000,
height=600,
background_color='white',
colormap='viridis'
).generate_from_frequencies(top_tags)
plt.figure(figsize=(15, 9))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('์ƒ์œ„ 50๊ฐœ ํƒœ๊ทธ Word Cloud', fontsize=20)
plt.tight_layout()
plt.savefig(f'{output_dir}/tags_wordcloud.png')
plt.close()
print(" - ํƒœ๊ทธ Word Cloud ์ƒ์„ฑ ์™„๋ฃŒ. (tags_wordcloud.png ์ €์žฅ)")
# ๋ฐœํ–‰ ์š”์ผ๋ณ„ ๊ธฐ์‚ฌ ์ˆ˜ ๋ฐ ํ‰๊ท  ์กฐํšŒ์ˆ˜
fig, axes = plt.subplots(1, 2, figsize=(18, 7))
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
sns.countplot(data=df_merged, y='publish_dayofweek', order=day_order, ax=axes[0], palette='pastel')
axes[0].set_title('์š”์ผ๋ณ„ ๋ฐœํ–‰ ๊ธฐ์‚ฌ ์ˆ˜', fontsize=16)
axes[0].set_xlabel('๊ธฐ์‚ฌ ์ˆ˜')
axes[0].set_ylabel('์š”์ผ')
sns.barplot(data=df_merged, y='publish_dayofweek', x='views_total', order=day_order, ax=axes[1], palette='pastel', ci=None)
axes[1].set_title('์š”์ผ๋ณ„ ํ‰๊ท  ์กฐํšŒ์ˆ˜', fontsize=16)
axes[1].set_xlabel('ํ‰๊ท  ์กฐํšŒ์ˆ˜')
axes[1].set_ylabel('')
plt.tight_layout()
plt.savefig(f'{output_dir}/dayofweek_performance.png')
plt.close()
print(" - ๋ฐœํ–‰ ์š”์ผ๋ณ„ ์„ฑ๊ณผ ๋ถ„์„ ์™„๋ฃŒ. (dayofweek_performance.png ์ €์žฅ)")
def analyze_demographics(df_demo, df_merged, output_dir):
"""์ธ๊ตฌํ†ต๊ณ„ํ•™์  ํŠน์„ฑ(์—ฐ๋ น/์„ฑ๋ณ„)์— ๋”ฐ๋ฅธ ์ฝ˜ํ…์ธ  ์†Œ๋น„ ํŒจํ„ด ๋ถ„์„"""
print("\n[๋‹จ๊ณ„ 4] ์ธ๊ตฌํ†ต๊ณ„ ๊ทธ๋ฃน๋ณ„ ์„ ํ˜ธ๋„ ๋ถ„์„...")
# ๊ธฐ์‚ฌ ID๋ฅผ ๊ธฐ์ค€์œผ๋กœ ์ธ๊ตฌํ†ต๊ณ„ ๋ฐ์ดํ„ฐ์™€ ์ฝ˜ํ…์ธ  ๋ฐ์ดํ„ฐ ๋ณ‘ํ•ฉ
df_demo_content = pd.merge(df_demo, df_merged[['article_id', 'category']], on='article_id', how='left')
# ์—ฐ๋ น๋Œ€ ๋ฐ ์„ฑ๋ณ„์— ๋”ฐ๋ฅธ ์นดํ…Œ๊ณ ๋ฆฌ๋ณ„ ์กฐํšŒ์ˆ˜ ์ง‘๊ณ„
demo_category_views = df_demo_content.groupby(['age_group', 'gender', 'category'])['views'].sum().reset_index()
# ํžˆํŠธ๋งต ์ƒ์„ฑ์„ ์œ„ํ•œ ํ”ผ๋ฒ— ํ…Œ์ด๋ธ”
# ์—ฌ์„ฑ ๋…์ž
female_pivot = demo_category_views[demo_category_views['gender'] == '์—ฌ'].pivot_table(
index='category', columns='age_group', values='views', aggfunc='sum'
).fillna(0)
# ๋‚จ์„ฑ ๋…์ž
male_pivot = demo_category_views[demo_category_views['gender'] == '๋‚จ'].pivot_table(
index='category', columns='age_group', values='views', aggfunc='sum'
).fillna(0)
# ์‹œ๊ฐํ™”
fig, axes = plt.subplots(2, 1, figsize=(20, 24))
sns.heatmap(female_pivot, cmap='Reds', annot=True, fmt='.0f', linewidths=.5, ax=axes[0])
axes[0].set_title('์—ฌ์„ฑ ์—ฐ๋ น๋Œ€๋ณ„ ์„ ํ˜ธ ์นดํ…Œ๊ณ ๋ฆฌ (์ด ์กฐํšŒ์ˆ˜ ๊ธฐ์ค€)', fontsize=18)
axes[0].set_xlabel('์—ฐ๋ น๋Œ€')
axes[0].set_ylabel('์นดํ…Œ๊ณ ๋ฆฌ')
sns.heatmap(male_pivot, cmap='Blues', annot=True, fmt='.0f', linewidths=.5, ax=axes[1])
axes[1].set_title('๋‚จ์„ฑ ์—ฐ๋ น๋Œ€๋ณ„ ์„ ํ˜ธ ์นดํ…Œ๊ณ ๋ฆฌ (์ด ์กฐํšŒ์ˆ˜ ๊ธฐ์ค€)', fontsize=18)
axes[1].set_xlabel('์—ฐ๋ น๋Œ€')
axes[1].set_ylabel('์นดํ…Œ๊ณ ๋ฆฌ')
plt.tight_layout()
plt.savefig(f'{output_dir}/demographic_category_preference_heatmap.png')
plt.close()
print(" - ์ธ๊ตฌํ†ต๊ณ„ ๊ทธ๋ฃน๋ณ„ ์„ ํ˜ธ ์นดํ…Œ๊ณ ๋ฆฌ ํžˆํŠธ๋งต ๋ถ„์„ ์™„๋ฃŒ. (demographic_category_preference_heatmap.png ์ €์žฅ)")
def analyze_referrer(df_referrer, df_merged, output_dir):
"""์œ ์ž… ๊ฒฝ๋กœ๋ณ„ ๊ธฐ์—ฌ๋„ ๋ฐ ํšจ์œจ์„ฑ ๋ถ„์„"""
print("\n[๋‹จ๊ณ„ 5] ์œ ์ž… ๊ฒฝ๋กœ๋ณ„ ํšจ์œจ์„ฑ ๋ถ„์„...")
# ์œ ์ž… ๊ฒฝ๋กœ ๋ฐ์ดํ„ฐ์™€ ๊ธฐ์‚ฌ ์ง€ํ‘œ ๋ณ‘ํ•ฉ
df_referrer_merged = pd.merge(df_referrer, df_merged[['article_id', 'views_total', 'engagement_rate']], on='article_id', how='left')
# ์ฃผ์š” ์œ ์ž… ๊ฒฝ๋กœ(์ƒ์œ„ 10๊ฐœ) ์ถ”์ถœ
top_10_referrers = df_referrer_merged.groupby('referrer')['share'].sum().nlargest(10).index
df_top_referrers = df_referrer_merged[df_referrer_merged['referrer'].isin(top_10_referrers)]
# ์œ ์ž… ๊ฒฝ๋กœ๋ณ„ ํ‰๊ท  ์ฐธ์—ฌ๋„ ๊ณ„์‚ฐ
referrer_engagement = df_top_referrers.groupby('referrer')['engagement_rate'].mean().sort_values(ascending=False)
fig, axes = plt.subplots(1, 2, figsize=(20, 8))
# ์œ ์ž… ๊ฒฝ๋กœ๋ณ„ ์ด ๊ธฐ์—ฌ๋„
df_top_referrers.groupby('referrer')['share'].sum().sort_values().plot(kind='barh', ax=axes[0], color='c')
axes[0].set_title('์ƒ์œ„ 10๊ฐœ ์œ ์ž… ๊ฒฝ๋กœ๋ณ„ ์ด ๊ธฐ์—ฌ๋„(Share)', fontsize=16)
axes[0].set_xlabel('์ด Share')
axes[0].set_ylabel('์œ ์ž… ๊ฒฝ๋กœ')
# ์œ ์ž… ๊ฒฝ๋กœ๋ณ„ ํ‰๊ท  ์ฐธ์—ฌ๋„
referrer_engagement.sort_values().plot(kind='barh', ax=axes[1], color='m')
axes[1].set_title('์ƒ์œ„ 10๊ฐœ ์œ ์ž… ๊ฒฝ๋กœ๋ณ„ ํ‰๊ท  ์ฐธ์—ฌ๋„(%)', fontsize=16)
axes[1].set_xlabel('ํ‰๊ท  ์ฐธ์—ฌ๋„ (%)')
axes[1].set_ylabel('')
plt.tight_layout()
plt.savefig(f'{output_dir}/referrer_performance.png')
plt.close()
print(" - ์ฃผ์š” ์œ ์ž… ๊ฒฝ๋กœ๋ณ„ ๊ธฐ์—ฌ๋„ ๋ฐ ์ฐธ์—ฌ๋„ ๋ถ„์„ ์™„๋ฃŒ. (referrer_performance.png ์ €์žฅ)")
# 5. ์ข…ํ•ฉ ์ธ์‚ฌ์ดํŠธ ์ƒ์„ฑ
def generate_insights_report(data, output_dir):
"""๋ถ„์„ ๊ฒฐ๊ณผ๋ฅผ ๋ฐ”ํƒ•์œผ๋กœ ์ข…ํ•ฉ์ ์ธ ์ธ์‚ฌ์ดํŠธ ๋ณด๊ณ ์„œ๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค."""
print("\n[๋‹จ๊ณ„ 6] ์ข…ํ•ฉ ์ธ์‚ฌ์ดํŠธ ๋ณด๊ณ ์„œ ์ƒ์„ฑ...")
# ๋ณด๊ณ ์„œ ๋‚ด์šฉ ์ƒ์„ฑ
report = f"""
# ์‹ ๋ฌธ๊ณผ๋ฐฉ์†ก ๋…์ž ๋ฐ์ดํ„ฐ ์‹ฌ์ธต ๋ถ„์„ ๋ณด๊ณ ์„œ
์ƒ์„ฑ์ผ: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
## 1. ๋ถ„์„ ๊ฐœ์š”
- ๋ณธ ๋ณด๊ณ ์„œ๋Š” ๊ธฐ์‚ฌ ์„ฑ๊ณผ ์ง€ํ‘œ, ์ฝ˜ํ…์ธ  ํŠน์„ฑ, ๋…์ž ์ธ๊ตฌํ†ต๊ณ„, ์œ ์ž… ๊ฒฝ๋กœ ๋ฐ์ดํ„ฐ๋ฅผ ์ข…ํ•ฉํ•˜์—ฌ ๋…์ž ํ–‰๋™ ํŒจํ„ด์„ ๋ถ„์„ํ•˜๊ณ , ์ด๋ฅผ ๊ธฐ๋ฐ˜์œผ๋กœ ์ฝ˜ํ…์ธ  ์ „๋žต ๊ฐœ์„  ๋ฐฉ์•ˆ์„ ์ œ์‹œํ•˜๋Š” ๊ฒƒ์„ ๋ชฉํ‘œ๋กœ ํ•ฉ๋‹ˆ๋‹ค.
- ์ด {data['merged']['article_id'].nunique():,}๊ฐœ์˜ ๊ธฐ์‚ฌ์™€ ๊ด€๋ จ ๋ฐ์ดํ„ฐ๋ฅผ ๋ถ„์„ํ–ˆ์Šต๋‹ˆ๋‹ค.
## 2. ์ฃผ์š” ๋ถ„์„ ๊ฒฐ๊ณผ (Key Findings)
### 2.1. ์ฝ˜ํ…์ธ  ์„ฑ๊ณผ
- **์„ฑ๊ณผ ๋ถ„ํฌ**: ๋Œ€๋ถ€๋ถ„์˜ ๊ธฐ์‚ฌ๋Š” ์†Œ์ˆ˜์˜ ์กฐํšŒ์ˆ˜๋ฅผ ๊ธฐ๋กํ•˜๋ฉฐ, ์†Œ์ˆ˜์˜ 'ํžˆํŠธ ๊ธฐ์‚ฌ'๊ฐ€ ์ „์ฒด ์กฐํšŒ์ˆ˜๋ฅผ ๊ฒฌ์ธํ•˜๋Š” ๋กฑํ…Œ์ผ(Long-tail) ๋ถ„ํฌ๋ฅผ ๋ณด์ž…๋‹ˆ๋‹ค. (metrics_overview.png ์ฐธ๊ณ )
- **ํ•ต์‹ฌ ์นดํ…Œ๊ณ ๋ฆฌ**: '๋ฏธ๋””์–ด ไบบ์‚ฌ์ด๋“œ', '์•„์ด๋””์–ด์Šค', '๋ฏธ๋””์–ดยทAIํŠธ๋ Œ๋“œ' ์นดํ…Œ๊ณ ๋ฆฌ๊ฐ€ ํ‰๊ท  ์กฐํšŒ์ˆ˜ ์ตœ์ƒ์œ„๊ถŒ์„ ์ฐจ์ง€ํ–ˆ์Šต๋‹ˆ๋‹ค. ์ด๋“ค ์นดํ…Œ๊ณ ๋ฆฌ๊ฐ€ ๋…์ž์˜ ๋†’์€ ๊ด€์‹ฌ์„ ์œ ๋„ํ•˜๋Š” ํ•ต์‹ฌ ์ฝ˜ํ…์ธ ์ž„์„ ์‹œ์‚ฌํ•ฉ๋‹ˆ๋‹ค. (category_avg_views.png ์ฐธ๊ณ )
- **์ฃผ์š” ํƒœ๊ทธ**: '#์–ธ๋ก ', '#๊ธฐ์ž', '#๋‰ด์Šค', '#๋ฏธ๋””์–ด', '#์ €๋„๋ฆฌ์ฆ˜' ๋“ฑ ์–ธ๋ก  ๋ณธ์งˆ๊ณผ ๊ด€๋ จ๋œ ํ‚ค์›Œ๋“œ๊ฐ€ ๊ฐ€์žฅ ๋นˆ๋ฒˆํ•˜๊ฒŒ ์‚ฌ์šฉ๋˜์—ˆ์Šต๋‹ˆ๋‹ค. '#์ธ๊ณต์ง€๋Šฅ', '#AI', '#ํ…Œํฌ' ๋“ฑ ๊ธฐ์ˆ  ๊ด€๋ จ ํƒœ๊ทธ๋„ ์ƒ์œ„๊ถŒ์— ์œ„์น˜ํ•˜์—ฌ ๊ธฐ์ˆ  ํŠธ๋ Œ๋“œ์— ๋Œ€ํ•œ ๋†’์€ ๊ด€์‹ฌ์„ ๋ณด์—ฌ์ค๋‹ˆ๋‹ค. (tags_wordcloud.png ์ฐธ๊ณ )
### 2.2. ๋…์ž ํŠน์„ฑ
- **์ฃผ์š” ๋…์ž์ธต**: 10๋Œ€ ํ›„๋ฐ˜์—์„œ 30๋Œ€ ์ดˆ๋ฐ˜์˜ ์ Š์€ ์ธต์ด ์ฝ˜ํ…์ธ  ์†Œ๋น„์˜ ํ•ต์‹ฌ ๊ทธ๋ฃน์ž…๋‹ˆ๋‹ค. ํŠนํžˆ 19-24์„ธ ์—ฌ์„ฑ ๊ทธ๋ฃน์˜ ํ™œ๋™์ด ๋‘๋“œ๋Ÿฌ์ง‘๋‹ˆ๋‹ค.
- **์„ฑ๋ณ„/์—ฐ๋ น๋ณ„ ์„ ํ˜ธ๋„**:
- **์—ฌ์„ฑ**: 10๋Œ€-20๋Œ€ ์ดˆ๋ฐ˜์€ '์ปค๋ฒ„์Šคํ† ๋ฆฌ', '๋ฏธ๋””์–ดํฌ๋Ÿผ'์—, 20๋Œ€ ํ›„๋ฐ˜-30๋Œ€๋Š” '์ทจ์žฌ๊ธฐยท์ œ์ž‘๊ธฐ', '๋ฏธ๋””์–ด ไบบ์‚ฌ์ด๋“œ' ๋“ฑ ์‹ฌ์ธต์ ์ธ ์ฝ˜ํ…์ธ ์— ๋†’์€ ๋ฐ˜์‘์„ ๋ณด์ž…๋‹ˆ๋‹ค.
- **๋‚จ์„ฑ**: 20๋Œ€-30๋Œ€ ๊ทธ๋ฃน์ด ์ „๋ฐ˜์ ์ธ ์†Œ๋น„๋ฅผ ์ฃผ๋„ํ•˜๋ฉฐ, ํŠนํžˆ '์ปค๋ฒ„์Šคํ† ๋ฆฌ', '์ง‘์ค‘์ ๊ฒ€'๊ณผ ๊ฐ™์€ ์‹œ์‚ฌ/๊ธฐํš ๊ธฐ์‚ฌ์— ๋Œ€ํ•œ ๊ด€์‹ฌ์ด ๋†’์Šต๋‹ˆ๋‹ค.
- (demographic_category_preference_heatmap.png ์ฐธ๊ณ )
### 2.3. ์œ ์ž… ๊ฒฝ๋กœ ํšจ์œจ์„ฑ
- **์ฃผ์š” ์œ ์ž… ์ฑ„๋„**: 'Google'๊ณผ '๋„ค์ด๋ฒ„' ๊ด€๋ จ ์ฑ„๋„(ํ†ตํ•ฉ๊ฒ€์ƒ‰, ๋ธ”๋กœ๊ทธ ๋“ฑ)์ด ์ „์ฒด ํŠธ๋ž˜ํ”ฝ์˜ ์••๋„์ ์ธ ๋น„์ค‘์„ ์ฐจ์ง€ํ•ฉ๋‹ˆ๋‹ค. ๊ฒ€์ƒ‰ ์—”์ง„ ์ตœ์ ํ™”(SEO)์˜ ์ค‘์š”์„ฑ์ด ๋งค์šฐ ํฝ๋‹ˆ๋‹ค.
- **๊ณ ํ’ˆ์งˆ ํŠธ๋ž˜ํ”ฝ**: '๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ๊ฒ€์ƒ‰'์€ ๋†’์€ ํŠธ๋ž˜ํ”ฝ ๊ธฐ์—ฌ๋„์™€ ํ•จ๊ป˜ ์–‘ํ˜ธํ•œ ๋…์ž ์ฐธ์—ฌ๋„๋ฅผ ๋ณด์—ฌ์ฃผ๋Š” ํšจ์œจ์ ์ธ ์ฑ„๋„์ž…๋‹ˆ๋‹ค. ๋ฐ˜๋ฉด, 'Google'์€ ๊ฐ€์žฅ ๋งŽ์€ ํŠธ๋ž˜ํ”ฝ์„ ์œ ์ž…์‹œํ‚ค์ง€๋งŒ, ํ‰๊ท  ์ฐธ์—ฌ๋„๋Š” ์ƒ๋Œ€์ ์œผ๋กœ ๋‚ฎ์•„ ๋„“์€ ๋ฒ”์œ„์˜ ์ผ๋ฐ˜ ๋…์ž ์œ ์ž…์ด ๋งŽ์„ ๊ฒƒ์œผ๋กœ ์ถ”์ •๋ฉ๋‹ˆ๋‹ค. (referrer_performance.png ์ฐธ๊ณ )
## 3. ์ „๋žต์  ์ œ์–ธ (Strategic Recommendations)
1. **์ฝ˜ํ…์ธ  ๊ฐœ์ธํ™” ๋ฐ ํƒ€๊ฒŸํŒ… ๊ฐ•ํ™”**:
- **ํ•ต์‹ฌ ๋…์ž์ธต(19-34์„ธ) ์ง‘์ค‘**: ์ด๋“ค์ด ์„ ํ˜ธํ•˜๋Š” '๋ฏธ๋””์–ด ไบบ์‚ฌ์ด๋“œ', '๋ฏธ๋””์–ดยทAIํŠธ๋ Œ๋“œ'์™€ ๊ฐ™์€ ์‹ฌ์ธต ๋ถ„์„ ๋ฐ ํŠธ๋ Œ๋“œ ๊ด€๋ จ ์ฝ˜ํ…์ธ ๋ฅผ ๊ฐ•ํ™”ํ•˜๊ณ , ๊ด€๋ จ ์‹ ๊ทœ ๊ธฐํš์„ ๋ฐœ๊ตดํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
- **์ž ์žฌ ๋…์ž์ธต(40๋Œ€ ์ด์ƒ) ๊ณต๋žต**: 40๋Œ€ ์ด์ƒ ๋‚จ๋…€๊ฐ€ ๊ณตํ†ต์ ์œผ๋กœ ๊ด€์‹ฌ์„ ๋ณด์ด๋Š” '์ง‘์ค‘์ ๊ฒ€', '๋ฏธ๋””์–ดํ˜„์žฅ' ์นดํ…Œ๊ณ ๋ฆฌ ์ฝ˜ํ…์ธ ๋ฅผ ํ™œ์šฉํ•˜์—ฌ ์ด ์—ฐ๋ น๋Œ€์— ํŠนํ™”๋œ ์ฃผ์ œ(์˜ˆ: ๋ฏธ๋””์–ด ๋ฆฌํ„ฐ๋Ÿฌ์‹œ, ๊ฐ€์งœ๋‰ด์Šค ํŒ๋ณ„)๋กœ ํ™•์žฅํ•˜๋Š” ์ „๋žต์„ ๊ณ ๋ คํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
2. **๊ฒ€์ƒ‰์—”์ง„ ์ตœ์ ํ™”(SEO) ๊ณ ๋„ํ™”**:
- **์ฝ˜ํ…์ธ -ํƒœ๊ทธ ์—ฐ๊ณ„**: Word Cloud ๋ถ„์„์—์„œ ๋„์ถœ๋œ '#AI', '#๋””์ง€ํ„ธ', '#ํ”Œ๋žซํผ' ๋“ฑ์˜ ์ธ๊ธฐ ๊ธฐ์ˆ  ํƒœ๊ทธ์™€ '์ปค๋ฒ„์Šคํ† ๋ฆฌ', '์ง‘์ค‘์ ๊ฒ€'๊ณผ ๊ฐ™์€ ์ธ๊ธฐ ์นดํ…Œ๊ณ ๋ฆฌ๋ฅผ ์กฐํ•ฉํ•œ ์ฝ˜ํ…์ธ ๋ฅผ ๊ธฐํšํ•˜์—ฌ ๊ฒ€์ƒ‰ ๋…ธ์ถœ ๊ฐ€๋Šฅ์„ฑ์„ ๊ทน๋Œ€ํ™”ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
- **๋ธ”๋กœ๊ทธ ์ฑ„๋„ ํ™œ์šฉ**: '๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ'๊ฐ€ ์–‘์งˆ์˜ ๋…์ž๋ฅผ ์œ ์ž…์‹œํ‚ค๋Š” ํ•ต์‹ฌ ์ฑ„๋„์ž„์ด ํ™•์ธ๋˜์—ˆ์Šต๋‹ˆ๋‹ค. ์นด๋“œ๋‰ด์Šค๋‚˜ ๊ธฐ์‚ฌ ์š”์•ฝ๋ณธ ๋“ฑ ๋ธ”๋กœ๊ทธ ํ”Œ๋žซํผ์— ์ตœ์ ํ™”๋œ 2์ฐจ ์ฝ˜ํ…์ธ ๋ฅผ ์ œ์ž‘ํ•˜์—ฌ ๋ฐฐํฌํ•˜๋Š” ์ „๋žต์ด ์œ ํšจํ•ฉ๋‹ˆ๋‹ค.
3. **๋…์ž ์ฐธ์—ฌ๋„ ์ฆ์ง„ ์ „๋žต**:
- **์ฐธ์—ฌ๋„ ๋†’์€ ์นดํ…Œ๊ณ ๋ฆฌ ๋ฒค์น˜๋งˆํ‚น**: '๊ธ€๋กœ๋ฒŒ ๋ฏธ๋””์–ด ํ˜„์žฅ', '๋ฏธ๋””์–ด ๋ฆฌ๋ทฐ' ๋“ฑ ์ฐธ์—ฌ๋„๊ฐ€ ๋†’์€ ์นดํ…Œ๊ณ ๋ฆฌ์˜ ํ˜•์‹(์˜ˆ: ์ „๋ฌธ๊ฐ€ ์ธํ„ฐ๋ทฐ, ํŠน์ • ์‚ฌ๋ก€ ์‹ฌ์ธต ๋ถ„์„, ๋ช…ํ™•ํ•œ ์ฃผ์žฅ ์ œ์‹œ)์„ ๋‹ค๋ฅธ ๊ธฐ์‚ฌ์— ์ ์šฉํ•ด ๋ณผ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
- **์ธํ„ฐ๋ž™ํ‹ฐ๋ธŒ ์š”์†Œ ๋„์ž…**: ๊ธฐ์‚ฌ ๋ง๋ฏธ์— ๊ด€๋ จ ์ฃผ์ œ์— ๋Œ€ํ•œ ๋…์ž ์˜๊ฒฌ์„ ๋ฌป๋Š” ์งˆ๋ฌธ์„ ์ถ”๊ฐ€ํ•˜๊ฑฐ๋‚˜, ํˆฌํ‘œ ๊ธฐ๋Šฅ์„ ํ™œ์šฉํ•˜์—ฌ ๋Œ“๊ธ€ ๋ฐ ์ƒํ˜ธ์ž‘์šฉ์„ ์œ ๋„ํ•˜๋Š” ๋ฐฉ์•ˆ์„ ๊ฒ€ํ† ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
"""
# ๋ฆฌํฌํŠธ ํŒŒ์ผ๋กœ ์ €์žฅ
report_path = f'{output_dir}/comprehensive_analysis_report.txt'
with open(report_path, 'w', encoding='utf-8') as f:
f.write(report)
print(f" - ์ข…ํ•ฉ ์ธ์‚ฌ์ดํŠธ ๋ณด๊ณ ์„œ ์ƒ์„ฑ ์™„๋ฃŒ. ({report_path} ์ €์žฅ)")
# 6. ๋ฉ”์ธ ์‹คํ–‰ ํ•จ์ˆ˜
def main():
"""์Šคํฌ๋ฆฝํŠธ์˜ ๋ฉ”์ธ ์‹คํ–‰ ๋กœ์ง"""
print("===== ์‹ ๋ฌธ๊ณผ๋ฐฉ์†ก ๋…์ž ๋ฐ์ดํ„ฐ ์‹ฌ์ธต ๋ถ„์„ ์Šคํฌ๋ฆฝํŠธ ์‹คํ–‰ =====")
# 1. ํ™˜๊ฒฝ ์„ค์ •
data_dir, output_dir = setup_environment()
# 2. ๋ฐ์ดํ„ฐ ๋กœ๋“œ ๋ฐ ์ „์ฒ˜๋ฆฌ
all_data = load_and_preprocess_data(data_dir)
# 3. ์ƒ์„ธ ๋ถ„์„ ๋ฐ ์‹œ๊ฐํ™” ์‹คํ–‰
analyze_metrics_overview(all_data['merged'], output_dir)
analyze_content_features(all_data['merged'], output_dir)
analyze_demographics(all_data['demo'], all_data['merged'], output_dir)
analyze_referrer(all_data['referrer'], all_data['merged'], output_dir)
# 4. ์ข…ํ•ฉ ์ธ์‚ฌ์ดํŠธ ๋ณด๊ณ ์„œ ์ƒ์„ฑ
generate_insights_report(all_data, output_dir)
print("\n===== ๋ชจ๋“  ๋ถ„์„์ด ์„ฑ๊ณต์ ์œผ๋กœ ์™„๋ฃŒ๋˜์—ˆ์Šต๋‹ˆ๋‹ค. =====")
print(f"๊ฒฐ๊ณผ๋ฌผ์€ '{output_dir}' ํด๋”์—์„œ ํ™•์ธํ•˜์‹ค ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.")
if __name__ == '__main__':
main()