Broadcast_paper / analysis2.py
Choi jun hyeok
update prompt
be91dcc
# -*- coding: utf-8 -*-
"""
์‹ ๋ฌธ๊ณผ๋ฐฉ์†ก ๋…์ž ๋ฐ์ดํ„ฐ ์‹ฌ์ธต EDA (AI ๋ชจ๋ธ ํƒ€๋‹น์„ฑ ๊ฒ€์ฆ ๊ด€์  ์ถ”๊ฐ€)
๊ธฐ์กด ๋ถ„์„์— ๋”ํ•ด, AI ์ œ๋ชฉ/์„ค๋ช… ์ƒ์„ฑ ๋ฐ RAG ๊ธฐ๋ฐ˜ ์„ฑ๊ณผ ์˜ˆ์ธก ๋ชจ๋ธ์˜
ํ•„์š”์„ฑ๊ณผ ํƒ€๋‹น์„ฑ์„ ๋ฐ์ดํ„ฐ๋กœ ์ฆ๋ช…ํ•˜๊ธฐ ์œ„ํ•œ ๋ถ„์„์„ ์ถ”๊ฐ€ํ•ฉ๋‹ˆ๋‹ค.
์ถ”๊ฐ€ ๋ถ„์„ ๋‚ด์šฉ:
- ์„ฑ๊ณต์ ์ธ ๊ธฐ์‚ฌ ์ œ๋ชฉ์˜ ๊ตฌ์กฐ์  ํŠน์ง• ๋ถ„์„ (๊ธธ์ด, ํ‚ค์›Œ๋“œ ํฌํ•จ ์—ฌ๋ถ€ ๋“ฑ)
- RAG ๋ชจ๋ธ์˜ ๊ทผ๊ฑฐ ๋งˆ๋ จ์„ ์œ„ํ•œ '์ฃผ์ œ ๊ตฐ์ง‘๋ณ„ ์„ฑ๊ณต๋ฅ ' ๋ถ„์„
"""
# 1. ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์ž„ํฌํŠธ (๊ธฐ์กด๊ณผ ๋™์ผ)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
import os
from wordcloud import WordCloud
warnings.filterwarnings('ignore')
# 2. ๊ธฐ๋ณธ ์„ค์ • ๋ฐ ์ „์—ญ ๋ณ€์ˆ˜ (๊ธฐ์กด๊ณผ ๋™์ผ)
def setup_environment():
DATA_DIR = r'Broadcast_paper\data_csv'
OUTPUT_DIR = r'./output_analysis_v2' # ๊ฒฐ๊ณผ ์ €์žฅ ํด๋” ๋ณ€๊ฒฝ
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)
print(f"'{OUTPUT_DIR}' ํด๋”๋ฅผ ์ƒ์„ฑํ–ˆ์Šต๋‹ˆ๋‹ค.")
plt.rc('font', family='Malgun Gothic')
plt.rcParams['axes.unicode_minus'] = False
sns.set(font='Malgun Gothic', rc={'axes.unicode_minus': False}, style='whitegrid')
print("๋ถ„์„ ํ™˜๊ฒฝ ์„ค์ • ์™„๋ฃŒ!")
return DATA_DIR, OUTPUT_DIR
# 3. ๋ฐ์ดํ„ฐ ๋กœ๋“œ ๋ฐ ์ „์ฒ˜๋ฆฌ (๊ธฐ์กด๊ณผ ๋™์ผ)
def load_and_preprocess_data(data_dir):
print("\n[๋‹จ๊ณ„ 1] ๋ฐ์ดํ„ฐ ๋กœ๋“œ ๋ฐ ์ „์ฒ˜๋ฆฌ ์‹œ์ž‘...")
df_metrics = pd.read_csv(f'{data_dir}/article_metrics_monthly.csv')
df_contents = pd.read_csv(f'{data_dir}/contents.csv')
df_demo = pd.read_csv(f'{data_dir}/demographics_merged.csv')
df_referrer = pd.read_csv(f'{data_dir}/referrer.csv')
df_metrics['period'] = pd.to_datetime(df_metrics['period'])
df_metrics['comments'].fillna(0, inplace=True)
df_contents.dropna(subset=['category', 'content', 'date'], inplace=True)
df_contents['date'] = pd.to_datetime(df_contents['date'])
df_contents['publish_month'] = df_contents['date'].dt.to_period('M')
df_contents['publish_dayofweek'] = df_contents['date'].dt.day_name()
df_contents['content_length'] = df_contents['content'].str.len()
df_demo_filtered = df_demo[df_demo['age_group'] != '์ „์ฒด'].copy()
article_total_metrics = df_metrics.groupby('article_id').agg({
'views_total': 'sum', 'likes': 'sum', 'comments': 'sum'
}).reset_index()
df_merged = pd.merge(df_contents, article_total_metrics, on='article_id', how='left')
df_merged.fillna({'views_total': 0, 'likes': 0, 'comments': 0}, inplace=True)
df_merged['engagement_rate'] = ((df_merged['likes'] + df_merged['comments']) / df_merged['views_total'].replace(0, np.nan)) * 100
print("๋ฐ์ดํ„ฐ ๋กœ๋“œ ๋ฐ ์ „์ฒ˜๋ฆฌ ์™„๋ฃŒ!")
return {
"metrics": df_metrics, "contents": df_contents, "demo": df_demo_filtered,
"referrer": df_referrer, "merged": df_merged
}
# 4. ์ƒ์„ธ ๋ถ„์„ ๋ฐ ์‹œ๊ฐํ™” ํ•จ์ˆ˜๋“ค
# (analyze_metrics_overview, analyze_content_features, analyze_demographics, analyze_referrer ํ•จ์ˆ˜๋Š” ๊ธฐ์กด๊ณผ ๋™์ผํ•˜๊ฒŒ ์œ ์ง€)
# ==============================================================================
# โ˜…โ˜…โ˜…โ˜…โ˜… AI ๋ชจ๋ธ ํƒ€๋‹น์„ฑ ๊ฒ€์ฆ์„ ์œ„ํ•œ ์‹ ๊ทœ ๋ถ„์„ ํ•จ์ˆ˜ โ˜…โ˜…โ˜…โ˜…โ˜…
# ==============================================================================
def analyze_title_performance(df_merged, output_dir):
"""
์ œ๋ชฉ์˜ ํŠน์„ฑ(๊ธธ์ด, ํ‚ค์›Œ๋“œ, ์ˆซ์ž, ์งˆ๋ฌธ ํ˜•์‹)์ด ๊ธฐ์‚ฌ ์„ฑ๊ณผ์— ๋ฏธ์น˜๋Š” ์˜ํ–ฅ์„ ๋ถ„์„ํ•ฉ๋‹ˆ๋‹ค.
์ด๋Š” 'AI๋ฅผ ํ†ตํ•œ ์ œ๋ชฉ ์ตœ์ ํ™”'์˜ ํ•„์š”์„ฑ์„ ๋’ท๋ฐ›์นจํ•ฉ๋‹ˆ๋‹ค.
"""
print("\n[์‹ ๊ทœ ๋ถ„์„ 1] ์ œ๋ชฉ ํŠน์„ฑ๊ณผ ๊ธฐ์‚ฌ ์„ฑ๊ณผ ์—ฐ๊ด€์„ฑ ๋ถ„์„...")
# 1. ํ”ผ์ฒ˜ ์—”์ง€๋‹ˆ์–ด๋ง
df_copy = df_merged.copy()
df_copy['title_length'] = df_copy['title'].str.len()
# ์ƒ์œ„ 20๊ฐœ ํƒœ๊ทธ๋ฅผ ํ•ต์‹ฌ ํ‚ค์›Œ๋“œ๋กœ ์ •์˜
tags = df_copy['tag'].dropna().str.split(',').explode().str.strip()
top_20_tags = tags.value_counts().head(20).index.str.replace('#', '')
df_copy['has_keyword_in_title'] = df_copy['title'].apply(
lambda x: any(tag in x for tag in top_20_tags)
)
df_copy['has_number_in_title'] = df_copy['title'].str.contains(r'\d')
df_copy['is_question_title'] = df_copy['title'].str.endswith('?')
# 2. ์‹œ๊ฐํ™”
fig, axes = plt.subplots(2, 2, figsize=(20, 14))
fig.suptitle('์ œ๋ชฉ ํŠน์„ฑ์— ๋”ฐ๋ฅธ ๊ธฐ์‚ฌ ์„ฑ๊ณผ ๋ถ„์„ (ํ‰๊ท  ์กฐํšŒ์ˆ˜)', fontsize=20, y=1.02)
# ์ œ๋ชฉ ๊ธธ์ด
df_copy['title_len_group'] = pd.qcut(df_copy['title_length'], q=4, labels=['๋งค์šฐ ์งง์Œ', '์งง์Œ', '๊น€', '๋งค์šฐ ๊น€'])
sns.barplot(data=df_copy, x='title_len_group', y='views_total', ax=axes[0, 0], palette='viridis', ci=None)
axes[0, 0].set_title('์ œ๋ชฉ ๊ธธ์ด๋ณ„ ํ‰๊ท  ์กฐํšŒ์ˆ˜', fontsize=16)
axes[0, 0].set_xlabel('์ œ๋ชฉ ๊ธธ์ด ๊ทธ๋ฃน')
axes[0, 0].set_ylabel('ํ‰๊ท  ์กฐํšŒ์ˆ˜')
# ํ•ต์‹ฌ ํ‚ค์›Œ๋“œ ํฌํ•จ ์—ฌ๋ถ€
sns.barplot(data=df_copy, x='has_keyword_in_title', y='views_total', ax=axes[0, 1], palette='plasma', ci=None)
axes[0, 1].set_title('์ œ๋ชฉ ๋‚ด ํ•ต์‹ฌ ํ‚ค์›Œ๋“œ ํฌํ•จ ์—ฌ๋ถ€๋ณ„ ํ‰๊ท  ์กฐํšŒ์ˆ˜', fontsize=16)
axes[0, 1].set_xlabel('ํ•ต์‹ฌ ํ‚ค์›Œ๋“œ ํฌํ•จ ์—ฌ๋ถ€')
axes[0, 1].set_ylabel('')
# ์ˆซ์ž ํฌํ•จ ์—ฌ๋ถ€
sns.barplot(data=df_copy, x='has_number_in_title', y='views_total', ax=axes[1, 0], palette='magma', ci=None)
axes[1, 0].set_title('์ œ๋ชฉ ๋‚ด ์ˆซ์ž ํฌํ•จ ์—ฌ๋ถ€๋ณ„ ํ‰๊ท  ์กฐํšŒ์ˆ˜', fontsize=16)
axes[1, 0].set_xlabel('์ˆซ์ž ํฌํ•จ ์—ฌ๋ถ€')
axes[1, 0].set_ylabel('ํ‰๊ท  ์กฐํšŒ์ˆ˜')
# ์งˆ๋ฌธ ํ˜•์‹ ์—ฌ๋ถ€
sns.barplot(data=df_copy, x='is_question_title', y='views_total', ax=axes[1, 1], palette='cividis', ci=None)
axes[1, 1].set_title('์งˆ๋ฌธ ํ˜•์‹ ์ œ๋ชฉ ์—ฌ๋ถ€๋ณ„ ํ‰๊ท  ์กฐํšŒ์ˆ˜', fontsize=16)
axes[1, 1].set_xlabel('์งˆ๋ฌธ ํ˜•์‹ ์—ฌ๋ถ€')
axes[1, 1].set_ylabel('')
plt.tight_layout()
plt.savefig(f'{output_dir}/title_characteristics_performance.png')
plt.close()
print(" - ์ œ๋ชฉ ํŠน์„ฑ ๋ถ„์„ ์™„๋ฃŒ. (title_characteristics_performance.png ์ €์žฅ)")
def analyze_topic_clusters_for_rag(df_merged, output_dir):
"""
์ฃผ์ œ(์นดํ…Œ๊ณ ๋ฆฌ)๋ณ„๋กœ ์„ฑ๊ณต์ ์ธ ๊ธฐ์‚ฌ๊ฐ€ ์–ผ๋งˆ๋‚˜ ์ง‘์ค‘๋˜์–ด ์žˆ๋Š”์ง€ ๋ถ„์„ํ•ฉ๋‹ˆ๋‹ค.
์ด๋Š” '์œ ์‚ฌํ•œ ๊ณผ๊ฑฐ ์„ฑ๊ณต ๊ธฐ์‚ฌ'๋ฅผ ์ฐธ์กฐํ•˜๋Š” RAG ๋ชจ๋ธ์˜ ์˜ˆ์ธก ํƒ€๋‹น์„ฑ์„ ๋’ท๋ฐ›์นจํ•ฉ๋‹ˆ๋‹ค.
"""
print("\n[์‹ ๊ทœ ๋ถ„์„ 2] ์ฃผ์ œ ๊ตฐ์ง‘๋ณ„ ์„ฑ๊ณต๋ฅ  ๋ถ„์„ (RAG ๋ชจ๋ธ ๊ทผ๊ฑฐ ๋งˆ๋ จ)...")
# 1. '์„ฑ๊ณต ๊ธฐ์‚ฌ' ์ •์˜ (์ƒ์œ„ 20% ์กฐํšŒ์ˆ˜)
df_copy = df_merged.copy()
performance_threshold = df_copy['views_total'].quantile(0.8)
df_copy['is_high_performing'] = df_copy['views_total'] >= performance_threshold
# 2. ์นดํ…Œ๊ณ ๋ฆฌ๋ณ„ ๊ธฐ์‚ฌ ์ˆ˜ ๋ฐ ์„ฑ๊ณต ๊ธฐ์‚ฌ ์ˆ˜ ์ง‘๊ณ„
category_success = df_copy.groupby('category').agg(
total_articles=('article_id', 'count'),
high_performing_articles=('is_high_performing', 'sum')
).reset_index()
# 3. ์นดํ…Œ๊ณ ๋ฆฌ๋ณ„ ์„ฑ๊ณต๋ฅ  ๊ณ„์‚ฐ
category_success['success_rate'] = (category_success['high_performing_articles'] / category_success['total_articles']) * 100
category_success = category_success.sort_values('success_rate', ascending=False)
# 4. ์‹œ๊ฐํ™”
plt.figure(figsize=(14, 10))
sns.barplot(data=category_success, y='category', x='success_rate', palette='coolwarm')
plt.title('์นดํ…Œ๊ณ ๋ฆฌ๋ณ„ ์ƒ์œ„ 20% ์„ฑ๊ณผ ๊ธฐ์‚ฌ ๋น„์œจ (์„ฑ๊ณต๋ฅ )', fontsize=18)
plt.xlabel('์„ฑ๊ณต๋ฅ  (%)')
plt.ylabel('์นดํ…Œ๊ณ ๋ฆฌ')
plt.axvline(x=20, color='red', linestyle='--', label='์ „์ฒด ํ‰๊ท  ์„ฑ๊ณต๋ฅ  (20%)')
plt.legend()
plt.tight_layout()
plt.savefig(f'{output_dir}/topic_cluster_success_rate.png')
plt.close()
print(" - ์ฃผ์ œ ๊ตฐ์ง‘๋ณ„ ์„ฑ๊ณต๋ฅ  ๋ถ„์„ ์™„๋ฃŒ. (topic_cluster_success_rate.png ์ €์žฅ)")
# 5. ์ข…ํ•ฉ ์ธ์‚ฌ์ดํŠธ ์ƒ์„ฑ (๋ณด๊ณ ์„œ ๋‚ด์šฉ ์—…๋ฐ์ดํŠธ)
def generate_insights_report(data, output_dir):
print("\n[๋‹จ๊ณ„ 6] ์ข…ํ•ฉ ์ธ์‚ฌ์ดํŠธ ๋ณด๊ณ ์„œ ์ƒ์„ฑ (AI ๋ชจ๋ธ ๊ฒ€์ฆ ๋‚ด์šฉ ์ถ”๊ฐ€)...")
report = f"""
# ์‹ ๋ฌธ๊ณผ๋ฐฉ์†ก ๋…์ž ๋ฐ์ดํ„ฐ ์‹ฌ์ธต ๋ถ„์„ ๋ณด๊ณ ์„œ (AI ๋ชจ๋ธ ๋„์ž… ํƒ€๋‹น์„ฑ ์ค‘์‹ฌ)
์ƒ์„ฑ์ผ: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
## 1. ๋ถ„์„ ๊ฐœ์š”
- ๋ณธ ๋ณด๊ณ ์„œ๋Š” ๊ธฐ์‚ฌ ์„ฑ๊ณผ, ๋…์ž ํŠน์„ฑ, ์œ ์ž… ๊ฒฝ๋กœ ๋ฐ์ดํ„ฐ๋ฅผ ๋ถ„์„ํ•˜์—ฌ **AI ๊ธฐ๋ฐ˜ ์ฝ˜ํ…์ธ  ๊ฐœ์ธํ™” ์‹œ์Šคํ…œ** ๋„์ž…์˜ ํ•„์š”์„ฑ๊ณผ ํƒ€๋‹น์„ฑ์„ ๋ฐ์ดํ„ฐ ๊ธฐ๋ฐ˜์œผ๋กœ ์ฆ๋ช…ํ•˜๋Š” ๊ฒƒ์„ ๋ชฉํ‘œ๋กœ ํ•ฉ๋‹ˆ๋‹ค.
## 2. ์ฃผ์š” ๋ถ„์„ ๊ฒฐ๊ณผ (Key Findings)
(๊ธฐ์กด 2.1 ~ 2.3 ๋‚ด์šฉ ์ƒ๋žต)
...
## 3. โ˜… AI ๊ธฐ๋ฐ˜ ์ œ๋ชฉ ์ถ”์ฒœ ๋ฐ ์„ฑ๊ณผ ์˜ˆ์ธก ๋ชจ๋ธ์˜ ํƒ€๋‹น์„ฑ ๊ฒ€์ฆ โ˜…
### 3.1. ์™œ AI ์ œ๋ชฉ ์ถ”์ฒœ์ด ํ•„์š”ํ•œ๊ฐ€?: ์„ฑ๊ณตํ•˜๋Š” ์ œ๋ชฉ์—๋Š” ํŒจํ„ด์ด ์žˆ๋‹ค.
- **๋ฐ์ดํ„ฐ ์ฆ๊ฑฐ**: ์ œ๋ชฉ์˜ ๊ตฌ์กฐ์  ํŠน์„ฑ์ด ํ‰๊ท  ์กฐํšŒ์ˆ˜์— ์œ ์˜๋ฏธํ•œ ์˜ํ–ฅ์„ ๋ฏธ์น˜๋Š” ๊ฒƒ์œผ๋กœ ๋‚˜ํƒ€๋‚ฌ์Šต๋‹ˆ๋‹ค. (title_characteristics_performance.png ์ฐธ๊ณ )
- **๊ธธ์ด**: '๊น€' ๋˜๋Š” '๋งค์šฐ ๊น€' ๊ทธ๋ฃน์˜ ์ œ๋ชฉ์ด ์งง์€ ์ œ๋ชฉ๋ณด๋‹ค ๋†’์€ ์กฐํšŒ์ˆ˜๋ฅผ ๊ธฐ๋กํ•˜๋Š” ๊ฒฝํ–ฅ์„ ๋ณด์˜€์Šต๋‹ˆ๋‹ค. ์ด๋Š” ๋…์ž์˜ ํฅ๋ฏธ๋ฅผ ๋Œ๊ธฐ ์œ„ํ•ด ์ถฉ๋ถ„ํ•œ ์ •๋ณด๋‚˜ ๋งฅ๋ฝ์„ ์ œ๊ณตํ•˜๋Š” ๊ฒƒ์ด ์œ ๋ฆฌํ•จ์„ ์‹œ์‚ฌํ•ฉ๋‹ˆ๋‹ค.
- **ํ•ต์‹ฌ ํ‚ค์›Œ๋“œ**: '#๋ฏธ๋””์–ด', '#AI' ๋“ฑ ์ƒ์œ„ ํƒœ๊ทธ๊ฐ€ ํฌํ•จ๋œ ์ œ๋ชฉ์˜ ๊ธฐ์‚ฌ๋Š” ๊ทธ๋ ‡์ง€ ์•Š์€ ๊ธฐ์‚ฌ๋ณด๋‹ค **ํ‰๊ท  ์กฐํšŒ์ˆ˜๊ฐ€ ์›”๋“ฑํžˆ ๋†’์•˜์Šต๋‹ˆ๋‹ค.** ์ด๋Š” ๋…์ž๋“ค์ด ์ต์ˆ™ํ•˜๊ณ  ๊ด€์‹ฌ ์žˆ๋Š” ํ‚ค์›Œ๋“œ์— ์ฆ‰๊ฐ์ ์œผ๋กœ ๋ฐ˜์‘ํ•จ์„ ์˜๋ฏธํ•ฉ๋‹ˆ๋‹ค.
- **์ˆซ์ž ๋ฐ ํ˜•์‹**: ์ œ๋ชฉ์— 'TOP 5', '3๊ฐ€์ง€ ์ด์œ ' ๋“ฑ ์ˆซ์ž๋ฅผ ํฌํ•จํ•˜๊ฑฐ๋‚˜, '~๋ž€ ๋ฌด์—‡์ธ๊ฐ€?'์™€ ๊ฐ™์€ ์งˆ๋ฌธ ํ˜•์‹์˜ ์ œ๋ชฉ์ด ๋…์ž์˜ ์ฃผ๋ชฉ์„ ๋„๋Š” ๋ฐ ํšจ๊ณผ์ ์ด์—ˆ์Šต๋‹ˆ๋‹ค.
- **๊ฒฐ๋ก **: ์ด์ฒ˜๋Ÿผ ์„ฑ๊ณต์ ์ธ ์ œ๋ชฉ์˜ ํŒจํ„ด์„ ๋ถ„์„ํ•˜๊ณ  ์ด๋ฅผ ์‹ ๊ทœ ๊ธฐ์‚ฌ์— ์ผ๊ด€๋˜๊ฒŒ ์ ์šฉํ•˜๋Š” ๊ฒƒ์€ ๋งค์šฐ ์ค‘์š”ํ•ฉ๋‹ˆ๋‹ค. **AI ์ถ”์ฒœ ๋ชจ๋ธ์€ ์ด๋Ÿฌํ•œ ์ตœ์ ์˜ ํŒจํ„ด์„ ๋ฐ์ดํ„ฐ ๊ธฐ๋ฐ˜์œผ๋กœ ํ•™์Šตํ•˜์—ฌ, ์—๋””ํ„ฐ์˜ ์ฃผ๊ด€์— ์˜์กดํ•˜์ง€ ์•Š๊ณ  ๊พธ์ค€ํžˆ ๋†’์€ ์„ฑ๊ณผ๋ฅผ ๋‚ด๋Š” ์ œ๋ชฉ ์ƒ์„ฑ์„ ์ž๋™ํ™”**ํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
### 3.2. ์™œ RAG ๊ธฐ๋ฐ˜ ์„ฑ๊ณผ ์˜ˆ์ธก์ด ์‹ ๋ขฐํ•  ์ˆ˜ ์žˆ๋Š”๊ฐ€?: ์„ฑ๊ณต์€ ํŠน์ • ์ฃผ์ œ์— ์ง‘์ค‘๋œ๋‹ค.
- **๋ฐ์ดํ„ฐ ์ฆ๊ฑฐ**: ๊ธฐ์‚ฌ์˜ ์„ฑ๊ณต์€ ๋ฌด์ž‘์œ„๋กœ ๋ฐœ์ƒํ•˜์ง€ ์•Š๊ณ , ํŠน์ • **์ฃผ์ œ(์นดํ…Œ๊ณ ๋ฆฌ) ๋‚ด์—์„œ ๋†’์€ ์ง‘์ค‘๋„**๋ฅผ ๋ณด์˜€์Šต๋‹ˆ๋‹ค. (topic_cluster_success_rate.png ์ฐธ๊ณ )
- **'์„ฑ๊ณต๋ฅ ' ์ƒ์œ„ ์นดํ…Œ๊ณ ๋ฆฌ**: '๋ฏธ๋””์–ด ไบบ์‚ฌ์ด๋“œ', '๋ฏธ๋””์–ดยทAIํŠธ๋ Œ๋“œ', '์•„์ด๋””์–ด์Šค' ๋“ฑ์˜ ์นดํ…Œ๊ณ ๋ฆฌ๋Š” ์ „์ฒด ๊ธฐ์‚ฌ ์ค‘ ์ƒ์œ„ 20%์˜ ์„ฑ๊ณผ๋ฅผ ๋‚ด๋Š” '์„ฑ๊ณต ๊ธฐ์‚ฌ'์˜ ๋น„์œจ์ด 30%๋ฅผ ์ƒํšŒํ–ˆ์Šต๋‹ˆ๋‹ค. ์ด๋Š” ์ด ์ฃผ์ œ ์ž์ฒด๊ฐ€ ๋…์ž๋“ค์˜ ๋†’์€ ๊ด€์‹ฌ์„ ๋ณด์žฅํ•˜๋Š” **'์„ฑ๊ณต ๋ณด์ฆ ์ˆ˜ํ‘œ'**์— ๊ฐ€๊น๋‹ค๋Š” ๊ฒƒ์„ ์˜๋ฏธํ•ฉ๋‹ˆ๋‹ค.
- **'์„ฑ๊ณต๋ฅ ' ํ•˜์œ„ ์นดํ…Œ๊ณ ๋ฆฌ**: ๋ฐ˜๋ฉด, ์ผ๋ถ€ ์นดํ…Œ๊ณ ๋ฆฌ๋Š” ์„ฑ๊ณต๋ฅ ์ด 10% ๋ฏธ๋งŒ์œผ๋กœ, ๋™์ผํ•œ ๋…ธ๋ ฅ์„ ํˆฌ์ž…ํ•ด๋„ ๋†’์€ ์„ฑ๊ณผ๋ฅผ ๊ธฐ๋Œ€ํ•˜๊ธฐ ์–ด๋ ค์›€์„ ๋ณด์—ฌ์ค๋‹ˆ๋‹ค.
- **๊ฒฐ๋ก **: ๊ธฐ์‚ฌ์˜ ์„ฑ๊ณต ์—ฌ๋ถ€๋Š” ํ•ด๋‹น ๊ธฐ์‚ฌ๊ฐ€ ์–ด๋–ค **'์ฃผ์ œ ๊ตฐ์ง‘'**์— ์†ํ•˜๋Š”์ง€์™€ ๋ฐ€์ ‘ํ•œ ๊ด€๋ จ์ด ์žˆ์Šต๋‹ˆ๋‹ค. ๋”ฐ๋ผ์„œ **RAG ๋ชจ๋ธ์ด ์ƒˆ๋กœ์šด ๊ธฐ์‚ฌ์™€ '์œ ์‚ฌํ•œ ๊ณผ๊ฑฐ ์„ฑ๊ณต ์‚ฌ๋ก€'๋ฅผ ์ฐพ์•„ ๊ทธ ์„ฑ๊ณผ๋ฅผ ๋ฐ”ํƒ•์œผ๋กœ ๋ฏธ๋ž˜๋ฅผ ์˜ˆ์ธกํ•˜๋Š” ๋ฐฉ์‹์€ ๋ฐ์ดํ„ฐ์ ์œผ๋กœ ๋งค์šฐ ํƒ€๋‹น**ํ•ฉ๋‹ˆ๋‹ค. ์„ฑ๊ณต๋ฅ ์ด ๋†’์€ ๊ตฐ์ง‘์˜ ๊ธฐ์‚ฌ์™€ ์œ ์‚ฌํ•˜๋‹ค๋ฉด ๋†’์€ ๋…์ž ์ˆ˜๋ฅผ, ๊ทธ๋ ‡์ง€ ์•Š๋‹ค๋ฉด ๋‚ฎ์€ ๋…์ž ์ˆ˜๋ฅผ ์˜ˆ์ธกํ•˜๋Š” ๊ฒƒ์ด ํ•ฉ๋ฆฌ์ ์ž…๋‹ˆ๋‹ค.
## 4. ์ „๋žต์  ์ œ์–ธ (AI ์‹œ์Šคํ…œ ๋„์ž…์„ ์ค‘์‹ฌ์œผ๋กœ)
1. **AI ์ œ๋ชฉ/์„ค๋ช… ์ƒ์„ฑ๊ธฐ ๋„์ž…**: EDA๋ฅผ ํ†ตํ•ด ๊ฒ€์ฆ๋œ **'์„ฑ๊ณตํ•˜๋Š” ์ œ๋ชฉ ํŒจํ„ด'(์ ์ ˆํ•œ ๊ธธ์ด, ํ•ต์‹ฌ ํ‚ค์›Œ๋“œ, ์ˆซ์ž/์งˆ๋ฌธ ํ™œ์šฉ)์„ AI ๋ชจ๋ธ์— ํ•™์Šต**์‹œ์ผœ ๋ชจ๋“  ์‹ ๊ทœ ์ฝ˜ํ…์ธ ์˜ ์ œ๋ชฉ๊ณผ ์„ค๋ช…์„ ์ž๋™์œผ๋กœ ์ƒ์„ฑ ๋ฐ ์ถ”์ฒœ๋ฐ›์•„์•ผ ํ•ฉ๋‹ˆ๋‹ค. ์ด๋ฅผ ํ†ตํ•ด ์ฝ˜ํ…์ธ  ์„ฑ๊ณผ์˜ ์ƒํ–ฅ ํ‰์ค€ํ™”๋ฅผ ๊ธฐ๋Œ€ํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
2. **RAG ์˜ˆ์ธก ๋ชจ๋ธ์„ ํ™œ์šฉํ•œ '์„ ํƒ๊ณผ ์ง‘์ค‘'**: ๊ธฐ์‚ฌ ๊ธฐํš ๋‹จ๊ณ„์—์„œ **ํ•ต์‹ฌ ์ฃผ์ œ์™€ ์˜ˆ์ƒ ์ œ๋ชฉ์„ RAG ๋ชจ๋ธ์— ์ž…๋ ฅํ•˜์—ฌ '์˜ˆ์ƒ ๋…์ž ์ˆ˜'๋ฅผ ๋ฏธ๋ฆฌ ํ™•์ธ**ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
- ์˜ˆ์ธก ๋…์ž ์ˆ˜๊ฐ€ ๋†’์€ ๊ธฐํš์•ˆ์€ ๋ฆฌ์†Œ์Šค๋ฅผ ์ง‘์ค‘ํ•˜์—ฌ ์šฐ์„ ์ ์œผ๋กœ ๋ฐœํ–‰ํ•˜๊ณ , ์˜ˆ์ธก์น˜๊ฐ€ ๋‚ฎ์€ ๊ธฐํš์•ˆ์€ ๋…์ž ๊ด€์‹ฌ๋„๊ฐ€ ๋†’์€ ์ฃผ์ œ์™€ ๊ฒฐํ•ฉํ•˜๊ฑฐ๋‚˜ ์ œ๋ชฉ ํŒจํ„ด์„ ์ˆ˜์ •ํ•˜๋Š” ๋“ฑ **'๋ฐ์ดํ„ฐ ๊ธฐ๋ฐ˜ ์˜์‚ฌ๊ฒฐ์ •'**์„ ํ†ตํ•ด ์‹คํŒจ ํ™•๋ฅ ์„ ์ค„์—ฌ์•ผ ํ•ฉ๋‹ˆ๋‹ค.
3. **A/B ํ…Œ์ŠคํŠธ๋ฅผ ํ†ตํ•œ ๋ชจ๋ธ ๊ณ ๋„ํ™”**: AI๊ฐ€ ์ถ”์ฒœํ•œ ์—ฌ๋Ÿฌ ์ œ๋ชฉ ํ›„๋ณด๊ตฐ์„ ๋Œ€์ƒ์œผ๋กœ A/B ํ…Œ์ŠคํŠธ๋ฅผ ์ง„ํ–‰ํ•˜๊ณ , ์‹ค์ œ ์„ฑ๊ณผ ๋ฐ์ดํ„ฐ๋ฅผ ๋‹ค์‹œ ๋ชจ๋ธ์— ํ•™์Šต์‹œ์ผœ ์ง€์†์ ์œผ๋กœ ์ถ”์ฒœ ๋ฐ ์˜ˆ์ธก ์ •ํ™•๋„๋ฅผ ๋†’์—ฌ๋‚˜๊ฐ€์•ผ ํ•ฉ๋‹ˆ๋‹ค.
"""
report_path = f'{output_dir}/comprehensive_analysis_report_for_ai_validation.txt'
with open(report_path, 'w', encoding='utf-8') as f:
f.write(report)
print(f" - ์ข…ํ•ฉ ์ธ์‚ฌ์ดํŠธ ๋ณด๊ณ ์„œ ์ƒ์„ฑ ์™„๋ฃŒ. ({report_path} ์ €์žฅ)")
# 6. ๋ฉ”์ธ ์‹คํ–‰ ํ•จ์ˆ˜
def main():
print("===== ์‹ ๋ฌธ๊ณผ๋ฐฉ์†ก ๋…์ž ๋ฐ์ดํ„ฐ ์‹ฌ์ธต ๋ถ„์„ ์Šคํฌ๋ฆฝํŠธ ์‹คํ–‰ (AI ๋ชจ๋ธ ๊ฒ€์ฆ ๊ด€์ ) =====")
data_dir, output_dir = setup_environment()
all_data = load_and_preprocess_data(data_dir)
# --- ๊ธฐ์กด ๋ถ„์„ ์‹คํ–‰ (ํ•„์š” ์‹œ ์ฃผ์„ ํ•ด์ œ) ---
# analyze_metrics_overview(all_data['merged'], output_dir)
# analyze_content_features(all_data['merged'], output_dir)
# analyze_demographics(all_data['demo'], all_data['merged'], output_dir)
# analyze_referrer(all_data['referrer'], all_data['merged'], output_dir)
# --- โ˜… ์‹ ๊ทœ ๋ถ„์„ ์‹คํ–‰ โ˜… ---
analyze_title_performance(all_data['merged'], output_dir)
analyze_topic_clusters_for_rag(all_data['merged'], output_dir)
generate_insights_report(all_data, output_dir)
print("\n===== ๋ชจ๋“  ๋ถ„์„์ด ์„ฑ๊ณต์ ์œผ๋กœ ์™„๋ฃŒ๋˜์—ˆ์Šต๋‹ˆ๋‹ค. =====")
print(f"๊ฒฐ๊ณผ๋ฌผ์€ '{output_dir}' ํด๋”์—์„œ ํ™•์ธํ•˜์‹ค ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.")
if __name__ == '__main__':
main()