File size: 11,000 Bytes
be91dcc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
# -*- coding: utf-8 -*-
"""
์‹ ๋ฌธ๊ณผ๋ฐฉ์†ก ๋…์ž ๋ฐ์ดํ„ฐ ์‹ฌ์ธต EDA (์กฐํšŒ์ˆ˜ ์ค‘์‹ฌ ์„ฑ๊ณต ๊ณต์‹ ๋„์ถœ - v2)

- ์˜ค๋ฅ˜ ์ˆ˜์ •: tick_params ha ๊ด€๋ จ ์˜ค๋ฅ˜ ํ•ด๊ฒฐ
- ๋ถ„์„ ์‹ฌํ™”: TOP 20 ๊ธฐ์‚ฌ ๋ฆฌ์ŠคํŠธ์—์„œ ๋ฐœ๊ฒฌ๋œ ์งˆ์  ์ธ์‚ฌ์ดํŠธ(๋ง๋จธ๋ฆฌ, ํŠธ๋ Œ๋“œ ํ‚ค์›Œ๋“œ)๋ฅผ
             ์ •๋Ÿ‰์ ์œผ๋กœ ๊ฒ€์ฆํ•˜๋Š” ๋ถ„์„ ๋กœ์ง ์ถ”๊ฐ€
"""

# 1. ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์ž„ํฌํŠธ
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
import os
import re

warnings.filterwarnings('ignore')

# 2. ๊ธฐ๋ณธ ์„ค์ • ๋ฐ ์ „์—ญ ๋ณ€์ˆ˜
def setup_environment():
    DATA_DIR = r'Broadcast_paper\data_csv'
    OUTPUT_DIR = r'./output_analysis_v6' # ๊ฒฐ๊ณผ ์ €์žฅ ํด๋” ๋ณ€๊ฒฝ
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
        print(f"'{OUTPUT_DIR}' ํด๋”๋ฅผ ์ƒ์„ฑํ–ˆ์Šต๋‹ˆ๋‹ค.")
    plt.rc('font', family='Malgun Gothic')
    plt.rcParams['axes.unicode_minus'] = False
    sns.set(font='Malgun Gothic', rc={'axes.unicode_minus': False}, style='whitegrid')
    print("๋ถ„์„ ํ™˜๊ฒฝ ์„ค์ • ์™„๋ฃŒ!")
    return DATA_DIR, OUTPUT_DIR

# 3. ๋ฐ์ดํ„ฐ ๋กœ๋“œ ๋ฐ ์ „์ฒ˜๋ฆฌ
def load_and_preprocess_data(data_dir):
    print("\n[๋‹จ๊ณ„ 1] ๋ฐ์ดํ„ฐ ๋กœ๋“œ ๋ฐ ์ „์ฒ˜๋ฆฌ ์‹œ์ž‘...")
    df_metrics = pd.read_csv(f'{data_dir}/article_metrics_monthly.csv')
    df_contents = pd.read_csv(f'{data_dir}/contents.csv')
    
    df_metrics['comments'].fillna(0, inplace=True)
    df_contents.dropna(subset=['category', 'content', 'date'], inplace=True)
    df_contents['date'] = pd.to_datetime(df_contents['date'])
    df_contents['publish_dayofweek'] = df_contents['date'].dt.day_name()
    df_contents['content_length'] = df_contents['content'].str.len()
    df_contents['title_length'] = df_contents['title'].str.len()
    
    article_total_metrics = df_metrics.groupby('article_id').agg({
        'views_total': 'sum', 'likes': 'sum', 'comments': 'sum'
    }).reset_index()
    
    df_merged = pd.merge(df_contents, article_total_metrics, on='article_id', how='left')
    df_merged.fillna({'views_total': 0, 'likes': 0, 'comments': 0}, inplace=True)
    
    print("๋ฐ์ดํ„ฐ ๋กœ๋“œ ๋ฐ ์ „์ฒ˜๋ฆฌ ์™„๋ฃŒ!")
    return df_merged

# ==============================================================================
# โ˜…โ˜…โ˜…โ˜…โ˜… ์กฐํšŒ์ˆ˜ TOP 10% ํžˆํŠธ ๊ธฐ์‚ฌ ์‹ฌ์ธต ๋ถ„์„ ํ•จ์ˆ˜ (์˜ค๋ฅ˜ ์ˆ˜์ • ๋ฐ ๊ธฐ๋Šฅ ๊ฐ•ํ™”) โ˜…โ˜…โ˜…โ˜…โ˜…
# ==============================================================================
def analyze_high_view_articles_v2(df_merged, output_dir):
    """
    ์กฐํšŒ์ˆ˜ ์ƒ์œ„ 10% ๊ธฐ์‚ฌ๋ฅผ ๋ถ„์„ํ•˜์—ฌ ์„ฑ๊ณต ์š”์ธ์„ ๋„์ถœํ•ฉ๋‹ˆ๋‹ค. (v2: ์งˆ์  ๋ถ„์„ ์ถ”๊ฐ€)
    """
    print("\n[ํ•ต์‹ฌ ๋ถ„์„] ์กฐํšŒ์ˆ˜ TOP 10% ํžˆํŠธ ๊ธฐ์‚ฌ ์‹ฌ์ธต ๋ถ„์„ (v2)...")
    
    # --- 1. 'ํžˆํŠธ ๊ธฐ์‚ฌ' ์ •์˜ ๋ฐ ๋ฐ์ดํ„ฐ ๋ถ„๋ฆฌ ---
    view_threshold = df_merged['views_total'].quantile(0.9)
    print(f"  - ์กฐํšŒ์ˆ˜ ์ƒ์œ„ 10% ๊ธฐ์ค€: {view_threshold:,.0f} ํšŒ ์ด์ƒ")
    
    df_merged['group'] = np.where(df_merged['views_total'] >= view_threshold, 'TOP 10%', '๋‚˜๋จธ์ง€ 90%')

    # --- 2. ์–ด๋–ค ๊ธฐ์‚ฌ๊ฐ€ ๋†’์€ ์กฐํšŒ์ˆ˜๋ฅผ ๋ฐ›์•˜๋Š”๊ฐ€? (TOP 20 ๋ฆฌ์ŠคํŠธ) ---
    top_20_list = df_merged.sort_values('views_total', ascending=False).head(20)
    top_20_table = top_20_list[['title', 'category', 'views_total', 'likes', 'comments']].reset_index(drop=True)
    print("\n--- ์กฐํšŒ์ˆ˜ TOP 20 ๊ธฐ์‚ฌ ๋ฆฌ์ŠคํŠธ ---")
    print(top_20_table)
    
    # --- 3. โ˜… ์งˆ์  ํŠน์„ฑ ์ •๋Ÿ‰ํ™” (์ƒˆ๋กœ์šด ํ”ผ์ฒ˜ ์ƒ์„ฑ) โ˜… ---
    df_merged['has_bracket_prefix'] = df_merged['title'].apply(lambda x: bool(re.match(r'^\[.+\]', x)))
    trend_keywords = ['์ˆํผ', 'MZ', '์•Œ๊ณ ๋ฆฌ์ฆ˜', '์ฑ—GPT', 'AI', '์ธ๊ณต์ง€๋Šฅ']
    df_merged['has_trend_keyword'] = df_merged['title'].apply(
        lambda x: any(keyword in x for keyword in trend_keywords)
    )

    # --- 4. ํžˆํŠธ ๊ธฐ์‚ฌ์˜ ํŠน์ง• ๋ถ„์„ ๋ฐ ์‹œ๊ฐํ™” ---
    fig, axes = plt.subplots(3, 2, figsize=(20, 24))
    fig.suptitle(f"์กฐํšŒ์ˆ˜ TOP 10% ๊ธฐ์‚ฌ vs ๋‚˜๋จธ์ง€ ๊ธฐ์‚ฌ ๋น„๊ต ๋ถ„์„ (๊ธฐ์ค€: {view_threshold:,.0f}ํšŒ)", fontsize=22, y=1.01)

    # (1) ์นดํ…Œ๊ณ ๋ฆฌ ๋ถ„ํฌ
    cat_comp_df = df_merged.groupby('group')['category'].value_counts(normalize=True).mul(100).unstack().T
    cat_comp_df = cat_comp_df.sort_values('TOP 10%', ascending=False).head(10)
    cat_comp_df.plot(kind='bar', ax=axes[0, 0], rot=45)
    axes[0, 0].set_title('ํžˆํŠธ ๊ธฐ์‚ฌ์˜ ์นดํ…Œ๊ณ ๋ฆฌ ๋ถ„ํฌ', fontsize=16)
    axes[0, 0].set_ylabel('๋น„์ค‘ (%)')
    # โ˜…โ˜…โ˜… ์˜ค๋ฅ˜ ์ˆ˜์ • โ˜…โ˜…โ˜…
    plt.setp(axes[0, 0].get_xticklabels(), rotation=45, ha='right')

    # (2) ๋ณธ๋ฌธ ๊ธธ์ด
    sns.boxplot(data=df_merged, x='group', y='content_length', ax=axes[0, 1], order=['TOP 10%', '๋‚˜๋จธ์ง€ 90%'])
    axes[0, 1].set_title('๋ณธ๋ฌธ ๊ธธ์ด ๋น„๊ต', fontsize=16); axes[0, 1].set_ylabel('๊ธ€์ž ์ˆ˜')
    axes[0, 1].set_ylim(0, df_merged['content_length'].quantile(0.95))

    # (3) ์ œ๋ชฉ ๊ธธ์ด
    sns.boxplot(data=df_merged, x='group', y='title_length', ax=axes[1, 0], order=['TOP 10%', '๋‚˜๋จธ์ง€ 90%'])
    axes[1, 0].set_title('์ œ๋ชฉ ๊ธธ์ด ๋น„๊ต', fontsize=16); axes[1, 0].set_ylabel('๊ธ€์ž ์ˆ˜')

    # (4) ๋ฐœํ–‰ ์š”์ผ
    day_comp_df = df_merged.groupby('group')['publish_dayofweek'].value_counts(normalize=True).mul(100).unstack().T
    day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    day_comp_df.reindex(day_order).plot(kind='bar', ax=axes[1, 1], rot=0)
    axes[1, 1].set_title('๋ฐœํ–‰ ์š”์ผ๋ณ„ ๋ถ„ํฌ', fontsize=16); axes[1, 1].set_ylabel('๋น„์ค‘ (%)')
    
    # โ˜…โ˜…โ˜… (5) ๋ง๋จธ๋ฆฌ([OO]) ์‚ฌ์šฉ ์—ฌ๋ถ€ (์‹ ๊ทœ ๋ถ„์„) โ˜…โ˜…โ˜…
    sns.barplot(data=df_merged, x='has_bracket_prefix', y='views_total', ax=axes[2, 0], ci=None)
    axes[2, 0].set_title('์ œ๋ชฉ ๋ง๋จธ๋ฆฌ([OO]) ์‚ฌ์šฉ ์—ฌ๋ถ€๋ณ„ ํ‰๊ท  ์กฐํšŒ์ˆ˜', fontsize=16)
    axes[2, 0].set_xlabel('๋ง๋จธ๋ฆฌ ์‚ฌ์šฉ ์—ฌ๋ถ€'); axes[2, 0].set_ylabel('ํ‰๊ท  ์กฐํšŒ์ˆ˜')
    
    # โ˜…โ˜…โ˜… (6) ํŠธ๋ Œ๋“œ ํ‚ค์›Œ๋“œ ํฌํ•จ ์—ฌ๋ถ€ (์‹ ๊ทœ ๋ถ„์„) โ˜…โ˜…โ˜…
    sns.barplot(data=df_merged, x='has_trend_keyword', y='views_total', ax=axes[2, 1], ci=None)
    axes[2, 1].set_title('์ œ๋ชฉ ๋‚ด ํŠธ๋ Œ๋“œ ํ‚ค์›Œ๋“œ ํฌํ•จ ์—ฌ๋ถ€๋ณ„ ํ‰๊ท  ์กฐํšŒ์ˆ˜', fontsize=16)
    axes[2, 1].set_xlabel('ํŠธ๋ Œ๋“œ ํ‚ค์›Œ๋“œ ํฌํ•จ ์—ฌ๋ถ€'); axes[2, 1].set_ylabel('ํ‰๊ท  ์กฐํšŒ์ˆ˜')

    plt.tight_layout()
    plt.savefig(f'{output_dir}/high_view_article_characteristics_v2.png')
    plt.close()
    
    print("\n  - ํžˆํŠธ ๊ธฐ์‚ฌ ํŠน์ง• ๋น„๊ต ๋ถ„์„(v2) ์™„๋ฃŒ. (high_view_article_characteristics_v2.png ์ €์žฅ)")
    
    return top_20_table, cat_comp_df

# 4. ์ข…ํ•ฉ ์ธ์‚ฌ์ดํŠธ ์ƒ์„ฑ (๋ณด๊ณ ์„œ ๋‚ด์šฉ ๊ฐ•ํ™”)
def generate_insights_report_v2(top_20_table, cat_comp_df, output_dir):
    print("\n[๋‹จ๊ณ„ 6] ์ข…ํ•ฉ ์ธ์‚ฌ์ดํŠธ ๋ณด๊ณ ์„œ ์ƒ์„ฑ (์„ฑ๊ณต ๊ณต์‹ ๊ฐ•ํ™”)...")
    
    top_20_str = top_20_table.to_string()
    cat_comp_str = cat_comp_df.head(5).round(1).to_string()
    
    report = f"""
# ์‹ ๋ฌธ๊ณผ๋ฐฉ์†ก ๋…์ž ๋ฐ์ดํ„ฐ ์‹ฌ์ธต ๋ถ„์„ ๋ณด๊ณ ์„œ (์กฐํšŒ์ˆ˜ ์ค‘์‹ฌ ์„ฑ๊ณต ๊ณต์‹ v2)
์ƒ์„ฑ์ผ: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

## 1. ๋ถ„์„ ๋ชฉํ‘œ
- 'ํžˆํŠธ ๊ธฐ์‚ฌ'์˜ ๊ณตํ†ต์ ์„ ์ •๋Ÿ‰์ , ์ •์„ฑ์ ์œผ๋กœ ๋ถ„์„ํ•˜์—ฌ **๋”ฐ๋ผ ํ•  ์ˆ˜ ์žˆ๋Š”(Actionable) ์„ฑ๊ณต ๊ณต์‹**์„ ๋„์ถœํ•ฉ๋‹ˆ๋‹ค.

## 2. ์กฐํšŒ์ˆ˜ TOP 20 'ํžˆํŠธ ๊ธฐ์‚ฌ' ๋ฆฌ์ŠคํŠธ
{top_20_str}

## 3. โ˜… ์กฐํšŒ์ˆ˜ '๋Œ€๋ฐ•' ๊ธฐ์‚ฌ์˜ ๊ฐ•ํ™”๋œ ์„ฑ๊ณต ๊ณต์‹ โ˜…

(high_view_article_characteristics_v2.png ์ฐธ๊ณ )

### ๊ณต์‹ 1: 'ํžˆํŠธ ํŒฉํ† ๋ฆฌ' ์นดํ…Œ๊ณ ๋ฆฌ์— ์ง‘์ค‘ํ•˜๋ผ.
- **๋ฐ์ดํ„ฐ ์ฆ๊ฑฐ**: '์ปค๋ฒ„์Šคํ† ๋ฆฌ', '๋ฏธ๋””์–ดํ˜„์žฅ', '์ทจ์žฌ๊ธฐยท์ œ์ž‘๊ธฐ' 3๊ฐœ ์นดํ…Œ๊ณ ๋ฆฌ์—์„œ ํžˆํŠธ ๊ธฐ์‚ฌ์˜ 60% ์ด์ƒ์ด ๋ฐฐ์ถœ๋˜์—ˆ์Šต๋‹ˆ๋‹ค. ์ด ์นดํ…Œ๊ณ ๋ฆฌ๋“ค์€ ๊ฒ€์ฆ๋œ ์„ฑ๊ณต ์˜์—ญ์ž…๋‹ˆ๋‹ค.

### ๊ณต์‹ 2: ์ œ๋ชฉ์œผ๋กœ ๋ชจ๋“  ๊ฒƒ์„ ๋งํ•˜๋ผ.
- **(์‹ ๊ทœ ๋ฐœ๊ฒฌ) ๋ง๋จธ๋ฆฌ ํšจ๊ณผ**: ์ œ๋ชฉ์— **'[์ค‘๊ตญ]', '[์•Œ๊ณ ๋ฆฌ์ฆ˜]'๊ณผ ๊ฐ™์ด ์ฃผ์ œ๋ฅผ ์š”์•ฝํ•˜๋Š” ๋ง๋จธ๋ฆฌ๋ฅผ ์‚ฌ์šฉํ•œ ๊ธฐ์‚ฌ์˜ ํ‰๊ท  ์กฐํšŒ์ˆ˜๋Š” ๊ทธ๋ ‡์ง€ ์•Š์€ ๊ธฐ์‚ฌ๋ณด๋‹ค ํ˜„์ €ํžˆ ๋†’์•˜์Šต๋‹ˆ๋‹ค.** ์ด๋Š” ๋…์ž๋“ค์ด ์ œ๋ชฉ๋งŒ ๋ณด๊ณ ๋„ ๊ธฐ์‚ฌ์˜ ํ•ต์‹ฌ ๋‚ด์šฉ์„ ๋น ๋ฅด๊ฒŒ ํŒŒ์•…ํ•  ์ˆ˜ ์žˆ์„ ๋•Œ ํด๋ฆญํ•  ํ™•๋ฅ ์ด ๋†’๋‹ค๋Š” ๊ฒƒ์„ ์˜๋ฏธํ•ฉ๋‹ˆ๋‹ค.
- **(์‹ ๊ทœ ๋ฐœ๊ฒฌ) ํŠธ๋ Œ๋“œ ํ‚ค์›Œ๋“œ ์„ ์ **: '์ˆํผ', 'MZ', 'AI' ๋“ฑ **์‹œ์˜์„ฑ ์žˆ๋Š” ํŠธ๋ Œ๋“œ ํ‚ค์›Œ๋“œ๋ฅผ ์ œ๋ชฉ์— ํฌํ•จํ•œ ๊ธฐ์‚ฌ๋“ค์ด ์••๋„์ ์œผ๋กœ ๋†’์€ ํ‰๊ท  ์กฐํšŒ์ˆ˜**๋ฅผ ๊ธฐ๋กํ–ˆ์Šต๋‹ˆ๋‹ค. ๋…์ž๋“ค์€ ์ตœ์‹  ์ด์Šˆ์— ๋ฏผ๊ฐํ•˜๊ฒŒ ๋ฐ˜์‘ํ•ฉ๋‹ˆ๋‹ค.

### ๊ณต์‹ 3: ๊ธธ๊ณ  ๊นŠ์ด ์žˆ๋Š” ์ฝ˜ํ…์ธ ๊ฐ€ ์ด๊ธด๋‹ค.
- **๋ฐ์ดํ„ฐ ์ฆ๊ฑฐ**: ํžˆํŠธ ๊ธฐ์‚ฌ๋“ค์€ ์ผ๋ฐ˜ ๊ธฐ์‚ฌ๋“ค๋ณด๋‹ค **๋ณธ๋ฌธ ๊ธธ์ด๊ฐ€ ํ›จ์”ฌ ๊ธด ๊ฒฝํ–ฅ**์„ ๋ณด์˜€์Šต๋‹ˆ๋‹ค. ๋…์ž๋“ค์€ ๊นŠ์ด ์žˆ๋Š” ๋กฑํผ ์ฝ˜ํ…์ธ ์— ๋” ๋†’์€ ๊ฐ€์น˜๋ฅผ ๋ถ€์—ฌํ•ฉ๋‹ˆ๋‹ค.

### ๊ณต์‹ 4: ์ฃผ์ดˆ(์›”/ํ™”)์— ์Šน๋ถ€์ˆ˜๋ฅผ ๋„์›Œ๋ผ.
- **๋ฐ์ดํ„ฐ ์ฆ๊ฑฐ**: ํžˆํŠธ ๊ธฐ์‚ฌ์˜ ์ƒ๋‹น์ˆ˜๊ฐ€ **์›”์š”์ผ๊ณผ ํ™”์š”์ผ์— ๋ฐœํ–‰**๋˜์—ˆ์Šต๋‹ˆ๋‹ค. ์ฃผ์ดˆ์— ๋…์ž๋“ค์˜ ์ฝ˜ํ…์ธ  ์†Œ๋น„ ์š•๊ตฌ๊ฐ€ ๊ฐ€์žฅ ๋†’์Šต๋‹ˆ๋‹ค.

## 4. ์‹คํ–‰์„ ์œ„ํ•œ '์„ฑ๊ณต ๊ณต์‹' ์ฒดํฌ๋ฆฌ์ŠคํŠธ
- ์‹ ๊ทœ ๊ธฐ์‚ฌ ๊ธฐํš ๋ฐ ๋ฐœํ–‰ ์‹œ, ์•„๋ž˜ ์ฒดํฌ๋ฆฌ์ŠคํŠธ๋ฅผ ํ™œ์šฉํ•˜์—ฌ ์„ฑ๊ณต ํ™•๋ฅ ์„ ๊ทน๋Œ€ํ™”ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.

| ์ฒดํฌ ํ•ญ๋ชฉ                                      | ์ „๋žต                                                               |
| ---------------------------------------------- | ------------------------------------------------------------------ |
| **1. ์นดํ…Œ๊ณ ๋ฆฌ ์„ ์ •**                           | '์ปค๋ฒ„์Šคํ† ๋ฆฌ', '๋ฏธ๋””์–ดํ˜„์žฅ' ๋“ฑ ๊ฒ€์ฆ๋œ ์นดํ…Œ๊ณ ๋ฆฌ์ธ๊ฐ€?                  |
| **2. ์ œ๋ชฉ - ๋ง๋จธ๋ฆฌ ํ™œ์šฉ**                      | ๋…์ž์˜ ๋ˆˆ๊ธธ์„ ๋„๋Š” ๋ช…ํ™•ํ•œ [๋ง๋จธ๋ฆฌ]๋ฅผ ์‚ฌ์šฉํ–ˆ๋Š”๊ฐ€?                    |
| **3. ์ œ๋ชฉ - ํ‚ค์›Œ๋“œ ํฌํ•จ**                      | ์ง€๊ธˆ ๊ฐ€์žฅ ๋œจ๊ฑฐ์šด 'ํŠธ๋ Œ๋“œ ํ‚ค์›Œ๋“œ'๋ฅผ ์ œ๋ชฉ์— ํฌํ•จํ–ˆ๋Š”๊ฐ€?               |
| **4. ์ฝ˜ํ…์ธ  ๊นŠ์ด**                             | ๋…์ž๊ฐ€ ์‹œ๊ฐ„์„ ํˆฌ์žํ•  ๋งŒํ•œ ๊นŠ์ด์™€ ์ „๋ฌธ์„ฑ์„ ๊ฐ–์ถ˜ ๋กฑํผ ์ฝ˜ํ…์ธ ์ธ๊ฐ€?     |
| **5. ๋ฐœํ–‰ ์‹œ์ **                               | ๊ฐ€์žฅ ์ค‘์š”ํ•œ ๊ธฐ์‚ฌ๋ฅผ 'ํ”„๋ผ์ž„ ํƒ€์ž„'์ธ ์›”์š”์ผ ์˜ค์ „์— ๋ฐœํ–‰ํ•˜๋Š”๊ฐ€?        |
"""
    report_path = f'{output_dir}/high_view_focused_analysis_report_v2.txt'
    with open(report_path, 'w', encoding='utf-8') as f:
        f.write(report)
    print(f"\n  - ์ข…ํ•ฉ ์ธ์‚ฌ์ดํŠธ ๋ณด๊ณ ์„œ(v2) ์ƒ์„ฑ ์™„๋ฃŒ. ({report_path} ์ €์žฅ)")

# 5. ๋ฉ”์ธ ์‹คํ–‰ ํ•จ์ˆ˜
def main():
    print("===== ์‹ ๋ฌธ๊ณผ๋ฐฉ์†ก ๋…์ž ๋ฐ์ดํ„ฐ ์‹ฌ์ธต ๋ถ„์„ (์กฐํšŒ์ˆ˜ ์ค‘์‹ฌ ์„ฑ๊ณต ๊ณต์‹ v2) =====")
    
    data_dir, output_dir = setup_environment()
    df_merged = load_and_preprocess_data(data_dir)
    
    top_20, cat_comp = analyze_high_view_articles_v2(df_merged, output_dir)
    
    generate_insights_report_v2(top_20, cat_comp, output_dir)
    
    print("\n===== ๋ชจ๋“  ๋ถ„์„์ด ์„ฑ๊ณต์ ์œผ๋กœ ์™„๋ฃŒ๋˜์—ˆ์Šต๋‹ˆ๋‹ค. =====")
    print(f"๊ฒฐ๊ณผ๋ฌผ์€ '{output_dir}' ํด๋”์—์„œ ํ™•์ธํ•˜์‹ค ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.")

if __name__ == '__main__':
    main()