File size: 9,021 Bytes
198ccb0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
#!/usr/bin/env python3
"""Prepare CSV files for analytics dashboard from original dataset.

This script generates the required CSV formats for:
1. Predictive Intervals: sentiment counts per news item
2. Category Analytics: news articles with categories
3. Thread Analysis: comments with news IDs
"""

import sys
import argparse
from pathlib import Path
import pandas as pd
import logging

# Add project root to path
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))

from analysis.sentiment_analyzer import SentimentAnalyzer
from data.data_loader import load_data

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def prepare_category_analytics_csv(
    ria_path: str,
    output_path: str = "data/analytics_category_data.csv",
    max_samples: int = None
):
    """
    Prepare CSV for Category Analytics tab.
    
    Required columns: category, text
    """
    logger.info(f"Loading RIA news from {ria_path}")
    df = pd.read_csv(ria_path, sep='\t')
    df = df[~df.tags.isnull()]
    
    if max_samples:
        df = df.head(max_samples)
    
    # Use title + snippet as text, or just title if snippet is missing
    df['text'] = df.apply(
        lambda row: f"{row['title']} {row.get('snippet', '')}".strip(),
        axis=1
    )
    
    # Use first tag as category (or 'unknown' if no tags)
    df['category'] = df['tags'].apply(
        lambda x: str(x).split(',')[0].strip() if pd.notna(x) else 'unknown'
    )
    
    # Select only required columns
    output_df = df[['category', 'text']].copy()
    
    output_df.to_csv(output_path, index=False)
    logger.info(f"✅ Created category analytics CSV: {output_path} ({len(output_df)} rows)")
    
    return output_path


def prepare_thread_analysis_csv(
    vk_comments_path: str,
    output_path: str = "data/analytics_thread_data.csv",
    max_samples: int = None
):
    """
    Prepare CSV for Thread Analysis tab.
    
    Required columns: news_id, text
    """
    logger.info(f"Loading VK comments from {vk_comments_path}")
    df = pd.read_csv(vk_comments_path, sep='\t')
    df = df[~df.text.isnull()]
    
    if max_samples:
        df = df.head(max_samples)
    
    # Use post_id as news_id
    df['news_id'] = df['post_id'].astype(str)
    
    # Select only required columns
    output_df = df[['news_id', 'text']].copy()
    
    output_df.to_csv(output_path, index=False)
    logger.info(f"✅ Created thread analysis CSV: {output_path} ({len(output_df)} rows)")
    
    return output_path


def prepare_predictive_intervals_csv(
    vk_comments_path: str,
    vk_news_path: str = None,
    output_path: str = "data/analytics_sentiment_counts.csv",
    max_news_items: int = 50,
    max_comments_per_item: int = 1000
):
    """
    Prepare CSV for Predictive Intervals tab.
    
    Required columns: id, positive_count, negative_count, neutral_count
    
    This analyzes comments with sentiment to get counts per news item.
    """
    logger.info(f"Loading VK comments from {vk_comments_path}")
    df_comments = pd.read_csv(vk_comments_path, sep='\t')
    df_comments = df_comments[~df_comments.text.isnull()]
    
    # Limit comments per news item
    df_comments = df_comments.groupby('post_id').head(max_comments_per_item)
    
    # Get unique news items
    news_ids = df_comments['post_id'].unique()[:max_news_items]
    logger.info(f"Analyzing sentiment for {len(news_ids)} news items...")
    
    # Initialize sentiment analyzer
    analyzer = SentimentAnalyzer()
    
    results = []
    for i, news_id in enumerate(news_ids):
        if (i + 1) % 10 == 0:
            logger.info(f"Processing {i+1}/{len(news_ids)}...")
        
        # Get comments for this news item
        comments = df_comments[df_comments['post_id'] == news_id]['text'].tolist()
        
        if not comments:
            continue
        
        # Analyze sentiment in batches
        sentiments = []
        batch_size = 50
        for j in range(0, len(comments), batch_size):
            batch = comments[j:j+batch_size]
            batch_results = analyzer.analyze_batch(batch)
            sentiments.extend(batch_results)
        
        # Count sentiments
        positive_count = sum(1 for s in sentiments if s.get('label') == 'POSITIVE')
        negative_count = sum(1 for s in sentiments if s.get('label') == 'NEGATIVE')
        neutral_count = sum(1 for s in sentiments if s.get('label') == 'NEUTRAL')
        
        results.append({
            'id': str(news_id),
            'positive_count': positive_count,
            'negative_count': negative_count,
            'neutral_count': neutral_count
        })
    
    output_df = pd.DataFrame(results)
    output_df.to_csv(output_path, index=False)
    logger.info(f"✅ Created predictive intervals CSV: {output_path} ({len(output_df)} rows)")
    
    return output_path


def main():
    parser = argparse.ArgumentParser(description="Prepare CSV files for analytics dashboard")
    parser.add_argument(
        "--ria-path",
        type=str,
        default="data/news_data/ria_news.tsv",
        help="Path to RIA news TSV file"
    )
    parser.add_argument(
        "--vk-comments-path",
        type=str,
        default="data/news_data/vk_comments.tsv",
        help="Path to VK comments TSV file"
    )
    parser.add_argument(
        "--vk-news-path",
        type=str,
        default=None,
        help="Path to VK news TSV file (optional)"
    )
    parser.add_argument(
        "--output-dir",
        type=str,
        default="data",
        help="Output directory for CSV files"
    )
    parser.add_argument(
        "--max-samples",
        type=int,
        default=None,
        help="Maximum samples for category analytics (for testing)"
    )
    parser.add_argument(
        "--max-news-items",
        type=int,
        default=50,
        help="Maximum news items for predictive intervals (sentiment analysis is slow)"
    )
    parser.add_argument(
        "--max-comments-per-item",
        type=int,
        default=1000,
        help="Maximum comments per news item for sentiment analysis"
    )
    parser.add_argument(
        "--skip-sentiment",
        action="store_true",
        help="Skip sentiment analysis (slow step)"
    )
    
    args = parser.parse_args()
    
    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    logger.info("=" * 60)
    logger.info("Preparing Analytics Dashboard CSV Files")
    logger.info("=" * 60)
    
    # 1. Category Analytics CSV
    logger.info("\n1. Preparing Category Analytics CSV...")
    category_path = output_dir / "analytics_category_data.csv"
    prepare_category_analytics_csv(
        args.ria_path,
        str(category_path),
        max_samples=args.max_samples
    )
    
    # 2. Thread Analysis CSV
    logger.info("\n2. Preparing Thread Analysis CSV...")
    if Path(args.vk_comments_path).exists():
        thread_path = output_dir / "analytics_thread_data.csv"
        prepare_thread_analysis_csv(
            args.vk_comments_path,
            str(thread_path),
            max_samples=args.max_samples
        )
    else:
        logger.warning(f"⚠️  VK comments file not found: {args.vk_comments_path}")
        logger.warning("   Skipping thread analysis CSV")
    
    # 3. Predictive Intervals CSV (requires sentiment analysis - slow!)
    if not args.skip_sentiment:
        logger.info("\n3. Preparing Predictive Intervals CSV (sentiment analysis)...")
        logger.info("   ⚠️  This step is slow - analyzing sentiment for comments...")
        if Path(args.vk_comments_path).exists():
            sentiment_path = output_dir / "analytics_sentiment_counts.csv"
            prepare_predictive_intervals_csv(
                args.vk_comments_path,
                args.vk_news_path,
                str(sentiment_path),
                max_news_items=args.max_news_items,
                max_comments_per_item=args.max_comments_per_item
            )
        else:
            logger.warning(f"⚠️  VK comments file not found: {args.vk_comments_path}")
            logger.warning("   Skipping predictive intervals CSV")
    else:
        logger.info("\n3. Skipping Predictive Intervals CSV (--skip-sentiment flag)")
    
    logger.info("\n" + "=" * 60)
    logger.info("✅ All CSV files prepared!")
    logger.info("=" * 60)
    logger.info("\nNext steps:")
    logger.info("1. Open Streamlit analytics dashboard:")
    logger.info("   streamlit run dashboards/analytics_dashboard.py")
    logger.info("2. Upload the generated CSV files in each tab:")
    logger.info(f"   - Category Analytics: {output_dir}/analytics_category_data.csv")
    if Path(args.vk_comments_path).exists():
        logger.info(f"   - Thread Analysis: {output_dir}/analytics_thread_data.csv")
        if not args.skip_sentiment:
            logger.info(f"   - Predictive Intervals: {output_dir}/analytics_sentiment_counts.csv")


if __name__ == "__main__":
    main()