|
|
|
|
|
"""Prepare CSV files for analytics dashboard from original dataset. |
|
|
|
|
|
This script generates the required CSV formats for: |
|
|
1. Predictive Intervals: sentiment counts per news item |
|
|
2. Category Analytics: news articles with categories |
|
|
3. Thread Analysis: comments with news IDs |
|
|
""" |
|
|
|
|
|
import sys |
|
|
import argparse |
|
|
from pathlib import Path |
|
|
import pandas as pd |
|
|
import logging |
|
|
|
|
|
|
|
|
project_root = Path(__file__).parent.parent |
|
|
sys.path.insert(0, str(project_root)) |
|
|
|
|
|
from analysis.sentiment_analyzer import SentimentAnalyzer |
|
|
from data.data_loader import load_data |
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
def prepare_category_analytics_csv( |
|
|
ria_path: str, |
|
|
output_path: str = "data/analytics_category_data.csv", |
|
|
max_samples: int = None |
|
|
): |
|
|
""" |
|
|
Prepare CSV for Category Analytics tab. |
|
|
|
|
|
Required columns: category, text |
|
|
""" |
|
|
logger.info(f"Loading RIA news from {ria_path}") |
|
|
df = pd.read_csv(ria_path, sep='\t') |
|
|
df = df[~df.tags.isnull()] |
|
|
|
|
|
if max_samples: |
|
|
df = df.head(max_samples) |
|
|
|
|
|
|
|
|
df['text'] = df.apply( |
|
|
lambda row: f"{row['title']} {row.get('snippet', '')}".strip(), |
|
|
axis=1 |
|
|
) |
|
|
|
|
|
|
|
|
df['category'] = df['tags'].apply( |
|
|
lambda x: str(x).split(',')[0].strip() if pd.notna(x) else 'unknown' |
|
|
) |
|
|
|
|
|
|
|
|
output_df = df[['category', 'text']].copy() |
|
|
|
|
|
output_df.to_csv(output_path, index=False) |
|
|
logger.info(f"✅ Created category analytics CSV: {output_path} ({len(output_df)} rows)") |
|
|
|
|
|
return output_path |
|
|
|
|
|
|
|
|
def prepare_thread_analysis_csv( |
|
|
vk_comments_path: str, |
|
|
output_path: str = "data/analytics_thread_data.csv", |
|
|
max_samples: int = None |
|
|
): |
|
|
""" |
|
|
Prepare CSV for Thread Analysis tab. |
|
|
|
|
|
Required columns: news_id, text |
|
|
""" |
|
|
logger.info(f"Loading VK comments from {vk_comments_path}") |
|
|
df = pd.read_csv(vk_comments_path, sep='\t') |
|
|
df = df[~df.text.isnull()] |
|
|
|
|
|
if max_samples: |
|
|
df = df.head(max_samples) |
|
|
|
|
|
|
|
|
df['news_id'] = df['post_id'].astype(str) |
|
|
|
|
|
|
|
|
output_df = df[['news_id', 'text']].copy() |
|
|
|
|
|
output_df.to_csv(output_path, index=False) |
|
|
logger.info(f"✅ Created thread analysis CSV: {output_path} ({len(output_df)} rows)") |
|
|
|
|
|
return output_path |
|
|
|
|
|
|
|
|
def prepare_predictive_intervals_csv( |
|
|
vk_comments_path: str, |
|
|
vk_news_path: str = None, |
|
|
output_path: str = "data/analytics_sentiment_counts.csv", |
|
|
max_news_items: int = 50, |
|
|
max_comments_per_item: int = 1000 |
|
|
): |
|
|
""" |
|
|
Prepare CSV for Predictive Intervals tab. |
|
|
|
|
|
Required columns: id, positive_count, negative_count, neutral_count |
|
|
|
|
|
This analyzes comments with sentiment to get counts per news item. |
|
|
""" |
|
|
logger.info(f"Loading VK comments from {vk_comments_path}") |
|
|
df_comments = pd.read_csv(vk_comments_path, sep='\t') |
|
|
df_comments = df_comments[~df_comments.text.isnull()] |
|
|
|
|
|
|
|
|
df_comments = df_comments.groupby('post_id').head(max_comments_per_item) |
|
|
|
|
|
|
|
|
news_ids = df_comments['post_id'].unique()[:max_news_items] |
|
|
logger.info(f"Analyzing sentiment for {len(news_ids)} news items...") |
|
|
|
|
|
|
|
|
analyzer = SentimentAnalyzer() |
|
|
|
|
|
results = [] |
|
|
for i, news_id in enumerate(news_ids): |
|
|
if (i + 1) % 10 == 0: |
|
|
logger.info(f"Processing {i+1}/{len(news_ids)}...") |
|
|
|
|
|
|
|
|
comments = df_comments[df_comments['post_id'] == news_id]['text'].tolist() |
|
|
|
|
|
if not comments: |
|
|
continue |
|
|
|
|
|
|
|
|
sentiments = [] |
|
|
batch_size = 50 |
|
|
for j in range(0, len(comments), batch_size): |
|
|
batch = comments[j:j+batch_size] |
|
|
batch_results = analyzer.analyze_batch(batch) |
|
|
sentiments.extend(batch_results) |
|
|
|
|
|
|
|
|
positive_count = sum(1 for s in sentiments if s.get('label') == 'POSITIVE') |
|
|
negative_count = sum(1 for s in sentiments if s.get('label') == 'NEGATIVE') |
|
|
neutral_count = sum(1 for s in sentiments if s.get('label') == 'NEUTRAL') |
|
|
|
|
|
results.append({ |
|
|
'id': str(news_id), |
|
|
'positive_count': positive_count, |
|
|
'negative_count': negative_count, |
|
|
'neutral_count': neutral_count |
|
|
}) |
|
|
|
|
|
output_df = pd.DataFrame(results) |
|
|
output_df.to_csv(output_path, index=False) |
|
|
logger.info(f"✅ Created predictive intervals CSV: {output_path} ({len(output_df)} rows)") |
|
|
|
|
|
return output_path |
|
|
|
|
|
|
|
|
def main(): |
|
|
parser = argparse.ArgumentParser(description="Prepare CSV files for analytics dashboard") |
|
|
parser.add_argument( |
|
|
"--ria-path", |
|
|
type=str, |
|
|
default="data/news_data/ria_news.tsv", |
|
|
help="Path to RIA news TSV file" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--vk-comments-path", |
|
|
type=str, |
|
|
default="data/news_data/vk_comments.tsv", |
|
|
help="Path to VK comments TSV file" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--vk-news-path", |
|
|
type=str, |
|
|
default=None, |
|
|
help="Path to VK news TSV file (optional)" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--output-dir", |
|
|
type=str, |
|
|
default="data", |
|
|
help="Output directory for CSV files" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--max-samples", |
|
|
type=int, |
|
|
default=None, |
|
|
help="Maximum samples for category analytics (for testing)" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--max-news-items", |
|
|
type=int, |
|
|
default=50, |
|
|
help="Maximum news items for predictive intervals (sentiment analysis is slow)" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--max-comments-per-item", |
|
|
type=int, |
|
|
default=1000, |
|
|
help="Maximum comments per news item for sentiment analysis" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--skip-sentiment", |
|
|
action="store_true", |
|
|
help="Skip sentiment analysis (slow step)" |
|
|
) |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
output_dir = Path(args.output_dir) |
|
|
output_dir.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
logger.info("=" * 60) |
|
|
logger.info("Preparing Analytics Dashboard CSV Files") |
|
|
logger.info("=" * 60) |
|
|
|
|
|
|
|
|
logger.info("\n1. Preparing Category Analytics CSV...") |
|
|
category_path = output_dir / "analytics_category_data.csv" |
|
|
prepare_category_analytics_csv( |
|
|
args.ria_path, |
|
|
str(category_path), |
|
|
max_samples=args.max_samples |
|
|
) |
|
|
|
|
|
|
|
|
logger.info("\n2. Preparing Thread Analysis CSV...") |
|
|
if Path(args.vk_comments_path).exists(): |
|
|
thread_path = output_dir / "analytics_thread_data.csv" |
|
|
prepare_thread_analysis_csv( |
|
|
args.vk_comments_path, |
|
|
str(thread_path), |
|
|
max_samples=args.max_samples |
|
|
) |
|
|
else: |
|
|
logger.warning(f"⚠️ VK comments file not found: {args.vk_comments_path}") |
|
|
logger.warning(" Skipping thread analysis CSV") |
|
|
|
|
|
|
|
|
if not args.skip_sentiment: |
|
|
logger.info("\n3. Preparing Predictive Intervals CSV (sentiment analysis)...") |
|
|
logger.info(" ⚠️ This step is slow - analyzing sentiment for comments...") |
|
|
if Path(args.vk_comments_path).exists(): |
|
|
sentiment_path = output_dir / "analytics_sentiment_counts.csv" |
|
|
prepare_predictive_intervals_csv( |
|
|
args.vk_comments_path, |
|
|
args.vk_news_path, |
|
|
str(sentiment_path), |
|
|
max_news_items=args.max_news_items, |
|
|
max_comments_per_item=args.max_comments_per_item |
|
|
) |
|
|
else: |
|
|
logger.warning(f"⚠️ VK comments file not found: {args.vk_comments_path}") |
|
|
logger.warning(" Skipping predictive intervals CSV") |
|
|
else: |
|
|
logger.info("\n3. Skipping Predictive Intervals CSV (--skip-sentiment flag)") |
|
|
|
|
|
logger.info("\n" + "=" * 60) |
|
|
logger.info("✅ All CSV files prepared!") |
|
|
logger.info("=" * 60) |
|
|
logger.info("\nNext steps:") |
|
|
logger.info("1. Open Streamlit analytics dashboard:") |
|
|
logger.info(" streamlit run dashboards/analytics_dashboard.py") |
|
|
logger.info("2. Upload the generated CSV files in each tab:") |
|
|
logger.info(f" - Category Analytics: {output_dir}/analytics_category_data.csv") |
|
|
if Path(args.vk_comments_path).exists(): |
|
|
logger.info(f" - Thread Analysis: {output_dir}/analytics_thread_data.csv") |
|
|
if not args.skip_sentiment: |
|
|
logger.info(f" - Predictive Intervals: {output_dir}/analytics_sentiment_counts.csv") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|
|
|
|
|
|
|
|
|
|