# visualization.py import polars as pl import matplotlib.pyplot as plt import seaborn as sns import numpy as np def create_visualizations(): """Create visualizations from the analyzed data""" try: # Load the cleaned data df = pl.read_csv('tiktok_cleaned.csv') # Set up the plotting style plt.style.use('default') sns.set_palette("husl") # Create subplots fig, axes = plt.subplots(2, 2, figsize=(15, 12)) fig.suptitle('TikTok Dataset Analysis', fontsize=16, fontweight='bold') # 1. Distribution of video likes (digg_count) likes_data = df['digg_count'].to_list() axes[0, 0].hist(likes_data, bins=50, alpha=0.7, edgecolor='black') axes[0, 0].set_title('Distribution of Video Likes (Digg Count)') axes[0, 0].set_xlabel('Number of Likes') axes[0, 0].set_ylabel('Frequency') axes[0, 0].grid(True, alpha=0.3) # 2. Distribution of video views (play_count) views_data = df['play_count'].to_list() axes[0, 1].hist(views_data, bins=50, alpha=0.7, edgecolor='black') axes[0, 1].set_title('Distribution of Video Views (Play Count)') axes[0, 1].set_xlabel('Number of Views') axes[0, 1].set_ylabel('Frequency') axes[0, 1].grid(True, alpha=0.3) # 3. Scatter plot: Views vs Likes axes[1, 0].scatter(views_data, likes_data, alpha=0.6) axes[1, 0].set_title('Views vs Likes Correlation') axes[1, 0].set_xlabel('Views (Play Count)') axes[1, 0].set_ylabel('Likes (Digg Count)') axes[1, 0].grid(True, alpha=0.3) # 4. Engagement metrics comparison engagement_metrics = ['digg_count', 'comment_count', 'share_count'] avg_engagement = [df[metric].mean() for metric in engagement_metrics] bars = axes[1, 1].bar(['Likes', 'Comments', 'Shares'], avg_engagement) axes[1, 1].set_title('Average Engagement Metrics') axes[1, 1].set_ylabel('Average Count') # Add value labels on bars for bar in bars: height = bar.get_height() axes[1, 1].text(bar.get_x() + bar.get_width()/2., height, f'{height:,.0f}', ha='center', va='bottom') plt.tight_layout() plt.savefig('tiktok_analysis_visualizations.png', dpi=300, bbox_inches='tight') plt.show() print("📊 Visualizations saved as 'tiktok_analysis_visualizations.png'") # Additional visualizations if duration data is available if 'duration' in df.columns: create_duration_visualizations(df) except Exception as e: print(f"Error creating visualizations: {e}") import traceback traceback.print_exc() def create_duration_visualizations(df): """Create visualizations related to video duration""" fig, axes = plt.subplots(1, 2, figsize=(12, 5)) # Duration distribution duration_data = df['duration'].to_list() axes[0].hist(duration_data, bins=30, alpha=0.7, edgecolor='black') axes[0].set_title('Distribution of Video Duration') axes[0].set_xlabel('Duration (seconds)') axes[0].set_ylabel('Frequency') axes[0].grid(True, alpha=0.3) # Duration vs Engagement axes[1].scatter(duration_data, df['digg_count'].to_list(), alpha=0.6) axes[1].set_title('Duration vs Likes') axes[1].set_xlabel('Duration (seconds)') axes[1].set_ylabel('Likes (Digg Count)') axes[1].grid(True, alpha=0.3) plt.tight_layout() plt.savefig('duration_analysis.png', dpi=300, bbox_inches='tight') plt.show() print("📊 Duration visualizations saved as 'duration_analysis.png'") if __name__ == "__main__": create_visualizations()