TroglodyteDerivations's picture
Upload 44 files
80d08c2 verified
# visualization.py
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
def create_visualizations():
"""Create visualizations from the analyzed data"""
try:
# Load the cleaned data
df = pl.read_csv('tiktok_cleaned.csv')
# Set up the plotting style
plt.style.use('default')
sns.set_palette("husl")
# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('TikTok Dataset Analysis', fontsize=16, fontweight='bold')
# 1. Distribution of video likes (digg_count)
likes_data = df['digg_count'].to_list()
axes[0, 0].hist(likes_data, bins=50, alpha=0.7, edgecolor='black')
axes[0, 0].set_title('Distribution of Video Likes (Digg Count)')
axes[0, 0].set_xlabel('Number of Likes')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].grid(True, alpha=0.3)
# 2. Distribution of video views (play_count)
views_data = df['play_count'].to_list()
axes[0, 1].hist(views_data, bins=50, alpha=0.7, edgecolor='black')
axes[0, 1].set_title('Distribution of Video Views (Play Count)')
axes[0, 1].set_xlabel('Number of Views')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].grid(True, alpha=0.3)
# 3. Scatter plot: Views vs Likes
axes[1, 0].scatter(views_data, likes_data, alpha=0.6)
axes[1, 0].set_title('Views vs Likes Correlation')
axes[1, 0].set_xlabel('Views (Play Count)')
axes[1, 0].set_ylabel('Likes (Digg Count)')
axes[1, 0].grid(True, alpha=0.3)
# 4. Engagement metrics comparison
engagement_metrics = ['digg_count', 'comment_count', 'share_count']
avg_engagement = [df[metric].mean() for metric in engagement_metrics]
bars = axes[1, 1].bar(['Likes', 'Comments', 'Shares'], avg_engagement)
axes[1, 1].set_title('Average Engagement Metrics')
axes[1, 1].set_ylabel('Average Count')
# Add value labels on bars
for bar in bars:
height = bar.get_height()
axes[1, 1].text(bar.get_x() + bar.get_width()/2., height,
f'{height:,.0f}',
ha='center', va='bottom')
plt.tight_layout()
plt.savefig('tiktok_analysis_visualizations.png', dpi=300, bbox_inches='tight')
plt.show()
print("πŸ“Š Visualizations saved as 'tiktok_analysis_visualizations.png'")
# Additional visualizations if duration data is available
if 'duration' in df.columns:
create_duration_visualizations(df)
except Exception as e:
print(f"Error creating visualizations: {e}")
import traceback
traceback.print_exc()
def create_duration_visualizations(df):
"""Create visualizations related to video duration"""
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# Duration distribution
duration_data = df['duration'].to_list()
axes[0].hist(duration_data, bins=30, alpha=0.7, edgecolor='black')
axes[0].set_title('Distribution of Video Duration')
axes[0].set_xlabel('Duration (seconds)')
axes[0].set_ylabel('Frequency')
axes[0].grid(True, alpha=0.3)
# Duration vs Engagement
axes[1].scatter(duration_data, df['digg_count'].to_list(), alpha=0.6)
axes[1].set_title('Duration vs Likes')
axes[1].set_xlabel('Duration (seconds)')
axes[1].set_ylabel('Likes (Digg Count)')
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('duration_analysis.png', dpi=300, bbox_inches='tight')
plt.show()
print("πŸ“Š Duration visualizations saved as 'duration_analysis.png'")
if __name__ == "__main__":
create_visualizations()