TroglodyteDerivations's picture
Upload 44 files
80d08c2 verified
# final_visualizations.py
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pathlib import Path
def create_comprehensive_visualizations():
"""Create comprehensive visualizations from the analyzed data"""
try:
# Load the cleaned data
df = pl.read_csv('tiktok_cleaned.csv')
# Set up the plotting style
plt.style.use('default')
sns.set_palette("husl")
# Create a 2x3 grid of subplots
fig, axes = plt.subplots(2, 3, figsize=(20, 12))
fig.suptitle('TikTok Dataset: Comprehensive Performance Analysis', fontsize=18, fontweight='bold')
# 1. Distribution of video likes (log scale for better visualization)
likes_data = df['digg_count'].to_list()
axes[0, 0].hist(likes_data, bins=50, alpha=0.7, edgecolor='black', log=True)
axes[0, 0].set_title('Distribution of Video Likes\n(Log Scale)', fontweight='bold')
axes[0, 0].set_xlabel('Number of Likes')
axes[0, 0].set_ylabel('Frequency (Log Scale)')
axes[0, 0].grid(True, alpha=0.3)
# 2. Engagement by duration category
duration_stats = df.group_by('duration_category').agg([
pl.col('digg_count').mean().alias('avg_likes'),
pl.len().alias('video_count')
]).sort('avg_likes', descending=True)
categories = duration_stats['duration_category'].to_list()
avg_likes = duration_stats['avg_likes'].to_list()
bars = axes[0, 1].bar(categories, avg_likes, alpha=0.7, color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4'])
axes[0, 1].set_title('Average Likes by Video Duration', fontweight='bold')
axes[0, 1].set_xlabel('Duration Category')
axes[0, 1].set_ylabel('Average Likes')
axes[0, 1].tick_params(axis='x', rotation=45)
axes[0, 1].grid(True, alpha=0.3)
# Add value labels on bars
for bar in bars:
height = bar.get_height()
axes[0, 1].text(bar.get_x() + bar.get_width()/2., height,
f'{height/1e6:.1f}M',
ha='center', va='bottom', fontweight='bold')
# 3. Author performance comparison
author_stats = df.group_by('author_unique_id').agg([
pl.col('digg_count').mean().alias('avg_likes'),
pl.col('play_count').mean().alias('avg_views'),
pl.len().alias('video_count')
]).sort('avg_likes', descending=True)
authors = author_stats['author_unique_id'].to_list()
author_likes = author_stats['avg_likes'].to_list()
author_views = author_stats['avg_views'].to_list()
x_pos = np.arange(len(authors))
width = 0.35
bars1 = axes[0, 2].bar(x_pos - width/2, [l/1e6 for l in author_likes], width,
label='Avg Likes (M)', alpha=0.7)
bars2 = axes[0, 2].bar(x_pos + width/2, [v/1e6 for v in author_views], width,
label='Avg Views (M)', alpha=0.7)
axes[0, 2].set_title('Author Performance Comparison', fontweight='bold')
axes[0, 2].set_xlabel('Authors')
axes[0, 2].set_ylabel('Count (Millions)')
axes[0, 2].set_xticks(x_pos)
axes[0, 2].set_xticklabels(authors, rotation=45)
axes[0, 2].legend()
axes[0, 2].grid(True, alpha=0.3)
# 4. Location performance
location_stats = df.filter(pl.col('location_created').is_not_null()).group_by('location_created').agg([
pl.col('digg_count').mean().alias('avg_likes'),
pl.len().alias('video_count')
]).sort('avg_likes', descending=True).head(6)
locations = location_stats['location_created'].to_list()
location_likes = location_stats['avg_likes'].to_list()
bars = axes[1, 0].bar(locations, [l/1e6 for l in location_likes], alpha=0.7)
axes[1, 0].set_title('Average Likes by Location\n(Top 6 Countries)', fontweight='bold')
axes[1, 0].set_xlabel('Country Code')
axes[1, 0].set_ylabel('Average Likes (Millions)')
axes[1, 0].tick_params(axis='x', rotation=45)
axes[1, 0].grid(True, alpha=0.3)
for bar in bars:
height = bar.get_height()
axes[1, 0].text(bar.get_x() + bar.get_width()/2., height,
f'{height:.1f}M',
ha='center', va='bottom', fontweight='bold')
# 5. Hashtag impact analysis
hashtag_stats = df.group_by('has_hashtags').agg([
pl.col('digg_count').mean().alias('avg_likes'),
pl.col('play_count').mean().alias('avg_views'),
pl.len().alias('video_count')
])
hashtag_labels = ['With Hashtags', 'Without Hashtags']
hashtag_likes = [hashtag_stats.filter(pl.col('has_hashtags') == True)['avg_likes'][0] / 1e6,
hashtag_stats.filter(pl.col('has_hashtags') == False)['avg_likes'][0] / 1e6]
bars = axes[1, 1].bar(hashtag_labels, hashtag_likes, alpha=0.7, color=['#FF9999', '#66B2FF'])
axes[1, 1].set_title('Impact of Hashtags on Engagement', fontweight='bold')
axes[1, 1].set_ylabel('Average Likes (Millions)')
axes[1, 1].grid(True, alpha=0.3)
for bar in bars:
height = bar.get_height()
axes[1, 1].text(bar.get_x() + bar.get_width()/2., height,
f'{height:.1f}M',
ha='center', va='bottom', fontweight='bold')
# 6. Engagement rates comparison
engagement_rates = [7.22, 0.11, 0.15] # Like, Comment, Share rates from analysis
engagement_types = ['Like Rate', 'Comment Rate', 'Share Rate']
bars = axes[1, 2].bar(engagement_types, engagement_rates, alpha=0.7, color=['#FF6B6B', '#4ECDC4', '#45B7D1'])
axes[1, 2].set_title('Engagement Rate Comparison (%)', fontweight='bold')
axes[1, 2].set_ylabel('Engagement Rate (%)')
axes[1, 2].grid(True, alpha=0.3)
for bar in bars:
height = bar.get_height()
axes[1, 2].text(bar.get_x() + bar.get_width()/2., height,
f'{height:.2f}%',
ha='center', va='bottom', fontweight='bold')
plt.tight_layout()
plt.savefig('comprehensive_tiktok_analysis.png', dpi=300, bbox_inches='tight')
plt.show()
print("πŸ“Š Comprehensive visualizations saved as 'comprehensive_tiktok_analysis.png'")
# Create additional detailed visualizations
create_detailed_analysis_charts(df)
except Exception as e:
print(f"Error creating visualizations: {e}")
import traceback
traceback.print_exc()
def create_detailed_analysis_charts(df):
"""Create additional detailed analysis charts"""
# 1. Performance distribution across creators
plt.figure(figsize=(12, 8))
# Subplot 1: Likes distribution by author
plt.subplot(2, 2, 1)
author_likes = df.group_by('author_unique_id').agg(
pl.col('digg_count').sum().alias('total_likes')
).sort('total_likes', descending=True)
plt.pie(author_likes['total_likes'].to_list(),
labels=author_likes['author_unique_id'].to_list(),
autopct='%1.1f%%', startangle=90)
plt.title('Total Likes Distribution by Creator')
# Subplot 2: Video count by author
plt.subplot(2, 2, 2)
author_counts = df.group_by('author_unique_id').agg(
pl.len().alias('video_count')
).sort('video_count', descending=True)
plt.bar(author_counts['author_unique_id'].to_list(),
author_counts['video_count'].to_list(),
alpha=0.7, color='skyblue')
plt.title('Video Count by Creator')
plt.xticks(rotation=45)
# Subplot 3: Duration distribution
plt.subplot(2, 2, 3)
plt.hist(df['duration'].to_list(), bins=30, alpha=0.7, edgecolor='black')
plt.title('Video Duration Distribution')
plt.xlabel('Duration (seconds)')
plt.ylabel('Frequency')
plt.grid(True, alpha=0.3)
# Subplot 4: Views vs Likes scatter plot
plt.subplot(2, 2, 4)
plt.scatter(df['play_count'].to_list(), df['digg_count'].to_list(),
alpha=0.6, s=20)
plt.title('Views vs Likes Correlation')
plt.xlabel('Views')
plt.ylabel('Likes')
plt.grid(True, alpha=0.3)
# Add correlation coefficient
correlation = df.select(pl.corr('play_count', 'digg_count')).item()
plt.text(0.05, 0.95, f'Correlation: {correlation:.3f}',
transform=plt.gca().transAxes, fontsize=12,
bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.8))
plt.tight_layout()
plt.savefig('detailed_tiktok_analysis.png', dpi=300, bbox_inches='tight')
plt.show()
print("πŸ“Š Detailed analysis charts saved as 'detailed_tiktok_analysis.png'")
# Create performance summary chart
create_performance_summary_chart(df)
def create_performance_summary_chart(df):
"""Create a performance summary chart highlighting key metrics"""
fig, ax = plt.subplots(figsize=(10, 6))
# Key metrics from analysis
metrics = ['Avg Views', 'Avg Likes', 'Like Rate', 'Comment Rate']
values = [21.7, 1.57, 7.22, 0.11] # In millions and percentages
units = ['M', 'M', '%', '%']
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4']
bars = ax.bar(metrics, values, color=colors, alpha=0.7)
ax.set_title('TikTok Performance Summary', fontsize=16, fontweight='bold')
ax.set_ylabel('Value')
ax.grid(True, alpha=0.3, axis='y')
# Add value labels on bars
for bar, value, unit in zip(bars, values, units):
height = bar.get_height()
ax.text(bar.get_x() + bar.get_width()/2., height,
f'{value} {unit}',
ha='center', va='bottom', fontweight='bold')
# Add insights as text
insights = [
"β€’ Very short videos (≀15s) perform best",
"β€’ US content outperforms international",
"β€’ Hashtags boost engagement 1.7x",
"β€’ Top 3 creators = 76.4% of all likes"
]
for i, insight in enumerate(insights):
ax.text(0.02, 0.95 - i*0.1, insight, transform=ax.transAxes,
fontsize=10, bbox=dict(boxstyle="round,pad=0.3",
facecolor="lightyellow", alpha=0.7))
plt.tight_layout()
plt.savefig('tiktok_performance_summary.png', dpi=300, bbox_inches='tight')
plt.show()
print("πŸ“Š Performance summary saved as 'tiktok_performance_summary.png'")
def generate_insights_report():
"""Generate a text-based insights report"""
print("\n" + "="*70)
print("πŸ“Š TIKTOK DATASET - KEY INSIGHTS REPORT")
print("="*70)
insights = [
"🎯 CONTENT STRATEGY INSIGHTS:",
"β€’ Very short videos (≀15s) generate 1.4x more likes than average",
"β€’ Optimal video length: 15-30 seconds for maximum engagement",
"β€’ Videos longer than 60s see significant drop in performance",
"",
"πŸ‘₯ CREATOR ECOSYSTEM:",
"β€’ Highly concentrated: Only 4 creators in entire dataset",
"β€’ Top 3 creators (zachking, mrbeast, addisonre) dominate:",
" - Account for 76.4% of all likes",
" - Generate highest average engagement rates",
"",
"🌍 GEOGRAPHIC PERFORMANCE:",
"β€’ US-based content performs 3.2x better than international",
"β€’ Indonesia has highest volume but lower engagement",
"β€’ Limited geographic diversity in dataset",
"",
"πŸ“Š ENGAGEMENT PATTERNS:",
"β€’ Strong correlation (0.65) between views and likes",
"β€’ Like rate: 7.22% (healthy engagement)",
"β€’ Comment rate: 0.11% (very low - viewers prefer liking)",
"β€’ Share rate: 0.15% (higher than comments)",
"",
"πŸ”– CONTENT OPTIMIZATION:",
"β€’ Videos with hashtags have 1.7x higher engagement",
"β€’ Average of 1.9 hashtags per video",
"β€’ Description length: ~44 characters on average",
"",
"πŸ“ˆ RECOMMENDATIONS:",
"1. Focus on 15-30 second video format",
"2. Always include relevant hashtags (1-3 optimal)",
"3. Target US audience for maximum engagement",
"4. Study top creators' content strategies",
"5. Prioritize like-generating content over comments"
]
for insight in insights:
print(insight)
print("\n" + "="*70)
if __name__ == "__main__":
create_comprehensive_visualizations()
generate_insights_report()