|
|
|
|
|
import polars as pl |
|
|
import matplotlib.pyplot as plt |
|
|
import seaborn as sns |
|
|
import numpy as np |
|
|
from pathlib import Path |
|
|
|
|
|
def create_comprehensive_visualizations(): |
|
|
"""Create comprehensive visualizations from the analyzed data""" |
|
|
|
|
|
try: |
|
|
|
|
|
df = pl.read_csv('tiktok_cleaned.csv') |
|
|
|
|
|
|
|
|
plt.style.use('default') |
|
|
sns.set_palette("husl") |
|
|
|
|
|
|
|
|
fig, axes = plt.subplots(2, 3, figsize=(20, 12)) |
|
|
fig.suptitle('TikTok Dataset: Comprehensive Performance Analysis', fontsize=18, fontweight='bold') |
|
|
|
|
|
|
|
|
likes_data = df['digg_count'].to_list() |
|
|
axes[0, 0].hist(likes_data, bins=50, alpha=0.7, edgecolor='black', log=True) |
|
|
axes[0, 0].set_title('Distribution of Video Likes\n(Log Scale)', fontweight='bold') |
|
|
axes[0, 0].set_xlabel('Number of Likes') |
|
|
axes[0, 0].set_ylabel('Frequency (Log Scale)') |
|
|
axes[0, 0].grid(True, alpha=0.3) |
|
|
|
|
|
|
|
|
duration_stats = df.group_by('duration_category').agg([ |
|
|
pl.col('digg_count').mean().alias('avg_likes'), |
|
|
pl.len().alias('video_count') |
|
|
]).sort('avg_likes', descending=True) |
|
|
|
|
|
categories = duration_stats['duration_category'].to_list() |
|
|
avg_likes = duration_stats['avg_likes'].to_list() |
|
|
|
|
|
bars = axes[0, 1].bar(categories, avg_likes, alpha=0.7, color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4']) |
|
|
axes[0, 1].set_title('Average Likes by Video Duration', fontweight='bold') |
|
|
axes[0, 1].set_xlabel('Duration Category') |
|
|
axes[0, 1].set_ylabel('Average Likes') |
|
|
axes[0, 1].tick_params(axis='x', rotation=45) |
|
|
axes[0, 1].grid(True, alpha=0.3) |
|
|
|
|
|
|
|
|
for bar in bars: |
|
|
height = bar.get_height() |
|
|
axes[0, 1].text(bar.get_x() + bar.get_width()/2., height, |
|
|
f'{height/1e6:.1f}M', |
|
|
ha='center', va='bottom', fontweight='bold') |
|
|
|
|
|
|
|
|
author_stats = df.group_by('author_unique_id').agg([ |
|
|
pl.col('digg_count').mean().alias('avg_likes'), |
|
|
pl.col('play_count').mean().alias('avg_views'), |
|
|
pl.len().alias('video_count') |
|
|
]).sort('avg_likes', descending=True) |
|
|
|
|
|
authors = author_stats['author_unique_id'].to_list() |
|
|
author_likes = author_stats['avg_likes'].to_list() |
|
|
author_views = author_stats['avg_views'].to_list() |
|
|
|
|
|
x_pos = np.arange(len(authors)) |
|
|
width = 0.35 |
|
|
|
|
|
bars1 = axes[0, 2].bar(x_pos - width/2, [l/1e6 for l in author_likes], width, |
|
|
label='Avg Likes (M)', alpha=0.7) |
|
|
bars2 = axes[0, 2].bar(x_pos + width/2, [v/1e6 for v in author_views], width, |
|
|
label='Avg Views (M)', alpha=0.7) |
|
|
|
|
|
axes[0, 2].set_title('Author Performance Comparison', fontweight='bold') |
|
|
axes[0, 2].set_xlabel('Authors') |
|
|
axes[0, 2].set_ylabel('Count (Millions)') |
|
|
axes[0, 2].set_xticks(x_pos) |
|
|
axes[0, 2].set_xticklabels(authors, rotation=45) |
|
|
axes[0, 2].legend() |
|
|
axes[0, 2].grid(True, alpha=0.3) |
|
|
|
|
|
|
|
|
location_stats = df.filter(pl.col('location_created').is_not_null()).group_by('location_created').agg([ |
|
|
pl.col('digg_count').mean().alias('avg_likes'), |
|
|
pl.len().alias('video_count') |
|
|
]).sort('avg_likes', descending=True).head(6) |
|
|
|
|
|
locations = location_stats['location_created'].to_list() |
|
|
location_likes = location_stats['avg_likes'].to_list() |
|
|
|
|
|
bars = axes[1, 0].bar(locations, [l/1e6 for l in location_likes], alpha=0.7) |
|
|
axes[1, 0].set_title('Average Likes by Location\n(Top 6 Countries)', fontweight='bold') |
|
|
axes[1, 0].set_xlabel('Country Code') |
|
|
axes[1, 0].set_ylabel('Average Likes (Millions)') |
|
|
axes[1, 0].tick_params(axis='x', rotation=45) |
|
|
axes[1, 0].grid(True, alpha=0.3) |
|
|
|
|
|
for bar in bars: |
|
|
height = bar.get_height() |
|
|
axes[1, 0].text(bar.get_x() + bar.get_width()/2., height, |
|
|
f'{height:.1f}M', |
|
|
ha='center', va='bottom', fontweight='bold') |
|
|
|
|
|
|
|
|
hashtag_stats = df.group_by('has_hashtags').agg([ |
|
|
pl.col('digg_count').mean().alias('avg_likes'), |
|
|
pl.col('play_count').mean().alias('avg_views'), |
|
|
pl.len().alias('video_count') |
|
|
]) |
|
|
|
|
|
hashtag_labels = ['With Hashtags', 'Without Hashtags'] |
|
|
hashtag_likes = [hashtag_stats.filter(pl.col('has_hashtags') == True)['avg_likes'][0] / 1e6, |
|
|
hashtag_stats.filter(pl.col('has_hashtags') == False)['avg_likes'][0] / 1e6] |
|
|
|
|
|
bars = axes[1, 1].bar(hashtag_labels, hashtag_likes, alpha=0.7, color=['#FF9999', '#66B2FF']) |
|
|
axes[1, 1].set_title('Impact of Hashtags on Engagement', fontweight='bold') |
|
|
axes[1, 1].set_ylabel('Average Likes (Millions)') |
|
|
axes[1, 1].grid(True, alpha=0.3) |
|
|
|
|
|
for bar in bars: |
|
|
height = bar.get_height() |
|
|
axes[1, 1].text(bar.get_x() + bar.get_width()/2., height, |
|
|
f'{height:.1f}M', |
|
|
ha='center', va='bottom', fontweight='bold') |
|
|
|
|
|
|
|
|
engagement_rates = [7.22, 0.11, 0.15] |
|
|
engagement_types = ['Like Rate', 'Comment Rate', 'Share Rate'] |
|
|
|
|
|
bars = axes[1, 2].bar(engagement_types, engagement_rates, alpha=0.7, color=['#FF6B6B', '#4ECDC4', '#45B7D1']) |
|
|
axes[1, 2].set_title('Engagement Rate Comparison (%)', fontweight='bold') |
|
|
axes[1, 2].set_ylabel('Engagement Rate (%)') |
|
|
axes[1, 2].grid(True, alpha=0.3) |
|
|
|
|
|
for bar in bars: |
|
|
height = bar.get_height() |
|
|
axes[1, 2].text(bar.get_x() + bar.get_width()/2., height, |
|
|
f'{height:.2f}%', |
|
|
ha='center', va='bottom', fontweight='bold') |
|
|
|
|
|
plt.tight_layout() |
|
|
plt.savefig('comprehensive_tiktok_analysis.png', dpi=300, bbox_inches='tight') |
|
|
plt.show() |
|
|
|
|
|
print("π Comprehensive visualizations saved as 'comprehensive_tiktok_analysis.png'") |
|
|
|
|
|
|
|
|
create_detailed_analysis_charts(df) |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error creating visualizations: {e}") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
|
|
|
def create_detailed_analysis_charts(df): |
|
|
"""Create additional detailed analysis charts""" |
|
|
|
|
|
|
|
|
plt.figure(figsize=(12, 8)) |
|
|
|
|
|
|
|
|
plt.subplot(2, 2, 1) |
|
|
author_likes = df.group_by('author_unique_id').agg( |
|
|
pl.col('digg_count').sum().alias('total_likes') |
|
|
).sort('total_likes', descending=True) |
|
|
|
|
|
plt.pie(author_likes['total_likes'].to_list(), |
|
|
labels=author_likes['author_unique_id'].to_list(), |
|
|
autopct='%1.1f%%', startangle=90) |
|
|
plt.title('Total Likes Distribution by Creator') |
|
|
|
|
|
|
|
|
plt.subplot(2, 2, 2) |
|
|
author_counts = df.group_by('author_unique_id').agg( |
|
|
pl.len().alias('video_count') |
|
|
).sort('video_count', descending=True) |
|
|
|
|
|
plt.bar(author_counts['author_unique_id'].to_list(), |
|
|
author_counts['video_count'].to_list(), |
|
|
alpha=0.7, color='skyblue') |
|
|
plt.title('Video Count by Creator') |
|
|
plt.xticks(rotation=45) |
|
|
|
|
|
|
|
|
plt.subplot(2, 2, 3) |
|
|
plt.hist(df['duration'].to_list(), bins=30, alpha=0.7, edgecolor='black') |
|
|
plt.title('Video Duration Distribution') |
|
|
plt.xlabel('Duration (seconds)') |
|
|
plt.ylabel('Frequency') |
|
|
plt.grid(True, alpha=0.3) |
|
|
|
|
|
|
|
|
plt.subplot(2, 2, 4) |
|
|
plt.scatter(df['play_count'].to_list(), df['digg_count'].to_list(), |
|
|
alpha=0.6, s=20) |
|
|
plt.title('Views vs Likes Correlation') |
|
|
plt.xlabel('Views') |
|
|
plt.ylabel('Likes') |
|
|
plt.grid(True, alpha=0.3) |
|
|
|
|
|
|
|
|
correlation = df.select(pl.corr('play_count', 'digg_count')).item() |
|
|
plt.text(0.05, 0.95, f'Correlation: {correlation:.3f}', |
|
|
transform=plt.gca().transAxes, fontsize=12, |
|
|
bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.8)) |
|
|
|
|
|
plt.tight_layout() |
|
|
plt.savefig('detailed_tiktok_analysis.png', dpi=300, bbox_inches='tight') |
|
|
plt.show() |
|
|
|
|
|
print("π Detailed analysis charts saved as 'detailed_tiktok_analysis.png'") |
|
|
|
|
|
|
|
|
create_performance_summary_chart(df) |
|
|
|
|
|
def create_performance_summary_chart(df): |
|
|
"""Create a performance summary chart highlighting key metrics""" |
|
|
|
|
|
fig, ax = plt.subplots(figsize=(10, 6)) |
|
|
|
|
|
|
|
|
metrics = ['Avg Views', 'Avg Likes', 'Like Rate', 'Comment Rate'] |
|
|
values = [21.7, 1.57, 7.22, 0.11] |
|
|
units = ['M', 'M', '%', '%'] |
|
|
|
|
|
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4'] |
|
|
|
|
|
bars = ax.bar(metrics, values, color=colors, alpha=0.7) |
|
|
|
|
|
ax.set_title('TikTok Performance Summary', fontsize=16, fontweight='bold') |
|
|
ax.set_ylabel('Value') |
|
|
ax.grid(True, alpha=0.3, axis='y') |
|
|
|
|
|
|
|
|
for bar, value, unit in zip(bars, values, units): |
|
|
height = bar.get_height() |
|
|
ax.text(bar.get_x() + bar.get_width()/2., height, |
|
|
f'{value} {unit}', |
|
|
ha='center', va='bottom', fontweight='bold') |
|
|
|
|
|
|
|
|
insights = [ |
|
|
"β’ Very short videos (β€15s) perform best", |
|
|
"β’ US content outperforms international", |
|
|
"β’ Hashtags boost engagement 1.7x", |
|
|
"β’ Top 3 creators = 76.4% of all likes" |
|
|
] |
|
|
|
|
|
for i, insight in enumerate(insights): |
|
|
ax.text(0.02, 0.95 - i*0.1, insight, transform=ax.transAxes, |
|
|
fontsize=10, bbox=dict(boxstyle="round,pad=0.3", |
|
|
facecolor="lightyellow", alpha=0.7)) |
|
|
|
|
|
plt.tight_layout() |
|
|
plt.savefig('tiktok_performance_summary.png', dpi=300, bbox_inches='tight') |
|
|
plt.show() |
|
|
|
|
|
print("π Performance summary saved as 'tiktok_performance_summary.png'") |
|
|
|
|
|
def generate_insights_report(): |
|
|
"""Generate a text-based insights report""" |
|
|
|
|
|
print("\n" + "="*70) |
|
|
print("π TIKTOK DATASET - KEY INSIGHTS REPORT") |
|
|
print("="*70) |
|
|
|
|
|
insights = [ |
|
|
"π― CONTENT STRATEGY INSIGHTS:", |
|
|
"β’ Very short videos (β€15s) generate 1.4x more likes than average", |
|
|
"β’ Optimal video length: 15-30 seconds for maximum engagement", |
|
|
"β’ Videos longer than 60s see significant drop in performance", |
|
|
"", |
|
|
"π₯ CREATOR ECOSYSTEM:", |
|
|
"β’ Highly concentrated: Only 4 creators in entire dataset", |
|
|
"β’ Top 3 creators (zachking, mrbeast, addisonre) dominate:", |
|
|
" - Account for 76.4% of all likes", |
|
|
" - Generate highest average engagement rates", |
|
|
"", |
|
|
"π GEOGRAPHIC PERFORMANCE:", |
|
|
"β’ US-based content performs 3.2x better than international", |
|
|
"β’ Indonesia has highest volume but lower engagement", |
|
|
"β’ Limited geographic diversity in dataset", |
|
|
"", |
|
|
"π ENGAGEMENT PATTERNS:", |
|
|
"β’ Strong correlation (0.65) between views and likes", |
|
|
"β’ Like rate: 7.22% (healthy engagement)", |
|
|
"β’ Comment rate: 0.11% (very low - viewers prefer liking)", |
|
|
"β’ Share rate: 0.15% (higher than comments)", |
|
|
"", |
|
|
"π CONTENT OPTIMIZATION:", |
|
|
"β’ Videos with hashtags have 1.7x higher engagement", |
|
|
"β’ Average of 1.9 hashtags per video", |
|
|
"β’ Description length: ~44 characters on average", |
|
|
"", |
|
|
"π RECOMMENDATIONS:", |
|
|
"1. Focus on 15-30 second video format", |
|
|
"2. Always include relevant hashtags (1-3 optimal)", |
|
|
"3. Target US audience for maximum engagement", |
|
|
"4. Study top creators' content strategies", |
|
|
"5. Prioritize like-generating content over comments" |
|
|
] |
|
|
|
|
|
for insight in insights: |
|
|
print(insight) |
|
|
|
|
|
print("\n" + "="*70) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
create_comprehensive_visualizations() |
|
|
generate_insights_report() |