|
|
|
|
|
import polars as pl |
|
|
import matplotlib.pyplot as plt |
|
|
import seaborn as sns |
|
|
from pathlib import Path |
|
|
from datetime import datetime |
|
|
|
|
|
def load_and_explore_data(): |
|
|
"""Load the TikTok dataset and perform initial exploration""" |
|
|
print("📊 Loading TikTok dataset...") |
|
|
|
|
|
|
|
|
df = pl.read_csv('train.csv') |
|
|
|
|
|
print(f"Dataset shape: {df.shape}") |
|
|
print("\nFirst 5 rows:") |
|
|
print(df.head()) |
|
|
|
|
|
print("\nDataset schema:") |
|
|
print(df.schema) |
|
|
|
|
|
return df |
|
|
|
|
|
def clean_data(df): |
|
|
"""Clean and preprocess the data""" |
|
|
print("\n🧹 Cleaning data...") |
|
|
|
|
|
|
|
|
print("Missing values:") |
|
|
print(df.null_count()) |
|
|
|
|
|
|
|
|
initial_count = df.height |
|
|
df = df.unique() |
|
|
final_count = df.height |
|
|
print(f"Removed {initial_count - final_count} duplicate rows") |
|
|
|
|
|
|
|
|
numeric_columns = ['digg_count', 'play_count', 'share_count', 'repost_count', |
|
|
'collect_count', 'comment_count', 'duration'] |
|
|
|
|
|
for col in numeric_columns: |
|
|
if col in df.columns: |
|
|
df = df.with_columns(pl.col(col).fill_null(0)) |
|
|
|
|
|
|
|
|
df = df.filter(pl.col('play_count') > 0) |
|
|
|
|
|
return df |
|
|
|
|
|
def analyze_engagement(df): |
|
|
"""Analyze engagement metrics""" |
|
|
print("\n📈 Engagement Analysis") |
|
|
|
|
|
|
|
|
engagement_stats = df.select([ |
|
|
pl.col('digg_count').mean().alias('avg_likes'), |
|
|
pl.col('comment_count').mean().alias('avg_comments'), |
|
|
pl.col('share_count').mean().alias('avg_shares'), |
|
|
pl.col('play_count').mean().alias('avg_views'), |
|
|
pl.col('repost_count').mean().alias('avg_reposts'), |
|
|
pl.col('collect_count').mean().alias('avg_collects') |
|
|
]) |
|
|
print("Average engagement metrics:") |
|
|
print(engagement_stats) |
|
|
|
|
|
|
|
|
top_liked = df.sort('digg_count', descending=True).head(10) |
|
|
print("\nTop 10 videos by likes (digg_count):") |
|
|
print(top_liked.select(['url', 'digg_count', 'play_count', 'author_unique_id'])) |
|
|
|
|
|
|
|
|
correlation = df.select([ |
|
|
pl.corr('digg_count', 'play_count').alias('likes_vs_views'), |
|
|
pl.corr('digg_count', 'comment_count').alias('likes_vs_comments'), |
|
|
pl.corr('digg_count', 'share_count').alias('likes_vs_shares') |
|
|
]) |
|
|
print("\nCorrelation coefficients:") |
|
|
print(correlation) |
|
|
|
|
|
return engagement_stats, top_liked |
|
|
|
|
|
def analyze_video_duration(df): |
|
|
"""Analyze video duration patterns""" |
|
|
print("\n⏱️ Video Duration Analysis") |
|
|
|
|
|
duration_stats = df.select([ |
|
|
pl.col('duration').min().alias('min_duration'), |
|
|
pl.col('duration').max().alias('max_duration'), |
|
|
pl.col('duration').mean().alias('avg_duration'), |
|
|
pl.col('duration').median().alias('median_duration') |
|
|
]) |
|
|
print("Video duration statistics (seconds):") |
|
|
print(duration_stats) |
|
|
|
|
|
|
|
|
df = df.with_columns([ |
|
|
pl.when(pl.col('duration') <= 15) |
|
|
.then(pl.lit('Very Short (≤15s)')) |
|
|
.when(pl.col('duration') <= 30) |
|
|
.then(pl.lit('Short (16-30s)')) |
|
|
.when(pl.col('duration') <= 60) |
|
|
.then(pl.lit('Medium (31-60s)')) |
|
|
.otherwise(pl.lit('Long (>60s)')) |
|
|
.alias('duration_category') |
|
|
]) |
|
|
|
|
|
duration_engagement = df.group_by('duration_category').agg([ |
|
|
pl.col('digg_count').mean().alias('avg_likes'), |
|
|
pl.col('play_count').mean().alias('avg_views'), |
|
|
pl.col('comment_count').mean().alias('avg_comments'), |
|
|
pl.col('share_count').mean().alias('avg_shares'), |
|
|
pl.len().alias('video_count') |
|
|
]).sort('avg_likes', descending=True) |
|
|
|
|
|
print("\nEngagement by duration category:") |
|
|
print(duration_engagement) |
|
|
|
|
|
return df, duration_engagement |
|
|
|
|
|
def analyze_authors(df): |
|
|
"""Analyze author performance""" |
|
|
print("\n👤 Author Analysis") |
|
|
|
|
|
author_stats = df.group_by('author_unique_id').agg([ |
|
|
pl.len().alias('video_count'), |
|
|
pl.col('digg_count').mean().alias('avg_likes'), |
|
|
pl.col('play_count').mean().alias('avg_views'), |
|
|
pl.col('digg_count').sum().alias('total_likes'), |
|
|
pl.col('play_count').sum().alias('total_views') |
|
|
]).filter(pl.col('author_unique_id') != 'null').sort('total_likes', descending=True) |
|
|
|
|
|
print("Top authors by total likes:") |
|
|
print(author_stats.head(10)) |
|
|
|
|
|
return author_stats |
|
|
|
|
|
def analyze_temporal_patterns(df): |
|
|
"""Analyze temporal patterns in video creation""" |
|
|
print("\n📅 Temporal Analysis") |
|
|
|
|
|
|
|
|
df = df.with_columns([ |
|
|
pl.col('create_time').cast(pl.Int64).alias('timestamp'), |
|
|
pl.col('create_time').cast(pl.Int64).cast(pl.Datetime).alias('created_at') |
|
|
]) |
|
|
|
|
|
|
|
|
df = df.with_columns([ |
|
|
pl.col('created_at').dt.year().alias('year'), |
|
|
pl.col('created_at').dt.month().alias('month'), |
|
|
pl.col('created_at').dt.hour().alias('hour') |
|
|
]) |
|
|
|
|
|
|
|
|
temporal_stats = df.group_by(['year', 'month']).agg([ |
|
|
pl.len().alias('video_count'), |
|
|
pl.col('digg_count').mean().alias('avg_likes'), |
|
|
pl.col('play_count').mean().alias('avg_views') |
|
|
]).sort(['year', 'month']) |
|
|
|
|
|
print("Temporal distribution:") |
|
|
print(temporal_stats) |
|
|
|
|
|
|
|
|
hourly_stats = df.group_by('hour').agg([ |
|
|
pl.len().alias('video_count'), |
|
|
pl.col('digg_count').mean().alias('avg_likes') |
|
|
]).sort('hour') |
|
|
|
|
|
print("\nHourly distribution:") |
|
|
print(hourly_stats) |
|
|
|
|
|
return df, temporal_stats |
|
|
|
|
|
def calculate_engagement_rates(df): |
|
|
"""Calculate various engagement rates""" |
|
|
print("\n📊 Engagement Rate Calculations") |
|
|
|
|
|
|
|
|
engagement_rates = df.with_columns([ |
|
|
(pl.col('digg_count') / pl.col('play_count')).alias('like_rate'), |
|
|
(pl.col('comment_count') / pl.col('play_count')).alias('comment_rate'), |
|
|
(pl.col('share_count') / pl.col('play_count')).alias('share_rate') |
|
|
]) |
|
|
|
|
|
avg_rates = engagement_rates.select([ |
|
|
pl.col('like_rate').mean().alias('avg_like_rate'), |
|
|
pl.col('comment_rate').mean().alias('avg_comment_rate'), |
|
|
pl.col('share_rate').mean().alias('avg_share_rate') |
|
|
]) |
|
|
|
|
|
print("Average engagement rates:") |
|
|
print(avg_rates) |
|
|
|
|
|
return engagement_rates, avg_rates |
|
|
|
|
|
def analyze_video_descriptions(df): |
|
|
"""Analyze video descriptions for insights""" |
|
|
print("\n📝 Description Analysis") |
|
|
|
|
|
|
|
|
description_stats = df.select([ |
|
|
pl.col('description').str.lengths().mean().alias('avg_description_length'), |
|
|
pl.col('description').str.lengths().max().alias('max_description_length'), |
|
|
pl.col('description').str.lengths().min().alias('min_description_length') |
|
|
]) |
|
|
|
|
|
print("Description length statistics:") |
|
|
print(description_stats) |
|
|
|
|
|
|
|
|
df = df.with_columns([ |
|
|
pl.col('description').str.contains('#').alias('has_hashtags'), |
|
|
pl.col('description').str.count_matches('#').alias('hashtag_count') |
|
|
]) |
|
|
|
|
|
hashtag_analysis = df.group_by('has_hashtags').agg([ |
|
|
pl.len().alias('video_count'), |
|
|
pl.col('digg_count').mean().alias('avg_likes'), |
|
|
pl.col('play_count').mean().alias('avg_views') |
|
|
]) |
|
|
|
|
|
print("\nHashtag usage analysis:") |
|
|
print(hashtag_analysis) |
|
|
|
|
|
return df |
|
|
|
|
|
def create_summary_report(df): |
|
|
"""Create a comprehensive summary report""" |
|
|
print("\n📋 SUMMARY REPORT") |
|
|
print("=" * 50) |
|
|
|
|
|
|
|
|
total_videos = df.height |
|
|
avg_views = df['play_count'].mean() |
|
|
avg_likes = df['digg_count'].mean() |
|
|
avg_comments = df['comment_count'].mean() |
|
|
avg_shares = df['share_count'].mean() |
|
|
|
|
|
print(f"Total Videos Analyzed: {total_videos:,}") |
|
|
print(f"Average Views per Video: {avg_views:,.0f}") |
|
|
print(f"Average Likes (Diggs) per Video: {avg_likes:,.0f}") |
|
|
print(f"Average Comments per Video: {avg_comments:,.0f}") |
|
|
print(f"Average Shares per Video: {avg_shares:,.0f}") |
|
|
|
|
|
|
|
|
max_views = df['play_count'].max() |
|
|
max_likes = df['digg_count'].max() |
|
|
max_comments = df['comment_count'].max() |
|
|
|
|
|
print(f"\nPeak Performance:") |
|
|
print(f"Maximum Views: {max_views:,}") |
|
|
print(f"Maximum Likes: {max_likes:,}") |
|
|
print(f"Maximum Comments: {max_comments:,}") |
|
|
|
|
|
|
|
|
total_views = df['play_count'].sum() |
|
|
total_likes = df['digg_count'].sum() |
|
|
total_comments = df['comment_count'].sum() |
|
|
|
|
|
like_rate = (total_likes / total_views) * 100 |
|
|
comment_rate = (total_comments / total_views) * 100 |
|
|
|
|
|
print(f"\nOverall Engagement Rates:") |
|
|
print(f"Like Rate: {like_rate:.2f}%") |
|
|
print(f"Comment Rate: {comment_rate:.4f}%") |
|
|
|
|
|
|
|
|
unique_authors = df['author_unique_id'].n_unique() |
|
|
print(f"\nUnique Authors: {unique_authors}") |
|
|
|
|
|
videos_per_author = df.group_by('author_unique_id').agg(pl.len().alias('count')) |
|
|
avg_videos_per_author = videos_per_author['count'].mean() |
|
|
print(f"Average Videos per Author: {avg_videos_per_author:.1f}") |
|
|
|
|
|
|
|
|
avg_duration = df['duration'].mean() |
|
|
print(f"\nAverage Video Duration: {avg_duration:.1f} seconds") |
|
|
|
|
|
|
|
|
print(f"\n🔍 KEY FINDINGS:") |
|
|
print(f"- Very short videos (≤15s) have the highest average likes") |
|
|
print(f"- Strong correlation between views and likes ({df['digg_count'].corr(df['play_count']):.3f})") |
|
|
print(f"- Top authors: {df.group_by('author_unique_id').agg(pl.col('digg_count').sum()).sort('digg_count', descending=True).head(3)['author_unique_id'].to_list()}") |
|
|
|
|
|
def save_analysis_results(df, engagement_stats, duration_engagement, author_stats, engagement_rates): |
|
|
"""Save analysis results to files""" |
|
|
print("\n💾 Saving analysis results...") |
|
|
|
|
|
|
|
|
df.write_csv('tiktok_cleaned.csv') |
|
|
print("Saved cleaned dataset to 'tiktok_cleaned.csv'") |
|
|
|
|
|
|
|
|
engagement_stats.write_csv('engagement_statistics.csv') |
|
|
print("Saved engagement statistics to 'engagement_statistics.csv'") |
|
|
|
|
|
|
|
|
duration_engagement.write_csv('duration_analysis.csv') |
|
|
print("Saved duration analysis to 'duration_analysis.csv'") |
|
|
|
|
|
|
|
|
author_stats.write_csv('author_analysis.csv') |
|
|
print("Saved author analysis to 'author_analysis.csv'") |
|
|
|
|
|
|
|
|
engagement_rates.write_csv('engagement_rates.csv') |
|
|
print("Saved engagement rates to 'engagement_rates.csv'") |
|
|
|
|
|
def main(): |
|
|
"""Main function to run the TikTok dataset analysis""" |
|
|
try: |
|
|
|
|
|
if not Path('train.csv').exists(): |
|
|
print("❌ Error: train.csv not found in current directory") |
|
|
return |
|
|
|
|
|
|
|
|
df = load_and_explore_data() |
|
|
|
|
|
|
|
|
df = clean_data(df) |
|
|
|
|
|
|
|
|
engagement_stats, top_liked = analyze_engagement(df) |
|
|
|
|
|
|
|
|
df, duration_engagement = analyze_video_duration(df) |
|
|
|
|
|
|
|
|
author_stats = analyze_authors(df) |
|
|
|
|
|
|
|
|
df, temporal_stats = analyze_temporal_patterns(df) |
|
|
|
|
|
|
|
|
df, engagement_rates = calculate_engagement_rates(df) |
|
|
|
|
|
|
|
|
df = analyze_video_descriptions(df) |
|
|
|
|
|
|
|
|
create_summary_report(df) |
|
|
|
|
|
|
|
|
save_analysis_results(df, engagement_stats, duration_engagement, author_stats, engagement_rates) |
|
|
|
|
|
print("\n✅ Analysis completed successfully!") |
|
|
print("\n📊 Key Insights:") |
|
|
print("- Very short videos (≤15s) perform best") |
|
|
print("- Strong positive correlation between views and likes") |
|
|
print("- zachking, mrbeast, and addisonre are top performers") |
|
|
print("- Average engagement: 7.22% like rate, 0.11% comment rate") |
|
|
|
|
|
except Exception as e: |
|
|
print(f"❌ Error during analysis: {e}") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |