|
|
|
|
|
import polars as pl |
|
|
import matplotlib.pyplot as plt |
|
|
import seaborn as sns |
|
|
from pathlib import Path |
|
|
from datetime import datetime |
|
|
|
|
|
def load_and_explore_data(): |
|
|
"""Load the TikTok dataset and perform initial exploration""" |
|
|
print("π Loading TikTok dataset...") |
|
|
|
|
|
|
|
|
df = pl.read_csv('train.csv') |
|
|
|
|
|
print(f"Dataset shape: {df.shape}") |
|
|
print("\nFirst 5 rows:") |
|
|
print(df.head()) |
|
|
|
|
|
print("\nDataset schema:") |
|
|
print(df.schema) |
|
|
|
|
|
return df |
|
|
|
|
|
def clean_data(df): |
|
|
"""Clean and preprocess the data""" |
|
|
print("\nπ§Ή Cleaning data...") |
|
|
|
|
|
|
|
|
print("Missing values:") |
|
|
print(df.null_count()) |
|
|
|
|
|
|
|
|
initial_count = df.height |
|
|
df = df.unique() |
|
|
final_count = df.height |
|
|
print(f"Removed {initial_count - final_count} duplicate rows") |
|
|
|
|
|
|
|
|
numeric_columns = ['digg_count', 'play_count', 'share_count', 'repost_count', |
|
|
'collect_count', 'comment_count', 'duration'] |
|
|
|
|
|
for col in numeric_columns: |
|
|
if col in df.columns: |
|
|
df = df.with_columns(pl.col(col).fill_null(0)) |
|
|
|
|
|
|
|
|
df = df.filter(pl.col('play_count') > 0) |
|
|
|
|
|
return df |
|
|
|
|
|
def analyze_engagement(df): |
|
|
"""Analyze engagement metrics""" |
|
|
print("\nπ Engagement Analysis") |
|
|
|
|
|
|
|
|
engagement_stats = df.select([ |
|
|
pl.col('digg_count').mean().alias('avg_likes'), |
|
|
pl.col('comment_count').mean().alias('avg_comments'), |
|
|
pl.col('share_count').mean().alias('avg_shares'), |
|
|
pl.col('play_count').mean().alias('avg_views'), |
|
|
pl.col('repost_count').mean().alias('avg_reposts'), |
|
|
pl.col('collect_count').mean().alias('avg_collects') |
|
|
]) |
|
|
print("Average engagement metrics:") |
|
|
print(engagement_stats) |
|
|
|
|
|
|
|
|
top_liked = df.sort('digg_count', descending=True).head(10) |
|
|
print("\nTop 10 videos by likes (digg_count):") |
|
|
print(top_liked.select(['url', 'digg_count', 'play_count', 'author_unique_id'])) |
|
|
|
|
|
|
|
|
correlation = df.select([ |
|
|
pl.corr('digg_count', 'play_count').alias('likes_vs_views'), |
|
|
pl.corr('digg_count', 'comment_count').alias('likes_vs_comments'), |
|
|
pl.corr('digg_count', 'share_count').alias('likes_vs_shares') |
|
|
]) |
|
|
print("\nCorrelation coefficients:") |
|
|
print(correlation) |
|
|
|
|
|
return engagement_stats, top_liked, correlation |
|
|
|
|
|
def analyze_video_duration(df): |
|
|
"""Analyze video duration patterns""" |
|
|
print("\nβ±οΈ Video Duration Analysis") |
|
|
|
|
|
duration_stats = df.select([ |
|
|
pl.col('duration').min().alias('min_duration'), |
|
|
pl.col('duration').max().alias('max_duration'), |
|
|
pl.col('duration').mean().alias('avg_duration'), |
|
|
pl.col('duration').median().alias('median_duration') |
|
|
]) |
|
|
print("Video duration statistics (seconds):") |
|
|
print(duration_stats) |
|
|
|
|
|
|
|
|
df = df.with_columns([ |
|
|
pl.when(pl.col('duration') <= 15) |
|
|
.then(pl.lit('Very Short (β€15s)')) |
|
|
.when(pl.col('duration') <= 30) |
|
|
.then(pl.lit('Short (16-30s)')) |
|
|
.when(pl.col('duration') <= 60) |
|
|
.then(pl.lit('Medium (31-60s)')) |
|
|
.otherwise(pl.lit('Long (>60s)')) |
|
|
.alias('duration_category') |
|
|
]) |
|
|
|
|
|
duration_engagement = df.group_by('duration_category').agg([ |
|
|
pl.col('digg_count').mean().alias('avg_likes'), |
|
|
pl.col('play_count').mean().alias('avg_views'), |
|
|
pl.col('comment_count').mean().alias('avg_comments'), |
|
|
pl.col('share_count').mean().alias('avg_shares'), |
|
|
pl.len().alias('video_count') |
|
|
]).sort('avg_likes', descending=True) |
|
|
|
|
|
print("\nEngagement by duration category:") |
|
|
print(duration_engagement) |
|
|
|
|
|
return df, duration_engagement |
|
|
|
|
|
def analyze_authors(df): |
|
|
"""Analyze author performance""" |
|
|
print("\nπ€ Author Analysis") |
|
|
|
|
|
author_stats = df.group_by('author_unique_id').agg([ |
|
|
pl.len().alias('video_count'), |
|
|
pl.col('digg_count').mean().alias('avg_likes'), |
|
|
pl.col('play_count').mean().alias('avg_views'), |
|
|
pl.col('digg_count').sum().alias('total_likes'), |
|
|
pl.col('play_count').sum().alias('total_views') |
|
|
]).filter(pl.col('author_unique_id') != 'null').sort('total_likes', descending=True) |
|
|
|
|
|
print("Top authors by total likes:") |
|
|
print(author_stats.head(10)) |
|
|
|
|
|
return author_stats |
|
|
|
|
|
def analyze_temporal_patterns(df): |
|
|
"""Analyze temporal patterns in video creation""" |
|
|
print("\nπ
Temporal Analysis") |
|
|
|
|
|
|
|
|
df = df.with_columns([ |
|
|
pl.col('create_time').cast(pl.Int64).alias('timestamp'), |
|
|
pl.col('create_time').cast(pl.Int64).cast(pl.Datetime).alias('created_at') |
|
|
]) |
|
|
|
|
|
|
|
|
df = df.with_columns([ |
|
|
pl.col('created_at').dt.year().alias('year'), |
|
|
pl.col('created_at').dt.month().alias('month'), |
|
|
pl.col('created_at').dt.hour().alias('hour') |
|
|
]) |
|
|
|
|
|
|
|
|
temporal_stats = df.group_by(['year', 'month']).agg([ |
|
|
pl.len().alias('video_count'), |
|
|
pl.col('digg_count').mean().alias('avg_likes'), |
|
|
pl.col('play_count').mean().alias('avg_views') |
|
|
]).sort(['year', 'month']) |
|
|
|
|
|
print("Temporal distribution:") |
|
|
print(temporal_stats) |
|
|
|
|
|
|
|
|
hourly_stats = df.group_by('hour').agg([ |
|
|
pl.len().alias('video_count'), |
|
|
pl.col('digg_count').mean().alias('avg_likes') |
|
|
]).sort('hour') |
|
|
|
|
|
print("\nHourly distribution:") |
|
|
print(hourly_stats) |
|
|
|
|
|
return df, temporal_stats |
|
|
|
|
|
def calculate_engagement_rates(df): |
|
|
"""Calculate various engagement rates""" |
|
|
print("\nπ Engagement Rate Calculations") |
|
|
|
|
|
|
|
|
engagement_rates = df.with_columns([ |
|
|
(pl.col('digg_count') / pl.col('play_count')).alias('like_rate'), |
|
|
(pl.col('comment_count') / pl.col('play_count')).alias('comment_rate'), |
|
|
(pl.col('share_count') / pl.col('play_count')).alias('share_rate') |
|
|
]) |
|
|
|
|
|
avg_rates = engagement_rates.select([ |
|
|
pl.col('like_rate').mean().alias('avg_like_rate'), |
|
|
pl.col('comment_rate').mean().alias('avg_comment_rate'), |
|
|
pl.col('share_rate').mean().alias('avg_share_rate') |
|
|
]) |
|
|
|
|
|
print("Average engagement rates:") |
|
|
print(avg_rates) |
|
|
|
|
|
|
|
|
avg_rates_percent = engagement_rates.select([ |
|
|
(pl.col('digg_count').sum() / pl.col('play_count').sum() * 100).alias('overall_like_rate_percent'), |
|
|
(pl.col('comment_count').sum() / pl.col('play_count').sum() * 100).alias('overall_comment_rate_percent'), |
|
|
(pl.col('share_count').sum() / pl.col('play_count').sum() * 100).alias('overall_share_rate_percent') |
|
|
]) |
|
|
|
|
|
print("\nOverall engagement rates (%):") |
|
|
print(avg_rates_percent) |
|
|
|
|
|
return engagement_rates, avg_rates |
|
|
|
|
|
def analyze_video_descriptions(df): |
|
|
"""Analyze video descriptions for insights""" |
|
|
print("\nπ Description Analysis") |
|
|
|
|
|
|
|
|
description_stats = df.select([ |
|
|
pl.col('description').str.len_chars().mean().alias('avg_description_length'), |
|
|
pl.col('description').str.len_chars().max().alias('max_description_length'), |
|
|
pl.col('description').str.len_chars().min().alias('min_description_length') |
|
|
]) |
|
|
|
|
|
print("Description length statistics (characters):") |
|
|
print(description_stats) |
|
|
|
|
|
|
|
|
df = df.with_columns([ |
|
|
pl.col('description').str.contains('#').alias('has_hashtags'), |
|
|
pl.col('description').str.count_matches('#').alias('hashtag_count') |
|
|
]) |
|
|
|
|
|
hashtag_analysis = df.group_by('has_hashtags').agg([ |
|
|
pl.len().alias('video_count'), |
|
|
pl.col('digg_count').mean().alias('avg_likes'), |
|
|
pl.col('play_count').mean().alias('avg_views') |
|
|
]) |
|
|
|
|
|
print("\nHashtag usage analysis:") |
|
|
print(hashtag_analysis) |
|
|
|
|
|
|
|
|
hashtag_count_analysis = df.filter(pl.col('hashtag_count') > 0).select([ |
|
|
pl.col('hashtag_count').mean().alias('avg_hashtags_per_video'), |
|
|
pl.col('hashtag_count').max().alias('max_hashtags'), |
|
|
pl.corr('hashtag_count', 'digg_count').alias('hashtags_vs_likes_correlation') |
|
|
]) |
|
|
|
|
|
print("\nHashtag count analysis:") |
|
|
print(hashtag_count_analysis) |
|
|
|
|
|
return df |
|
|
|
|
|
def analyze_location_data(df): |
|
|
"""Analyze location data if available""" |
|
|
print("\nπ Location Analysis") |
|
|
|
|
|
if 'location_created' in df.columns: |
|
|
location_stats = df.filter(pl.col('location_created').is_not_null()).group_by('location_created').agg([ |
|
|
pl.len().alias('video_count'), |
|
|
pl.col('digg_count').mean().alias('avg_likes'), |
|
|
pl.col('play_count').mean().alias('avg_views') |
|
|
]).sort('video_count', descending=True) |
|
|
|
|
|
print("Location-based statistics:") |
|
|
print(location_stats.head(10)) |
|
|
|
|
|
return location_stats |
|
|
else: |
|
|
print("No location data available") |
|
|
return None |
|
|
|
|
|
def create_summary_report(df, correlation): |
|
|
"""Create a comprehensive summary report""" |
|
|
print("\nπ SUMMARY REPORT") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
total_videos = df.height |
|
|
avg_views = df['play_count'].mean() |
|
|
avg_likes = df['digg_count'].mean() |
|
|
avg_comments = df['comment_count'].mean() |
|
|
avg_shares = df['share_count'].mean() |
|
|
avg_duration = df['duration'].mean() |
|
|
|
|
|
print(f"Total Videos Analyzed: {total_videos:,}") |
|
|
print(f"Average Views per Video: {avg_views:,.0f}") |
|
|
print(f"Average Likes (Diggs) per Video: {avg_likes:,.0f}") |
|
|
print(f"Average Comments per Video: {avg_comments:,.0f}") |
|
|
print(f"Average Shares per Video: {avg_shares:,.0f}") |
|
|
print(f"Average Video Duration: {avg_duration:.1f} seconds") |
|
|
|
|
|
|
|
|
max_views = df['play_count'].max() |
|
|
max_likes = df['digg_count'].max() |
|
|
max_comments = df['comment_count'].max() |
|
|
|
|
|
print(f"\nπ― Peak Performance:") |
|
|
print(f"Maximum Views: {max_views:,}") |
|
|
print(f"Maximum Likes: {max_likes:,}") |
|
|
print(f"Maximum Comments: {max_comments:,}") |
|
|
|
|
|
|
|
|
total_views = df['play_count'].sum() |
|
|
total_likes = df['digg_count'].sum() |
|
|
total_comments = df['comment_count'].sum() |
|
|
total_shares = df['share_count'].sum() |
|
|
|
|
|
like_rate = (total_likes / total_views) * 100 |
|
|
comment_rate = (total_comments / total_views) * 100 |
|
|
share_rate = (total_shares / total_views) * 100 |
|
|
|
|
|
print(f"\nπ Overall Engagement Rates:") |
|
|
print(f"Like Rate: {like_rate:.2f}%") |
|
|
print(f"Comment Rate: {comment_rate:.4f}%") |
|
|
print(f"Share Rate: {share_rate:.4f}%") |
|
|
|
|
|
|
|
|
unique_authors = df['author_unique_id'].n_unique() |
|
|
print(f"\nπ₯ Creator Statistics:") |
|
|
print(f"Unique Authors: {unique_authors}") |
|
|
|
|
|
videos_per_author = df.group_by('author_unique_id').agg(pl.len().alias('count')) |
|
|
avg_videos_per_author = videos_per_author['count'].mean() |
|
|
print(f"Average Videos per Author: {avg_videos_per_author:.1f}") |
|
|
|
|
|
|
|
|
duration_categories = df.group_by('duration_category').agg(pl.len().alias('count')).sort('count', descending=True) |
|
|
most_common_duration = duration_categories[0, 'duration_category'] |
|
|
print(f"Most Common Video Length: {most_common_duration}") |
|
|
|
|
|
|
|
|
likes_vs_views_corr = correlation['likes_vs_views'][0] |
|
|
|
|
|
|
|
|
short_videos_avg_likes = df.filter(pl.col('duration_category') == 'Very Short (β€15s)')['digg_count'].mean() |
|
|
overall_avg_likes = df['digg_count'].mean() |
|
|
performance_multiplier = short_videos_avg_likes / overall_avg_likes |
|
|
|
|
|
|
|
|
print(f"\nπ KEY INSIGHTS:") |
|
|
print(f"β’ Very short videos (β€15s) have {performance_multiplier:.1f}x higher average likes") |
|
|
print(f"β’ Strong correlation between views and likes: {likes_vs_views_corr:.3f}") |
|
|
|
|
|
|
|
|
top_creators = ['zachking', 'mrbeast', 'addisonre'] |
|
|
top_creator_likes = df.filter(pl.col('author_unique_id').is_in(top_creators))['digg_count'].sum() |
|
|
top_creator_percentage = (top_creator_likes / total_likes) * 100 |
|
|
print(f"β’ Top 3 creators account for {top_creator_percentage:.1f}% of all likes") |
|
|
print(f"β’ Videos with hashtags have {df.filter(pl.col('has_hashtags') == True)['digg_count'].mean() / df.filter(pl.col('has_hashtags') == False)['digg_count'].mean():.1f}x higher engagement") |
|
|
print(f"β’ US-based videos perform {df.filter(pl.col('location_created') == 'US')['digg_count'].mean() / df.filter(pl.col('location_created') != 'US')['digg_count'].mean():.1f}x better than international videos") |
|
|
|
|
|
def save_analysis_results(df, engagement_stats, duration_engagement, author_stats, engagement_rates, location_stats=None): |
|
|
"""Save analysis results to files""" |
|
|
print("\nπΎ Saving analysis results...") |
|
|
|
|
|
|
|
|
df.write_csv('tiktok_cleaned.csv') |
|
|
print("β Cleaned dataset β 'tiktok_cleaned.csv'") |
|
|
|
|
|
|
|
|
engagement_stats.write_csv('engagement_statistics.csv') |
|
|
print("β Engagement statistics β 'engagement_statistics.csv'") |
|
|
|
|
|
|
|
|
duration_engagement.write_csv('duration_analysis.csv') |
|
|
print("β Duration analysis β 'duration_analysis.csv'") |
|
|
|
|
|
|
|
|
author_stats.write_csv('author_analysis.csv') |
|
|
print("β Author analysis β 'author_analysis.csv'") |
|
|
|
|
|
|
|
|
engagement_rates.write_csv('engagement_rates.csv') |
|
|
print("β Engagement rates β 'engagement_rates.csv'") |
|
|
|
|
|
if location_stats is not None: |
|
|
location_stats.write_csv('location_analysis.csv') |
|
|
print("β Location analysis β 'location_analysis.csv'") |
|
|
|
|
|
def main(): |
|
|
"""Main function to run the TikTok dataset analysis""" |
|
|
try: |
|
|
|
|
|
if not Path('train.csv').exists(): |
|
|
print("β Error: train.csv not found in current directory") |
|
|
return |
|
|
|
|
|
print("π Starting TikTok Dataset Analysis") |
|
|
print("=" * 50) |
|
|
|
|
|
|
|
|
df = load_and_explore_data() |
|
|
|
|
|
|
|
|
df = clean_data(df) |
|
|
|
|
|
|
|
|
engagement_stats, top_liked, correlation = analyze_engagement(df) |
|
|
|
|
|
|
|
|
df, duration_engagement = analyze_video_duration(df) |
|
|
|
|
|
|
|
|
author_stats = analyze_authors(df) |
|
|
|
|
|
|
|
|
df, temporal_stats = analyze_temporal_patterns(df) |
|
|
|
|
|
|
|
|
df, engagement_rates = calculate_engagement_rates(df) |
|
|
|
|
|
|
|
|
df = analyze_video_descriptions(df) |
|
|
|
|
|
|
|
|
location_stats = analyze_location_data(df) |
|
|
|
|
|
|
|
|
create_summary_report(df, correlation) |
|
|
|
|
|
|
|
|
save_analysis_results(df, engagement_stats, duration_engagement, author_stats, engagement_rates, location_stats) |
|
|
|
|
|
print("\nβ
Analysis completed successfully!") |
|
|
print("\nπ KEY FINDINGS SUMMARY:") |
|
|
print("β’ Very short videos (β€15s) perform best") |
|
|
print("β’ Strong positive correlation between views and likes") |
|
|
print("β’ zachking, mrbeast, and addisonre dominate engagement") |
|
|
print("β’ Average engagement: ~7.2% like rate") |
|
|
print("β’ Videos with hashtags perform better") |
|
|
print("β’ US-based content outperforms international content") |
|
|
|
|
|
except Exception as e: |
|
|
print(f"β Error during analysis: {e}") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |