|
|
import polars as pl |
|
|
import matplotlib.pyplot as plt |
|
|
import seaborn as sns |
|
|
from pathlib import Path |
|
|
from datetime import datetime |
|
|
|
|
|
def load_and_explore_data(): |
|
|
"""Load the TikTok dataset and perform initial exploration""" |
|
|
print("📊 Loading TikTok dataset...") |
|
|
|
|
|
|
|
|
df = pl.read_csv('train.csv') |
|
|
|
|
|
print(f"Dataset shape: {df.shape}") |
|
|
print("\nFirst 5 rows:") |
|
|
print(df.head()) |
|
|
|
|
|
print("\nDataset schema:") |
|
|
print(df.schema) |
|
|
|
|
|
print("\nColumn names:") |
|
|
for i, col in enumerate(df.columns): |
|
|
print(f"{i+1}. {col}") |
|
|
|
|
|
return df |
|
|
|
|
|
def clean_data(df): |
|
|
"""Clean and preprocess the data""" |
|
|
print("\n🧹 Cleaning data...") |
|
|
|
|
|
|
|
|
print("Missing values:") |
|
|
print(df.null_count()) |
|
|
|
|
|
|
|
|
initial_count = df.height |
|
|
df = df.unique() |
|
|
final_count = df.height |
|
|
print(f"Removed {initial_count - final_count} duplicate rows") |
|
|
|
|
|
|
|
|
numeric_columns = ['digg_count', 'play_count', 'share_count', 'repost_count', |
|
|
'collect_count', 'comment_count', 'duration'] |
|
|
|
|
|
for col in numeric_columns: |
|
|
if col in df.columns: |
|
|
df = df.with_columns(pl.col(col).fill_null(0)) |
|
|
|
|
|
return df |
|
|
|
|
|
def analyze_engagement(df): |
|
|
"""Analyze engagement metrics""" |
|
|
print("\n📈 Engagement Analysis") |
|
|
|
|
|
|
|
|
engagement_stats = df.select([ |
|
|
pl.col('digg_count').mean().alias('avg_likes'), |
|
|
pl.col('comment_count').mean().alias('avg_comments'), |
|
|
pl.col('share_count').mean().alias('avg_shares'), |
|
|
pl.col('play_count').mean().alias('avg_views'), |
|
|
pl.col('repost_count').mean().alias('avg_reposts'), |
|
|
pl.col('collect_count').mean().alias('avg_collects') |
|
|
]) |
|
|
print("Average engagement metrics:") |
|
|
print(engagement_stats) |
|
|
|
|
|
|
|
|
top_liked = df.sort('digg_count', descending=True).head(10) |
|
|
print("\nTop 10 videos by likes (digg_count):") |
|
|
print(top_liked.select(['url', 'digg_count', 'play_count', 'author_unique_id'])) |
|
|
|
|
|
|
|
|
correlation = df.select([ |
|
|
pl.corr('digg_count', 'play_count').alias('likes_vs_views'), |
|
|
pl.corr('digg_count', 'comment_count').alias('likes_vs_comments'), |
|
|
pl.corr('digg_count', 'share_count').alias('likes_vs_shares') |
|
|
]) |
|
|
print("\nCorrelation coefficients:") |
|
|
print(correlation) |
|
|
|
|
|
return engagement_stats, top_liked |
|
|
|
|
|
def analyze_video_duration(df): |
|
|
"""Analyze video duration patterns""" |
|
|
print("\n⏱️ Video Duration Analysis") |
|
|
|
|
|
if 'duration' in df.columns: |
|
|
duration_stats = df.select([ |
|
|
pl.col('duration').min().alias('min_duration'), |
|
|
pl.col('duration').max().alias('max_duration'), |
|
|
pl.col('duration').mean().alias('avg_duration'), |
|
|
pl.col('duration').median().alias('median_duration') |
|
|
]) |
|
|
print("Video duration statistics (seconds):") |
|
|
print(duration_stats) |
|
|
|
|
|
|
|
|
df = df.with_columns([ |
|
|
pl.when(pl.col('duration') <= 15) |
|
|
.then(pl.lit('Very Short (≤15s)')) |
|
|
.when(pl.col('duration') <= 30) |
|
|
.then(pl.lit('Short (16-30s)')) |
|
|
.when(pl.col('duration') <= 60) |
|
|
.then(pl.lit('Medium (31-60s)')) |
|
|
.otherwise(pl.lit('Long (>60s)')) |
|
|
.alias('duration_category') |
|
|
]) |
|
|
|
|
|
duration_engagement = df.group_by('duration_category').agg([ |
|
|
pl.col('digg_count').mean().alias('avg_likes'), |
|
|
pl.col('play_count').mean().alias('avg_views'), |
|
|
pl.col('comment_count').mean().alias('avg_comments'), |
|
|
pl.col('share_count').mean().alias('avg_shares'), |
|
|
pl.count().alias('video_count') |
|
|
]).sort('avg_likes', descending=True) |
|
|
|
|
|
print("\nEngagement by duration category:") |
|
|
print(duration_engagement) |
|
|
|
|
|
return df, duration_engagement |
|
|
else: |
|
|
print("No 'duration' column found in dataset") |
|
|
return df, None |
|
|
|
|
|
def analyze_authors(df): |
|
|
"""Analyze author performance""" |
|
|
print("\n👤 Author Analysis") |
|
|
|
|
|
if 'author_unique_id' in df.columns: |
|
|
author_stats = df.group_by('author_unique_id').agg([ |
|
|
pl.count().alias('video_count'), |
|
|
pl.col('digg_count').mean().alias('avg_likes'), |
|
|
pl.col('play_count').mean().alias('avg_views'), |
|
|
pl.col('digg_count').sum().alias('total_likes'), |
|
|
pl.col('play_count').sum().alias('total_views') |
|
|
]).sort('total_likes', descending=True) |
|
|
|
|
|
print("Top 10 authors by total likes:") |
|
|
print(author_stats.head(10)) |
|
|
|
|
|
return author_stats |
|
|
else: |
|
|
print("No 'author_unique_id' column found") |
|
|
return None |
|
|
|
|
|
def analyze_temporal_patterns(df): |
|
|
"""Analyze temporal patterns in video creation""" |
|
|
print("\n📅 Temporal Analysis") |
|
|
|
|
|
if 'create_time' in df.columns: |
|
|
|
|
|
df = df.with_columns([ |
|
|
pl.col('create_time').cast(pl.Int64).alias('timestamp'), |
|
|
(pl.col('create_time').cast(pl.Int64) / 1000).cast(pl.Datetime).alias('created_at') |
|
|
]) |
|
|
|
|
|
|
|
|
df = df.with_columns([ |
|
|
pl.col('created_at').dt.year().alias('year'), |
|
|
pl.col('created_at').dt.month().alias('month'), |
|
|
pl.col('created_at').dt.hour().alias('hour') |
|
|
]) |
|
|
|
|
|
|
|
|
temporal_stats = df.group_by(['year', 'month']).agg([ |
|
|
pl.count().alias('video_count'), |
|
|
pl.col('digg_count').mean().alias('avg_likes'), |
|
|
pl.col('play_count').mean().alias('avg_views') |
|
|
]).sort(['year', 'month']) |
|
|
|
|
|
print("Temporal distribution:") |
|
|
print(temporal_stats) |
|
|
|
|
|
return df, temporal_stats |
|
|
else: |
|
|
print("No 'create_time' column found") |
|
|
return df, None |
|
|
|
|
|
def calculate_engagement_rates(df): |
|
|
"""Calculate various engagement rates""" |
|
|
print("\n📊 Engagement Rate Calculations") |
|
|
|
|
|
engagement_rates = df.with_columns([ |
|
|
(pl.col('digg_count') / pl.col('play_count')).alias('like_rate'), |
|
|
(pl.col('comment_count') / pl.col('play_count')).alias('comment_rate'), |
|
|
(pl.col('share_count') / pl.col('play_count')).alias('share_rate') |
|
|
]).select([ |
|
|
pl.col('like_rate').mean().alias('avg_like_rate'), |
|
|
pl.col('comment_rate').mean().alias('avg_comment_rate'), |
|
|
pl.col('share_rate').mean().alias('avg_share_rate') |
|
|
]) |
|
|
|
|
|
print("Average engagement rates:") |
|
|
print(engagement_rates) |
|
|
|
|
|
return engagement_rates |
|
|
|
|
|
def create_summary_report(df): |
|
|
"""Create a comprehensive summary report""" |
|
|
print("\n📋 SUMMARY REPORT") |
|
|
print("=" * 50) |
|
|
|
|
|
|
|
|
total_videos = df.height |
|
|
avg_views = df['play_count'].mean() |
|
|
avg_likes = df['digg_count'].mean() |
|
|
avg_comments = df['comment_count'].mean() |
|
|
avg_shares = df['share_count'].mean() |
|
|
|
|
|
print(f"Total Videos Analyzed: {total_videos:,}") |
|
|
print(f"Average Views per Video: {avg_views:,.0f}") |
|
|
print(f"Average Likes (Diggs) per Video: {avg_likes:,.0f}") |
|
|
print(f"Average Comments per Video: {avg_comments:,.0f}") |
|
|
print(f"Average Shares per Video: {avg_shares:,.0f}") |
|
|
|
|
|
|
|
|
max_views = df['play_count'].max() |
|
|
max_likes = df['digg_count'].max() |
|
|
|
|
|
print(f"\nPeak Performance:") |
|
|
print(f"Maximum Views: {max_views:,}") |
|
|
print(f"Maximum Likes: {max_likes:,}") |
|
|
|
|
|
|
|
|
like_rate = (df['digg_count'].sum() / df['play_count'].sum()) * 100 |
|
|
comment_rate = (df['comment_count'].sum() / df['play_count'].sum()) * 100 |
|
|
|
|
|
print(f"\nOverall Engagement Rates:") |
|
|
print(f"Like Rate: {like_rate:.2f}%") |
|
|
print(f"Comment Rate: {comment_rate:.2f}%") |
|
|
|
|
|
|
|
|
if 'author_unique_id' in df.columns: |
|
|
unique_authors = df['author_unique_id'].n_unique() |
|
|
print(f"\nUnique Authors: {unique_authors}") |
|
|
|
|
|
videos_per_author = df.group_by('author_unique_id').agg(pl.count().alias('count')) |
|
|
avg_videos_per_author = videos_per_author['count'].mean() |
|
|
print(f"Average Videos per Author: {avg_videos_per_author:.1f}") |
|
|
|
|
|
def save_analysis_results(df, engagement_stats, duration_engagement, author_stats): |
|
|
"""Save analysis results to files""" |
|
|
print("\n💾 Saving analysis results...") |
|
|
|
|
|
|
|
|
df.write_csv('tiktok_cleaned.csv') |
|
|
print("Saved cleaned dataset to 'tiktok_cleaned.csv'") |
|
|
|
|
|
|
|
|
engagement_stats.write_csv('engagement_statistics.csv') |
|
|
print("Saved engagement statistics to 'engagement_statistics.csv'") |
|
|
|
|
|
|
|
|
if duration_engagement is not None: |
|
|
duration_engagement.write_csv('duration_analysis.csv') |
|
|
print("Saved duration analysis to 'duration_analysis.csv'") |
|
|
|
|
|
|
|
|
if author_stats is not None: |
|
|
author_stats.write_csv('author_analysis.csv') |
|
|
print("Saved author analysis to 'author_analysis.csv'") |
|
|
|
|
|
def main(): |
|
|
"""Main function to run the TikTok dataset analysis""" |
|
|
try: |
|
|
|
|
|
if not Path('train.csv').exists(): |
|
|
print("❌ Error: train.csv not found in current directory") |
|
|
print("Please make sure the dataset is downloaded and in the correct location") |
|
|
return |
|
|
|
|
|
|
|
|
df = load_and_explore_data() |
|
|
|
|
|
|
|
|
df = clean_data(df) |
|
|
|
|
|
|
|
|
engagement_stats, top_liked = analyze_engagement(df) |
|
|
|
|
|
|
|
|
df, duration_engagement = analyze_video_duration(df) |
|
|
|
|
|
|
|
|
author_stats = analyze_authors(df) |
|
|
|
|
|
|
|
|
df, temporal_stats = analyze_temporal_patterns(df) |
|
|
|
|
|
|
|
|
engagement_rates = calculate_engagement_rates(df) |
|
|
|
|
|
|
|
|
create_summary_report(df) |
|
|
|
|
|
|
|
|
save_analysis_results(df, engagement_stats, duration_engagement, author_stats) |
|
|
|
|
|
print("\n✅ Analysis completed successfully!") |
|
|
print("\nGenerated files:") |
|
|
print("- tiktok_cleaned.csv: Cleaned dataset") |
|
|
print("- engagement_statistics.csv: Engagement metrics") |
|
|
print("- duration_analysis.csv: Duration-based analysis") |
|
|
print("- author_analysis.csv: Author performance analysis") |
|
|
|
|
|
except Exception as e: |
|
|
print(f"❌ Error during analysis: {e}") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |