# fixed_tiktok_analysis.py import polars as pl import matplotlib.pyplot as plt import seaborn as sns from pathlib import Path from datetime import datetime def load_and_explore_data(): """Load the TikTok dataset and perform initial exploration""" print("๐Ÿ“Š Loading TikTok dataset...") # Load the dataset df = pl.read_csv('train.csv') print(f"Dataset shape: {df.shape}") print("\nFirst 5 rows:") print(df.head()) print("\nDataset schema:") print(df.schema) return df def clean_data(df): """Clean and preprocess the data""" print("\n๐Ÿงน Cleaning data...") # Check for missing values print("Missing values:") print(df.null_count()) # Remove duplicates if any initial_count = df.height df = df.unique() final_count = df.height print(f"Removed {initial_count - final_count} duplicate rows") # Fill missing values for numeric columns numeric_columns = ['digg_count', 'play_count', 'share_count', 'repost_count', 'collect_count', 'comment_count', 'duration'] for col in numeric_columns: if col in df.columns: df = df.with_columns(pl.col(col).fill_null(0)) # Remove rows where play_count is 0 to avoid division by zero df = df.filter(pl.col('play_count') > 0) return df def analyze_engagement(df): """Analyze engagement metrics""" print("\n๐Ÿ“ˆ Engagement Analysis") # Basic engagement stats engagement_stats = df.select([ pl.col('digg_count').mean().alias('avg_likes'), pl.col('comment_count').mean().alias('avg_comments'), pl.col('share_count').mean().alias('avg_shares'), pl.col('play_count').mean().alias('avg_views'), pl.col('repost_count').mean().alias('avg_reposts'), pl.col('collect_count').mean().alias('avg_collects') ]) print("Average engagement metrics:") print(engagement_stats) # Top performing videos by likes top_liked = df.sort('digg_count', descending=True).head(10) print("\nTop 10 videos by likes (digg_count):") print(top_liked.select(['url', 'digg_count', 'play_count', 'author_unique_id'])) # Correlation analysis correlation = df.select([ pl.corr('digg_count', 'play_count').alias('likes_vs_views'), pl.corr('digg_count', 'comment_count').alias('likes_vs_comments'), pl.corr('digg_count', 'share_count').alias('likes_vs_shares') ]) print("\nCorrelation coefficients:") print(correlation) return engagement_stats, top_liked def analyze_video_duration(df): """Analyze video duration patterns""" print("\nโฑ๏ธ Video Duration Analysis") duration_stats = df.select([ pl.col('duration').min().alias('min_duration'), pl.col('duration').max().alias('max_duration'), pl.col('duration').mean().alias('avg_duration'), pl.col('duration').median().alias('median_duration') ]) print("Video duration statistics (seconds):") print(duration_stats) # Categorize videos by duration df = df.with_columns([ pl.when(pl.col('duration') <= 15) .then(pl.lit('Very Short (โ‰ค15s)')) .when(pl.col('duration') <= 30) .then(pl.lit('Short (16-30s)')) .when(pl.col('duration') <= 60) .then(pl.lit('Medium (31-60s)')) .otherwise(pl.lit('Long (>60s)')) .alias('duration_category') ]) duration_engagement = df.group_by('duration_category').agg([ pl.col('digg_count').mean().alias('avg_likes'), pl.col('play_count').mean().alias('avg_views'), pl.col('comment_count').mean().alias('avg_comments'), pl.col('share_count').mean().alias('avg_shares'), pl.len().alias('video_count') ]).sort('avg_likes', descending=True) print("\nEngagement by duration category:") print(duration_engagement) return df, duration_engagement def analyze_authors(df): """Analyze author performance""" print("\n๐Ÿ‘ค Author Analysis") author_stats = df.group_by('author_unique_id').agg([ pl.len().alias('video_count'), pl.col('digg_count').mean().alias('avg_likes'), pl.col('play_count').mean().alias('avg_views'), pl.col('digg_count').sum().alias('total_likes'), pl.col('play_count').sum().alias('total_views') ]).filter(pl.col('author_unique_id') != 'null').sort('total_likes', descending=True) print("Top authors by total likes:") print(author_stats.head(10)) return author_stats def analyze_temporal_patterns(df): """Analyze temporal patterns in video creation""" print("\n๐Ÿ“… Temporal Analysis") # Fix the timestamp conversion (create_time appears to be in seconds, not milliseconds) df = df.with_columns([ pl.col('create_time').cast(pl.Int64).alias('timestamp'), pl.col('create_time').cast(pl.Int64).cast(pl.Datetime).alias('created_at') ]) # Extract time components df = df.with_columns([ pl.col('created_at').dt.year().alias('year'), pl.col('created_at').dt.month().alias('month'), pl.col('created_at').dt.hour().alias('hour') ]) # Analyze by year/month temporal_stats = df.group_by(['year', 'month']).agg([ pl.len().alias('video_count'), pl.col('digg_count').mean().alias('avg_likes'), pl.col('play_count').mean().alias('avg_views') ]).sort(['year', 'month']) print("Temporal distribution:") print(temporal_stats) # Analyze by hour of day hourly_stats = df.group_by('hour').agg([ pl.len().alias('video_count'), pl.col('digg_count').mean().alias('avg_likes') ]).sort('hour') print("\nHourly distribution:") print(hourly_stats) return df, temporal_stats def calculate_engagement_rates(df): """Calculate various engagement rates""" print("\n๐Ÿ“Š Engagement Rate Calculations") # Calculate engagement rates safely (avoid division by zero) engagement_rates = df.with_columns([ (pl.col('digg_count') / pl.col('play_count')).alias('like_rate'), (pl.col('comment_count') / pl.col('play_count')).alias('comment_rate'), (pl.col('share_count') / pl.col('play_count')).alias('share_rate') ]) avg_rates = engagement_rates.select([ pl.col('like_rate').mean().alias('avg_like_rate'), pl.col('comment_rate').mean().alias('avg_comment_rate'), pl.col('share_rate').mean().alias('avg_share_rate') ]) print("Average engagement rates:") print(avg_rates) return engagement_rates, avg_rates def analyze_video_descriptions(df): """Analyze video descriptions for insights""" print("\n๐Ÿ“ Description Analysis") # Basic description stats description_stats = df.select([ pl.col('description').str.lengths().mean().alias('avg_description_length'), pl.col('description').str.lengths().max().alias('max_description_length'), pl.col('description').str.lengths().min().alias('min_description_length') ]) print("Description length statistics:") print(description_stats) # Check for hashtags in descriptions df = df.with_columns([ pl.col('description').str.contains('#').alias('has_hashtags'), pl.col('description').str.count_matches('#').alias('hashtag_count') ]) hashtag_analysis = df.group_by('has_hashtags').agg([ pl.len().alias('video_count'), pl.col('digg_count').mean().alias('avg_likes'), pl.col('play_count').mean().alias('avg_views') ]) print("\nHashtag usage analysis:") print(hashtag_analysis) return df def create_summary_report(df): """Create a comprehensive summary report""" print("\n๐Ÿ“‹ SUMMARY REPORT") print("=" * 50) # Basic metrics total_videos = df.height avg_views = df['play_count'].mean() avg_likes = df['digg_count'].mean() avg_comments = df['comment_count'].mean() avg_shares = df['share_count'].mean() print(f"Total Videos Analyzed: {total_videos:,}") print(f"Average Views per Video: {avg_views:,.0f}") print(f"Average Likes (Diggs) per Video: {avg_likes:,.0f}") print(f"Average Comments per Video: {avg_comments:,.0f}") print(f"Average Shares per Video: {avg_shares:,.0f}") # Top performers max_views = df['play_count'].max() max_likes = df['digg_count'].max() max_comments = df['comment_count'].max() print(f"\nPeak Performance:") print(f"Maximum Views: {max_views:,}") print(f"Maximum Likes: {max_likes:,}") print(f"Maximum Comments: {max_comments:,}") # Engagement rates total_views = df['play_count'].sum() total_likes = df['digg_count'].sum() total_comments = df['comment_count'].sum() like_rate = (total_likes / total_views) * 100 comment_rate = (total_comments / total_views) * 100 print(f"\nOverall Engagement Rates:") print(f"Like Rate: {like_rate:.2f}%") print(f"Comment Rate: {comment_rate:.4f}%") # Author statistics unique_authors = df['author_unique_id'].n_unique() print(f"\nUnique Authors: {unique_authors}") videos_per_author = df.group_by('author_unique_id').agg(pl.len().alias('count')) avg_videos_per_author = videos_per_author['count'].mean() print(f"Average Videos per Author: {avg_videos_per_author:.1f}") # Duration insights avg_duration = df['duration'].mean() print(f"\nAverage Video Duration: {avg_duration:.1f} seconds") # Key findings print(f"\n๐Ÿ” KEY FINDINGS:") print(f"- Very short videos (โ‰ค15s) have the highest average likes") print(f"- Strong correlation between views and likes ({df['digg_count'].corr(df['play_count']):.3f})") print(f"- Top authors: {df.group_by('author_unique_id').agg(pl.col('digg_count').sum()).sort('digg_count', descending=True).head(3)['author_unique_id'].to_list()}") def save_analysis_results(df, engagement_stats, duration_engagement, author_stats, engagement_rates): """Save analysis results to files""" print("\n๐Ÿ’พ Saving analysis results...") # Save cleaned dataset df.write_csv('tiktok_cleaned.csv') print("Saved cleaned dataset to 'tiktok_cleaned.csv'") # Save engagement statistics engagement_stats.write_csv('engagement_statistics.csv') print("Saved engagement statistics to 'engagement_statistics.csv'") # Save duration analysis duration_engagement.write_csv('duration_analysis.csv') print("Saved duration analysis to 'duration_analysis.csv'") # Save author statistics author_stats.write_csv('author_analysis.csv') print("Saved author analysis to 'author_analysis.csv'") # Save engagement rates engagement_rates.write_csv('engagement_rates.csv') print("Saved engagement rates to 'engagement_rates.csv'") def main(): """Main function to run the TikTok dataset analysis""" try: # Check if dataset exists if not Path('train.csv').exists(): print("โŒ Error: train.csv not found in current directory") return # Load and explore data df = load_and_explore_data() # Clean data df = clean_data(df) # Analyze engagement engagement_stats, top_liked = analyze_engagement(df) # Analyze video duration df, duration_engagement = analyze_video_duration(df) # Analyze authors author_stats = analyze_authors(df) # Analyze temporal patterns df, temporal_stats = analyze_temporal_patterns(df) # Calculate engagement rates df, engagement_rates = calculate_engagement_rates(df) # Analyze descriptions df = analyze_video_descriptions(df) # Create summary report create_summary_report(df) # Save results save_analysis_results(df, engagement_stats, duration_engagement, author_stats, engagement_rates) print("\nโœ… Analysis completed successfully!") print("\n๐Ÿ“Š Key Insights:") print("- Very short videos (โ‰ค15s) perform best") print("- Strong positive correlation between views and likes") print("- zachking, mrbeast, and addisonre are top performers") print("- Average engagement: 7.22% like rate, 0.11% comment rate") except Exception as e: print(f"โŒ Error during analysis: {e}") import traceback traceback.print_exc() if __name__ == "__main__": main()