import polars as pl import matplotlib.pyplot as plt import seaborn as sns from pathlib import Path from datetime import datetime def load_and_explore_data(): """Load the TikTok dataset and perform initial exploration""" print("๐Ÿ“Š Loading TikTok dataset...") # Load the dataset df = pl.read_csv('train.csv') print(f"Dataset shape: {df.shape}") print("\nFirst 5 rows:") print(df.head()) print("\nDataset schema:") print(df.schema) print("\nColumn names:") for i, col in enumerate(df.columns): print(f"{i+1}. {col}") return df def clean_data(df): """Clean and preprocess the data""" print("\n๐Ÿงน Cleaning data...") # Check for missing values print("Missing values:") print(df.null_count()) # Remove duplicates if any initial_count = df.height df = df.unique() final_count = df.height print(f"Removed {initial_count - final_count} duplicate rows") # Fill missing values for numeric columns numeric_columns = ['digg_count', 'play_count', 'share_count', 'repost_count', 'collect_count', 'comment_count', 'duration'] for col in numeric_columns: if col in df.columns: df = df.with_columns(pl.col(col).fill_null(0)) return df def analyze_engagement(df): """Analyze engagement metrics""" print("\n๐Ÿ“ˆ Engagement Analysis") # Basic engagement stats - using actual column names engagement_stats = df.select([ pl.col('digg_count').mean().alias('avg_likes'), pl.col('comment_count').mean().alias('avg_comments'), pl.col('share_count').mean().alias('avg_shares'), pl.col('play_count').mean().alias('avg_views'), pl.col('repost_count').mean().alias('avg_reposts'), pl.col('collect_count').mean().alias('avg_collects') ]) print("Average engagement metrics:") print(engagement_stats) # Top performing videos by likes (digg_count) top_liked = df.sort('digg_count', descending=True).head(10) print("\nTop 10 videos by likes (digg_count):") print(top_liked.select(['url', 'digg_count', 'play_count', 'author_unique_id'])) # Correlation analysis correlation = df.select([ pl.corr('digg_count', 'play_count').alias('likes_vs_views'), pl.corr('digg_count', 'comment_count').alias('likes_vs_comments'), pl.corr('digg_count', 'share_count').alias('likes_vs_shares') ]) print("\nCorrelation coefficients:") print(correlation) return engagement_stats, top_liked def analyze_video_duration(df): """Analyze video duration patterns""" print("\nโฑ๏ธ Video Duration Analysis") if 'duration' in df.columns: duration_stats = df.select([ pl.col('duration').min().alias('min_duration'), pl.col('duration').max().alias('max_duration'), pl.col('duration').mean().alias('avg_duration'), pl.col('duration').median().alias('median_duration') ]) print("Video duration statistics (seconds):") print(duration_stats) # Categorize videos by duration df = df.with_columns([ pl.when(pl.col('duration') <= 15) .then(pl.lit('Very Short (โ‰ค15s)')) .when(pl.col('duration') <= 30) .then(pl.lit('Short (16-30s)')) .when(pl.col('duration') <= 60) .then(pl.lit('Medium (31-60s)')) .otherwise(pl.lit('Long (>60s)')) .alias('duration_category') ]) duration_engagement = df.group_by('duration_category').agg([ pl.col('digg_count').mean().alias('avg_likes'), pl.col('play_count').mean().alias('avg_views'), pl.col('comment_count').mean().alias('avg_comments'), pl.col('share_count').mean().alias('avg_shares'), pl.count().alias('video_count') ]).sort('avg_likes', descending=True) print("\nEngagement by duration category:") print(duration_engagement) return df, duration_engagement else: print("No 'duration' column found in dataset") return df, None def analyze_authors(df): """Analyze author performance""" print("\n๐Ÿ‘ค Author Analysis") if 'author_unique_id' in df.columns: author_stats = df.group_by('author_unique_id').agg([ pl.count().alias('video_count'), pl.col('digg_count').mean().alias('avg_likes'), pl.col('play_count').mean().alias('avg_views'), pl.col('digg_count').sum().alias('total_likes'), pl.col('play_count').sum().alias('total_views') ]).sort('total_likes', descending=True) print("Top 10 authors by total likes:") print(author_stats.head(10)) return author_stats else: print("No 'author_unique_id' column found") return None def analyze_temporal_patterns(df): """Analyze temporal patterns in video creation""" print("\n๐Ÿ“… Temporal Analysis") if 'create_time' in df.columns: # Convert Unix timestamp to datetime df = df.with_columns([ pl.col('create_time').cast(pl.Int64).alias('timestamp'), (pl.col('create_time').cast(pl.Int64) / 1000).cast(pl.Datetime).alias('created_at') ]) # Extract time components df = df.with_columns([ pl.col('created_at').dt.year().alias('year'), pl.col('created_at').dt.month().alias('month'), pl.col('created_at').dt.hour().alias('hour') ]) # Analyze by year/month temporal_stats = df.group_by(['year', 'month']).agg([ pl.count().alias('video_count'), pl.col('digg_count').mean().alias('avg_likes'), pl.col('play_count').mean().alias('avg_views') ]).sort(['year', 'month']) print("Temporal distribution:") print(temporal_stats) return df, temporal_stats else: print("No 'create_time' column found") return df, None def calculate_engagement_rates(df): """Calculate various engagement rates""" print("\n๐Ÿ“Š Engagement Rate Calculations") engagement_rates = df.with_columns([ (pl.col('digg_count') / pl.col('play_count')).alias('like_rate'), (pl.col('comment_count') / pl.col('play_count')).alias('comment_rate'), (pl.col('share_count') / pl.col('play_count')).alias('share_rate') ]).select([ pl.col('like_rate').mean().alias('avg_like_rate'), pl.col('comment_rate').mean().alias('avg_comment_rate'), pl.col('share_rate').mean().alias('avg_share_rate') ]) print("Average engagement rates:") print(engagement_rates) return engagement_rates def create_summary_report(df): """Create a comprehensive summary report""" print("\n๐Ÿ“‹ SUMMARY REPORT") print("=" * 50) # Basic metrics total_videos = df.height avg_views = df['play_count'].mean() avg_likes = df['digg_count'].mean() avg_comments = df['comment_count'].mean() avg_shares = df['share_count'].mean() print(f"Total Videos Analyzed: {total_videos:,}") print(f"Average Views per Video: {avg_views:,.0f}") print(f"Average Likes (Diggs) per Video: {avg_likes:,.0f}") print(f"Average Comments per Video: {avg_comments:,.0f}") print(f"Average Shares per Video: {avg_shares:,.0f}") # Top performers max_views = df['play_count'].max() max_likes = df['digg_count'].max() print(f"\nPeak Performance:") print(f"Maximum Views: {max_views:,}") print(f"Maximum Likes: {max_likes:,}") # Engagement rates like_rate = (df['digg_count'].sum() / df['play_count'].sum()) * 100 comment_rate = (df['comment_count'].sum() / df['play_count'].sum()) * 100 print(f"\nOverall Engagement Rates:") print(f"Like Rate: {like_rate:.2f}%") print(f"Comment Rate: {comment_rate:.2f}%") # Author statistics if 'author_unique_id' in df.columns: unique_authors = df['author_unique_id'].n_unique() print(f"\nUnique Authors: {unique_authors}") videos_per_author = df.group_by('author_unique_id').agg(pl.count().alias('count')) avg_videos_per_author = videos_per_author['count'].mean() print(f"Average Videos per Author: {avg_videos_per_author:.1f}") def save_analysis_results(df, engagement_stats, duration_engagement, author_stats): """Save analysis results to files""" print("\n๐Ÿ’พ Saving analysis results...") # Save cleaned dataset df.write_csv('tiktok_cleaned.csv') print("Saved cleaned dataset to 'tiktok_cleaned.csv'") # Save engagement statistics engagement_stats.write_csv('engagement_statistics.csv') print("Saved engagement statistics to 'engagement_statistics.csv'") # Save duration analysis if available if duration_engagement is not None: duration_engagement.write_csv('duration_analysis.csv') print("Saved duration analysis to 'duration_analysis.csv'") # Save author statistics if available if author_stats is not None: author_stats.write_csv('author_analysis.csv') print("Saved author analysis to 'author_analysis.csv'") def main(): """Main function to run the TikTok dataset analysis""" try: # Check if dataset exists if not Path('train.csv').exists(): print("โŒ Error: train.csv not found in current directory") print("Please make sure the dataset is downloaded and in the correct location") return # Load and explore data df = load_and_explore_data() # Clean data df = clean_data(df) # Analyze engagement engagement_stats, top_liked = analyze_engagement(df) # Analyze video duration df, duration_engagement = analyze_video_duration(df) # Analyze authors author_stats = analyze_authors(df) # Analyze temporal patterns df, temporal_stats = analyze_temporal_patterns(df) # Calculate engagement rates engagement_rates = calculate_engagement_rates(df) # Create summary report create_summary_report(df) # Save results save_analysis_results(df, engagement_stats, duration_engagement, author_stats) print("\nโœ… Analysis completed successfully!") print("\nGenerated files:") print("- tiktok_cleaned.csv: Cleaned dataset") print("- engagement_statistics.csv: Engagement metrics") print("- duration_analysis.csv: Duration-based analysis") print("- author_analysis.csv: Author performance analysis") except Exception as e: print(f"โŒ Error during analysis: {e}") import traceback traceback.print_exc() if __name__ == "__main__": main()