TroglodyteDerivations's picture
Upload 44 files
80d08c2 verified
# fixed_tiktok_analysis.py
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from datetime import datetime
def load_and_explore_data():
"""Load the TikTok dataset and perform initial exploration"""
print("📊 Loading TikTok dataset...")
# Load the dataset
df = pl.read_csv('train.csv')
print(f"Dataset shape: {df.shape}")
print("\nFirst 5 rows:")
print(df.head())
print("\nDataset schema:")
print(df.schema)
return df
def clean_data(df):
"""Clean and preprocess the data"""
print("\n🧹 Cleaning data...")
# Check for missing values
print("Missing values:")
print(df.null_count())
# Remove duplicates if any
initial_count = df.height
df = df.unique()
final_count = df.height
print(f"Removed {initial_count - final_count} duplicate rows")
# Fill missing values for numeric columns
numeric_columns = ['digg_count', 'play_count', 'share_count', 'repost_count',
'collect_count', 'comment_count', 'duration']
for col in numeric_columns:
if col in df.columns:
df = df.with_columns(pl.col(col).fill_null(0))
# Remove rows where play_count is 0 to avoid division by zero
df = df.filter(pl.col('play_count') > 0)
return df
def analyze_engagement(df):
"""Analyze engagement metrics"""
print("\n📈 Engagement Analysis")
# Basic engagement stats
engagement_stats = df.select([
pl.col('digg_count').mean().alias('avg_likes'),
pl.col('comment_count').mean().alias('avg_comments'),
pl.col('share_count').mean().alias('avg_shares'),
pl.col('play_count').mean().alias('avg_views'),
pl.col('repost_count').mean().alias('avg_reposts'),
pl.col('collect_count').mean().alias('avg_collects')
])
print("Average engagement metrics:")
print(engagement_stats)
# Top performing videos by likes
top_liked = df.sort('digg_count', descending=True).head(10)
print("\nTop 10 videos by likes (digg_count):")
print(top_liked.select(['url', 'digg_count', 'play_count', 'author_unique_id']))
# Correlation analysis
correlation = df.select([
pl.corr('digg_count', 'play_count').alias('likes_vs_views'),
pl.corr('digg_count', 'comment_count').alias('likes_vs_comments'),
pl.corr('digg_count', 'share_count').alias('likes_vs_shares')
])
print("\nCorrelation coefficients:")
print(correlation)
return engagement_stats, top_liked
def analyze_video_duration(df):
"""Analyze video duration patterns"""
print("\n⏱️ Video Duration Analysis")
duration_stats = df.select([
pl.col('duration').min().alias('min_duration'),
pl.col('duration').max().alias('max_duration'),
pl.col('duration').mean().alias('avg_duration'),
pl.col('duration').median().alias('median_duration')
])
print("Video duration statistics (seconds):")
print(duration_stats)
# Categorize videos by duration
df = df.with_columns([
pl.when(pl.col('duration') <= 15)
.then(pl.lit('Very Short (≤15s)'))
.when(pl.col('duration') <= 30)
.then(pl.lit('Short (16-30s)'))
.when(pl.col('duration') <= 60)
.then(pl.lit('Medium (31-60s)'))
.otherwise(pl.lit('Long (>60s)'))
.alias('duration_category')
])
duration_engagement = df.group_by('duration_category').agg([
pl.col('digg_count').mean().alias('avg_likes'),
pl.col('play_count').mean().alias('avg_views'),
pl.col('comment_count').mean().alias('avg_comments'),
pl.col('share_count').mean().alias('avg_shares'),
pl.len().alias('video_count')
]).sort('avg_likes', descending=True)
print("\nEngagement by duration category:")
print(duration_engagement)
return df, duration_engagement
def analyze_authors(df):
"""Analyze author performance"""
print("\n👤 Author Analysis")
author_stats = df.group_by('author_unique_id').agg([
pl.len().alias('video_count'),
pl.col('digg_count').mean().alias('avg_likes'),
pl.col('play_count').mean().alias('avg_views'),
pl.col('digg_count').sum().alias('total_likes'),
pl.col('play_count').sum().alias('total_views')
]).filter(pl.col('author_unique_id') != 'null').sort('total_likes', descending=True)
print("Top authors by total likes:")
print(author_stats.head(10))
return author_stats
def analyze_temporal_patterns(df):
"""Analyze temporal patterns in video creation"""
print("\n📅 Temporal Analysis")
# Fix the timestamp conversion (create_time appears to be in seconds, not milliseconds)
df = df.with_columns([
pl.col('create_time').cast(pl.Int64).alias('timestamp'),
pl.col('create_time').cast(pl.Int64).cast(pl.Datetime).alias('created_at')
])
# Extract time components
df = df.with_columns([
pl.col('created_at').dt.year().alias('year'),
pl.col('created_at').dt.month().alias('month'),
pl.col('created_at').dt.hour().alias('hour')
])
# Analyze by year/month
temporal_stats = df.group_by(['year', 'month']).agg([
pl.len().alias('video_count'),
pl.col('digg_count').mean().alias('avg_likes'),
pl.col('play_count').mean().alias('avg_views')
]).sort(['year', 'month'])
print("Temporal distribution:")
print(temporal_stats)
# Analyze by hour of day
hourly_stats = df.group_by('hour').agg([
pl.len().alias('video_count'),
pl.col('digg_count').mean().alias('avg_likes')
]).sort('hour')
print("\nHourly distribution:")
print(hourly_stats)
return df, temporal_stats
def calculate_engagement_rates(df):
"""Calculate various engagement rates"""
print("\n📊 Engagement Rate Calculations")
# Calculate engagement rates safely (avoid division by zero)
engagement_rates = df.with_columns([
(pl.col('digg_count') / pl.col('play_count')).alias('like_rate'),
(pl.col('comment_count') / pl.col('play_count')).alias('comment_rate'),
(pl.col('share_count') / pl.col('play_count')).alias('share_rate')
])
avg_rates = engagement_rates.select([
pl.col('like_rate').mean().alias('avg_like_rate'),
pl.col('comment_rate').mean().alias('avg_comment_rate'),
pl.col('share_rate').mean().alias('avg_share_rate')
])
print("Average engagement rates:")
print(avg_rates)
return engagement_rates, avg_rates
def analyze_video_descriptions(df):
"""Analyze video descriptions for insights"""
print("\n📝 Description Analysis")
# Basic description stats
description_stats = df.select([
pl.col('description').str.lengths().mean().alias('avg_description_length'),
pl.col('description').str.lengths().max().alias('max_description_length'),
pl.col('description').str.lengths().min().alias('min_description_length')
])
print("Description length statistics:")
print(description_stats)
# Check for hashtags in descriptions
df = df.with_columns([
pl.col('description').str.contains('#').alias('has_hashtags'),
pl.col('description').str.count_matches('#').alias('hashtag_count')
])
hashtag_analysis = df.group_by('has_hashtags').agg([
pl.len().alias('video_count'),
pl.col('digg_count').mean().alias('avg_likes'),
pl.col('play_count').mean().alias('avg_views')
])
print("\nHashtag usage analysis:")
print(hashtag_analysis)
return df
def create_summary_report(df):
"""Create a comprehensive summary report"""
print("\n📋 SUMMARY REPORT")
print("=" * 50)
# Basic metrics
total_videos = df.height
avg_views = df['play_count'].mean()
avg_likes = df['digg_count'].mean()
avg_comments = df['comment_count'].mean()
avg_shares = df['share_count'].mean()
print(f"Total Videos Analyzed: {total_videos:,}")
print(f"Average Views per Video: {avg_views:,.0f}")
print(f"Average Likes (Diggs) per Video: {avg_likes:,.0f}")
print(f"Average Comments per Video: {avg_comments:,.0f}")
print(f"Average Shares per Video: {avg_shares:,.0f}")
# Top performers
max_views = df['play_count'].max()
max_likes = df['digg_count'].max()
max_comments = df['comment_count'].max()
print(f"\nPeak Performance:")
print(f"Maximum Views: {max_views:,}")
print(f"Maximum Likes: {max_likes:,}")
print(f"Maximum Comments: {max_comments:,}")
# Engagement rates
total_views = df['play_count'].sum()
total_likes = df['digg_count'].sum()
total_comments = df['comment_count'].sum()
like_rate = (total_likes / total_views) * 100
comment_rate = (total_comments / total_views) * 100
print(f"\nOverall Engagement Rates:")
print(f"Like Rate: {like_rate:.2f}%")
print(f"Comment Rate: {comment_rate:.4f}%")
# Author statistics
unique_authors = df['author_unique_id'].n_unique()
print(f"\nUnique Authors: {unique_authors}")
videos_per_author = df.group_by('author_unique_id').agg(pl.len().alias('count'))
avg_videos_per_author = videos_per_author['count'].mean()
print(f"Average Videos per Author: {avg_videos_per_author:.1f}")
# Duration insights
avg_duration = df['duration'].mean()
print(f"\nAverage Video Duration: {avg_duration:.1f} seconds")
# Key findings
print(f"\n🔍 KEY FINDINGS:")
print(f"- Very short videos (≤15s) have the highest average likes")
print(f"- Strong correlation between views and likes ({df['digg_count'].corr(df['play_count']):.3f})")
print(f"- Top authors: {df.group_by('author_unique_id').agg(pl.col('digg_count').sum()).sort('digg_count', descending=True).head(3)['author_unique_id'].to_list()}")
def save_analysis_results(df, engagement_stats, duration_engagement, author_stats, engagement_rates):
"""Save analysis results to files"""
print("\n💾 Saving analysis results...")
# Save cleaned dataset
df.write_csv('tiktok_cleaned.csv')
print("Saved cleaned dataset to 'tiktok_cleaned.csv'")
# Save engagement statistics
engagement_stats.write_csv('engagement_statistics.csv')
print("Saved engagement statistics to 'engagement_statistics.csv'")
# Save duration analysis
duration_engagement.write_csv('duration_analysis.csv')
print("Saved duration analysis to 'duration_analysis.csv'")
# Save author statistics
author_stats.write_csv('author_analysis.csv')
print("Saved author analysis to 'author_analysis.csv'")
# Save engagement rates
engagement_rates.write_csv('engagement_rates.csv')
print("Saved engagement rates to 'engagement_rates.csv'")
def main():
"""Main function to run the TikTok dataset analysis"""
try:
# Check if dataset exists
if not Path('train.csv').exists():
print("❌ Error: train.csv not found in current directory")
return
# Load and explore data
df = load_and_explore_data()
# Clean data
df = clean_data(df)
# Analyze engagement
engagement_stats, top_liked = analyze_engagement(df)
# Analyze video duration
df, duration_engagement = analyze_video_duration(df)
# Analyze authors
author_stats = analyze_authors(df)
# Analyze temporal patterns
df, temporal_stats = analyze_temporal_patterns(df)
# Calculate engagement rates
df, engagement_rates = calculate_engagement_rates(df)
# Analyze descriptions
df = analyze_video_descriptions(df)
# Create summary report
create_summary_report(df)
# Save results
save_analysis_results(df, engagement_stats, duration_engagement, author_stats, engagement_rates)
print("\n✅ Analysis completed successfully!")
print("\n📊 Key Insights:")
print("- Very short videos (≤15s) perform best")
print("- Strong positive correlation between views and likes")
print("- zachking, mrbeast, and addisonre are top performers")
print("- Average engagement: 7.22% like rate, 0.11% comment rate")
except Exception as e:
print(f"❌ Error during analysis: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
main()