Spaces:

TroglodyteDerivations
/

Rick_and_Morty_Transcript_Analysis

Sleeping

File size: 16,416 Bytes

80d08c2

# final_tiktok_analysis.py
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from datetime import datetime

def load_and_explore_data():
    """Load the TikTok dataset and perform initial exploration"""
    print("📊 Loading TikTok dataset...")
    
    # Load the dataset
    df = pl.read_csv('train.csv')
    
    print(f"Dataset shape: {df.shape}")
    print("\nFirst 5 rows:")
    print(df.head())
    
    print("\nDataset schema:")
    print(df.schema)
    
    return df

def clean_data(df):
    """Clean and preprocess the data"""
    print("\n🧹 Cleaning data...")
    
    # Check for missing values
    print("Missing values:")
    print(df.null_count())
    
    # Remove duplicates if any
    initial_count = df.height
    df = df.unique()
    final_count = df.height
    print(f"Removed {initial_count - final_count} duplicate rows")
    
    # Fill missing values for numeric columns
    numeric_columns = ['digg_count', 'play_count', 'share_count', 'repost_count', 
                      'collect_count', 'comment_count', 'duration']
    
    for col in numeric_columns:
        if col in df.columns:
            df = df.with_columns(pl.col(col).fill_null(0))
    
    # Remove rows where play_count is 0 to avoid division by zero
    df = df.filter(pl.col('play_count') > 0)
    
    return df

def analyze_engagement(df):
    """Analyze engagement metrics"""
    print("\n📈 Engagement Analysis")
    
    # Basic engagement stats
    engagement_stats = df.select([
        pl.col('digg_count').mean().alias('avg_likes'),
        pl.col('comment_count').mean().alias('avg_comments'),
        pl.col('share_count').mean().alias('avg_shares'),
        pl.col('play_count').mean().alias('avg_views'),
        pl.col('repost_count').mean().alias('avg_reposts'),
        pl.col('collect_count').mean().alias('avg_collects')
    ])
    print("Average engagement metrics:")
    print(engagement_stats)
    
    # Top performing videos by likes
    top_liked = df.sort('digg_count', descending=True).head(10)
    print("\nTop 10 videos by likes (digg_count):")
    print(top_liked.select(['url', 'digg_count', 'play_count', 'author_unique_id']))
    
    # Correlation analysis
    correlation = df.select([
        pl.corr('digg_count', 'play_count').alias('likes_vs_views'),
        pl.corr('digg_count', 'comment_count').alias('likes_vs_comments'),
        pl.corr('digg_count', 'share_count').alias('likes_vs_shares')
    ])
    print("\nCorrelation coefficients:")
    print(correlation)
    
    return engagement_stats, top_liked, correlation

def analyze_video_duration(df):
    """Analyze video duration patterns"""
    print("\n⏱️ Video Duration Analysis")
    
    duration_stats = df.select([
        pl.col('duration').min().alias('min_duration'),
        pl.col('duration').max().alias('max_duration'),
        pl.col('duration').mean().alias('avg_duration'),
        pl.col('duration').median().alias('median_duration')
    ])
    print("Video duration statistics (seconds):")
    print(duration_stats)
    
    # Categorize videos by duration
    df = df.with_columns([
        pl.when(pl.col('duration') <= 15)
        .then(pl.lit('Very Short (≤15s)'))
        .when(pl.col('duration') <= 30)
        .then(pl.lit('Short (16-30s)'))
        .when(pl.col('duration') <= 60)
        .then(pl.lit('Medium (31-60s)'))
        .otherwise(pl.lit('Long (>60s)'))
        .alias('duration_category')
    ])
    
    duration_engagement = df.group_by('duration_category').agg([
        pl.col('digg_count').mean().alias('avg_likes'),
        pl.col('play_count').mean().alias('avg_views'),
        pl.col('comment_count').mean().alias('avg_comments'),
        pl.col('share_count').mean().alias('avg_shares'),
        pl.len().alias('video_count')
    ]).sort('avg_likes', descending=True)
    
    print("\nEngagement by duration category:")
    print(duration_engagement)
    
    return df, duration_engagement

def analyze_authors(df):
    """Analyze author performance"""
    print("\n👤 Author Analysis")
    
    author_stats = df.group_by('author_unique_id').agg([
        pl.len().alias('video_count'),
        pl.col('digg_count').mean().alias('avg_likes'),
        pl.col('play_count').mean().alias('avg_views'),
        pl.col('digg_count').sum().alias('total_likes'),
        pl.col('play_count').sum().alias('total_views')
    ]).filter(pl.col('author_unique_id') != 'null').sort('total_likes', descending=True)
    
    print("Top authors by total likes:")
    print(author_stats.head(10))
    
    return author_stats

def analyze_temporal_patterns(df):
    """Analyze temporal patterns in video creation"""
    print("\n📅 Temporal Analysis")
    
    # Fix the timestamp conversion (create_time appears to be in seconds, not milliseconds)
    df = df.with_columns([
        pl.col('create_time').cast(pl.Int64).alias('timestamp'),
        pl.col('create_time').cast(pl.Int64).cast(pl.Datetime).alias('created_at')
    ])
    
    # Extract time components
    df = df.with_columns([
        pl.col('created_at').dt.year().alias('year'),
        pl.col('created_at').dt.month().alias('month'),
        pl.col('created_at').dt.hour().alias('hour')
    ])
    
    # Analyze by year/month
    temporal_stats = df.group_by(['year', 'month']).agg([
        pl.len().alias('video_count'),
        pl.col('digg_count').mean().alias('avg_likes'),
        pl.col('play_count').mean().alias('avg_views')
    ]).sort(['year', 'month'])
    
    print("Temporal distribution:")
    print(temporal_stats)
    
    # Analyze by hour of day
    hourly_stats = df.group_by('hour').agg([
        pl.len().alias('video_count'),
        pl.col('digg_count').mean().alias('avg_likes')
    ]).sort('hour')
    
    print("\nHourly distribution:")
    print(hourly_stats)
    
    return df, temporal_stats

def calculate_engagement_rates(df):
    """Calculate various engagement rates"""
    print("\n📊 Engagement Rate Calculations")
    
    # Calculate engagement rates safely (avoid division by zero)
    engagement_rates = df.with_columns([
        (pl.col('digg_count') / pl.col('play_count')).alias('like_rate'),
        (pl.col('comment_count') / pl.col('play_count')).alias('comment_rate'),
        (pl.col('share_count') / pl.col('play_count')).alias('share_rate')
    ])
    
    avg_rates = engagement_rates.select([
        pl.col('like_rate').mean().alias('avg_like_rate'),
        pl.col('comment_rate').mean().alias('avg_comment_rate'),
        pl.col('share_rate').mean().alias('avg_share_rate')
    ])
    
    print("Average engagement rates:")
    print(avg_rates)
    
    # Convert to percentages for better interpretation
    avg_rates_percent = engagement_rates.select([
        (pl.col('digg_count').sum() / pl.col('play_count').sum() * 100).alias('overall_like_rate_percent'),
        (pl.col('comment_count').sum() / pl.col('play_count').sum() * 100).alias('overall_comment_rate_percent'),
        (pl.col('share_count').sum() / pl.col('play_count').sum() * 100).alias('overall_share_rate_percent')
    ])
    
    print("\nOverall engagement rates (%):")
    print(avg_rates_percent)
    
    return engagement_rates, avg_rates

def analyze_video_descriptions(df):
    """Analyze video descriptions for insights"""
    print("\n📝 Description Analysis")
    
    # Basic description stats - using correct Polars syntax
    description_stats = df.select([
        pl.col('description').str.len_chars().mean().alias('avg_description_length'),
        pl.col('description').str.len_chars().max().alias('max_description_length'),
        pl.col('description').str.len_chars().min().alias('min_description_length')
    ])
    
    print("Description length statistics (characters):")
    print(description_stats)
    
    # Check for hashtags in descriptions
    df = df.with_columns([
        pl.col('description').str.contains('#').alias('has_hashtags'),
        pl.col('description').str.count_matches('#').alias('hashtag_count')
    ])
    
    hashtag_analysis = df.group_by('has_hashtags').agg([
        pl.len().alias('video_count'),
        pl.col('digg_count').mean().alias('avg_likes'),
        pl.col('play_count').mean().alias('avg_views')
    ])
    
    print("\nHashtag usage analysis:")
    print(hashtag_analysis)
    
    # Analyze hashtag count impact
    hashtag_count_analysis = df.filter(pl.col('hashtag_count') > 0).select([
        pl.col('hashtag_count').mean().alias('avg_hashtags_per_video'),
        pl.col('hashtag_count').max().alias('max_hashtags'),
        pl.corr('hashtag_count', 'digg_count').alias('hashtags_vs_likes_correlation')
    ])
    
    print("\nHashtag count analysis:")
    print(hashtag_count_analysis)
    
    return df

def analyze_location_data(df):
    """Analyze location data if available"""
    print("\n🌍 Location Analysis")
    
    if 'location_created' in df.columns:
        location_stats = df.filter(pl.col('location_created').is_not_null()).group_by('location_created').agg([
            pl.len().alias('video_count'),
            pl.col('digg_count').mean().alias('avg_likes'),
            pl.col('play_count').mean().alias('avg_views')
        ]).sort('video_count', descending=True)
        
        print("Location-based statistics:")
        print(location_stats.head(10))
        
        return location_stats
    else:
        print("No location data available")
        return None

def create_summary_report(df, correlation):
    """Create a comprehensive summary report"""
    print("\n📋 SUMMARY REPORT")
    print("=" * 60)
    
    # Basic metrics
    total_videos = df.height
    avg_views = df['play_count'].mean()
    avg_likes = df['digg_count'].mean()
    avg_comments = df['comment_count'].mean()
    avg_shares = df['share_count'].mean()
    avg_duration = df['duration'].mean()
    
    print(f"Total Videos Analyzed: {total_videos:,}")
    print(f"Average Views per Video: {avg_views:,.0f}")
    print(f"Average Likes (Diggs) per Video: {avg_likes:,.0f}")
    print(f"Average Comments per Video: {avg_comments:,.0f}")
    print(f"Average Shares per Video: {avg_shares:,.0f}")
    print(f"Average Video Duration: {avg_duration:.1f} seconds")
    
    # Top performers
    max_views = df['play_count'].max()
    max_likes = df['digg_count'].max()
    max_comments = df['comment_count'].max()
    
    print(f"\n🎯 Peak Performance:")
    print(f"Maximum Views: {max_views:,}")
    print(f"Maximum Likes: {max_likes:,}")
    print(f"Maximum Comments: {max_comments:,}")
    
    # Engagement rates
    total_views = df['play_count'].sum()
    total_likes = df['digg_count'].sum()
    total_comments = df['comment_count'].sum()
    total_shares = df['share_count'].sum()
    
    like_rate = (total_likes / total_views) * 100
    comment_rate = (total_comments / total_views) * 100
    share_rate = (total_shares / total_views) * 100
    
    print(f"\n📊 Overall Engagement Rates:")
    print(f"Like Rate: {like_rate:.2f}%")
    print(f"Comment Rate: {comment_rate:.4f}%")
    print(f"Share Rate: {share_rate:.4f}%")
    
    # Author statistics
    unique_authors = df['author_unique_id'].n_unique()
    print(f"\n👥 Creator Statistics:")
    print(f"Unique Authors: {unique_authors}")
    
    videos_per_author = df.group_by('author_unique_id').agg(pl.len().alias('count'))
    avg_videos_per_author = videos_per_author['count'].mean()
    print(f"Average Videos per Author: {avg_videos_per_author:.1f}")
    
    # Duration insights
    duration_categories = df.group_by('duration_category').agg(pl.len().alias('count')).sort('count', descending=True)
    most_common_duration = duration_categories[0, 'duration_category']
    print(f"Most Common Video Length: {most_common_duration}")
    
    # Get correlation value properly
    likes_vs_views_corr = correlation['likes_vs_views'][0]
    
    # Calculate performance multiplier for short videos
    short_videos_avg_likes = df.filter(pl.col('duration_category') == 'Very Short (≤15s)')['digg_count'].mean()
    overall_avg_likes = df['digg_count'].mean()
    performance_multiplier = short_videos_avg_likes / overall_avg_likes
    
    # Key findings
    print(f"\n🔍 KEY INSIGHTS:")
    print(f"• Very short videos (≤15s) have {performance_multiplier:.1f}x higher average likes")
    print(f"• Strong correlation between views and likes: {likes_vs_views_corr:.3f}")
    
    # Calculate top creators percentage
    top_creators = ['zachking', 'mrbeast', 'addisonre']
    top_creator_likes = df.filter(pl.col('author_unique_id').is_in(top_creators))['digg_count'].sum()
    top_creator_percentage = (top_creator_likes / total_likes) * 100
    print(f"• Top 3 creators account for {top_creator_percentage:.1f}% of all likes")
    print(f"• Videos with hashtags have {df.filter(pl.col('has_hashtags') == True)['digg_count'].mean() / df.filter(pl.col('has_hashtags') == False)['digg_count'].mean():.1f}x higher engagement")
    print(f"• US-based videos perform {df.filter(pl.col('location_created') == 'US')['digg_count'].mean() / df.filter(pl.col('location_created') != 'US')['digg_count'].mean():.1f}x better than international videos")

def save_analysis_results(df, engagement_stats, duration_engagement, author_stats, engagement_rates, location_stats=None):
    """Save analysis results to files"""
    print("\n💾 Saving analysis results...")
    
    # Save cleaned dataset
    df.write_csv('tiktok_cleaned.csv')
    print("✓ Cleaned dataset → 'tiktok_cleaned.csv'")
    
    # Save engagement statistics
    engagement_stats.write_csv('engagement_statistics.csv')
    print("✓ Engagement statistics → 'engagement_statistics.csv'")
    
    # Save duration analysis
    duration_engagement.write_csv('duration_analysis.csv')
    print("✓ Duration analysis → 'duration_analysis.csv'")
    
    # Save author statistics
    author_stats.write_csv('author_analysis.csv')
    print("✓ Author analysis → 'author_analysis.csv'")
    
    # Save engagement rates
    engagement_rates.write_csv('engagement_rates.csv')
    print("✓ Engagement rates → 'engagement_rates.csv'")
    
    if location_stats is not None:
        location_stats.write_csv('location_analysis.csv')
        print("✓ Location analysis → 'location_analysis.csv'")

def main():
    """Main function to run the TikTok dataset analysis"""
    try:
        # Check if dataset exists
        if not Path('train.csv').exists():
            print("❌ Error: train.csv not found in current directory")
            return
        
        print("🚀 Starting TikTok Dataset Analysis")
        print("=" * 50)
        
        # Load and explore data
        df = load_and_explore_data()
        
        # Clean data
        df = clean_data(df)
        
        # Analyze engagement
        engagement_stats, top_liked, correlation = analyze_engagement(df)
        
        # Analyze video duration
        df, duration_engagement = analyze_video_duration(df)
        
        # Analyze authors
        author_stats = analyze_authors(df)
        
        # Analyze temporal patterns
        df, temporal_stats = analyze_temporal_patterns(df)
        
        # Calculate engagement rates
        df, engagement_rates = calculate_engagement_rates(df)
        
        # Analyze descriptions
        df = analyze_video_descriptions(df)
        
        # Analyze location data
        location_stats = analyze_location_data(df)
        
        # Create summary report
        create_summary_report(df, correlation)
        
        # Save results
        save_analysis_results(df, engagement_stats, duration_engagement, author_stats, engagement_rates, location_stats)
        
        print("\n✅ Analysis completed successfully!")
        print("\n📈 KEY FINDINGS SUMMARY:")
        print("• Very short videos (≤15s) perform best")
        print("• Strong positive correlation between views and likes")
        print("• zachking, mrbeast, and addisonre dominate engagement")
        print("• Average engagement: ~7.2% like rate")
        print("• Videos with hashtags perform better")
        print("• US-based content outperforms international content")
        
    except Exception as e:
        print(f"❌ Error during analysis: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()