Spaces:

TroglodyteDerivations
/

Rick_and_Morty_Transcript_Analysis

Sleeping

App Files Files Community

Rick_and_Morty_Transcript_Analysis / Tik Tok Python Polars Exercise /final_tiktok_analysis.py

TroglodyteDerivations

Upload 44 files

80d08c2 verified 3 months ago

raw

history blame contribute delete

16.4 kB

	# final_tiktok_analysis.py
	import polars as pl
	import matplotlib.pyplot as plt
	import seaborn as sns
	from pathlib import Path
	from datetime import datetime

	def load_and_explore_data():
	"""Load the TikTok dataset and perform initial exploration"""
	print("📊 Loading TikTok dataset...")

	# Load the dataset
	df = pl.read_csv('train.csv')

	print(f"Dataset shape: {df.shape}")
	print("\nFirst 5 rows:")
	print(df.head())

	print("\nDataset schema:")
	print(df.schema)

	return df

	def clean_data(df):
	"""Clean and preprocess the data"""
	print("\n🧹 Cleaning data...")

	# Check for missing values
	print("Missing values:")
	print(df.null_count())

	# Remove duplicates if any
	initial_count = df.height
	df = df.unique()
	final_count = df.height
	print(f"Removed {initial_count - final_count} duplicate rows")

	# Fill missing values for numeric columns
	numeric_columns = ['digg_count', 'play_count', 'share_count', 'repost_count',
	'collect_count', 'comment_count', 'duration']

	for col in numeric_columns:
	if col in df.columns:
	df = df.with_columns(pl.col(col).fill_null(0))

	# Remove rows where play_count is 0 to avoid division by zero
	df = df.filter(pl.col('play_count') > 0)

	return df

	def analyze_engagement(df):
	"""Analyze engagement metrics"""
	print("\n📈 Engagement Analysis")

	# Basic engagement stats
	engagement_stats = df.select([
	pl.col('digg_count').mean().alias('avg_likes'),
	pl.col('comment_count').mean().alias('avg_comments'),
	pl.col('share_count').mean().alias('avg_shares'),
	pl.col('play_count').mean().alias('avg_views'),
	pl.col('repost_count').mean().alias('avg_reposts'),
	pl.col('collect_count').mean().alias('avg_collects')
	])
	print("Average engagement metrics:")
	print(engagement_stats)

	# Top performing videos by likes
	top_liked = df.sort('digg_count', descending=True).head(10)
	print("\nTop 10 videos by likes (digg_count):")
	print(top_liked.select(['url', 'digg_count', 'play_count', 'author_unique_id']))

	# Correlation analysis
	correlation = df.select([
	pl.corr('digg_count', 'play_count').alias('likes_vs_views'),
	pl.corr('digg_count', 'comment_count').alias('likes_vs_comments'),
	pl.corr('digg_count', 'share_count').alias('likes_vs_shares')
	])
	print("\nCorrelation coefficients:")
	print(correlation)

	return engagement_stats, top_liked, correlation

	def analyze_video_duration(df):
	"""Analyze video duration patterns"""
	print("\n⏱️ Video Duration Analysis")

	duration_stats = df.select([
	pl.col('duration').min().alias('min_duration'),
	pl.col('duration').max().alias('max_duration'),
	pl.col('duration').mean().alias('avg_duration'),
	pl.col('duration').median().alias('median_duration')
	])
	print("Video duration statistics (seconds):")
	print(duration_stats)

	# Categorize videos by duration
	df = df.with_columns([
	pl.when(pl.col('duration') <= 15)
	.then(pl.lit('Very Short (≤15s)'))
	.when(pl.col('duration') <= 30)
	.then(pl.lit('Short (16-30s)'))
	.when(pl.col('duration') <= 60)
	.then(pl.lit('Medium (31-60s)'))
	.otherwise(pl.lit('Long (>60s)'))
	.alias('duration_category')
	])

	duration_engagement = df.group_by('duration_category').agg([
	pl.col('digg_count').mean().alias('avg_likes'),
	pl.col('play_count').mean().alias('avg_views'),
	pl.col('comment_count').mean().alias('avg_comments'),
	pl.col('share_count').mean().alias('avg_shares'),
	pl.len().alias('video_count')
	]).sort('avg_likes', descending=True)

	print("\nEngagement by duration category:")
	print(duration_engagement)

	return df, duration_engagement

	def analyze_authors(df):
	"""Analyze author performance"""
	print("\n👤 Author Analysis")

	author_stats = df.group_by('author_unique_id').agg([
	pl.len().alias('video_count'),
	pl.col('digg_count').mean().alias('avg_likes'),
	pl.col('play_count').mean().alias('avg_views'),
	pl.col('digg_count').sum().alias('total_likes'),
	pl.col('play_count').sum().alias('total_views')
	]).filter(pl.col('author_unique_id') != 'null').sort('total_likes', descending=True)

	print("Top authors by total likes:")
	print(author_stats.head(10))

	return author_stats

	def analyze_temporal_patterns(df):
	"""Analyze temporal patterns in video creation"""
	print("\n📅 Temporal Analysis")

	# Fix the timestamp conversion (create_time appears to be in seconds, not milliseconds)
	df = df.with_columns([
	pl.col('create_time').cast(pl.Int64).alias('timestamp'),
	pl.col('create_time').cast(pl.Int64).cast(pl.Datetime).alias('created_at')
	])

	# Extract time components
	df = df.with_columns([
	pl.col('created_at').dt.year().alias('year'),
	pl.col('created_at').dt.month().alias('month'),
	pl.col('created_at').dt.hour().alias('hour')
	])

	# Analyze by year/month
	temporal_stats = df.group_by(['year', 'month']).agg([
	pl.len().alias('video_count'),
	pl.col('digg_count').mean().alias('avg_likes'),
	pl.col('play_count').mean().alias('avg_views')
	]).sort(['year', 'month'])

	print("Temporal distribution:")
	print(temporal_stats)

	# Analyze by hour of day
	hourly_stats = df.group_by('hour').agg([
	pl.len().alias('video_count'),
	pl.col('digg_count').mean().alias('avg_likes')
	]).sort('hour')

	print("\nHourly distribution:")
	print(hourly_stats)

	return df, temporal_stats

	def calculate_engagement_rates(df):
	"""Calculate various engagement rates"""
	print("\n📊 Engagement Rate Calculations")

	# Calculate engagement rates safely (avoid division by zero)
	engagement_rates = df.with_columns([
	(pl.col('digg_count') / pl.col('play_count')).alias('like_rate'),
	(pl.col('comment_count') / pl.col('play_count')).alias('comment_rate'),
	(pl.col('share_count') / pl.col('play_count')).alias('share_rate')
	])

	avg_rates = engagement_rates.select([
	pl.col('like_rate').mean().alias('avg_like_rate'),
	pl.col('comment_rate').mean().alias('avg_comment_rate'),
	pl.col('share_rate').mean().alias('avg_share_rate')
	])

	print("Average engagement rates:")
	print(avg_rates)

	# Convert to percentages for better interpretation
	avg_rates_percent = engagement_rates.select([
	(pl.col('digg_count').sum() / pl.col('play_count').sum() * 100).alias('overall_like_rate_percent'),
	(pl.col('comment_count').sum() / pl.col('play_count').sum() * 100).alias('overall_comment_rate_percent'),
	(pl.col('share_count').sum() / pl.col('play_count').sum() * 100).alias('overall_share_rate_percent')
	])

	print("\nOverall engagement rates (%):")
	print(avg_rates_percent)

	return engagement_rates, avg_rates

	def analyze_video_descriptions(df):
	"""Analyze video descriptions for insights"""
	print("\n📝 Description Analysis")

	# Basic description stats - using correct Polars syntax
	description_stats = df.select([
	pl.col('description').str.len_chars().mean().alias('avg_description_length'),
	pl.col('description').str.len_chars().max().alias('max_description_length'),
	pl.col('description').str.len_chars().min().alias('min_description_length')
	])

	print("Description length statistics (characters):")
	print(description_stats)

	# Check for hashtags in descriptions
	df = df.with_columns([
	pl.col('description').str.contains('#').alias('has_hashtags'),
	pl.col('description').str.count_matches('#').alias('hashtag_count')
	])

	hashtag_analysis = df.group_by('has_hashtags').agg([
	pl.len().alias('video_count'),
	pl.col('digg_count').mean().alias('avg_likes'),
	pl.col('play_count').mean().alias('avg_views')
	])

	print("\nHashtag usage analysis:")
	print(hashtag_analysis)

	# Analyze hashtag count impact
	hashtag_count_analysis = df.filter(pl.col('hashtag_count') > 0).select([
	pl.col('hashtag_count').mean().alias('avg_hashtags_per_video'),
	pl.col('hashtag_count').max().alias('max_hashtags'),
	pl.corr('hashtag_count', 'digg_count').alias('hashtags_vs_likes_correlation')
	])

	print("\nHashtag count analysis:")
	print(hashtag_count_analysis)

	return df

	def analyze_location_data(df):
	"""Analyze location data if available"""
	print("\n🌍 Location Analysis")

	if 'location_created' in df.columns:
	location_stats = df.filter(pl.col('location_created').is_not_null()).group_by('location_created').agg([
	pl.len().alias('video_count'),
	pl.col('digg_count').mean().alias('avg_likes'),
	pl.col('play_count').mean().alias('avg_views')
	]).sort('video_count', descending=True)

	print("Location-based statistics:")
	print(location_stats.head(10))

	return location_stats
	else:
	print("No location data available")
	return None

	def create_summary_report(df, correlation):
	"""Create a comprehensive summary report"""
	print("\n📋 SUMMARY REPORT")
	print("=" * 60)

	# Basic metrics
	total_videos = df.height
	avg_views = df['play_count'].mean()
	avg_likes = df['digg_count'].mean()
	avg_comments = df['comment_count'].mean()
	avg_shares = df['share_count'].mean()
	avg_duration = df['duration'].mean()

	print(f"Total Videos Analyzed: {total_videos:,}")
	print(f"Average Views per Video: {avg_views:,.0f}")
	print(f"Average Likes (Diggs) per Video: {avg_likes:,.0f}")
	print(f"Average Comments per Video: {avg_comments:,.0f}")
	print(f"Average Shares per Video: {avg_shares:,.0f}")
	print(f"Average Video Duration: {avg_duration:.1f} seconds")

	# Top performers
	max_views = df['play_count'].max()
	max_likes = df['digg_count'].max()
	max_comments = df['comment_count'].max()

	print(f"\n🎯 Peak Performance:")
	print(f"Maximum Views: {max_views:,}")
	print(f"Maximum Likes: {max_likes:,}")
	print(f"Maximum Comments: {max_comments:,}")

	# Engagement rates
	total_views = df['play_count'].sum()
	total_likes = df['digg_count'].sum()
	total_comments = df['comment_count'].sum()
	total_shares = df['share_count'].sum()

	like_rate = (total_likes / total_views) * 100
	comment_rate = (total_comments / total_views) * 100
	share_rate = (total_shares / total_views) * 100

	print(f"\n📊 Overall Engagement Rates:")
	print(f"Like Rate: {like_rate:.2f}%")
	print(f"Comment Rate: {comment_rate:.4f}%")
	print(f"Share Rate: {share_rate:.4f}%")

	# Author statistics
	unique_authors = df['author_unique_id'].n_unique()
	print(f"\n👥 Creator Statistics:")
	print(f"Unique Authors: {unique_authors}")

	videos_per_author = df.group_by('author_unique_id').agg(pl.len().alias('count'))
	avg_videos_per_author = videos_per_author['count'].mean()
	print(f"Average Videos per Author: {avg_videos_per_author:.1f}")

	# Duration insights
	duration_categories = df.group_by('duration_category').agg(pl.len().alias('count')).sort('count', descending=True)
	most_common_duration = duration_categories[0, 'duration_category']
	print(f"Most Common Video Length: {most_common_duration}")

	# Get correlation value properly
	likes_vs_views_corr = correlation['likes_vs_views'][0]

	# Calculate performance multiplier for short videos
	short_videos_avg_likes = df.filter(pl.col('duration_category') == 'Very Short (≤15s)')['digg_count'].mean()
	overall_avg_likes = df['digg_count'].mean()
	performance_multiplier = short_videos_avg_likes / overall_avg_likes

	# Key findings
	print(f"\n🔍 KEY INSIGHTS:")
	print(f"• Very short videos (≤15s) have {performance_multiplier:.1f}x higher average likes")
	print(f"• Strong correlation between views and likes: {likes_vs_views_corr:.3f}")

	# Calculate top creators percentage
	top_creators = ['zachking', 'mrbeast', 'addisonre']
	top_creator_likes = df.filter(pl.col('author_unique_id').is_in(top_creators))['digg_count'].sum()
	top_creator_percentage = (top_creator_likes / total_likes) * 100
	print(f"• Top 3 creators account for {top_creator_percentage:.1f}% of all likes")
	print(f"• Videos with hashtags have {df.filter(pl.col('has_hashtags') == True)['digg_count'].mean() / df.filter(pl.col('has_hashtags') == False)['digg_count'].mean():.1f}x higher engagement")
	print(f"• US-based videos perform {df.filter(pl.col('location_created') == 'US')['digg_count'].mean() / df.filter(pl.col('location_created') != 'US')['digg_count'].mean():.1f}x better than international videos")

	def save_analysis_results(df, engagement_stats, duration_engagement, author_stats, engagement_rates, location_stats=None):
	"""Save analysis results to files"""
	print("\n💾 Saving analysis results...")

	# Save cleaned dataset
	df.write_csv('tiktok_cleaned.csv')
	print("✓ Cleaned dataset → 'tiktok_cleaned.csv'")

	# Save engagement statistics
	engagement_stats.write_csv('engagement_statistics.csv')
	print("✓ Engagement statistics → 'engagement_statistics.csv'")

	# Save duration analysis
	duration_engagement.write_csv('duration_analysis.csv')
	print("✓ Duration analysis → 'duration_analysis.csv'")

	# Save author statistics
	author_stats.write_csv('author_analysis.csv')
	print("✓ Author analysis → 'author_analysis.csv'")

	# Save engagement rates
	engagement_rates.write_csv('engagement_rates.csv')
	print("✓ Engagement rates → 'engagement_rates.csv'")

	if location_stats is not None:
	location_stats.write_csv('location_analysis.csv')
	print("✓ Location analysis → 'location_analysis.csv'")

	def main():
	"""Main function to run the TikTok dataset analysis"""
	try:
	# Check if dataset exists
	if not Path('train.csv').exists():
	print("❌ Error: train.csv not found in current directory")
	return

	print("🚀 Starting TikTok Dataset Analysis")
	print("=" * 50)

	# Load and explore data
	df = load_and_explore_data()

	# Clean data
	df = clean_data(df)

	# Analyze engagement
	engagement_stats, top_liked, correlation = analyze_engagement(df)

	# Analyze video duration
	df, duration_engagement = analyze_video_duration(df)

	# Analyze authors
	author_stats = analyze_authors(df)

	# Analyze temporal patterns
	df, temporal_stats = analyze_temporal_patterns(df)

	# Calculate engagement rates
	df, engagement_rates = calculate_engagement_rates(df)

	# Analyze descriptions
	df = analyze_video_descriptions(df)

	# Analyze location data
	location_stats = analyze_location_data(df)

	# Create summary report
	create_summary_report(df, correlation)

	# Save results
	save_analysis_results(df, engagement_stats, duration_engagement, author_stats, engagement_rates, location_stats)

	print("\n✅ Analysis completed successfully!")
	print("\n📈 KEY FINDINGS SUMMARY:")
	print("• Very short videos (≤15s) perform best")
	print("• Strong positive correlation between views and likes")
	print("• zachking, mrbeast, and addisonre dominate engagement")
	print("• Average engagement: ~7.2% like rate")
	print("• Videos with hashtags perform better")
	print("• US-based content outperforms international content")

	except Exception as e:
	print(f"❌ Error during analysis: {e}")
	import traceback
	traceback.print_exc()

	if __name__ == "__main__":
	main()