Spaces:

TroglodyteDerivations
/

Rick_and_Morty_Transcript_Analysis

Sleeping

App Files Files Community

Rick_and_Morty_Transcript_Analysis / Tik Tok Python Polars Exercise /final_visualizations.py

TroglodyteDerivations

Upload 44 files

80d08c2 verified 3 months ago

raw

history blame contribute delete

12.9 kB

	# final_visualizations.py
	import polars as pl
	import matplotlib.pyplot as plt
	import seaborn as sns
	import numpy as np
	from pathlib import Path

	def create_comprehensive_visualizations():
	"""Create comprehensive visualizations from the analyzed data"""

	try:
	# Load the cleaned data
	df = pl.read_csv('tiktok_cleaned.csv')

	# Set up the plotting style
	plt.style.use('default')
	sns.set_palette("husl")

	# Create a 2x3 grid of subplots
	fig, axes = plt.subplots(2, 3, figsize=(20, 12))
	fig.suptitle('TikTok Dataset: Comprehensive Performance Analysis', fontsize=18, fontweight='bold')

	# 1. Distribution of video likes (log scale for better visualization)
	likes_data = df['digg_count'].to_list()
	axes[0, 0].hist(likes_data, bins=50, alpha=0.7, edgecolor='black', log=True)
	axes[0, 0].set_title('Distribution of Video Likes\n(Log Scale)', fontweight='bold')
	axes[0, 0].set_xlabel('Number of Likes')
	axes[0, 0].set_ylabel('Frequency (Log Scale)')
	axes[0, 0].grid(True, alpha=0.3)

	# 2. Engagement by duration category
	duration_stats = df.group_by('duration_category').agg([
	pl.col('digg_count').mean().alias('avg_likes'),
	pl.len().alias('video_count')
	]).sort('avg_likes', descending=True)

	categories = duration_stats['duration_category'].to_list()
	avg_likes = duration_stats['avg_likes'].to_list()

	bars = axes[0, 1].bar(categories, avg_likes, alpha=0.7, color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4'])
	axes[0, 1].set_title('Average Likes by Video Duration', fontweight='bold')
	axes[0, 1].set_xlabel('Duration Category')
	axes[0, 1].set_ylabel('Average Likes')
	axes[0, 1].tick_params(axis='x', rotation=45)
	axes[0, 1].grid(True, alpha=0.3)

	# Add value labels on bars
	for bar in bars:
	height = bar.get_height()
	axes[0, 1].text(bar.get_x() + bar.get_width()/2., height,
	f'{height/1e6:.1f}M',
	ha='center', va='bottom', fontweight='bold')

	# 3. Author performance comparison
	author_stats = df.group_by('author_unique_id').agg([
	pl.col('digg_count').mean().alias('avg_likes'),
	pl.col('play_count').mean().alias('avg_views'),
	pl.len().alias('video_count')
	]).sort('avg_likes', descending=True)

	authors = author_stats['author_unique_id'].to_list()
	author_likes = author_stats['avg_likes'].to_list()
	author_views = author_stats['avg_views'].to_list()

	x_pos = np.arange(len(authors))
	width = 0.35

	bars1 = axes[0, 2].bar(x_pos - width/2, [l/1e6 for l in author_likes], width,
	label='Avg Likes (M)', alpha=0.7)
	bars2 = axes[0, 2].bar(x_pos + width/2, [v/1e6 for v in author_views], width,
	label='Avg Views (M)', alpha=0.7)

	axes[0, 2].set_title('Author Performance Comparison', fontweight='bold')
	axes[0, 2].set_xlabel('Authors')
	axes[0, 2].set_ylabel('Count (Millions)')
	axes[0, 2].set_xticks(x_pos)
	axes[0, 2].set_xticklabels(authors, rotation=45)
	axes[0, 2].legend()
	axes[0, 2].grid(True, alpha=0.3)

	# 4. Location performance
	location_stats = df.filter(pl.col('location_created').is_not_null()).group_by('location_created').agg([
	pl.col('digg_count').mean().alias('avg_likes'),
	pl.len().alias('video_count')
	]).sort('avg_likes', descending=True).head(6)

	locations = location_stats['location_created'].to_list()
	location_likes = location_stats['avg_likes'].to_list()

	bars = axes[1, 0].bar(locations, [l/1e6 for l in location_likes], alpha=0.7)
	axes[1, 0].set_title('Average Likes by Location\n(Top 6 Countries)', fontweight='bold')
	axes[1, 0].set_xlabel('Country Code')
	axes[1, 0].set_ylabel('Average Likes (Millions)')
	axes[1, 0].tick_params(axis='x', rotation=45)
	axes[1, 0].grid(True, alpha=0.3)

	for bar in bars:
	height = bar.get_height()
	axes[1, 0].text(bar.get_x() + bar.get_width()/2., height,
	f'{height:.1f}M',
	ha='center', va='bottom', fontweight='bold')

	# 5. Hashtag impact analysis
	hashtag_stats = df.group_by('has_hashtags').agg([
	pl.col('digg_count').mean().alias('avg_likes'),
	pl.col('play_count').mean().alias('avg_views'),
	pl.len().alias('video_count')
	])

	hashtag_labels = ['With Hashtags', 'Without Hashtags']
	hashtag_likes = [hashtag_stats.filter(pl.col('has_hashtags') == True)['avg_likes'][0] / 1e6,
	hashtag_stats.filter(pl.col('has_hashtags') == False)['avg_likes'][0] / 1e6]

	bars = axes[1, 1].bar(hashtag_labels, hashtag_likes, alpha=0.7, color=['#FF9999', '#66B2FF'])
	axes[1, 1].set_title('Impact of Hashtags on Engagement', fontweight='bold')
	axes[1, 1].set_ylabel('Average Likes (Millions)')
	axes[1, 1].grid(True, alpha=0.3)

	for bar in bars:
	height = bar.get_height()
	axes[1, 1].text(bar.get_x() + bar.get_width()/2., height,
	f'{height:.1f}M',
	ha='center', va='bottom', fontweight='bold')

	# 6. Engagement rates comparison
	engagement_rates = [7.22, 0.11, 0.15] # Like, Comment, Share rates from analysis
	engagement_types = ['Like Rate', 'Comment Rate', 'Share Rate']

	bars = axes[1, 2].bar(engagement_types, engagement_rates, alpha=0.7, color=['#FF6B6B', '#4ECDC4', '#45B7D1'])
	axes[1, 2].set_title('Engagement Rate Comparison (%)', fontweight='bold')
	axes[1, 2].set_ylabel('Engagement Rate (%)')
	axes[1, 2].grid(True, alpha=0.3)

	for bar in bars:
	height = bar.get_height()
	axes[1, 2].text(bar.get_x() + bar.get_width()/2., height,
	f'{height:.2f}%',
	ha='center', va='bottom', fontweight='bold')

	plt.tight_layout()
	plt.savefig('comprehensive_tiktok_analysis.png', dpi=300, bbox_inches='tight')
	plt.show()

	print("📊 Comprehensive visualizations saved as 'comprehensive_tiktok_analysis.png'")

	# Create additional detailed visualizations
	create_detailed_analysis_charts(df)

	except Exception as e:
	print(f"Error creating visualizations: {e}")
	import traceback
	traceback.print_exc()

	def create_detailed_analysis_charts(df):
	"""Create additional detailed analysis charts"""

	# 1. Performance distribution across creators
	plt.figure(figsize=(12, 8))

	# Subplot 1: Likes distribution by author
	plt.subplot(2, 2, 1)
	author_likes = df.group_by('author_unique_id').agg(
	pl.col('digg_count').sum().alias('total_likes')
	).sort('total_likes', descending=True)

	plt.pie(author_likes['total_likes'].to_list(),
	labels=author_likes['author_unique_id'].to_list(),
	autopct='%1.1f%%', startangle=90)
	plt.title('Total Likes Distribution by Creator')

	# Subplot 2: Video count by author
	plt.subplot(2, 2, 2)
	author_counts = df.group_by('author_unique_id').agg(
	pl.len().alias('video_count')
	).sort('video_count', descending=True)

	plt.bar(author_counts['author_unique_id'].to_list(),
	author_counts['video_count'].to_list(),
	alpha=0.7, color='skyblue')
	plt.title('Video Count by Creator')
	plt.xticks(rotation=45)

	# Subplot 3: Duration distribution
	plt.subplot(2, 2, 3)
	plt.hist(df['duration'].to_list(), bins=30, alpha=0.7, edgecolor='black')
	plt.title('Video Duration Distribution')
	plt.xlabel('Duration (seconds)')
	plt.ylabel('Frequency')
	plt.grid(True, alpha=0.3)

	# Subplot 4: Views vs Likes scatter plot
	plt.subplot(2, 2, 4)
	plt.scatter(df['play_count'].to_list(), df['digg_count'].to_list(),
	alpha=0.6, s=20)
	plt.title('Views vs Likes Correlation')
	plt.xlabel('Views')
	plt.ylabel('Likes')
	plt.grid(True, alpha=0.3)

	# Add correlation coefficient
	correlation = df.select(pl.corr('play_count', 'digg_count')).item()
	plt.text(0.05, 0.95, f'Correlation: {correlation:.3f}',
	transform=plt.gca().transAxes, fontsize=12,
	bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.8))

	plt.tight_layout()
	plt.savefig('detailed_tiktok_analysis.png', dpi=300, bbox_inches='tight')
	plt.show()

	print("📊 Detailed analysis charts saved as 'detailed_tiktok_analysis.png'")

	# Create performance summary chart
	create_performance_summary_chart(df)

	def create_performance_summary_chart(df):
	"""Create a performance summary chart highlighting key metrics"""

	fig, ax = plt.subplots(figsize=(10, 6))

	# Key metrics from analysis
	metrics = ['Avg Views', 'Avg Likes', 'Like Rate', 'Comment Rate']
	values = [21.7, 1.57, 7.22, 0.11] # In millions and percentages
	units = ['M', 'M', '%', '%']

	colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4']

	bars = ax.bar(metrics, values, color=colors, alpha=0.7)

	ax.set_title('TikTok Performance Summary', fontsize=16, fontweight='bold')
	ax.set_ylabel('Value')
	ax.grid(True, alpha=0.3, axis='y')

	# Add value labels on bars
	for bar, value, unit in zip(bars, values, units):
	height = bar.get_height()
	ax.text(bar.get_x() + bar.get_width()/2., height,
	f'{value} {unit}',
	ha='center', va='bottom', fontweight='bold')

	# Add insights as text
	insights = [
	"• Very short videos (≤15s) perform best",
	"• US content outperforms international",
	"• Hashtags boost engagement 1.7x",
	"• Top 3 creators = 76.4% of all likes"
	]

	for i, insight in enumerate(insights):
	ax.text(0.02, 0.95 - i*0.1, insight, transform=ax.transAxes,
	fontsize=10, bbox=dict(boxstyle="round,pad=0.3",
	facecolor="lightyellow", alpha=0.7))

	plt.tight_layout()
	plt.savefig('tiktok_performance_summary.png', dpi=300, bbox_inches='tight')
	plt.show()

	print("📊 Performance summary saved as 'tiktok_performance_summary.png'")

	def generate_insights_report():
	"""Generate a text-based insights report"""

	print("\n" + "="*70)
	print("📊 TIKTOK DATASET - KEY INSIGHTS REPORT")
	print("="*70)

	insights = [
	"🎯 CONTENT STRATEGY INSIGHTS:",
	"• Very short videos (≤15s) generate 1.4x more likes than average",
	"• Optimal video length: 15-30 seconds for maximum engagement",
	"• Videos longer than 60s see significant drop in performance",
	"",
	"👥 CREATOR ECOSYSTEM:",
	"• Highly concentrated: Only 4 creators in entire dataset",
	"• Top 3 creators (zachking, mrbeast, addisonre) dominate:",
	" - Account for 76.4% of all likes",
	" - Generate highest average engagement rates",
	"",
	"🌍 GEOGRAPHIC PERFORMANCE:",
	"• US-based content performs 3.2x better than international",
	"• Indonesia has highest volume but lower engagement",
	"• Limited geographic diversity in dataset",
	"",
	"📊 ENGAGEMENT PATTERNS:",
	"• Strong correlation (0.65) between views and likes",
	"• Like rate: 7.22% (healthy engagement)",
	"• Comment rate: 0.11% (very low - viewers prefer liking)",
	"• Share rate: 0.15% (higher than comments)",
	"",
	"🔖 CONTENT OPTIMIZATION:",
	"• Videos with hashtags have 1.7x higher engagement",
	"• Average of 1.9 hashtags per video",
	"• Description length: ~44 characters on average",
	"",
	"📈 RECOMMENDATIONS:",
	"1. Focus on 15-30 second video format",
	"2. Always include relevant hashtags (1-3 optimal)",
	"3. Target US audience for maximum engagement",
	"4. Study top creators' content strategies",
	"5. Prioritize like-generating content over comments"
	]

	for insight in insights:
	print(insight)

	print("\n" + "="*70)

	if __name__ == "__main__":
	create_comprehensive_visualizations()
	generate_insights_report()