File size: 3,885 Bytes
80d08c2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# visualization.py
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

def create_visualizations():
    """Create visualizations from the analyzed data"""
    
    try:
        # Load the cleaned data
        df = pl.read_csv('tiktok_cleaned.csv')
        
        # Set up the plotting style
        plt.style.use('default')
        sns.set_palette("husl")
        
        # Create subplots
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
        fig.suptitle('TikTok Dataset Analysis', fontsize=16, fontweight='bold')
        
        # 1. Distribution of video likes (digg_count)
        likes_data = df['digg_count'].to_list()
        axes[0, 0].hist(likes_data, bins=50, alpha=0.7, edgecolor='black')
        axes[0, 0].set_title('Distribution of Video Likes (Digg Count)')
        axes[0, 0].set_xlabel('Number of Likes')
        axes[0, 0].set_ylabel('Frequency')
        axes[0, 0].grid(True, alpha=0.3)
        
        # 2. Distribution of video views (play_count)
        views_data = df['play_count'].to_list()
        axes[0, 1].hist(views_data, bins=50, alpha=0.7, edgecolor='black')
        axes[0, 1].set_title('Distribution of Video Views (Play Count)')
        axes[0, 1].set_xlabel('Number of Views')
        axes[0, 1].set_ylabel('Frequency')
        axes[0, 1].grid(True, alpha=0.3)
        
        # 3. Scatter plot: Views vs Likes
        axes[1, 0].scatter(views_data, likes_data, alpha=0.6)
        axes[1, 0].set_title('Views vs Likes Correlation')
        axes[1, 0].set_xlabel('Views (Play Count)')
        axes[1, 0].set_ylabel('Likes (Digg Count)')
        axes[1, 0].grid(True, alpha=0.3)
        
        # 4. Engagement metrics comparison
        engagement_metrics = ['digg_count', 'comment_count', 'share_count']
        avg_engagement = [df[metric].mean() for metric in engagement_metrics]
        
        bars = axes[1, 1].bar(['Likes', 'Comments', 'Shares'], avg_engagement)
        axes[1, 1].set_title('Average Engagement Metrics')
        axes[1, 1].set_ylabel('Average Count')
        
        # Add value labels on bars
        for bar in bars:
            height = bar.get_height()
            axes[1, 1].text(bar.get_x() + bar.get_width()/2., height,
                           f'{height:,.0f}',
                           ha='center', va='bottom')
        
        plt.tight_layout()
        plt.savefig('tiktok_analysis_visualizations.png', dpi=300, bbox_inches='tight')
        plt.show()
        
        print("πŸ“Š Visualizations saved as 'tiktok_analysis_visualizations.png'")
        
        # Additional visualizations if duration data is available
        if 'duration' in df.columns:
            create_duration_visualizations(df)
            
    except Exception as e:
        print(f"Error creating visualizations: {e}")
        import traceback
        traceback.print_exc()

def create_duration_visualizations(df):
    """Create visualizations related to video duration"""
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    
    # Duration distribution
    duration_data = df['duration'].to_list()
    axes[0].hist(duration_data, bins=30, alpha=0.7, edgecolor='black')
    axes[0].set_title('Distribution of Video Duration')
    axes[0].set_xlabel('Duration (seconds)')
    axes[0].set_ylabel('Frequency')
    axes[0].grid(True, alpha=0.3)
    
    # Duration vs Engagement
    axes[1].scatter(duration_data, df['digg_count'].to_list(), alpha=0.6)
    axes[1].set_title('Duration vs Likes')
    axes[1].set_xlabel('Duration (seconds)')
    axes[1].set_ylabel('Likes (Digg Count)')
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('duration_analysis.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("πŸ“Š Duration visualizations saved as 'duration_analysis.png'")

if __name__ == "__main__":
    create_visualizations()