File size: 8,263 Bytes
98bc1c2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
import numpy as np

class VisualizationEngine:
    def __init__(self):
        plt.style.use('seaborn-v0_8')
        self.color_palette = sns.color_palette("husl", 8)
    
    def create_visualizations(self, df, selected_features):
        """Create various visualizations based on selected features"""
        plots = []
        
        if not selected_features:
            selected_features = df.columns[:4]  # Default to first 4 columns
        
        for feature in selected_features:
            if feature in df.columns and feature != 'ID':
                if df[feature].dtype in ['int64', 'float64']:
                    # Numerical feature visualizations
                    plots.extend(self._create_numerical_plots(df, feature))
                else:
                    # Categorical feature visualizations
                    plots.extend(self._create_categorical_plots(df, feature))
        
        # Create comparison plots
        if len(selected_features) >= 2:
            plots.extend(self._create_comparison_plots(df, selected_features))
        
        return plots
    
    def _create_numerical_plots(self, df, feature):
        """Create plots for numerical features"""
        plots = []
        
        # Histogram
        plt.figure(figsize=(10, 6))
        plt.hist(df[feature], bins=30, alpha=0.7, color=self.color_palette[0], edgecolor='black')
        plt.title(f'{feature} Distribution')
        plt.xlabel(feature)
        plt.ylabel('Frequency')
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plot_name = f'{feature.lower().replace(" ", "_")}_histogram.png'
        plt.savefig(plot_name, dpi=300, bbox_inches='tight')
        plots.append(plot_name)
        plt.close()
        
        # Box plot
        plt.figure(figsize=(8, 6))
        plt.boxplot(df[feature], patch_artist=True, 
                   boxprops=dict(facecolor=self.color_palette[1]))
        plt.title(f'{feature} Box Plot')
        plt.ylabel(feature)
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plot_name = f'{feature.lower().replace(" ", "_")}_boxplot.png'
        plt.savefig(plot_name, dpi=300, bbox_inches='tight')
        plots.append(plot_name)
        plt.close()
        
        # Density plot
        plt.figure(figsize=(10, 6))
        df[feature].plot(kind='density', color=self.color_palette[2], linewidth=2)
        plt.title(f'{feature} Density Plot')
        plt.xlabel(feature)
        plt.ylabel('Density')
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plot_name = f'{feature.lower().replace(" ", "_")}_density.png'
        plt.savefig(plot_name, dpi=300, bbox_inches='tight')
        plots.append(plot_name)
        plt.close()
        
        return plots
    
    def _create_categorical_plots(self, df, feature):
        """Create plots for categorical features"""
        plots = []
        
        value_counts = df[feature].value_counts()
        
        # Bar plot
        plt.figure(figsize=(12, 6))
        bars = plt.bar(value_counts.index, value_counts.values, 
                      color=self.color_palette[:len(value_counts)])
        plt.title(f'{feature} Distribution')
        plt.xlabel(feature)
        plt.ylabel('Count')
        plt.xticks(rotation=45)
        
        # Add value labels on bars
        for bar in bars:
            height = bar.get_height()
            plt.text(bar.get_x() + bar.get_width()/2., height,
                    f'{int(height)}', ha='center', va='bottom')
        
        plt.tight_layout()
        plot_name = f'{feature.lower().replace(" ", "_")}_barplot.png'
        plt.savefig(plot_name, dpi=300, bbox_inches='tight')
        plots.append(plot_name)
        plt.close()
        
        # Pie chart
        plt.figure(figsize=(10, 8))
        plt.pie(value_counts.values, labels=value_counts.index, autopct='%1.1f%%',
                colors=self.color_palette[:len(value_counts)])
        plt.title(f'{feature} Distribution (Pie Chart)')
        plt.tight_layout()
        plot_name = f'{feature.lower().replace(" ", "_")}_piechart.png'
        plt.savefig(plot_name, dpi=300, bbox_inches='tight')
        plots.append(plot_name)
        plt.close()
        
        return plots
    
    def _create_comparison_plots(self, df, features):
        """Create comparison plots between features"""
        plots = []
        
        numeric_features = [f for f in features if df[f].dtype in ['int64', 'float64']]
        categorical_features = [f for f in features if df[f].dtype in ['object', 'category']]
        
        # Scatter plots for numeric features
        if len(numeric_features) >= 2:
            for i in range(len(numeric_features)):
                for j in range(i+1, len(numeric_features)):
                    plt.figure(figsize=(10, 8))
                    plt.scatter(df[numeric_features[i]], df[numeric_features[j]], 
                              alpha=0.6, color=self.color_palette[0])
                    plt.xlabel(numeric_features[i])
                    plt.ylabel(numeric_features[j])
                    plt.title(f'{numeric_features[i]} vs {numeric_features[j]}')
                    plt.grid(True, alpha=0.3)
                    plt.tight_layout()
                    plot_name = f'{numeric_features[i].lower().replace(" ", "_")}_vs_{numeric_features[j].lower().replace(" ", "_")}_scatter.png'
                    plt.savefig(plot_name, dpi=300, bbox_inches='tight')
                    plots.append(plot_name)
                    plt.close()
        
        # Box plots for numeric vs categorical
        if numeric_features and categorical_features:
            for num_feat in numeric_features[:2]:  # Limit to avoid too many plots
                for cat_feat in categorical_features[:2]:
                    plt.figure(figsize=(12, 8))
                    df.boxplot(column=num_feat, by=cat_feat, ax=plt.gca())
                    plt.title(f'{num_feat} by {cat_feat}')
                    plt.suptitle('')  # Remove default title
                    plt.xticks(rotation=45)
                    plt.tight_layout()
                    plot_name = f'{num_feat.lower().replace(" ", "_")}_by_{cat_feat.lower().replace(" ", "_")}_boxplot.png'
                    plt.savefig(plot_name, dpi=300, bbox_inches='tight')
                    plots.append(plot_name)
                    plt.close()
        
        # Correlation heatmap for numeric features
        if len(numeric_features) >= 2:
            plt.figure(figsize=(10, 8))
            correlation_matrix = df[numeric_features].corr()
            sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
                       square=True, linewidths=0.5)
            plt.title('Feature Correlation Matrix')
            plt.tight_layout()
            plot_name = 'selected_features_correlation.png'
            plt.savefig(plot_name, dpi=300, bbox_inches='tight')
            plots.append(plot_name)
            plt.close()
        
        return plots
    
    def create_interactive_plots(self, df, features):
        """Create interactive Plotly visualizations"""
        plots = []
        
        for feature in features:
            if feature in df.columns and feature != 'ID':
                if df[feature].dtype in ['int64', 'float64']:
                    # Interactive histogram
                    fig = px.histogram(df, x=feature, title=f'{feature} Distribution')
                    fig.write_html(f'{feature.lower().replace(" ", "_")}_interactive_hist.html')
                    plots.append(f'{feature.lower().replace(" ", "_")}_interactive_hist.html')
                else:
                    # Interactive bar chart
                    value_counts = df[feature].value_counts()
                    fig = px.bar(x=value_counts.index, y=value_counts.values,
                               title=f'{feature} Distribution')
                    fig.write_html(f'{feature.lower().replace(" ", "_")}_interactive_bar.html')
                    plots.append(f'{feature.lower().replace(" ", "_")}_interactive_bar.html')
        
        return plots