File size: 7,053 Bytes
ac2b2fb
 
 
 
23b9f7f
5f31ef5
 
ac2b2fb
5f31ef5
23b9f7f
28fce0e
 
 
 
 
 
 
 
 
 
 
 
23b9f7f
5f31ef5
 
23b9f7f
5f31ef5
 
 
 
 
 
 
23b9f7f
5f31ef5
 
 
 
 
 
 
23b9f7f
5f31ef5
 
 
 
 
 
 
23b9f7f
5f31ef5
 
 
 
 
 
 
 
 
 
 
 
 
23b9f7f
5f31ef5
 
 
 
 
 
 
 
 
 
23b9f7f
5f31ef5
 
 
 
23b9f7f
5f31ef5
 
 
 
 
 
 
 
 
 
23b9f7f
5f31ef5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23b9f7f
 
5f31ef5
ac2b2fb
5b85e09
5f31ef5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ded942c
 
5f31ef5
 
 
 
 
 
 
 
 
 
 
 
ac2b2fb
5f31ef5
ac2b2fb
5f31ef5
 
 
28fce0e
5f31ef5
28fce0e
5f31ef5
5b85e09
28fce0e
5f31ef5
5b85e09
5f31ef5
28fce0e
5f31ef5
 
ac2b2fb
 
5b85e09
23b9f7f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
import gradio as gr
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy.stats as stats
import base64

def advanced_analysis(file):
    try:
        # Support multiple file types
        supported_extensions = ['.csv', '.xlsx', '.xls', '.txt']
        if not any(file.lower().endswith(ext) for ext in supported_extensions):
            raise ValueError(f"Unsupported file type. Please upload a file with one of these extensions: {', '.join(supported_extensions)}")
        
        # Load file based on extension
        if file.endswith('.csv'):
            df = pd.read_csv(file)
        elif file.endswith(('.xlsx', '.xls')):
            df = pd.read_excel(file)
        elif file.endswith('.txt'):
            df = pd.read_csv(file, sep='\t')
        
        # Comprehensive Analysis Report
        report = "# πŸ“Š Comprehensive Data Analysis Report\n\n"
        
        # 1. Basic Dataset Information
        report += "## 1. Dataset Overview\n"
        report += f"- **Total Rows:** {len(df)}\n"
        report += f"- **Total Columns:** {len(df.columns)}\n"
        report += f"- **Column Types:**\n"
        for col, dtype in df.dtypes.items():
            report += f"  - `{col}`: {dtype}\n"
        
        # 2. Missing Value Analysis
        report += "\n## 2. Missing Value Analysis\n"
        missing_data = df.isnull().sum()
        missing_percentage = 100 * df.isnull().sum() / len(df)
        missing_table = pd.concat([missing_data, missing_percentage], axis=1, 
                                   keys=['Missing Count', 'Missing Percentage'])
        report += "```\n" + missing_table.to_string() + "\n```\n"
        
        # 3. Statistical Summary
        report += "\n## 3. Statistical Summary\n"
        numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
        if len(numeric_cols) > 0:
            stats_summary = df[numeric_cols].describe()
            report += "### Numerical Columns Statistics\n"
            report += "```\n" + stats_summary.to_string() + "\n```\n"
        
        # 4. Outlier Detection
        report += "\n## 4. Outlier Analysis\n"
        outliers = {}
        for col in numeric_cols:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            column_outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
            if len(column_outliers) > 0:
                outliers[col] = len(column_outliers)
                report += f"- **{col}:** {len(column_outliers)} outliers detected\n"
        
        # 5. Correlation Analysis
        report += "\n## 5. Correlation Analysis\n"
        if len(numeric_cols) > 1:
            correlation_matrix = df[numeric_cols].corr()
            report += "### Top Correlations\n"
            # Find and report top correlations
            corr_unstack = correlation_matrix.unstack()
            top_correlations = corr_unstack[corr_unstack != 1].nlargest(5)
            for (col1, col2), corr_value in top_correlations.items():
                report += f"- **{col1}** & **{col2}**: {corr_value:.2f}\n"
        
        # Visualizations
        plt.close('all')
        fig, axs = plt.subplots(2, 2, figsize=(20, 15))
        plt.subplots_adjust(hspace=0.4, wspace=0.3)
        
        # Correlation Heatmap
        if len(numeric_cols) > 1:
            sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', ax=axs[0, 0], 
                        square=True, cbar=True, linewidths=0.5)
            axs[0, 0].set_title('Correlation Heatmap', fontsize=14, fontweight='bold')
        
        # Box Plot for Numeric Columns
        df[numeric_cols].boxplot(ax=axs[0, 1])
        axs[0, 1].set_title('Box Plot of Numeric Columns', fontsize=14, fontweight='bold')
        axs[0, 1].tick_params(axis='x', rotation=45)
        
        # Distribution of Categorical Columns
        categorical_cols = df.select_dtypes(include=['object']).columns
        if len(categorical_cols) > 0:
            cat_value_counts = df[categorical_cols[0]].value_counts()
            cat_value_counts.plot(kind='bar', ax=axs[1, 0])
            axs[1, 0].set_title(f'Distribution of {categorical_cols[0]}', 
                                 fontsize=14, fontweight='bold')
            axs[1, 0].tick_params(axis='x', rotation=45)
        
        # Scatter Plot Matrix for Top Correlated Features
        if len(numeric_cols) > 2:
            top_corr_features = correlation_matrix.unstack().sort_values(
                kind="quicksort", ascending=False).head(5)
            top_features = list(set([x[0] for x in top_corr_features.index] + 
                                    [x[1] for x in top_corr_features.index]))[:3]
            pd.plotting.scatter_matrix(df[top_features], 
                                       figsize=(10,10), 
                                       diagonal='hist', 
                                       ax=axs[1, 1])
            axs[1, 1].set_title('Scatter Plot of Top Correlated Features', 
                                fontsize=14, fontweight='bold')
        
        plt.suptitle('πŸ” Data Analysis Visualizations', fontsize=16, fontweight='bold')
        plt.tight_layout()
        plt.savefig('data_analysis_advanced.png', dpi=300, bbox_inches='tight')
        plt.close()
        
        return report, 'data_analysis_advanced.png'
    
    except Exception as e:
        error_report = f"## ❌ Analysis Failed\n\n**Error:** {str(e)}\n\n"
        error_report += "Possible reasons:\n"
        error_report += "- Incorrect file format\n"
        error_report += "- Unsupported data types\n"
        error_report += "- Corrupted or incomplete dataset"
        return error_report, None

# Custom CSS for a more modern look
css = """
.gradio-container {
    background-color: #f0f2f6;
    font-family: 'Inter', 'Helvetica Neue', Arial, sans-serif;
}
.output-markdown {
    background-color: white;
    border-radius: 10px;
    padding: 20px;
    box-shadow: 0 4px 6px rgba(0,0,0,0.1);
    max-height: 500px;
    overflow-y: auto;
}
.file-upload {
    background-color: #4CAF50;
    color: white;
    border-radius: 5px;
    padding: 10px 20px;
    transition: background-color 0.3s ease;
}
.file-upload:hover {
    background-color: #45a049;
}
"""

# Gradio Interface with Enhanced UI
demo = gr.Interface(
    fn=advanced_analysis,
    inputs=gr.File(
        type="filepath", 
        label="πŸ“€ Upload File", 
        file_count="single", 
        file_types=["csv", "xlsx", "xls", "txt"]  # Updated file types
    ),
    outputs=[
        gr.Markdown(),
        gr.Image(label="πŸ“Š Advanced Visualizations")
    ],
    title="🧠 Smart Data Analyzer",
    description="Upload a CSV, Excel, or Text file for comprehensive data analysis, statistical insights, and interactive visualizations.",
    theme='default',
    css=css
)

# Launch the interface
demo.launch(server_name="0.0.0.0", server_port=7860)