Spaces:

shvy
/

data-visulaizer-69

Sleeping

File size: 7,053 Bytes

ac2b2fb
 
 
 
23b9f7f
5f31ef5
 
ac2b2fb
5f31ef5
23b9f7f
28fce0e
 
 
 
 
 
 
 
 
 
 
 
23b9f7f
5f31ef5
 
23b9f7f
5f31ef5
 
 
 
 
 
 
23b9f7f
5f31ef5
 
 
 
 
 
 
23b9f7f
5f31ef5
 
 
 
 
 
 
23b9f7f
5f31ef5
 
 
 
 
 
 
 
 
 
 
 
 
23b9f7f
5f31ef5
 
 
 
 
 
 
 
 
 
23b9f7f
5f31ef5
 
 
 
23b9f7f
5f31ef5
 
 
 
 
 
 
 
 
 
23b9f7f
5f31ef5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23b9f7f
 
5f31ef5
ac2b2fb
5b85e09
5f31ef5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ded942c
 
5f31ef5
 
 
 
 
 
 
 
 
 
 
 
ac2b2fb
5f31ef5
ac2b2fb
5f31ef5
 
 
28fce0e
5f31ef5
28fce0e
5f31ef5
5b85e09
28fce0e
5f31ef5
5b85e09
5f31ef5
28fce0e
5f31ef5
 
ac2b2fb
 
5b85e09
23b9f7f

import gradio as gr
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy.stats as stats
import base64

def advanced_analysis(file):
    try:
        # Support multiple file types
        supported_extensions = ['.csv', '.xlsx', '.xls', '.txt']
        if not any(file.lower().endswith(ext) for ext in supported_extensions):
            raise ValueError(f"Unsupported file type. Please upload a file with one of these extensions: {', '.join(supported_extensions)}")
        
        # Load file based on extension
        if file.endswith('.csv'):
            df = pd.read_csv(file)
        elif file.endswith(('.xlsx', '.xls')):
            df = pd.read_excel(file)
        elif file.endswith('.txt'):
            df = pd.read_csv(file, sep='\t')
        
        # Comprehensive Analysis Report
        report = "# 📊 Comprehensive Data Analysis Report\n\n"
        
        # 1. Basic Dataset Information
        report += "## 1. Dataset Overview\n"
        report += f"- **Total Rows:** {len(df)}\n"
        report += f"- **Total Columns:** {len(df.columns)}\n"
        report += f"- **Column Types:**\n"
        for col, dtype in df.dtypes.items():
            report += f"  - `{col}`: {dtype}\n"
        
        # 2. Missing Value Analysis
        report += "\n## 2. Missing Value Analysis\n"
        missing_data = df.isnull().sum()
        missing_percentage = 100 * df.isnull().sum() / len(df)
        missing_table = pd.concat([missing_data, missing_percentage], axis=1, 
                                   keys=['Missing Count', 'Missing Percentage'])
        report += "```\n" + missing_table.to_string() + "\n```\n"
        
        # 3. Statistical Summary
        report += "\n## 3. Statistical Summary\n"
        numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
        if len(numeric_cols) > 0:
            stats_summary = df[numeric_cols].describe()
            report += "### Numerical Columns Statistics\n"
            report += "```\n" + stats_summary.to_string() + "\n```\n"
        
        # 4. Outlier Detection
        report += "\n## 4. Outlier Analysis\n"
        outliers = {}
        for col in numeric_cols:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            column_outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
            if len(column_outliers) > 0:
                outliers[col] = len(column_outliers)
                report += f"- **{col}:** {len(column_outliers)} outliers detected\n"
        
        # 5. Correlation Analysis
        report += "\n## 5. Correlation Analysis\n"
        if len(numeric_cols) > 1:
            correlation_matrix = df[numeric_cols].corr()
            report += "### Top Correlations\n"
            # Find and report top correlations
            corr_unstack = correlation_matrix.unstack()
            top_correlations = corr_unstack[corr_unstack != 1].nlargest(5)
            for (col1, col2), corr_value in top_correlations.items():
                report += f"- **{col1}** & **{col2}**: {corr_value:.2f}\n"
        
        # Visualizations
        plt.close('all')
        fig, axs = plt.subplots(2, 2, figsize=(20, 15))
        plt.subplots_adjust(hspace=0.4, wspace=0.3)
        
        # Correlation Heatmap
        if len(numeric_cols) > 1:
            sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', ax=axs[0, 0], 
                        square=True, cbar=True, linewidths=0.5)
            axs[0, 0].set_title('Correlation Heatmap', fontsize=14, fontweight='bold')
        
        # Box Plot for Numeric Columns
        df[numeric_cols].boxplot(ax=axs[0, 1])
        axs[0, 1].set_title('Box Plot of Numeric Columns', fontsize=14, fontweight='bold')
        axs[0, 1].tick_params(axis='x', rotation=45)
        
        # Distribution of Categorical Columns
        categorical_cols = df.select_dtypes(include=['object']).columns
        if len(categorical_cols) > 0:
            cat_value_counts = df[categorical_cols[0]].value_counts()
            cat_value_counts.plot(kind='bar', ax=axs[1, 0])
            axs[1, 0].set_title(f'Distribution of {categorical_cols[0]}', 
                                 fontsize=14, fontweight='bold')
            axs[1, 0].tick_params(axis='x', rotation=45)
        
        # Scatter Plot Matrix for Top Correlated Features
        if len(numeric_cols) > 2:
            top_corr_features = correlation_matrix.unstack().sort_values(
                kind="quicksort", ascending=False).head(5)
            top_features = list(set([x[0] for x in top_corr_features.index] + 
                                    [x[1] for x in top_corr_features.index]))[:3]
            pd.plotting.scatter_matrix(df[top_features], 
                                       figsize=(10,10), 
                                       diagonal='hist', 
                                       ax=axs[1, 1])
            axs[1, 1].set_title('Scatter Plot of Top Correlated Features', 
                                fontsize=14, fontweight='bold')
        
        plt.suptitle('🔍 Data Analysis Visualizations', fontsize=16, fontweight='bold')
        plt.tight_layout()
        plt.savefig('data_analysis_advanced.png', dpi=300, bbox_inches='tight')
        plt.close()
        
        return report, 'data_analysis_advanced.png'
    
    except Exception as e:
        error_report = f"## ❌ Analysis Failed\n\n**Error:** {str(e)}\n\n"
        error_report += "Possible reasons:\n"
        error_report += "- Incorrect file format\n"
        error_report += "- Unsupported data types\n"
        error_report += "- Corrupted or incomplete dataset"
        return error_report, None

# Custom CSS for a more modern look
css = """
.gradio-container {
    background-color: #f0f2f6;
    font-family: 'Inter', 'Helvetica Neue', Arial, sans-serif;
}
.output-markdown {
    background-color: white;
    border-radius: 10px;
    padding: 20px;
    box-shadow: 0 4px 6px rgba(0,0,0,0.1);
    max-height: 500px;
    overflow-y: auto;
}
.file-upload {
    background-color: #4CAF50;
    color: white;
    border-radius: 5px;
    padding: 10px 20px;
    transition: background-color 0.3s ease;
}
.file-upload:hover {
    background-color: #45a049;
}
"""

# Gradio Interface with Enhanced UI
demo = gr.Interface(
    fn=advanced_analysis,
    inputs=gr.File(
        type="filepath", 
        label="📤 Upload File", 
        file_count="single", 
        file_types=["csv", "xlsx", "xls", "txt"]  # Updated file types
    ),
    outputs=[
        gr.Markdown(),
        gr.Image(label="📊 Advanced Visualizations")
    ],
    title="🧠 Smart Data Analyzer",
    description="Upload a CSV, Excel, or Text file for comprehensive data analysis, statistical insights, and interactive visualizations.",
    theme='default',
    css=css
)

# Launch the interface
demo.launch(server_name="0.0.0.0", server_port=7860)