Spaces:

shvy
/

data-visulaizer-69

Sleeping

App Files Files Community

shvy commited on Mar 28, 2025

Commit

5f31ef5

verified ·

1 Parent(s): 23b9f7f

Update app.py

Browse files

Files changed (1) hide show

app.py +133 -61

app.py CHANGED Viewed

@@ -3,88 +3,160 @@ import pandas as pd
 import matplotlib.pyplot as plt
 import seaborn as sns
 import numpy as np
-import os
-def analyze_dataset(file):
     try:
         # Load dataset
         df = pd.read_csv(file)
-        # Generate summary statistics
-        summary = df.describe().to_string()
-        missing_values = df.isnull().sum().to_string()
-        duplicates = df.duplicated().sum()
-        # Prepare analysis text
-        insights = f"""Dataset Analysis:
-Summary Statistics:
-{summary}
-Missing Values:
-{missing_values}
-Duplicate Entries: {duplicates}
-Recommended Cleaning Strategies:
-1. Handle missing values through imputation or removal
-2. Remove or investigate duplicate entries
-3. Consider normalizing numerical features
-4. Check for outliers in the dataset
-"""
-        # Generate visualizations
-        plt.close('all')  # Close any existing plots
-        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
-        # Select numerical columns
-        numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
-        # Correlation heatmap
-        try:
-            if len(numerical_cols) > 1:
-                corr_matrix = df[numerical_cols].corr()
-                sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', ax=ax1)
-                ax1.set_title('Correlation Heatmap')
-            else:
-                ax1.text(0.5, 0.5, "Insufficient columns\nfor correlation",
-                         horizontalalignment='center', verticalalignment='center')
-        except Exception as e:
-            ax1.text(0.5, 0.5, f"Correlation plot error: {str(e)}",
-                     horizontalalignment='center', verticalalignment='center')
-        # Distribution plot for numerical columns
-        if len(numerical_cols) > 0:
-            for col in numerical_cols:
-                df[col].hist(ax=ax2, bins=15, alpha=0.5, label=col)
-            ax2.set_title('Numerical Features Distribution')
-            ax2.legend()
-        else:
-            ax2.text(0.5, 0.5, "No numerical columns\nfor distribution",
-                     horizontalalignment='center', verticalalignment='center')
-        plt.tight_layout()
-        # Save plot to file instead of BytesIO
-        output_path = 'data_analysis_plot.png'
-        plt.savefig(output_path)
         plt.close()
-        return insights, output_path
     except Exception as e:
-        return f"Error processing file: {str(e)}", None
-# Gradio UI
 demo = gr.Interface(
-    fn=analyze_dataset,
-    inputs=gr.File(type="filepath"),
     outputs=[
-        gr.Textbox(label="Analysis"),
-        gr.Image(label="Data Visualizations")
     ],
-    title="Data Analyzer",
-    description="Upload a CSV file for automatic data analysis and visualization."
 )
 # Launch the interface

 import matplotlib.pyplot as plt
 import seaborn as sns
 import numpy as np
+import scipy.stats as stats
+import base64
+def advanced_analysis(file):
     try:
         # Load dataset
         df = pd.read_csv(file)
+        # Comprehensive Analysis Report
+        report = "# 📊 Comprehensive Data Analysis Report\n\n"
+        # 1. Basic Dataset Information
+        report += "## 1. Dataset Overview\n"
+        report += f"- **Total Rows:** {len(df)}\n"
+        report += f"- **Total Columns:** {len(df.columns)}\n"
+        report += f"- **Column Types:**\n"
+        for col, dtype in df.dtypes.items():
+            report += f"  - `{col}`: {dtype}\n"
+        # 2. Missing Value Analysis
+        report += "\n## 2. Missing Value Analysis\n"
+        missing_data = df.isnull().sum()
+        missing_percentage = 100 * df.isnull().sum() / len(df)
+        missing_table = pd.concat([missing_data, missing_percentage], axis=1,
+                                   keys=['Missing Count', 'Missing Percentage'])
+        report += "```\n" + missing_table.to_string() + "\n```\n"
+        # 3. Statistical Summary
+        report += "\n## 3. Statistical Summary\n"
+        numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
+        if len(numeric_cols) > 0:
+            stats_summary = df[numeric_cols].describe()
+            report += "### Numerical Columns Statistics\n"
+            report += "```\n" + stats_summary.to_string() + "\n```\n"
+        # 4. Outlier Detection
+        report += "\n## 4. Outlier Analysis\n"
+        outliers = {}
+        for col in numeric_cols:
+            Q1 = df[col].quantile(0.25)
+            Q3 = df[col].quantile(0.75)
+            IQR = Q3 - Q1
+            lower_bound = Q1 - 1.5 * IQR
+            upper_bound = Q3 + 1.5 * IQR
+            column_outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
+            if len(column_outliers) > 0:
+                outliers[col] = len(column_outliers)
+                report += f"- **{col}:** {len(column_outliers)} outliers detected\n"
+        # 5. Correlation Analysis
+        report += "\n## 5. Correlation Analysis\n"
+        if len(numeric_cols) > 1:
+            correlation_matrix = df[numeric_cols].corr()
+            report += "### Top Correlations\n"
+            # Find and report top correlations
+            corr_unstack = correlation_matrix.unstack()
+            top_correlations = corr_unstack[corr_unstack != 1].nlargest(5)
+            for (col1, col2), corr_value in top_correlations.items():
+                report += f"- **{col1}** & **{col2}**: {corr_value:.2f}\n"
+        # Visualizations
+        plt.close('all')
+        fig, axs = plt.subplots(2, 2, figsize=(20, 15))
+        plt.subplots_adjust(hspace=0.4, wspace=0.3)
+        # Correlation Heatmap
+        if len(numeric_cols) > 1:
+            sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', ax=axs[0, 0],
+                        square=True, cbar=True, linewidths=0.5)
+            axs[0, 0].set_title('Correlation Heatmap', fontsize=14, fontweight='bold')
+        # Box Plot for Numeric Columns
+        df[numeric_cols].boxplot(ax=axs[0, 1])
+        axs[0, 1].set_title('Box Plot of Numeric Columns', fontsize=14, fontweight='bold')
+        axs[0, 1].tick_params(axis='x', rotation=45)
+        # Distribution of Categorical Columns
+        categorical_cols = df.select_dtypes(include=['object']).columns
+        if len(categorical_cols) > 0:
+            cat_value_counts = df[categorical_cols[0]].value_counts()
+            cat_value_counts.plot(kind='bar', ax=axs[1, 0])
+            axs[1, 0].set_title(f'Distribution of {categorical_cols[0]}',
+                                 fontsize=14, fontweight='bold')
+            axs[1, 0].tick_params(axis='x', rotation=45)
+        # Scatter Plot Matrix for Top Correlated Features
+        if len(numeric_cols) > 2:
+            top_corr_features = correlation_matrix.unstack().sort_values(
+                kind="quicksort", ascending=False).head(5)
+            top_features = list(set([x[0] for x in top_corr_features.index] +
+                                    [x[1] for x in top_corr_features.index]))[:3]
+            pd.plotting.scatter_matrix(df[top_features],
+                                       figsize=(10,10),
+                                       diagonal='hist',
+                                       ax=axs[1, 1])
+            axs[1, 1].set_title('Scatter Plot of Top Correlated Features',
+                                fontsize=14, fontweight='bold')
+        plt.suptitle('🔍 Data Analysis Visualizations', fontsize=16, fontweight='bold')
+        plt.tight_layout()
+        plt.savefig('data_analysis_advanced.png', dpi=300, bbox_inches='tight')
         plt.close()
+        return report, 'data_analysis_advanced.png'
     except Exception as e:
+        error_report = f"## ❌ Analysis Failed\n\n**Error:** {str(e)}\n\n"
+        error_report += "Possible reasons:\n"
+        error_report += "- Incorrect file format\n"
+        error_report += "- Unsupported data types\n"
+        error_report += "- Corrupted or incomplete dataset"
+        return error_report, None
+# Custom CSS for a more modern look
+css = """
+.gradio-container {
+    background-color: #f0f2f6;
+    font-family: 'Inter', 'Helvetica Neue', Arial, sans-serif;
+}
+.output-markdown {
+    background-color: white;
+    border-radius: 10px;
+    padding: 20px;
+    box-shadow: 0 4px 6px rgba(0,0,0,0.1);
+}
+.file-upload {
+    background-color: #4CAF50;
+    color: white;
+    border-radius: 5px;
+    padding: 10px 20px;
+    transition: background-color 0.3s ease;
+}
+.file-upload:hover {
+    background-color: #45a049;
+}
+"""
+# Gradio Interface with Enhanced UI
 demo = gr.Interface(
+    fn=advanced_analysis,
+    inputs=gr.File(
+        type="filepath",
+        label="📤 Upload CSV File",
+        file_count="single",
+        file_types=["csv"]
+    ),
     outputs=[
+        gr.Markdown(label="📋 Analysis Report", lines=20),
+        gr.Image(label="📊 Advanced Visualizations")
     ],
+    title="🧠 Smart Data Analyzer",
+    description="Upload a CSV file for comprehensive data analysis, statistical insights, and interactive visualizations.",
+    theme='default',
+    css=css
 )
 # Launch the interface