import gradio as gr import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import numpy as np import scipy.stats as stats import base64 def advanced_analysis(file): try: # Support multiple file types supported_extensions = ['.csv', '.xlsx', '.xls', '.txt'] if not any(file.lower().endswith(ext) for ext in supported_extensions): raise ValueError(f"Unsupported file type. Please upload a file with one of these extensions: {', '.join(supported_extensions)}") # Load file based on extension if file.endswith('.csv'): df = pd.read_csv(file) elif file.endswith(('.xlsx', '.xls')): df = pd.read_excel(file) elif file.endswith('.txt'): df = pd.read_csv(file, sep='\t') # Comprehensive Analysis Report report = "# 📊 Comprehensive Data Analysis Report\n\n" # 1. Basic Dataset Information report += "## 1. Dataset Overview\n" report += f"- **Total Rows:** {len(df)}\n" report += f"- **Total Columns:** {len(df.columns)}\n" report += f"- **Column Types:**\n" for col, dtype in df.dtypes.items(): report += f" - `{col}`: {dtype}\n" # 2. Missing Value Analysis report += "\n## 2. Missing Value Analysis\n" missing_data = df.isnull().sum() missing_percentage = 100 * df.isnull().sum() / len(df) missing_table = pd.concat([missing_data, missing_percentage], axis=1, keys=['Missing Count', 'Missing Percentage']) report += "```\n" + missing_table.to_string() + "\n```\n" # 3. Statistical Summary report += "\n## 3. Statistical Summary\n" numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns if len(numeric_cols) > 0: stats_summary = df[numeric_cols].describe() report += "### Numerical Columns Statistics\n" report += "```\n" + stats_summary.to_string() + "\n```\n" # 4. Outlier Detection report += "\n## 4. Outlier Analysis\n" outliers = {} for col in numeric_cols: Q1 = df[col].quantile(0.25) Q3 = df[col].quantile(0.75) IQR = Q3 - Q1 lower_bound = Q1 - 1.5 * IQR upper_bound = Q3 + 1.5 * IQR column_outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)] if len(column_outliers) > 0: outliers[col] = len(column_outliers) report += f"- **{col}:** {len(column_outliers)} outliers detected\n" # 5. Correlation Analysis report += "\n## 5. Correlation Analysis\n" if len(numeric_cols) > 1: correlation_matrix = df[numeric_cols].corr() report += "### Top Correlations\n" # Find and report top correlations corr_unstack = correlation_matrix.unstack() top_correlations = corr_unstack[corr_unstack != 1].nlargest(5) for (col1, col2), corr_value in top_correlations.items(): report += f"- **{col1}** & **{col2}**: {corr_value:.2f}\n" # Visualizations plt.close('all') fig, axs = plt.subplots(2, 2, figsize=(20, 15)) plt.subplots_adjust(hspace=0.4, wspace=0.3) # Correlation Heatmap if len(numeric_cols) > 1: sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', ax=axs[0, 0], square=True, cbar=True, linewidths=0.5) axs[0, 0].set_title('Correlation Heatmap', fontsize=14, fontweight='bold') # Box Plot for Numeric Columns df[numeric_cols].boxplot(ax=axs[0, 1]) axs[0, 1].set_title('Box Plot of Numeric Columns', fontsize=14, fontweight='bold') axs[0, 1].tick_params(axis='x', rotation=45) # Distribution of Categorical Columns categorical_cols = df.select_dtypes(include=['object']).columns if len(categorical_cols) > 0: cat_value_counts = df[categorical_cols[0]].value_counts() cat_value_counts.plot(kind='bar', ax=axs[1, 0]) axs[1, 0].set_title(f'Distribution of {categorical_cols[0]}', fontsize=14, fontweight='bold') axs[1, 0].tick_params(axis='x', rotation=45) # Scatter Plot Matrix for Top Correlated Features if len(numeric_cols) > 2: top_corr_features = correlation_matrix.unstack().sort_values( kind="quicksort", ascending=False).head(5) top_features = list(set([x[0] for x in top_corr_features.index] + [x[1] for x in top_corr_features.index]))[:3] pd.plotting.scatter_matrix(df[top_features], figsize=(10,10), diagonal='hist', ax=axs[1, 1]) axs[1, 1].set_title('Scatter Plot of Top Correlated Features', fontsize=14, fontweight='bold') plt.suptitle('🔍 Data Analysis Visualizations', fontsize=16, fontweight='bold') plt.tight_layout() plt.savefig('data_analysis_advanced.png', dpi=300, bbox_inches='tight') plt.close() return report, 'data_analysis_advanced.png' except Exception as e: error_report = f"## ❌ Analysis Failed\n\n**Error:** {str(e)}\n\n" error_report += "Possible reasons:\n" error_report += "- Incorrect file format\n" error_report += "- Unsupported data types\n" error_report += "- Corrupted or incomplete dataset" return error_report, None # Custom CSS for a more modern look css = """ .gradio-container { background-color: #f0f2f6; font-family: 'Inter', 'Helvetica Neue', Arial, sans-serif; } .output-markdown { background-color: white; border-radius: 10px; padding: 20px; box-shadow: 0 4px 6px rgba(0,0,0,0.1); max-height: 500px; overflow-y: auto; } .file-upload { background-color: #4CAF50; color: white; border-radius: 5px; padding: 10px 20px; transition: background-color 0.3s ease; } .file-upload:hover { background-color: #45a049; } """ # Gradio Interface with Enhanced UI demo = gr.Interface( fn=advanced_analysis, inputs=gr.File( type="filepath", label="📤 Upload File", file_count="single", file_types=["csv", "xlsx", "xls", "txt"] # Updated file types ), outputs=[ gr.Markdown(), gr.Image(label="📊 Advanced Visualizations") ], title="🧠 Smart Data Analyzer", description="Upload a CSV, Excel, or Text file for comprehensive data analysis, statistical insights, and interactive visualizations.", theme='default', css=css ) # Launch the interface demo.launch(server_name="0.0.0.0", server_port=7860)