Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import numpy as np | |
| import scipy.stats as stats | |
| import base64 | |
| def advanced_analysis(file): | |
| try: | |
| # Support multiple file types | |
| supported_extensions = ['.csv', '.xlsx', '.xls', '.txt'] | |
| if not any(file.lower().endswith(ext) for ext in supported_extensions): | |
| raise ValueError(f"Unsupported file type. Please upload a file with one of these extensions: {', '.join(supported_extensions)}") | |
| # Load file based on extension | |
| if file.endswith('.csv'): | |
| df = pd.read_csv(file) | |
| elif file.endswith(('.xlsx', '.xls')): | |
| df = pd.read_excel(file) | |
| elif file.endswith('.txt'): | |
| df = pd.read_csv(file, sep='\t') | |
| # Comprehensive Analysis Report | |
| report = "# π Comprehensive Data Analysis Report\n\n" | |
| # 1. Basic Dataset Information | |
| report += "## 1. Dataset Overview\n" | |
| report += f"- **Total Rows:** {len(df)}\n" | |
| report += f"- **Total Columns:** {len(df.columns)}\n" | |
| report += f"- **Column Types:**\n" | |
| for col, dtype in df.dtypes.items(): | |
| report += f" - `{col}`: {dtype}\n" | |
| # 2. Missing Value Analysis | |
| report += "\n## 2. Missing Value Analysis\n" | |
| missing_data = df.isnull().sum() | |
| missing_percentage = 100 * df.isnull().sum() / len(df) | |
| missing_table = pd.concat([missing_data, missing_percentage], axis=1, | |
| keys=['Missing Count', 'Missing Percentage']) | |
| report += "```\n" + missing_table.to_string() + "\n```\n" | |
| # 3. Statistical Summary | |
| report += "\n## 3. Statistical Summary\n" | |
| numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns | |
| if len(numeric_cols) > 0: | |
| stats_summary = df[numeric_cols].describe() | |
| report += "### Numerical Columns Statistics\n" | |
| report += "```\n" + stats_summary.to_string() + "\n```\n" | |
| # 4. Outlier Detection | |
| report += "\n## 4. Outlier Analysis\n" | |
| outliers = {} | |
| for col in numeric_cols: | |
| Q1 = df[col].quantile(0.25) | |
| Q3 = df[col].quantile(0.75) | |
| IQR = Q3 - Q1 | |
| lower_bound = Q1 - 1.5 * IQR | |
| upper_bound = Q3 + 1.5 * IQR | |
| column_outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)] | |
| if len(column_outliers) > 0: | |
| outliers[col] = len(column_outliers) | |
| report += f"- **{col}:** {len(column_outliers)} outliers detected\n" | |
| # 5. Correlation Analysis | |
| report += "\n## 5. Correlation Analysis\n" | |
| if len(numeric_cols) > 1: | |
| correlation_matrix = df[numeric_cols].corr() | |
| report += "### Top Correlations\n" | |
| # Find and report top correlations | |
| corr_unstack = correlation_matrix.unstack() | |
| top_correlations = corr_unstack[corr_unstack != 1].nlargest(5) | |
| for (col1, col2), corr_value in top_correlations.items(): | |
| report += f"- **{col1}** & **{col2}**: {corr_value:.2f}\n" | |
| # Visualizations | |
| plt.close('all') | |
| fig, axs = plt.subplots(2, 2, figsize=(20, 15)) | |
| plt.subplots_adjust(hspace=0.4, wspace=0.3) | |
| # Correlation Heatmap | |
| if len(numeric_cols) > 1: | |
| sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', ax=axs[0, 0], | |
| square=True, cbar=True, linewidths=0.5) | |
| axs[0, 0].set_title('Correlation Heatmap', fontsize=14, fontweight='bold') | |
| # Box Plot for Numeric Columns | |
| df[numeric_cols].boxplot(ax=axs[0, 1]) | |
| axs[0, 1].set_title('Box Plot of Numeric Columns', fontsize=14, fontweight='bold') | |
| axs[0, 1].tick_params(axis='x', rotation=45) | |
| # Distribution of Categorical Columns | |
| categorical_cols = df.select_dtypes(include=['object']).columns | |
| if len(categorical_cols) > 0: | |
| cat_value_counts = df[categorical_cols[0]].value_counts() | |
| cat_value_counts.plot(kind='bar', ax=axs[1, 0]) | |
| axs[1, 0].set_title(f'Distribution of {categorical_cols[0]}', | |
| fontsize=14, fontweight='bold') | |
| axs[1, 0].tick_params(axis='x', rotation=45) | |
| # Scatter Plot Matrix for Top Correlated Features | |
| if len(numeric_cols) > 2: | |
| top_corr_features = correlation_matrix.unstack().sort_values( | |
| kind="quicksort", ascending=False).head(5) | |
| top_features = list(set([x[0] for x in top_corr_features.index] + | |
| [x[1] for x in top_corr_features.index]))[:3] | |
| pd.plotting.scatter_matrix(df[top_features], | |
| figsize=(10,10), | |
| diagonal='hist', | |
| ax=axs[1, 1]) | |
| axs[1, 1].set_title('Scatter Plot of Top Correlated Features', | |
| fontsize=14, fontweight='bold') | |
| plt.suptitle('π Data Analysis Visualizations', fontsize=16, fontweight='bold') | |
| plt.tight_layout() | |
| plt.savefig('data_analysis_advanced.png', dpi=300, bbox_inches='tight') | |
| plt.close() | |
| return report, 'data_analysis_advanced.png' | |
| except Exception as e: | |
| error_report = f"## β Analysis Failed\n\n**Error:** {str(e)}\n\n" | |
| error_report += "Possible reasons:\n" | |
| error_report += "- Incorrect file format\n" | |
| error_report += "- Unsupported data types\n" | |
| error_report += "- Corrupted or incomplete dataset" | |
| return error_report, None | |
| # Custom CSS for a more modern look | |
| css = """ | |
| .gradio-container { | |
| background-color: #f0f2f6; | |
| font-family: 'Inter', 'Helvetica Neue', Arial, sans-serif; | |
| } | |
| .output-markdown { | |
| background-color: white; | |
| border-radius: 10px; | |
| padding: 20px; | |
| box-shadow: 0 4px 6px rgba(0,0,0,0.1); | |
| max-height: 500px; | |
| overflow-y: auto; | |
| } | |
| .file-upload { | |
| background-color: #4CAF50; | |
| color: white; | |
| border-radius: 5px; | |
| padding: 10px 20px; | |
| transition: background-color 0.3s ease; | |
| } | |
| .file-upload:hover { | |
| background-color: #45a049; | |
| } | |
| """ | |
| # Gradio Interface with Enhanced UI | |
| demo = gr.Interface( | |
| fn=advanced_analysis, | |
| inputs=gr.File( | |
| type="filepath", | |
| label="π€ Upload File", | |
| file_count="single", | |
| file_types=["csv", "xlsx", "xls", "txt"] # Updated file types | |
| ), | |
| outputs=[ | |
| gr.Markdown(), | |
| gr.Image(label="π Advanced Visualizations") | |
| ], | |
| title="π§ Smart Data Analyzer", | |
| description="Upload a CSV, Excel, or Text file for comprehensive data analysis, statistical insights, and interactive visualizations.", | |
| theme='default', | |
| css=css | |
| ) | |
| # Launch the interface | |
| demo.launch(server_name="0.0.0.0", server_port=7860) |