shvy commited on
Commit
5f31ef5
Β·
verified Β·
1 Parent(s): 23b9f7f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +133 -61
app.py CHANGED
@@ -3,88 +3,160 @@ import pandas as pd
3
  import matplotlib.pyplot as plt
4
  import seaborn as sns
5
  import numpy as np
6
- import os
 
7
 
8
- def analyze_dataset(file):
9
  try:
10
  # Load dataset
11
  df = pd.read_csv(file)
12
 
13
- # Generate summary statistics
14
- summary = df.describe().to_string()
15
- missing_values = df.isnull().sum().to_string()
16
- duplicates = df.duplicated().sum()
17
 
18
- # Prepare analysis text
19
- insights = f"""Dataset Analysis:
 
 
 
 
 
20
 
21
- Summary Statistics:
22
- {summary}
23
-
24
- Missing Values:
25
- {missing_values}
26
-
27
- Duplicate Entries: {duplicates}
28
-
29
- Recommended Cleaning Strategies:
30
- 1. Handle missing values through imputation or removal
31
- 2. Remove or investigate duplicate entries
32
- 3. Consider normalizing numerical features
33
- 4. Check for outliers in the dataset
34
- """
35
 
36
- # Generate visualizations
37
- plt.close('all') # Close any existing plots
38
- fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
 
 
 
 
39
 
40
- # Select numerical columns
41
- numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
 
 
 
 
 
 
 
 
 
 
 
42
 
43
- # Correlation heatmap
44
- try:
45
- if len(numerical_cols) > 1:
46
- corr_matrix = df[numerical_cols].corr()
47
- sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', ax=ax1)
48
- ax1.set_title('Correlation Heatmap')
49
- else:
50
- ax1.text(0.5, 0.5, "Insufficient columns\nfor correlation",
51
- horizontalalignment='center', verticalalignment='center')
52
- except Exception as e:
53
- ax1.text(0.5, 0.5, f"Correlation plot error: {str(e)}",
54
- horizontalalignment='center', verticalalignment='center')
55
 
56
- # Distribution plot for numerical columns
57
- if len(numerical_cols) > 0:
58
- for col in numerical_cols:
59
- df[col].hist(ax=ax2, bins=15, alpha=0.5, label=col)
60
- ax2.set_title('Numerical Features Distribution')
61
- ax2.legend()
62
- else:
63
- ax2.text(0.5, 0.5, "No numerical columns\nfor distribution",
64
- horizontalalignment='center', verticalalignment='center')
65
 
66
- plt.tight_layout()
 
 
 
 
 
 
 
 
 
67
 
68
- # Save plot to file instead of BytesIO
69
- output_path = 'data_analysis_plot.png'
70
- plt.savefig(output_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  plt.close()
72
 
73
- return insights, output_path
74
 
75
  except Exception as e:
76
- return f"Error processing file: {str(e)}", None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
- # Gradio UI
79
  demo = gr.Interface(
80
- fn=analyze_dataset,
81
- inputs=gr.File(type="filepath"),
 
 
 
 
 
82
  outputs=[
83
- gr.Textbox(label="Analysis"),
84
- gr.Image(label="Data Visualizations")
85
  ],
86
- title="Data Analyzer",
87
- description="Upload a CSV file for automatic data analysis and visualization."
 
 
88
  )
89
 
90
  # Launch the interface
 
3
  import matplotlib.pyplot as plt
4
  import seaborn as sns
5
  import numpy as np
6
+ import scipy.stats as stats
7
+ import base64
8
 
9
+ def advanced_analysis(file):
10
  try:
11
  # Load dataset
12
  df = pd.read_csv(file)
13
 
14
+ # Comprehensive Analysis Report
15
+ report = "# πŸ“Š Comprehensive Data Analysis Report\n\n"
 
 
16
 
17
+ # 1. Basic Dataset Information
18
+ report += "## 1. Dataset Overview\n"
19
+ report += f"- **Total Rows:** {len(df)}\n"
20
+ report += f"- **Total Columns:** {len(df.columns)}\n"
21
+ report += f"- **Column Types:**\n"
22
+ for col, dtype in df.dtypes.items():
23
+ report += f" - `{col}`: {dtype}\n"
24
 
25
+ # 2. Missing Value Analysis
26
+ report += "\n## 2. Missing Value Analysis\n"
27
+ missing_data = df.isnull().sum()
28
+ missing_percentage = 100 * df.isnull().sum() / len(df)
29
+ missing_table = pd.concat([missing_data, missing_percentage], axis=1,
30
+ keys=['Missing Count', 'Missing Percentage'])
31
+ report += "```\n" + missing_table.to_string() + "\n```\n"
 
 
 
 
 
 
 
32
 
33
+ # 3. Statistical Summary
34
+ report += "\n## 3. Statistical Summary\n"
35
+ numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
36
+ if len(numeric_cols) > 0:
37
+ stats_summary = df[numeric_cols].describe()
38
+ report += "### Numerical Columns Statistics\n"
39
+ report += "```\n" + stats_summary.to_string() + "\n```\n"
40
 
41
+ # 4. Outlier Detection
42
+ report += "\n## 4. Outlier Analysis\n"
43
+ outliers = {}
44
+ for col in numeric_cols:
45
+ Q1 = df[col].quantile(0.25)
46
+ Q3 = df[col].quantile(0.75)
47
+ IQR = Q3 - Q1
48
+ lower_bound = Q1 - 1.5 * IQR
49
+ upper_bound = Q3 + 1.5 * IQR
50
+ column_outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
51
+ if len(column_outliers) > 0:
52
+ outliers[col] = len(column_outliers)
53
+ report += f"- **{col}:** {len(column_outliers)} outliers detected\n"
54
 
55
+ # 5. Correlation Analysis
56
+ report += "\n## 5. Correlation Analysis\n"
57
+ if len(numeric_cols) > 1:
58
+ correlation_matrix = df[numeric_cols].corr()
59
+ report += "### Top Correlations\n"
60
+ # Find and report top correlations
61
+ corr_unstack = correlation_matrix.unstack()
62
+ top_correlations = corr_unstack[corr_unstack != 1].nlargest(5)
63
+ for (col1, col2), corr_value in top_correlations.items():
64
+ report += f"- **{col1}** & **{col2}**: {corr_value:.2f}\n"
 
 
65
 
66
+ # Visualizations
67
+ plt.close('all')
68
+ fig, axs = plt.subplots(2, 2, figsize=(20, 15))
69
+ plt.subplots_adjust(hspace=0.4, wspace=0.3)
 
 
 
 
 
70
 
71
+ # Correlation Heatmap
72
+ if len(numeric_cols) > 1:
73
+ sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', ax=axs[0, 0],
74
+ square=True, cbar=True, linewidths=0.5)
75
+ axs[0, 0].set_title('Correlation Heatmap', fontsize=14, fontweight='bold')
76
+
77
+ # Box Plot for Numeric Columns
78
+ df[numeric_cols].boxplot(ax=axs[0, 1])
79
+ axs[0, 1].set_title('Box Plot of Numeric Columns', fontsize=14, fontweight='bold')
80
+ axs[0, 1].tick_params(axis='x', rotation=45)
81
 
82
+ # Distribution of Categorical Columns
83
+ categorical_cols = df.select_dtypes(include=['object']).columns
84
+ if len(categorical_cols) > 0:
85
+ cat_value_counts = df[categorical_cols[0]].value_counts()
86
+ cat_value_counts.plot(kind='bar', ax=axs[1, 0])
87
+ axs[1, 0].set_title(f'Distribution of {categorical_cols[0]}',
88
+ fontsize=14, fontweight='bold')
89
+ axs[1, 0].tick_params(axis='x', rotation=45)
90
+
91
+ # Scatter Plot Matrix for Top Correlated Features
92
+ if len(numeric_cols) > 2:
93
+ top_corr_features = correlation_matrix.unstack().sort_values(
94
+ kind="quicksort", ascending=False).head(5)
95
+ top_features = list(set([x[0] for x in top_corr_features.index] +
96
+ [x[1] for x in top_corr_features.index]))[:3]
97
+ pd.plotting.scatter_matrix(df[top_features],
98
+ figsize=(10,10),
99
+ diagonal='hist',
100
+ ax=axs[1, 1])
101
+ axs[1, 1].set_title('Scatter Plot of Top Correlated Features',
102
+ fontsize=14, fontweight='bold')
103
+
104
+ plt.suptitle('πŸ” Data Analysis Visualizations', fontsize=16, fontweight='bold')
105
+ plt.tight_layout()
106
+ plt.savefig('data_analysis_advanced.png', dpi=300, bbox_inches='tight')
107
  plt.close()
108
 
109
+ return report, 'data_analysis_advanced.png'
110
 
111
  except Exception as e:
112
+ error_report = f"## ❌ Analysis Failed\n\n**Error:** {str(e)}\n\n"
113
+ error_report += "Possible reasons:\n"
114
+ error_report += "- Incorrect file format\n"
115
+ error_report += "- Unsupported data types\n"
116
+ error_report += "- Corrupted or incomplete dataset"
117
+ return error_report, None
118
+
119
+ # Custom CSS for a more modern look
120
+ css = """
121
+ .gradio-container {
122
+ background-color: #f0f2f6;
123
+ font-family: 'Inter', 'Helvetica Neue', Arial, sans-serif;
124
+ }
125
+ .output-markdown {
126
+ background-color: white;
127
+ border-radius: 10px;
128
+ padding: 20px;
129
+ box-shadow: 0 4px 6px rgba(0,0,0,0.1);
130
+ }
131
+ .file-upload {
132
+ background-color: #4CAF50;
133
+ color: white;
134
+ border-radius: 5px;
135
+ padding: 10px 20px;
136
+ transition: background-color 0.3s ease;
137
+ }
138
+ .file-upload:hover {
139
+ background-color: #45a049;
140
+ }
141
+ """
142
 
143
+ # Gradio Interface with Enhanced UI
144
  demo = gr.Interface(
145
+ fn=advanced_analysis,
146
+ inputs=gr.File(
147
+ type="filepath",
148
+ label="πŸ“€ Upload CSV File",
149
+ file_count="single",
150
+ file_types=["csv"]
151
+ ),
152
  outputs=[
153
+ gr.Markdown(label="πŸ“‹ Analysis Report", lines=20),
154
+ gr.Image(label="πŸ“Š Advanced Visualizations")
155
  ],
156
+ title="🧠 Smart Data Analyzer",
157
+ description="Upload a CSV file for comprehensive data analysis, statistical insights, and interactive visualizations.",
158
+ theme='default',
159
+ css=css
160
  )
161
 
162
  # Launch the interface