entropy25 commited on
Commit
c42749b
·
verified ·
1 Parent(s): f71de9c

Update analyzer.py

Browse files
Files changed (1) hide show
  1. analyzer.py +669 -304
analyzer.py CHANGED
@@ -1,328 +1,693 @@
 
1
  import pandas as pd
2
  import numpy as np
3
- import streamlit as st
4
- from typing import Dict, List, Any, Optional, Tuple
5
- import warnings
6
- warnings.filterwarnings('ignore')
 
 
 
 
 
 
7
 
8
- # Machine Learning imports
9
  try:
10
- from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
11
- from sklearn.linear_model import LinearRegression, LogisticRegression
12
- from sklearn.model_selection import train_test_split, cross_val_score
13
- from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report
14
- from sklearn.preprocessing import StandardScaler, LabelEncoder
15
- ML_AVAILABLE = True
16
  except ImportError:
17
- ML_AVAILABLE = False
18
- st.warning("⚠️ Machine Learning libraries not available. Please install scikit-learn for ML features.")
19
 
20
- class DataAnalyzer:
21
- """Enhanced data analyzer with ML capabilities"""
 
 
 
 
 
 
22
 
23
- def __init__(self, df: pd.DataFrame):
24
- """Initialize analyzer with dataframe"""
25
- self.df = df.copy()
26
- self.numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
27
- self.categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
28
- self.results = {}
 
29
 
30
- def run_basic_analysis(self) -> Dict[str, Any]:
31
- """Run basic statistical analysis"""
32
- try:
33
- analysis = {}
34
-
35
- # Shape and basic info
36
- analysis['dataset_info'] = {
37
- 'rows': self.df.shape[0],
38
- 'columns': self.df.shape[1],
39
- 'memory_usage_mb': self.df.memory_usage(deep=True).sum() / (1024**2)
40
- }
41
-
42
- # Missing data summary
43
- missing_data = self.df.isnull().sum()
44
- analysis['missing_data'] = {
45
- 'total_missing': int(missing_data.sum()),
46
- 'missing_percentage': float((missing_data.sum() / (self.df.shape[0] * self.df.shape[1])) * 100),
47
- 'columns_with_missing': missing_data[missing_data > 0].to_dict()
48
- }
49
-
50
- # Data types summary
51
- dtype_counts = self.df.dtypes.value_counts()
52
- analysis['data_types'] = {str(k): int(v) for k, v in dtype_counts.items()}
53
-
54
- # Numeric columns analysis
55
- if self.numeric_cols:
56
- numeric_analysis = {}
57
- for col in self.numeric_cols:
58
- try:
59
- numeric_analysis[col] = {
60
- 'mean': float(self.df[col].mean()),
61
- 'median': float(self.df[col].median()),
62
- 'std': float(self.df[col].std()),
63
- 'min': float(self.df[col].min()),
64
- 'max': float(self.df[col].max()),
65
- 'skewness': float(self.df[col].skew()),
66
- 'kurtosis': float(self.df[col].kurtosis())
67
- }
68
- except:
69
- numeric_analysis[col] = {'error': 'Could not calculate statistics'}
70
- analysis['numeric_analysis'] = numeric_analysis
71
-
72
- # Categorical columns analysis
73
- if self.categorical_cols:
74
- categorical_analysis = {}
75
- for col in self.categorical_cols:
76
- try:
77
- mode_val = self.df[col].mode()
78
- most_frequent = str(mode_val.iloc[0]) if not mode_val.empty else 'None'
79
- most_frequent_count = int(self.df[col].value_counts().iloc[0]) if len(self.df[col].value_counts()) > 0 else 0
80
-
81
- categorical_analysis[col] = {
82
- 'unique_values': int(self.df[col].nunique()),
83
- 'most_frequent': most_frequent,
84
- 'most_frequent_count': most_frequent_count
85
- }
86
- except:
87
- categorical_analysis[col] = {'error': 'Could not calculate statistics'}
88
- analysis['categorical_analysis'] = categorical_analysis
89
-
90
- self.results['basic_analysis'] = analysis
91
- return analysis
92
-
93
- except Exception as e:
94
- st.error(f"Error in basic analysis: {str(e)}")
95
- return {}
96
 
97
- def run_correlation_analysis(self) -> Dict[str, Any]:
98
- """Run correlation analysis for numeric columns"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  try:
100
- if len(self.numeric_cols) < 2:
101
- return {'message': 'Need at least 2 numeric columns for correlation analysis'}
102
-
103
- # Calculate correlation matrix
104
- correlation_matrix = self.df[self.numeric_cols].corr()
105
-
106
- # Find strong correlations (threshold > 0.7)
107
- strong_correlations = []
108
- for i in range(len(correlation_matrix.columns)):
109
- for j in range(i+1, len(correlation_matrix.columns)):
110
- corr_value = correlation_matrix.iloc[i, j]
111
- if not pd.isna(corr_value) and abs(corr_value) > 0.7:
112
- strong_correlations.append({
113
- 'variable_1': correlation_matrix.columns[i],
114
- 'variable_2': correlation_matrix.columns[j],
115
- 'correlation': float(corr_value),
116
- 'strength': 'Strong Positive' if corr_value > 0.7 else 'Strong Negative'
117
- })
118
-
119
- analysis = {
120
- 'correlation_matrix': correlation_matrix.to_dict(),
121
- 'strong_correlations': strong_correlations,
122
- 'total_pairs': len(strong_correlations)
123
- }
124
-
125
- self.results['correlation_analysis'] = analysis
126
- return analysis
127
-
128
  except Exception as e:
129
- st.error(f"Error in correlation analysis: {str(e)}")
130
- return {}
 
 
131
 
132
- def run_ml_analysis(self, target_column: str) -> Dict[str, Any]:
133
- """Run machine learning analysis"""
134
- if not ML_AVAILABLE:
135
- return {'error': 'Machine learning libraries not available'}
 
 
136
 
137
- try:
138
- # Prepare data
139
- features = [col for col in self.numeric_cols if col != target_column]
140
- if len(features) < 1:
141
- return {'error': 'Not enough features for ML analysis'}
142
-
143
- # Get clean data (no missing values)
144
- ml_data = self.df[features + [target_column]].dropna()
145
- if len(ml_data) < 10:
146
- return {'error': 'Not enough data points for ML analysis'}
147
-
148
- X = ml_data[features]
149
- y = ml_data[target_column]
150
-
151
- # Split data
152
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
153
-
154
- # Scale features
155
- scaler = StandardScaler()
156
- X_train_scaled = scaler.fit_transform(X_train)
157
- X_test_scaled = scaler.transform(X_test)
158
-
159
- results = {}
160
-
161
- # Determine if regression or classification
162
- is_classification = len(np.unique(y)) < 10 and (y.dtype == 'object' or len(np.unique(y)) <= 5)
163
-
164
- if is_classification:
165
- # Classification models
166
- models = {
167
- 'Random Forest Classifier': RandomForestClassifier(n_estimators=100, random_state=42),
168
- 'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000)
169
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
170
 
171
- for name, model in models.items():
172
- try:
173
- # Train model
174
- if name == 'Logistic Regression':
175
- model.fit(X_train_scaled, y_train)
176
- y_pred = model.predict(X_test_scaled)
177
- else:
178
- model.fit(X_train, y_train)
179
- y_pred = model.predict(X_test)
180
-
181
- # Calculate metrics
182
- accuracy = accuracy_score(y_test, y_pred)
183
-
184
- results[name] = {
185
- 'accuracy': float(accuracy),
186
- 'type': 'classification'
187
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
 
189
- except Exception as e:
190
- results[name] = {'error': str(e)}
191
-
192
- else:
193
- # Regression models
194
- models = {
195
- 'Random Forest Regressor': RandomForestRegressor(n_estimators=100, random_state=42),
196
- 'Linear Regression': LinearRegression()
197
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
 
199
- for name, model in models.items():
200
  try:
201
- # Train model
202
- if name == 'Linear Regression':
203
- model.fit(X_train_scaled, y_train)
204
- y_pred = model.predict(X_test_scaled)
205
- else:
206
- model.fit(X_train, y_train)
207
- y_pred = model.predict(X_test)
208
-
209
- # Calculate metrics
210
- r2 = r2_score(y_test, y_pred)
211
- mse = mean_squared_error(y_test, y_pred)
212
-
213
- results[name] = {
214
- 'r2_score': float(r2),
215
- 'mse': float(mse),
216
- 'rmse': float(np.sqrt(mse)),
217
- 'type': 'regression'
218
- }
219
 
 
220
  except Exception as e:
221
- results[name] = {'error': str(e)}
222
-
223
- self.results['ml_analysis'] = results
224
- return results
225
-
226
- except Exception as e:
227
- st.error(f"Error in ML analysis: {str(e)}")
228
- return {'error': str(e)}
 
 
 
 
 
 
 
 
 
 
 
 
229
 
230
- def generate_insights(self) -> Dict[str, Any]:
231
- """Generate comprehensive insights from all analyses"""
232
- try:
233
- insights = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
 
235
- # Basic insights
236
- basic = self.run_basic_analysis()
237
- if basic:
238
- insights['data_summary'] = [
239
- f"Dataset contains {basic['dataset_info']['rows']:,} rows and {basic['dataset_info']['columns']} columns",
240
- f"Memory usage: {basic['dataset_info']['memory_usage_mb']:.1f} MB",
241
- f"Missing data: {basic['missing_data']['missing_percentage']:.1f}% of total cells"
242
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243
 
244
- # Correlation insights
245
- correlation = self.run_correlation_analysis()
246
- if correlation and 'strong_correlations' in correlation:
247
- if correlation['strong_correlations']:
248
- corr_insights = []
249
- for corr in correlation['strong_correlations'][:5]: # Top 5
250
- corr_insights.append(
251
- f"{corr['variable_1']} and {corr['variable_2']} are strongly correlated (r={corr['correlation']:.3f})"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
  )
253
- insights['correlation_insights'] = corr_insights
254
- else:
255
- insights['correlation_insights'] = ["No strong correlations found between numeric variables"]
256
-
257
- # Data quality insights
258
- quality_insights = []
259
-
260
- # Missing data insights
261
- if basic and basic['missing_data']['total_missing'] > 0:
262
- quality_insights.append(f"Found {basic['missing_data']['total_missing']} missing values")
263
- if basic['missing_data']['missing_percentage'] > 10:
264
- quality_insights.append("⚠️ High percentage of missing data may affect analysis quality")
265
-
266
- # Duplicates
267
- duplicates = self.df.duplicated().sum()
268
- if duplicates > 0:
269
- quality_insights.append(f"Found {duplicates} duplicate rows")
270
-
271
- if not quality_insights:
272
- quality_insights.append("✅ Data quality looks good - no major issues detected")
273
-
274
- insights['data_quality'] = quality_insights
275
-
276
- # Recommendations
277
- recommendations = []
278
-
279
- if basic and basic['missing_data']['missing_percentage'] > 5:
280
- recommendations.append("Consider handling missing values before analysis")
281
-
282
- if len(self.numeric_cols) < 2:
283
- recommendations.append("Add more numeric columns for better analysis capabilities")
284
-
285
- if self.df.shape[0] < 100:
286
- recommendations.append("Consider collecting more data points for robust analysis")
287
-
288
- if not recommendations:
289
- recommendations.append("Dataset is ready for comprehensive analysis")
290
-
291
- insights['recommendations'] = recommendations
292
-
293
- return insights
294
-
295
- except Exception as e:
296
- st.error(f"Error generating insights: {str(e)}")
297
- return {'error': str(e)}
298
 
299
- def get_summary_statistics(self) -> Dict[str, Any]:
300
- """Get comprehensive summary statistics"""
301
- try:
302
- summary = {
303
- 'shape': self.df.shape,
304
- 'columns': self.df.columns.tolist(),
305
- 'dtypes': self.df.dtypes.to_dict(),
306
- 'missing_values': self.df.isnull().sum().to_dict(),
307
- 'memory_usage': self.df.memory_usage(deep=True).sum() / (1024**2) # MB
308
- }
309
-
310
- # Numeric statistics
311
- if self.numeric_cols:
312
- summary['numeric_stats'] = self.df[self.numeric_cols].describe().to_dict()
313
-
314
- # Categorical statistics
315
- if self.categorical_cols:
316
- categorical_stats = {}
317
- for col in self.categorical_cols:
318
- categorical_stats[col] = {
319
- 'unique_count': self.df[col].nunique(),
320
- 'top_values': self.df[col].value_counts().head(5).to_dict()
321
- }
322
- summary['categorical_stats'] = categorical_stats
323
-
324
- return summary
325
-
326
- except Exception as e:
327
- st.error(f"Error getting summary statistics: {str(e)}")
328
- return {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
  import pandas as pd
3
  import numpy as np
4
+ import plotly.express as px
5
+ import plotly.graph_objects as go
6
+ from typing import Dict, List, Any, Optional
7
+ import os
8
+ from dotenv import load_dotenv
9
+ from data_handler import *
10
+ from io import BytesIO
11
+
12
+ # Load environment variables
13
+ load_dotenv()
14
 
15
+ # Optional AI Integration
16
  try:
17
+ import openai
18
+ OPENAI_AVAILABLE = True
 
 
 
 
19
  except ImportError:
20
+ OPENAI_AVAILABLE = False
 
21
 
22
+ try:
23
+ import google.generativeai as genai
24
+ GEMINI_AVAILABLE = True
25
+ except ImportError:
26
+ GEMINI_AVAILABLE = False
27
+
28
+ class AIAssistant:
29
+ """AI-powered analysis assistant"""
30
 
31
+ def __init__(self):
32
+ self.openai_key = os.getenv('OPENAI_API_KEY')
33
+ self.gemini_key = os.getenv('GOOGLE_API_KEY')
34
+
35
+ if self.gemini_key and GEMINI_AVAILABLE:
36
+ genai.configure(api_key=self.gemini_key)
37
+ self.gemini_model = genai.GenerativeModel('gemini-1.5-flash')
38
 
39
+ def get_available_models(self) -> List[str]:
40
+ """Get list of available AI models"""
41
+ models = []
42
+ if self.openai_key and OPENAI_AVAILABLE:
43
+ models.append("OpenAI GPT")
44
+ if self.gemini_key and GEMINI_AVAILABLE:
45
+ models.append("Google Gemini")
46
+ return models
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
+ def analyze_insights(self, df: pd.DataFrame, insights: List[Dict], model: str = "Google Gemini") -> str:
49
+ """Get AI analysis of insights"""
50
+
51
+ # Prepare data summary
52
+ summary = f"""
53
+ Dataset Summary:
54
+ - Shape: {df.shape}
55
+ - Columns: {list(df.columns)}
56
+ - Data types: {df.dtypes.value_counts().to_dict()}
57
+
58
+ Key Insights Found:
59
+ """
60
+
61
+ for insight in insights:
62
+ summary += f"\n- {insight['insight']}"
63
+
64
+ prompt = f"""
65
+ As a senior data scientist, analyze this dataset and provide:
66
+
67
+ 1. Business implications of the findings
68
+ 2. Potential opportunities or risks
69
+ 3. Recommendations for decision-making
70
+ 4. Suggestions for further analysis
71
+
72
+ {summary}
73
+
74
+ Provide actionable insights in a professional format.
75
+ """
76
+
77
  try:
78
+ if model == "Google Gemini" and hasattr(self, 'gemini_model'):
79
+ response = self.gemini_model.generate_content(prompt)
80
+ return response.text
81
+ elif model == "OpenAI GPT" and self.openai_key:
82
+ client = openai.OpenAI(api_key=self.openai_key)
83
+ response = client.chat.completions.create(
84
+ model="gpt-3.5-turbo",
85
+ messages=[{"role": "user", "content": prompt}]
86
+ )
87
+ return response.choices[0].message.content
88
+ else:
89
+ return "AI analysis not available. Please configure API keys."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  except Exception as e:
91
+ return f"AI Analysis Error: {str(e)}"
92
+
93
+ class DataAnalysisWorkflow:
94
+ """Optimized data analysis workflow with caching and pagination"""
95
 
96
+ def __init__(self, df: pd.DataFrame):
97
+ self.df = df
98
+ self.stats = calculate_basic_stats(df)
99
+ self.column_types = get_column_types(df)
100
+ self.insights = []
101
+ self.page_size = 1000 # For pagination
102
 
103
+ def add_insight(self, insight: str, stage: int):
104
+ """Add insight to analysis report"""
105
+ self.insights.append({
106
+ 'stage': stage,
107
+ 'insight': insight,
108
+ 'timestamp': pd.Timestamp.now()
109
+ })
110
+
111
+ def get_paginated_data(self, page: int = 0) -> pd.DataFrame:
112
+ """Get paginated data for display"""
113
+ start_idx = page * self.page_size
114
+ end_idx = start_idx + self.page_size
115
+ return self.df.iloc[start_idx:end_idx]
116
+
117
+ def stage_1_overview(self):
118
+ """Stage 1: Data Overview with caching"""
119
+ st.subheader("📊 Data Overview")
120
+
121
+ # Data Quality Score
122
+ quality_metrics = calculate_data_quality_score(self.df)
123
+ col1, col2, col3, col4 = st.columns(4)
124
+ with col1:
125
+ st.metric("Rows", f"{self.stats['shape'][0]:,}")
126
+ with col2:
127
+ st.metric("Columns", f"{self.stats['shape'][1]:,}")
128
+ with col3:
129
+ st.metric("Quality Score", f"{quality_metrics['score']:.1f}/100")
130
+ with col4:
131
+ st.metric("Grade", quality_metrics['grade'])
132
+
133
+ if quality_metrics['issues']:
134
+ st.warning("Quality Issues Found:")
135
+ for issue in quality_metrics['issues']:
136
+ st.write(f"• {issue}")
137
+
138
+ # Memory Usage and Optimization
139
+ st.subheader("Memory Analysis")
140
+ memory_opt = calculate_memory_optimization(self.df)
141
+ col1, col2 = st.columns(2)
142
+ with col1:
143
+ st.metric("Current Memory", f"{memory_opt['current_memory_mb']:.1f} MB")
144
+ with col2:
145
+ if memory_opt['potential_savings_mb'] > 0:
146
+ st.metric("Potential Savings",
147
+ f"{memory_opt['potential_savings_mb']:.1f} MB",
148
+ f"{memory_opt['potential_savings_pct']:.1f}%")
149
 
150
+ if st.button("Show Optimization Details"):
151
+ st.dataframe(pd.DataFrame(memory_opt['suggestions']))
152
+
153
+ # Column Cardinality Analysis
154
+ st.subheader("Column Cardinality Analysis")
155
+ cardinality_df = calculate_column_cardinality(self.df)
156
+
157
+ # Filter options
158
+ col_types = cardinality_df['Type'].unique()
159
+ selected_types = st.multiselect("Filter by Column Type",
160
+ col_types,
161
+ default=col_types)
162
+
163
+ filtered_df = cardinality_df[cardinality_df['Type'].isin(selected_types)]
164
+ st.dataframe(filtered_df, use_container_width=True)
165
+
166
+ # Highlight important findings
167
+ id_cols = filtered_df[filtered_df['Type'] == 'Unique Identifier']['Column'].tolist()
168
+ if id_cols:
169
+ st.info(f"📌 Potential ID columns found: {', '.join(id_cols)}")
170
+
171
+ const_cols = filtered_df[filtered_df['Type'] == 'Constant']['Column'].tolist()
172
+ if const_cols:
173
+ st.warning(f"⚠️ Constant columns found: {', '.join(const_cols)}")
174
+
175
+ # Data types visualization
176
+ if self.stats['dtypes']:
177
+ st.subheader("Data Types Distribution")
178
+ fig = px.pie(values=list(self.stats['dtypes'].values()),
179
+ names=list(self.stats['dtypes'].keys()),
180
+ title="Data Types")
181
+ st.plotly_chart(fig, use_container_width=True)
182
+
183
+ # Sample data with pagination
184
+ st.subheader("Sample Data")
185
+ total_pages = (len(self.df) - 1) // self.page_size + 1
186
+
187
+ if total_pages > 1:
188
+ page = st.slider("Page", 0, total_pages - 1, 0)
189
+ sample_data = self.get_paginated_data(page)
190
+ st.write(f"Showing rows {page * self.page_size + 1} to {min((page + 1) * self.page_size, len(self.df))}")
191
+ else:
192
+ sample_data = self.df.head(10)
193
+
194
+ st.dataframe(sample_data, use_container_width=True)
195
+
196
+ # Missing values analysis
197
+ missing_df = calculate_missing_data(self.df)
198
+ if not missing_df.empty:
199
+ st.subheader("Missing Values Analysis")
200
+ st.dataframe(missing_df, use_container_width=True)
201
+
202
+ worst_column = missing_df.iloc[0]['Column']
203
+ worst_percentage = missing_df.iloc[0]['Missing %']
204
+ self.add_insight(f"Column '{worst_column}' has highest missing data: {worst_percentage:.1f}%", 1)
205
+ else:
206
+ st.success("✅ No missing values found!")
207
+ self.add_insight("Dataset has no missing values - excellent data quality", 1)
208
+
209
+ # Add insights about data quality and cardinality
210
+ if quality_metrics['score'] < 80:
211
+ self.add_insight(f"Data quality needs improvement (Score: {quality_metrics['score']:.1f}/100)", 1)
212
+
213
+ if memory_opt['potential_savings_pct'] > 20:
214
+ self.add_insight(f"Potential memory optimization of {memory_opt['potential_savings_pct']:.1f}% identified", 1)
215
+
216
+ if id_cols:
217
+ self.add_insight(f"Found {len(id_cols)} potential ID columns", 1)
218
+
219
+ def stage_2_exploration(self):
220
+ """Stage 2: Exploratory Data Analysis with caching"""
221
+ st.subheader("🔍 Exploratory Data Analysis")
222
+
223
+ numeric_cols = self.column_types['numeric']
224
+ categorical_cols = self.column_types['categorical']
225
+
226
+ # Numeric analysis
227
+ if numeric_cols:
228
+ st.subheader("Numeric Variables")
229
+ selected_numeric = st.selectbox("Select numeric column:", numeric_cols)
230
+
231
+ col1, col2 = st.columns(2)
232
+ with col1:
233
+ fig = px.histogram(self.df, x=selected_numeric,
234
+ title=f"Distribution of {selected_numeric}")
235
+ st.plotly_chart(fig, use_container_width=True)
236
+
237
+ with col2:
238
+ fig = px.box(self.df, y=selected_numeric,
239
+ title=f"Box Plot of {selected_numeric}")
240
+ st.plotly_chart(fig, use_container_width=True)
241
+
242
+ # Statistical summary
243
+ st.subheader("Statistical Summary")
244
+ summary_stats = self.df[numeric_cols].describe()
245
+ st.dataframe(summary_stats, use_container_width=True)
246
+
247
+ # Correlation analysis
248
+ if len(numeric_cols) > 1:
249
+ st.subheader("Correlation Analysis")
250
+ corr_matrix = calculate_correlation_matrix(self.df)
251
+ if not corr_matrix.empty:
252
+ fig = px.imshow(corr_matrix, text_auto=True, aspect="auto",
253
+ title="Correlation Matrix")
254
+ st.plotly_chart(fig, use_container_width=True)
255
+
256
+ # Find highest correlation
257
+ corr_values = []
258
+ for i in range(len(corr_matrix.columns)):
259
+ for j in range(i+1, len(corr_matrix.columns)):
260
+ corr_values.append(abs(corr_matrix.iloc[i, j]))
261
+
262
+ if corr_values:
263
+ max_corr = max(corr_values)
264
+ self.add_insight(f"Maximum correlation coefficient: {max_corr:.3f}", 2)
265
+
266
+ # Categorical analysis
267
+ if categorical_cols:
268
+ st.subheader("Categorical Variables")
269
+ selected_categorical = st.selectbox("Select categorical column:", categorical_cols)
270
+
271
+ value_counts = get_value_counts(self.df, selected_categorical)
272
+ fig = px.bar(x=value_counts.index, y=value_counts.values,
273
+ title=f"Top 10 {selected_categorical} Values")
274
+ st.plotly_chart(fig, use_container_width=True)
275
+
276
+ total_categories = self.df[selected_categorical].nunique()
277
+ self.add_insight(f"Column '{selected_categorical}' has {total_categories} unique categories", 2)
278
+
279
+ def stage_3_cleaning(self):
280
+ """Stage 3: Data Quality Assessment"""
281
+ st.subheader("🧹 Data Quality Assessment")
282
+
283
+ cleaning_actions = []
284
+ cleaning_history = []
285
+
286
+ # Missing values handling
287
+ if self.stats['missing_values'] > 0:
288
+ st.subheader("Missing Values Treatment")
289
+ missing_df = calculate_missing_data(self.df)
290
+ st.dataframe(missing_df, use_container_width=True)
291
+
292
+ col1, col2 = st.columns(2)
293
+ with col1:
294
+ selected_col = st.selectbox("Select column to handle missing values:",
295
+ missing_df['Column'].tolist())
296
+ with col2:
297
+ fill_method = st.selectbox("Choose fill method:",
298
+ ["Drop rows", "Mean", "Median", "Mode", "Custom value"])
299
+
300
+ if st.button("Apply Missing Value Treatment"):
301
+ try:
302
+ if fill_method == "Drop rows":
303
+ self.df = self.df.dropna(subset=[selected_col])
304
+ cleaning_history.append(f"Dropped rows with missing values in {selected_col}")
305
+ else:
306
+ if fill_method == "Mean":
307
+ fill_value = self.df[selected_col].mean()
308
+ elif fill_method == "Median":
309
+ fill_value = self.df[selected_col].median()
310
+ elif fill_method == "Mode":
311
+ fill_value = self.df[selected_col].mode()[0]
312
+ else: # Custom value
313
+ fill_value = st.number_input("Enter custom value:", value=0.0)
314
 
315
+ self.df[selected_col] = self.df[selected_col].fillna(fill_value)
316
+ cleaning_history.append(f"Filled missing values in {selected_col} with {fill_method}")
317
+
318
+ st.success("✅ Missing values handled successfully!")
319
+ except Exception as e:
320
+ st.error(f"Error handling missing values: {str(e)}")
321
+
322
+ # Duplicates handling
323
+ if self.stats['duplicates'] > 0:
324
+ st.subheader("Duplicate Rows")
325
+ st.warning(f"Found {self.stats['duplicates']} duplicate rows")
326
+
327
+ if st.button("Remove Duplicate Rows"):
328
+ original_len = len(self.df)
329
+ self.df = self.df.drop_duplicates()
330
+ removed = original_len - len(self.df)
331
+ cleaning_history.append(f"Removed {removed} duplicate rows")
332
+ st.success(f"✅ Removed {removed} duplicate rows")
333
+ else:
334
+ st.success("✅ No duplicate rows found")
335
+
336
+ # Mixed type detection and handling
337
+ mixed_types = detect_mixed_types(self.df)
338
+ if mixed_types:
339
+ st.subheader("Mixed Data Types")
340
+ mixed_df = pd.DataFrame(mixed_types)
341
+ st.dataframe(mixed_df, use_container_width=True)
342
+
343
+ selected_col = st.selectbox("Select column to fix data type:",
344
+ [item['column'] for item in mixed_types])
345
+
346
+ fix_method = st.selectbox("Choose fix method:",
347
+ ["Convert to numeric", "Convert to string"])
348
+
349
+ if st.button("Fix Data Type"):
350
+ try:
351
+ if fix_method == "Convert to numeric":
352
+ self.df[selected_col] = pd.to_numeric(self.df[selected_col], errors='coerce')
353
+ else:
354
+ self.df[selected_col] = self.df[selected_col].astype(str)
355
+
356
+ cleaning_history.append(f"Fixed data type for {selected_col} to {fix_method}")
357
+ st.success("✅ Data type fixed successfully!")
358
+ except Exception as e:
359
+ st.error(f"Error fixing data type: {str(e)}")
360
+
361
+ # Outlier detection and handling
362
+ numeric_cols = self.column_types['numeric']
363
+ if numeric_cols:
364
+ st.subheader("Outlier Detection")
365
+ selected_col = st.selectbox("Select column for outlier detection:", numeric_cols)
366
+
367
+ outliers = calculate_outliers(self.df, selected_col)
368
+ outlier_count = len(outliers)
369
+
370
+ if outlier_count > 0:
371
+ st.warning(f"Found {outlier_count} potential outliers in '{selected_col}'")
372
+ st.dataframe(outliers[[selected_col]].head(100), use_container_width=True)
373
+
374
+ treatment_method = st.selectbox("Choose outlier treatment method:",
375
+ ["None", "Remove", "Cap at percentiles"])
376
 
377
+ if treatment_method != "None" and st.button("Apply Outlier Treatment"):
378
  try:
379
+ if treatment_method == "Remove":
380
+ self.df = self.df[~self.df.index.isin(outliers.index)]
381
+ cleaning_history.append(f"Removed {outlier_count} outliers from {selected_col}")
382
+ else: # Cap at percentiles
383
+ Q1 = self.df[selected_col].quantile(0.25)
384
+ Q3 = self.df[selected_col].quantile(0.75)
385
+ IQR = Q3 - Q1
386
+ lower_bound = Q1 - 1.5 * IQR
387
+ upper_bound = Q3 + 1.5 * IQR
388
+
389
+ self.df[selected_col] = self.df[selected_col].clip(lower_bound, upper_bound)
390
+ cleaning_history.append(f"Capped outliers in {selected_col} at percentiles")
 
 
 
 
 
 
391
 
392
+ st.success("✅ Outliers handled successfully!")
393
  except Exception as e:
394
+ st.error(f"Error handling outliers: {str(e)}")
395
+ else:
396
+ st.success(f"✅ No outliers detected in '{selected_col}'")
397
+
398
+ # Cleaning History
399
+ if cleaning_history:
400
+ st.subheader("Cleaning Operations History")
401
+ for i, operation in enumerate(cleaning_history, 1):
402
+ st.write(f"{i}. {operation}")
403
+ self.add_insight(f"Performed {len(cleaning_history)} data cleaning operations", 3)
404
+
405
+ # Summary
406
+ if cleaning_actions:
407
+ st.subheader("Remaining Action Items")
408
+ for i, action in enumerate(cleaning_actions, 1):
409
+ st.write(f"{i}. {action}")
410
+ self.add_insight(f"Identified {len(cleaning_actions)} data quality issues", 3)
411
+ else:
412
+ st.success("✅ Data quality is excellent!")
413
+ self.add_insight("No major data quality issues found", 3)
414
 
415
+ def stage_4_analysis(self):
416
+ """Stage 4: Advanced Analysis"""
417
+ st.subheader("🔬 Advanced Analysis")
418
+
419
+ numeric_cols = self.column_types['numeric']
420
+ categorical_cols = self.column_types['categorical']
421
+
422
+ # Relationship analysis
423
+ if len(numeric_cols) >= 2:
424
+ st.subheader("Variable Relationships")
425
+
426
+ col1, col2 = st.columns(2)
427
+ with col1:
428
+ x_var = st.selectbox("X Variable:", numeric_cols)
429
+ with col2:
430
+ y_var = st.selectbox("Y Variable:",
431
+ [col for col in numeric_cols if col != x_var])
432
+
433
+ # Sample data for performance if dataset is large
434
+ sample_size = min(5000, len(self.df))
435
+ sample_df = self.df.sample(n=sample_size) if len(self.df) > sample_size else self.df
436
+
437
+ fig = px.scatter(sample_df, x=x_var, y=y_var,
438
+ title=f"Relationship: {x_var} vs {y_var}")
439
+ st.plotly_chart(fig, use_container_width=True)
440
+
441
+ correlation = self.df[x_var].corr(self.df[y_var])
442
+ st.metric("Correlation", f"{correlation:.3f}")
443
+
444
+ if abs(correlation) > 0.7:
445
+ strength = "Strong"
446
+ elif abs(correlation) > 0.3:
447
+ strength = "Moderate"
448
+ else:
449
+ strength = "Weak"
450
 
451
+ direction = "positive" if correlation > 0 else "negative"
452
+ st.write(f"**Result:** {strength} {direction} correlation")
453
+ self.add_insight(f"{strength} correlation ({correlation:.3f}) between {x_var} and {y_var}", 4)
454
+
455
+ # Group analysis
456
+ if categorical_cols and numeric_cols:
457
+ st.subheader("Group Analysis")
458
+
459
+ col1, col2 = st.columns(2)
460
+ with col1:
461
+ group_var = st.selectbox("Group by:", categorical_cols)
462
+ with col2:
463
+ metric_var = st.selectbox("Analyze:", numeric_cols)
464
+
465
+ group_stats = calculate_group_stats(self.df, group_var, metric_var)
466
+ st.dataframe(group_stats, use_container_width=True)
467
+
468
+ # Sample for visualization if too many groups
469
+ unique_groups = self.df[group_var].nunique()
470
+ if unique_groups <= 20:
471
+ fig = px.box(self.df, x=group_var, y=metric_var,
472
+ title=f"{metric_var} by {group_var}")
473
+ st.plotly_chart(fig, use_container_width=True)
474
+ else:
475
+ st.info(f"Too many groups ({unique_groups}) for visualization. Showing statistics only.")
476
 
477
+ best_group = group_stats['mean'].idxmax()
478
+ best_value = group_stats.loc[best_group, 'mean']
479
+ self.add_insight(f"'{best_group}' has highest average {metric_var}: {best_value:.2f}", 4)
480
+
481
+ def stage_5_summary(self):
482
+ """Stage 5: Summary and Export"""
483
+ st.subheader("📈 Analysis Summary")
484
+
485
+ # Key metrics
486
+ col1, col2, col3 = st.columns(3)
487
+ with col1:
488
+ st.metric("Total Insights", len(self.insights))
489
+ with col2:
490
+ quality = "High" if self.stats['missing_values'] == 0 else "Medium"
491
+ st.metric("Data Quality", quality)
492
+ with col3:
493
+ st.metric("Analysis Complete", "✅")
494
+
495
+ # Insights summary
496
+ st.subheader("Key Insights")
497
+ for i, insight in enumerate(self.insights, 1):
498
+ st.write(f"{i}. **Stage {insight['stage']}:** {insight['insight']}")
499
+
500
+ # Export options
501
+ st.subheader("Export Results")
502
+ export_format = st.selectbox("Choose export format:",
503
+ ["Text Report", "Markdown Report", "Python Code", "Cleaned Data"])
504
+
505
+ if export_format == "Text Report":
506
+ report = self.generate_text_report()
507
+ st.download_button(
508
+ label="Download Text Report",
509
+ data=report,
510
+ file_name="analysis_report.txt",
511
+ mime="text/plain"
512
+ )
513
+
514
+ elif export_format == "Markdown Report":
515
+ report = self.generate_markdown_report()
516
+ st.download_button(
517
+ label="Download Markdown Report",
518
+ data=report,
519
+ file_name="analysis_report.md",
520
+ mime="text/markdown"
521
+ )
522
+
523
+ elif export_format == "Python Code":
524
+ code = self.generate_python_code()
525
+ st.code(code, language="python")
526
+ st.download_button(
527
+ label="Download Python Script",
528
+ data=code,
529
+ file_name="analysis_script.py",
530
+ mime="text/plain"
531
+ )
532
+
533
+ else: # Cleaned Data
534
+ # Offer different export formats
535
+ data_format = st.selectbox("Choose data format:",
536
+ ["CSV", "Excel", "Parquet"])
537
+
538
+ if st.button("Export Data"):
539
+ try:
540
+ if data_format == "CSV":
541
+ csv = self.df.to_csv(index=False)
542
+ st.download_button(
543
+ label="Download CSV",
544
+ data=csv,
545
+ file_name="cleaned_data.csv",
546
+ mime="text/csv"
547
  )
548
+ elif data_format == "Excel":
549
+ excel_buffer = BytesIO()
550
+ self.df.to_excel(excel_buffer, index=False)
551
+ excel_data = excel_buffer.getvalue()
552
+ st.download_button(
553
+ label="Download Excel",
554
+ data=excel_data,
555
+ file_name="cleaned_data.xlsx",
556
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
557
+ )
558
+ else: # Parquet
559
+ parquet_buffer = BytesIO()
560
+ self.df.to_parquet(parquet_buffer, index=False)
561
+ parquet_data = parquet_buffer.getvalue()
562
+ st.download_button(
563
+ label="Download Parquet",
564
+ data=parquet_data,
565
+ file_name="cleaned_data.parquet",
566
+ mime="application/octet-stream"
567
+ )
568
+ except Exception as e:
569
+ st.error(f"Error exporting data: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
570
 
571
+ def generate_text_report(self) -> str:
572
+ """Generate text analysis report"""
573
+ report = f"""DATA ANALYSIS REPORT
574
+ ==================
575
+
576
+ Dataset Overview:
577
+ - Rows: {self.stats['shape'][0]:,}
578
+ - Columns: {self.stats['shape'][1]:,}
579
+ - Missing Values: {self.stats['missing_values']:,}
580
+ - Memory Usage: {self.stats['memory_usage']:.1f} MB
581
+
582
+ Key Insights:
583
+ """
584
+ for insight in self.insights:
585
+ report += f"\n- Stage {insight['stage']}: {insight['insight']}"
586
+
587
+ report += f"\n\nGenerated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}"
588
+ return report
589
+
590
+ def generate_markdown_report(self) -> str:
591
+ """Generate markdown analysis report"""
592
+ report = f"""# Data Analysis Report
593
+
594
+ ## Dataset Overview
595
+ * **Rows:** {self.stats['shape'][0]:,}
596
+ * **Columns:** {self.stats['shape'][1]:,}
597
+ * **Missing Values:** {self.stats['missing_values']:,}
598
+ * **Memory Usage:** {self.stats['memory_usage']:.1f} MB
599
+
600
+ ## Data Types
601
+ ```
602
+ {pd.DataFrame(self.stats['dtypes'].items(), columns=['Type', 'Count']).to_markdown()}
603
+ ```
604
+
605
+ ## Key Insights
606
+ """
607
+ # Group insights by stage
608
+ for stage in range(1, 6):
609
+ stage_insights = [i for i in self.insights if i['stage'] == stage]
610
+ if stage_insights:
611
+ report += f"\n### Stage {stage}\n"
612
+ for insight in stage_insights:
613
+ report += f"* {insight['insight']}\n"
614
+
615
+ report += f"\n\n*Generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}*"
616
+ return report
617
+
618
+ def generate_python_code(self) -> str:
619
+ """Generate reproducible Python code"""
620
+ code = """import pandas as pd
621
+ import numpy as np
622
+ import plotly.express as px
623
+ from typing import Dict, List, Any
624
+
625
+ # Load and prepare data
626
+ df = pd.read_csv('your_data.csv') # Update with your data source
627
+
628
+ # Basic statistics
629
+ def calculate_basic_stats(df: pd.DataFrame) -> Dict[str, Any]:
630
+ return {
631
+ 'shape': df.shape,
632
+ 'memory_usage': float(df.memory_usage(deep=True).sum() / 1024**2),
633
+ 'missing_values': int(df.isnull().sum().sum()),
634
+ 'dtypes': df.dtypes.value_counts().to_dict(),
635
+ 'duplicates': int(df.duplicated().sum())
636
+ }
637
+
638
+ stats = calculate_basic_stats(df)
639
+ print("\\nBasic Statistics:")
640
+ print(f"- Shape: {stats['shape']}")
641
+ print(f"- Memory Usage: {stats['memory_usage']:.1f} MB")
642
+ print(f"- Missing Values: {stats['missing_values']}")
643
+ print(f"- Duplicates: {stats['duplicates']}")
644
+
645
+ """
646
+ # Add data cleaning operations if any were performed
647
+ if hasattr(self, 'cleaning_history'):
648
+ code += "\n# Data Cleaning\n"
649
+ for operation in self.cleaning_history:
650
+ if "missing values" in operation.lower():
651
+ code += "# Handle missing values\n"
652
+ code += "df = df.fillna(method='ffill') # Update with your chosen method\n"
653
+ elif "duplicate" in operation.lower():
654
+ code += "# Remove duplicates\n"
655
+ code += "df = df.drop_duplicates()\n"
656
+ elif "outlier" in operation.lower():
657
+ code += """# Handle outliers
658
+ def remove_outliers(df: pd.DataFrame, column: str) -> pd.DataFrame:
659
+ Q1 = df[column].quantile(0.25)
660
+ Q3 = df[column].quantile(0.75)
661
+ IQR = Q3 - Q1
662
+ return df[~((df[column] < (Q1 - 1.5 * IQR)) | (df[column] > (Q3 + 1.5 * IQR)))]
663
+
664
+ # Apply to numeric columns as needed
665
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
666
+ for col in numeric_cols:
667
+ df = remove_outliers(df, col)
668
+ """
669
+
670
+ # Add visualization code
671
+ code += """
672
+ # Visualizations
673
+ def plot_missing_values(df: pd.DataFrame):
674
+ missing = df.isnull().sum()
675
+ if missing.sum() > 0:
676
+ missing = missing[missing > 0]
677
+ fig = px.bar(x=missing.index, y=missing.values,
678
+ title='Missing Values by Column')
679
+ fig.show()
680
+
681
+ def plot_correlations(df: pd.DataFrame):
682
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
683
+ if len(numeric_cols) > 1:
684
+ corr = df[numeric_cols].corr()
685
+ fig = px.imshow(corr, title='Correlation Matrix')
686
+ fig.show()
687
+
688
+ # Generate plots
689
+ plot_missing_values(df)
690
+ plot_correlations(df)
691
+ """
692
+
693
+ return code