entropy25 commited on
Commit
86805f4
·
verified ·
1 Parent(s): 1cc5290

Update analyzer.py

Browse files
Files changed (1) hide show
  1. analyzer.py +1261 -483
analyzer.py CHANGED
@@ -5,6 +5,7 @@ import plotly.express as px
5
  import plotly.graph_objects as go
6
  from typing import Dict, List, Any, Optional
7
  import os
 
8
  from dotenv import load_dotenv
9
  from data_handler import *
10
  from io import BytesIO
@@ -12,29 +13,42 @@ from io import BytesIO
12
  # Load environment variables
13
  load_dotenv()
14
 
15
- # Optional AI Integration
 
 
 
16
  try:
17
  import openai
18
  OPENAI_AVAILABLE = True
19
  except ImportError:
20
  OPENAI_AVAILABLE = False
 
21
 
22
  try:
23
  import google.generativeai as genai
24
  GEMINI_AVAILABLE = True
25
  except ImportError:
26
  GEMINI_AVAILABLE = False
 
27
 
28
  class AIAssistant:
29
- """AI-powered analysis assistant"""
30
 
31
  def __init__(self):
32
  self.openai_key = os.getenv('OPENAI_API_KEY')
33
  self.gemini_key = os.getenv('GOOGLE_API_KEY')
34
-
35
- if self.gemini_key and GEMINI_AVAILABLE:
36
- genai.configure(api_key=self.gemini_key)
37
- self.gemini_model = genai.GenerativeModel('gemini-1.5-flash')
 
 
 
 
 
 
 
 
38
 
39
  def get_available_models(self) -> List[str]:
40
  """Get list of available AI models"""
@@ -46,648 +60,1412 @@ class AIAssistant:
46
  return models
47
 
48
  def analyze_insights(self, df: pd.DataFrame, insights: List[Dict], model: str = "Google Gemini") -> str:
49
- """Get AI analysis of insights"""
50
-
51
- # Prepare data summary
52
- summary = f"""
53
- Dataset Summary:
54
- - Shape: {df.shape}
55
- - Columns: {list(df.columns)}
56
- - Data types: {df.dtypes.value_counts().to_dict()}
57
-
58
- Key Insights Found:
59
- """
60
-
61
- for insight in insights:
62
- summary += f"\n- {insight['insight']}"
63
-
64
- prompt = f"""
65
- As a senior data scientist, analyze this dataset and provide:
66
 
67
- 1. Business implications of the findings
68
- 2. Potential opportunities or risks
69
- 3. Recommendations for decision-making
70
- 4. Suggestions for further analysis
71
-
72
- {summary}
73
-
74
- Provide actionable insights in a professional format.
75
- """
76
 
77
  try:
 
 
 
 
78
  if model == "Google Gemini" and hasattr(self, 'gemini_model'):
79
  response = self.gemini_model.generate_content(prompt)
80
- return response.text
81
- elif model == "OpenAI GPT" and self.openai_key:
 
82
  client = openai.OpenAI(api_key=self.openai_key)
83
  response = client.chat.completions.create(
84
  model="gpt-3.5-turbo",
85
- messages=[{"role": "user", "content": prompt}]
 
 
86
  )
87
- return response.choices[0].message.content
 
88
  else:
89
- return "AI analysis not available. Please configure API keys."
 
90
  except Exception as e:
91
- return f"AI Analysis Error: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
  class DataAnalysisWorkflow:
94
- """Optimized data analysis workflow with caching and pagination"""
95
 
96
  def __init__(self, df: pd.DataFrame):
97
  self.df = df
 
98
  self.stats = calculate_basic_stats(df)
99
  self.column_types = get_column_types(df)
100
  self.insights = []
101
- self.page_size = 1000 # For pagination
 
102
 
103
- def add_insight(self, insight: str, stage: int):
104
- """Add insight to analysis report"""
 
 
 
 
 
 
105
  self.insights.append({
106
  'stage': stage,
107
  'insight': insight,
 
108
  'timestamp': pd.Timestamp.now()
109
  })
110
 
111
  def get_paginated_data(self, page: int = 0) -> pd.DataFrame:
112
- """Get paginated data for display"""
113
- start_idx = page * self.page_size
114
- end_idx = start_idx + self.page_size
115
- return self.df.iloc[start_idx:end_idx]
 
 
 
 
116
 
117
  def stage_1_overview(self):
118
- """Stage 1: Data Overview with caching"""
119
  st.subheader("📊 Data Overview")
120
 
121
- # Data Quality Score
 
 
 
 
 
 
 
 
 
 
122
  quality_metrics = calculate_data_quality_score(self.df)
 
123
  col1, col2, col3, col4 = st.columns(4)
124
  with col1:
125
- st.metric("Rows", f"{self.stats['shape'][0]:,}")
126
  with col2:
127
- st.metric("Columns", f"{self.stats['shape'][1]:,}")
128
  with col3:
129
- st.metric("Quality Score", f"{quality_metrics['score']:.1f}/100")
 
 
130
  with col4:
131
- st.metric("Grade", quality_metrics['grade'])
 
132
 
 
133
  if quality_metrics['issues']:
134
- st.warning("Quality Issues Found:")
135
  for issue in quality_metrics['issues']:
136
  st.write(f"• {issue}")
137
 
138
- # Memory Usage and Optimization
139
- st.subheader("Memory Analysis")
 
 
 
 
 
140
  memory_opt = calculate_memory_optimization(self.df)
141
- col1, col2 = st.columns(2)
 
142
  with col1:
143
  st.metric("Current Memory", f"{memory_opt['current_memory_mb']:.1f} MB")
144
  with col2:
145
  if memory_opt['potential_savings_mb'] > 0:
146
  st.metric("Potential Savings",
147
  f"{memory_opt['potential_savings_mb']:.1f} MB",
148
- f"{memory_opt['potential_savings_pct']:.1f}%")
149
-
150
- if st.button("Show Optimization Details"):
151
- st.dataframe(pd.DataFrame(memory_opt['suggestions']))
 
 
 
 
 
152
 
153
- # Column Cardinality Analysis
154
- st.subheader("Column Cardinality Analysis")
155
  cardinality_df = calculate_column_cardinality(self.df)
156
 
157
- # Filter options
158
- col_types = cardinality_df['Type'].unique()
159
- selected_types = st.multiselect("Filter by Column Type",
160
- col_types,
161
- default=col_types)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
- filtered_df = cardinality_df[cardinality_df['Type'].isin(selected_types)]
164
- st.dataframe(filtered_df, use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
165
 
166
- # Highlight important findings
167
- id_cols = filtered_df[filtered_df['Type'] == 'Unique Identifier']['Column'].tolist()
168
  if id_cols:
169
- st.info(f"📌 Potential ID columns found: {', '.join(id_cols)}")
170
-
171
- const_cols = filtered_df[filtered_df['Type'] == 'Constant']['Column'].tolist()
172
  if const_cols:
173
- st.warning(f"⚠️ Constant columns found: {', '.join(const_cols)}")
 
 
 
 
 
 
 
 
 
174
 
175
- # Data types visualization
176
- if self.stats['dtypes']:
177
- st.subheader("Data Types Distribution")
178
- fig = px.pie(values=list(self.stats['dtypes'].values()),
179
- names=list(self.stats['dtypes'].keys()),
180
- title="Data Types")
181
- st.plotly_chart(fig, use_container_width=True)
182
-
183
- # Sample data with pagination
184
- st.subheader("Sample Data")
185
  total_pages = (len(self.df) - 1) // self.page_size + 1
186
 
187
- if total_pages > 1:
188
- page = st.slider("Page", 0, total_pages - 1, 0)
189
- sample_data = self.get_paginated_data(page)
190
- st.write(f"Showing rows {page * self.page_size + 1} to {min((page + 1) * self.page_size, len(self.df))}")
191
- else:
192
- sample_data = self.df.head(10)
 
 
 
 
 
 
193
 
194
- st.dataframe(sample_data, use_container_width=True)
 
 
 
 
195
 
196
- # Missing values analysis
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  missing_df = calculate_missing_data(self.df)
 
198
  if not missing_df.empty:
199
- st.subheader("Missing Values Analysis")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  st.dataframe(missing_df, use_container_width=True)
201
 
202
- worst_column = missing_df.iloc[0]['Column']
203
- worst_percentage = missing_df.iloc[0]['Missing %']
204
- self.add_insight(f"Column '{worst_column}' has highest missing data: {worst_percentage:.1f}%", 1)
 
 
 
 
 
 
 
 
 
 
 
 
205
  else:
206
- st.success("✅ No missing values found!")
207
- self.add_insight("Dataset has no missing values - excellent data quality", 1)
208
-
209
- # Add insights about data quality and cardinality
210
- if quality_metrics['score'] < 80:
211
- self.add_insight(f"Data quality needs improvement (Score: {quality_metrics['score']:.1f}/100)", 1)
 
 
 
212
 
 
213
  if memory_opt['potential_savings_pct'] > 20:
214
- self.add_insight(f"Potential memory optimization of {memory_opt['potential_savings_pct']:.1f}% identified", 1)
215
 
216
- if id_cols:
217
- self.add_insight(f"Found {len(id_cols)} potential ID columns", 1)
 
 
 
 
 
 
 
218
 
219
  def stage_2_exploration(self):
220
- """Stage 2: Exploratory Data Analysis with caching"""
221
  st.subheader("🔍 Exploratory Data Analysis")
222
 
 
 
 
 
 
 
 
 
 
223
  numeric_cols = self.column_types['numeric']
224
  categorical_cols = self.column_types['categorical']
225
 
226
- # Numeric analysis
 
 
 
 
227
  if numeric_cols:
228
- st.subheader("Numeric Variables")
229
- selected_numeric = st.selectbox("Select numeric column:", numeric_cols)
230
-
231
- col1, col2 = st.columns(2)
232
- with col1:
233
- fig = px.histogram(self.df, x=selected_numeric,
234
- title=f"Distribution of {selected_numeric}")
235
- st.plotly_chart(fig, use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
 
237
- with col2:
238
- fig = px.box(self.df, y=selected_numeric,
239
- title=f"Box Plot of {selected_numeric}")
240
- st.plotly_chart(fig, use_container_width=True)
 
 
 
 
 
 
 
241
 
242
- # Statistical summary
243
- st.subheader("Statistical Summary")
244
- summary_stats = self.df[numeric_cols].describe()
245
- st.dataframe(summary_stats, use_container_width=True)
246
-
247
- # Correlation analysis
248
- if len(numeric_cols) > 1:
249
- st.subheader("Correlation Analysis")
250
- corr_matrix = calculate_correlation_matrix(self.df)
251
- if not corr_matrix.empty:
252
- fig = px.imshow(corr_matrix, text_auto=True, aspect="auto",
253
- title="Correlation Matrix")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
  st.plotly_chart(fig, use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
 
256
- # Find highest correlation
257
- corr_values = []
258
- for i in range(len(corr_matrix.columns)):
259
- for j in range(i+1, len(corr_matrix.columns)):
260
- corr_values.append(abs(corr_matrix.iloc[i, j]))
 
 
 
 
 
261
 
262
- if corr_values:
263
- max_corr = max(corr_values)
264
- self.add_insight(f"Maximum correlation coefficient: {max_corr:.3f}", 2)
265
-
266
- # Categorical analysis
267
- if categorical_cols:
268
- st.subheader("Categorical Variables")
269
- selected_categorical = st.selectbox("Select categorical column:", categorical_cols)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
 
271
- value_counts = get_value_counts(self.df, selected_categorical)
272
- fig = px.bar(x=value_counts.index, y=value_counts.values,
273
- title=f"Top 10 {selected_categorical} Values")
274
- st.plotly_chart(fig, use_container_width=True)
 
 
 
 
275
 
276
- total_categories = self.df[selected_categorical].nunique()
277
- self.add_insight(f"Column '{selected_categorical}' has {total_categories} unique categories", 2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
 
279
  def stage_3_cleaning(self):
280
- """Stage 3: Data Quality Assessment"""
281
- st.subheader("🧹 Data Quality Assessment")
282
 
283
- cleaning_actions = []
284
- cleaning_history = []
 
 
 
 
 
 
285
 
286
- # Missing values handling
287
- if self.stats['missing_values'] > 0:
288
- st.subheader("Missing Values Treatment")
289
- missing_df = calculate_missing_data(self.df)
290
- st.dataframe(missing_df, use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291
 
292
- col1, col2 = st.columns(2)
 
293
  with col1:
294
- selected_col = st.selectbox("Select column to handle missing values:",
295
- missing_df['Column'].tolist())
296
  with col2:
297
- fill_method = st.selectbox("Choose fill method:",
298
- ["Drop rows", "Mean", "Median", "Mode", "Custom value"])
 
 
 
 
 
 
 
 
 
 
299
 
300
- if st.button("Apply Missing Value Treatment"):
 
 
 
 
 
 
 
 
 
 
 
 
301
  try:
 
 
302
  if fill_method == "Drop rows":
303
  self.df = self.df.dropna(subset=[selected_col])
304
- cleaning_history.append(f"Dropped rows with missing values in {selected_col}")
305
  else:
306
  if fill_method == "Mean":
307
  fill_value = self.df[selected_col].mean()
308
  elif fill_method == "Median":
309
  fill_value = self.df[selected_col].median()
310
  elif fill_method == "Mode":
311
- fill_value = self.df[selected_col].mode()[0]
312
- else: # Custom value
313
- fill_value = st.number_input("Enter custom value:", value=0.0)
 
314
 
315
  self.df[selected_col] = self.df[selected_col].fillna(fill_value)
316
- cleaning_history.append(f"Filled missing values in {selected_col} with {fill_method}")
317
 
318
- st.success("✅ Missing values handled successfully!")
 
 
 
319
  except Exception as e:
320
- st.error(f"Error handling missing values: {str(e)}")
321
-
322
- # Duplicates handling
323
- if self.stats['duplicates'] > 0:
324
- st.subheader("Duplicate Rows")
325
- st.warning(f"Found {self.stats['duplicates']} duplicate rows")
326
-
327
- if st.button("Remove Duplicate Rows"):
328
- original_len = len(self.df)
329
- self.df = self.df.drop_duplicates()
330
- removed = original_len - len(self.df)
331
- cleaning_history.append(f"Removed {removed} duplicate rows")
332
- st.success(f"✅ Removed {removed} duplicate rows")
333
  else:
334
- st.success("✅ No duplicate rows found")
335
-
336
- # Mixed type detection and handling
337
- mixed_types = detect_mixed_types(self.df)
338
- if mixed_types:
339
- st.subheader("Mixed Data Types")
340
- mixed_df = pd.DataFrame(mixed_types)
341
- st.dataframe(mixed_df, use_container_width=True)
342
 
343
- selected_col = st.selectbox("Select column to fix data type:",
344
- [item['column'] for item in mixed_types])
345
 
346
- fix_method = st.selectbox("Choose fix method:",
347
- ["Convert to numeric", "Convert to string"])
 
 
 
348
 
349
- if st.button("Fix Data Type"):
350
  try:
351
- if fix_method == "Convert to numeric":
352
- self.df[selected_col] = pd.to_numeric(self.df[selected_col], errors='coerce')
353
- else:
354
- self.df[selected_col] = self.df[selected_col].astype(str)
355
-
356
- cleaning_history.append(f"Fixed data type for {selected_col} to {fix_method}")
357
- st.success("✅ Data type fixed successfully!")
358
  except Exception as e:
359
- st.error(f"Error fixing data type: {str(e)}")
 
 
 
 
 
 
360
 
361
- # Outlier detection and handling
362
- numeric_cols = self.column_types['numeric']
363
- if numeric_cols:
364
- st.subheader("Outlier Detection")
365
- selected_col = st.selectbox("Select column for outlier detection:", numeric_cols)
366
-
367
- outliers = calculate_outliers(self.df, selected_col)
368
- outlier_count = len(outliers)
369
 
370
- if outlier_count > 0:
371
- st.warning(f"Found {outlier_count} potential outliers in '{selected_col}'")
372
- st.dataframe(outliers[[selected_col]].head(100), use_container_width=True)
 
373
 
374
- treatment_method = st.selectbox("Choose outlier treatment method:",
375
- ["None", "Remove", "Cap at percentiles"])
376
 
377
- if treatment_method != "None" and st.button("Apply Outlier Treatment"):
378
- try:
379
- if treatment_method == "Remove":
380
- self.df = self.df[~self.df.index.isin(outliers.index)]
381
- cleaning_history.append(f"Removed {outlier_count} outliers from {selected_col}")
382
- else: # Cap at percentiles
383
- Q1 = self.df[selected_col].quantile(0.25)
384
- Q3 = self.df[selected_col].quantile(0.75)
385
- IQR = Q3 - Q1
386
- lower_bound = Q1 - 1.5 * IQR
387
- upper_bound = Q3 + 1.5 * IQR
 
 
 
 
 
 
 
 
 
388
 
389
- self.df[selected_col] = self.df[selected_col].clip(lower_bound, upper_bound)
390
- cleaning_history.append(f"Capped outliers in {selected_col} at percentiles")
391
-
392
- st.success("✅ Outliers handled successfully!")
393
- except Exception as e:
394
- st.error(f"Error handling outliers: {str(e)}")
395
- else:
396
- st.success(f"✅ No outliers detected in '{selected_col}'")
397
-
398
- # Cleaning History
399
- if cleaning_history:
400
- st.subheader("Cleaning Operations History")
401
- for i, operation in enumerate(cleaning_history, 1):
402
- st.write(f"{i}. {operation}")
403
- self.add_insight(f"Performed {len(cleaning_history)} data cleaning operations", 3)
404
-
405
- # Summary
406
- if cleaning_actions:
407
- st.subheader("Remaining Action Items")
408
- for i, action in enumerate(cleaning_actions, 1):
409
- st.write(f"{i}. {action}")
410
- self.add_insight(f"Identified {len(cleaning_actions)} data quality issues", 3)
411
  else:
412
- st.success("✅ Data quality is excellent!")
413
- self.add_insight("No major data quality issues found", 3)
414
 
415
- def stage_4_analysis(self):
416
- """Stage 4: Advanced Analysis"""
417
- st.subheader("🔬 Advanced Analysis")
418
-
419
  numeric_cols = self.column_types['numeric']
420
- categorical_cols = self.column_types['categorical']
421
 
422
- # Relationship analysis
423
- if len(numeric_cols) >= 2:
424
- st.subheader("Variable Relationships")
425
 
426
- col1, col2 = st.columns(2)
427
  with col1:
428
- x_var = st.selectbox("X Variable:", numeric_cols)
429
  with col2:
430
- y_var = st.selectbox("Y Variable:",
431
- [col for col in numeric_cols if col != x_var])
432
-
433
- # Sample data for performance if dataset is large
434
- sample_size = min(5000, len(self.df))
435
- sample_df = self.df.sample(n=sample_size) if len(self.df) > sample_size else self.df
 
436
 
437
- fig = px.scatter(sample_df, x=x_var, y=y_var,
438
- title=f"Relationship: {x_var} vs {y_var}")
439
- st.plotly_chart(fig, use_container_width=True)
440
-
441
- correlation = self.df[x_var].corr(self.df[y_var])
442
- st.metric("Correlation", f"{correlation:.3f}")
443
-
444
- if abs(correlation) > 0.7:
445
- strength = "Strong"
446
- elif abs(correlation) > 0.3:
447
- strength = "Moderate"
448
- else:
449
- strength = "Weak"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
450
 
451
- direction = "positive" if correlation > 0 else "negative"
452
- st.write(f"**Result:** {strength} {direction} correlation")
453
- self.add_insight(f"{strength} correlation ({correlation:.3f}) between {x_var} and {y_var}", 4)
454
-
455
- # Group analysis
456
- if categorical_cols and numeric_cols:
457
- st.subheader("Group Analysis")
458
 
 
459
  col1, col2 = st.columns(2)
460
  with col1:
461
- group_var = st.selectbox("Group by:", categorical_cols)
 
462
  with col2:
463
- metric_var = st.selectbox("Analyze:", numeric_cols)
 
 
 
 
 
464
 
465
- group_stats = calculate_group_stats(self.df, group_var, metric_var)
466
- st.dataframe(group_stats, use_container_width=True)
 
 
 
 
467
 
468
- # Sample for visualization if too many groups
469
- unique_groups = self.df[group_var].nunique()
470
- if unique_groups <= 20:
471
- fig = px.box(self.df, x=group_var, y=metric_var,
472
- title=f"{metric_var} by {group_var}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
473
  st.plotly_chart(fig, use_container_width=True)
474
- else:
475
- st.info(f"Too many groups ({unique_groups}) for visualization. Showing statistics only.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
476
 
477
- best_group = group_stats['mean'].idxmax()
478
- best_value = group_stats.loc[best_group, 'mean']
479
- self.add_insight(f"'{best_group}' has highest average {metric_var}: {best_value:.2f}", 4)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
480
 
481
  def stage_5_summary(self):
482
- """Stage 5: Summary and Export"""
483
- st.subheader("📈 Analysis Summary")
484
 
485
- # Key metrics
486
- col1, col2, col3 = st.columns(3)
 
 
 
 
 
 
 
 
 
487
  with col1:
488
- st.metric("Total Insights", len(self.insights))
489
  with col2:
490
- quality = "High" if self.stats['missing_values'] == 0 else "Medium"
491
- st.metric("Data Quality", quality)
492
  with col3:
493
- st.metric("Analysis Complete", "✅")
494
-
495
- # Insights summary
496
- st.subheader("Key Insights")
497
- for i, insight in enumerate(self.insights, 1):
498
- st.write(f"{i}. **Stage {insight['stage']}:** {insight['insight']}")
499
-
500
- # Export options
501
- st.subheader("Export Results")
502
- export_format = st.selectbox("Choose export format:",
503
- ["Text Report", "Markdown Report", "Python Code", "Cleaned Data"])
504
-
505
- if export_format == "Text Report":
506
- report = self.generate_text_report()
507
- st.download_button(
508
- label="Download Text Report",
509
- data=report,
510
- file_name="analysis_report.txt",
511
- mime="text/plain"
512
- )
513
-
514
- elif export_format == "Markdown Report":
515
- report = self.generate_markdown_report()
516
- st.download_button(
517
- label="Download Markdown Report",
518
- data=report,
519
- file_name="analysis_report.md",
520
- mime="text/markdown"
521
- )
522
-
523
- elif export_format == "Python Code":
524
- code = self.generate_python_code()
525
- st.code(code, language="python")
526
- st.download_button(
527
- label="Download Python Script",
528
- data=code,
529
- file_name="analysis_script.py",
530
- mime="text/plain"
531
- )
532
-
533
- else: # Cleaned Data
534
- # Offer different export formats
535
- data_format = st.selectbox("Choose data format:",
536
- ["CSV", "Excel", "Parquet"])
537
-
538
- if st.button("Export Data"):
539
- try:
540
- if data_format == "CSV":
541
- csv = self.df.to_csv(index=False)
542
- st.download_button(
543
- label="Download CSV",
544
- data=csv,
545
- file_name="cleaned_data.csv",
546
- mime="text/csv"
547
- )
548
- elif data_format == "Excel":
549
- excel_buffer = BytesIO()
550
- self.df.to_excel(excel_buffer, index=False)
551
- excel_data = excel_buffer.getvalue()
552
- st.download_button(
553
- label="Download Excel",
554
- data=excel_data,
555
- file_name="cleaned_data.xlsx",
556
- mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
557
- )
558
- else: # Parquet
559
- parquet_buffer = BytesIO()
560
- self.df.to_parquet(parquet_buffer, index=False)
561
- parquet_data = parquet_buffer.getvalue()
562
- st.download_button(
563
- label="Download Parquet",
564
- data=parquet_data,
565
- file_name="cleaned_data.parquet",
566
- mime="application/octet-stream"
567
- )
568
- except Exception as e:
569
- st.error(f"Error exporting data: {str(e)}")
570
 
571
- def generate_text_report(self) -> str:
572
- """Generate text analysis report"""
573
- report = f"""DATA ANALYSIS REPORT
574
- ==================
575
-
576
- Dataset Overview:
577
- - Rows: {self.stats['shape'][0]:,}
578
- - Columns: {self.stats['shape'][1]:,}
579
- - Missing Values: {self.stats['missing_values']:,}
580
- - Memory Usage: {self.stats['memory_usage']:.1f} MB
581
-
582
- Key Insights:
583
- """
584
- for insight in self.insights:
585
- report += f"\n- Stage {insight['stage']}: {insight['insight']}"
586
 
587
- report += f"\n\nGenerated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}"
588
- return report
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
589
 
590
  def generate_markdown_report(self) -> str:
591
- """Generate markdown analysis report"""
592
- report = f"""# Data Analysis Report
593
 
594
- ## Dataset Overview
595
- * **Rows:** {self.stats['shape'][0]:,}
596
- * **Columns:** {self.stats['shape'][1]:,}
597
- * **Missing Values:** {self.stats['missing_values']:,}
598
- * **Memory Usage:** {self.stats['memory_usage']:.1f} MB
599
 
600
- ## Data Types
601
- ```
602
- {pd.DataFrame(self.stats['dtypes'].items(), columns=['Type', 'Count']).to_markdown()}
603
- ```
 
 
 
604
 
605
- ## Key Insights
606
  """
 
 
 
 
 
607
  # Group insights by stage
608
- for stage in range(1, 6):
 
 
609
  stage_insights = [i for i in self.insights if i['stage'] == stage]
610
  if stage_insights:
611
- report += f"\n### Stage {stage}\n"
612
  for insight in stage_insights:
613
- report += f"* {insight['insight']}\n"
 
614
 
615
- report += f"\n\n*Generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}*"
 
 
 
 
 
616
  return report
617
 
618
- def generate_python_code(self) -> str:
619
- """Generate reproducible Python code"""
620
- code = """import pandas as pd
 
 
 
 
 
 
 
621
  import numpy as np
622
  import plotly.express as px
623
- from typing import Dict, List, Any
624
-
625
- # Load and prepare data
626
- df = pd.read_csv('your_data.csv') # Update with your data source
627
 
628
- # Basic statistics
629
- def calculate_basic_stats(df: pd.DataFrame) -> Dict[str, Any]:
630
- return {
631
- 'shape': df.shape,
632
- 'memory_usage': float(df.memory_usage(deep=True).sum() / 1024**2),
633
- 'missing_values': int(df.isnull().sum().sum()),
634
- 'dtypes': df.dtypes.value_counts().to_dict(),
635
- 'duplicates': int(df.duplicated().sum())
636
- }
 
 
 
 
 
 
 
637
 
638
- stats = calculate_basic_stats(df)
639
- print("\\nBasic Statistics:")
640
- print(f"- Shape: {stats['shape']}")
641
- print(f"- Memory Usage: {stats['memory_usage']:.1f} MB")
642
- print(f"- Missing Values: {stats['missing_values']}")
643
- print(f"- Duplicates: {stats['duplicates']}")
 
 
 
 
 
 
 
 
644
 
645
- """
646
- # Add data cleaning operations if any were performed
647
- if hasattr(self, 'cleaning_history'):
648
- code += "\n# Data Cleaning\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
649
  for operation in self.cleaning_history:
650
- if "missing values" in operation.lower():
651
- code += "# Handle missing values\n"
652
- code += "df = df.fillna(method='ffill') # Update with your chosen method\n"
653
  elif "duplicate" in operation.lower():
654
- code += "# Remove duplicates\n"
655
- code += "df = df.drop_duplicates()\n"
656
  elif "outlier" in operation.lower():
657
- code += """# Handle outliers
658
- def remove_outliers(df: pd.DataFrame, column: str) -> pd.DataFrame:
659
- Q1 = df[column].quantile(0.25)
660
- Q3 = df[column].quantile(0.75)
661
- IQR = Q3 - Q1
662
- return df[~((df[column] < (Q1 - 1.5 * IQR)) | (df[column] > (Q3 + 1.5 * IQR)))]
663
-
664
- # Apply to numeric columns as needed
665
- numeric_cols = df.select_dtypes(include=[np.number]).columns
666
- for col in numeric_cols:
667
- df = remove_outliers(df, col)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
668
  """
669
 
670
- # Add visualization code
671
- code += """
672
- # Visualizations
673
- def plot_missing_values(df: pd.DataFrame):
674
- missing = df.isnull().sum()
675
- if missing.sum() > 0:
676
- missing = missing[missing > 0]
677
- fig = px.bar(x=missing.index, y=missing.values,
678
- title='Missing Values by Column')
679
- fig.show()
 
 
680
 
681
- def plot_correlations(df: pd.DataFrame):
682
- numeric_cols = df.select_dtypes(include=[np.number]).columns
683
- if len(numeric_cols) > 1:
684
- corr = df[numeric_cols].corr()
685
- fig = px.imshow(corr, title='Correlation Matrix')
686
- fig.show()
687
 
688
- # Generate plots
689
- plot_missing_values(df)
690
- plot_correlations(df)
691
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
692
 
693
- return code
 
 
5
  import plotly.graph_objects as go
6
  from typing import Dict, List, Any, Optional
7
  import os
8
+ import logging
9
  from dotenv import load_dotenv
10
  from data_handler import *
11
  from io import BytesIO
 
13
  # Load environment variables
14
  load_dotenv()
15
 
16
+ # Configure logging
17
+ logger = logging.getLogger(__name__)
18
+
19
+ # Optional AI Integration with enhanced error handling
20
  try:
21
  import openai
22
  OPENAI_AVAILABLE = True
23
  except ImportError:
24
  OPENAI_AVAILABLE = False
25
+ logger.info("OpenAI not available - install openai package for AI features")
26
 
27
  try:
28
  import google.generativeai as genai
29
  GEMINI_AVAILABLE = True
30
  except ImportError:
31
  GEMINI_AVAILABLE = False
32
+ logger.info("Gemini not available - install google-generativeai package for AI features")
33
 
34
  class AIAssistant:
35
+ """Enhanced AI-powered analysis assistant with better error handling"""
36
 
37
  def __init__(self):
38
  self.openai_key = os.getenv('OPENAI_API_KEY')
39
  self.gemini_key = os.getenv('GOOGLE_API_KEY')
40
+ self.setup_models()
41
+
42
+ def setup_models(self):
43
+ """Initialize AI models with error handling"""
44
+ try:
45
+ if self.gemini_key and GEMINI_AVAILABLE:
46
+ genai.configure(api_key=self.gemini_key)
47
+ self.gemini_model = genai.GenerativeModel('gemini-1.5-flash')
48
+ logger.info("Gemini model initialized successfully")
49
+ except Exception as e:
50
+ logger.error(f"Failed to initialize Gemini: {str(e)}")
51
+ self.gemini_key = None
52
 
53
  def get_available_models(self) -> List[str]:
54
  """Get list of available AI models"""
 
60
  return models
61
 
62
  def analyze_insights(self, df: pd.DataFrame, insights: List[Dict], model: str = "Google Gemini") -> str:
63
+ """Get AI analysis with enhanced error handling and rate limiting"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
+ if not insights:
66
+ return "No insights available for analysis. Please complete the data analysis stages first."
 
 
 
 
 
 
 
67
 
68
  try:
69
+ # Prepare concise data summary
70
+ summary = self._prepare_data_summary(df, insights)
71
+ prompt = self._create_analysis_prompt(summary)
72
+
73
  if model == "Google Gemini" and hasattr(self, 'gemini_model'):
74
  response = self.gemini_model.generate_content(prompt)
75
+ return self._format_ai_response(response.text)
76
+
77
+ elif model == "OpenAI GPT" and self.openai_key and OPENAI_AVAILABLE:
78
  client = openai.OpenAI(api_key=self.openai_key)
79
  response = client.chat.completions.create(
80
  model="gpt-3.5-turbo",
81
+ messages=[{"role": "user", "content": prompt}],
82
+ max_tokens=800,
83
+ temperature=0.7
84
  )
85
+ return self._format_ai_response(response.choices[0].message.content)
86
+
87
  else:
88
+ return "AI analysis not available. Please check your API configuration."
89
+
90
  except Exception as e:
91
+ error_msg = f"AI Analysis Error: {str(e)}"
92
+ logger.error(error_msg)
93
+ return f"❌ {error_msg}\n\n💡 Try checking your API keys or internet connection."
94
+
95
+ def _prepare_data_summary(self, df: pd.DataFrame, insights: List[Dict]) -> str:
96
+ """Prepare concise data summary for AI analysis"""
97
+ summary = f"""Dataset: {df.shape[0]} rows × {df.shape[1]} columns
98
+ Data Types: {dict(df.dtypes.value_counts())}
99
+ Missing Data: {df.isnull().sum().sum()} cells
100
+
101
+ Key Findings:"""
102
+
103
+ for insight in insights[-5:]: # Last 5 insights
104
+ summary += f"\n• {insight['insight']}"
105
+
106
+ return summary
107
+
108
+ def _create_analysis_prompt(self, summary: str) -> str:
109
+ """Create optimized prompt for AI analysis"""
110
+ return f"""As a data scientist, provide a brief analysis focusing on:
111
+
112
+ 1. **Business Impact**: What do these findings mean?
113
+ 2. **Recommendations**: 2-3 actionable next steps
114
+ 3. **Risks**: Potential data quality concerns
115
+
116
+ {summary}
117
+
118
+ Keep response under 300 words and focus on actionable insights."""
119
+
120
+ def _format_ai_response(self, response: str) -> str:
121
+ """Format AI response for better readability"""
122
+ if not response:
123
+ return "No response received from AI model."
124
+
125
+ # Clean up response
126
+ formatted = response.strip()
127
+
128
+ # Add emoji headers if not present
129
+ if "Business Impact" in formatted and "🎯" not in formatted:
130
+ formatted = formatted.replace("Business Impact", "🎯 **Business Impact**")
131
+ if "Recommendations" in formatted and "💡" not in formatted:
132
+ formatted = formatted.replace("Recommendations", "💡 **Recommendations**")
133
+ if "Risks" in formatted and "⚠️" not in formatted:
134
+ formatted = formatted.replace("Risks", "⚠️ **Risks**")
135
+
136
+ return formatted
137
 
138
  class DataAnalysisWorkflow:
139
+ """Enhanced data analysis workflow with improved UX and error handling"""
140
 
141
  def __init__(self, df: pd.DataFrame):
142
  self.df = df
143
+ self.original_df = df.copy() # Keep original for rollback
144
  self.stats = calculate_basic_stats(df)
145
  self.column_types = get_column_types(df)
146
  self.insights = []
147
+ self.page_size = 1000
148
+ self.cleaning_history = []
149
 
150
+ # Validate data on initialization
151
+ is_valid, validation_issues = validate_dataframe(df)
152
+ if not is_valid:
153
+ for issue in validation_issues:
154
+ self.add_insight(f"Data validation issue: {issue}", 0)
155
+
156
+ def add_insight(self, insight: str, stage: int, insight_type: str = "info"):
157
+ """Enhanced insight tracking with types"""
158
  self.insights.append({
159
  'stage': stage,
160
  'insight': insight,
161
+ 'type': insight_type,
162
  'timestamp': pd.Timestamp.now()
163
  })
164
 
165
  def get_paginated_data(self, page: int = 0) -> pd.DataFrame:
166
+ """Get paginated data with validation"""
167
+ try:
168
+ start_idx = page * self.page_size
169
+ end_idx = min(start_idx + self.page_size, len(self.df))
170
+ return self.df.iloc[start_idx:end_idx]
171
+ except Exception as e:
172
+ logger.error(f"Pagination error: {str(e)}")
173
+ return self.df.head(10)
174
 
175
  def stage_1_overview(self):
176
+ """Stage 1: Enhanced Data Overview with better UX"""
177
  st.subheader("📊 Data Overview")
178
 
179
+ # Help section
180
+ with st.expander("ℹ️ Help - Understanding Your Data", expanded=False):
181
+ st.markdown("""
182
+ **This stage provides:**
183
+ - Basic dataset statistics and structure
184
+ - Data quality assessment and scoring
185
+ - Memory usage analysis and optimization suggestions
186
+ - Column type classification and cardinality analysis
187
+ """)
188
+
189
+ # Data Quality Score with enhanced display
190
  quality_metrics = calculate_data_quality_score(self.df)
191
+
192
  col1, col2, col3, col4 = st.columns(4)
193
  with col1:
194
+ st.metric("Rows", f"{self.stats['shape'][0]:,}", help="Total number of records")
195
  with col2:
196
+ st.metric("Columns", f"{self.stats['shape'][1]:,}", help="Total number of features")
197
  with col3:
198
+ score_color = "normal" if quality_metrics['score'] >= 80 else "inverse"
199
+ st.metric("Quality Score", f"{quality_metrics['score']:.1f}/100",
200
+ help="Overall data quality assessment")
201
  with col4:
202
+ grade_emoji = {"A+": "🌟", "A": "✅", "B+": "👍", "B": "👌", "C+": "⚠️", "C": "⚠️", "D": "❌", "F": "💥"}
203
+ st.metric("Grade", f"{grade_emoji.get(quality_metrics['grade'], '❓')} {quality_metrics['grade']}")
204
 
205
+ # Quality Issues and Recommendations
206
  if quality_metrics['issues']:
207
+ st.error("🚨 **Data Quality Issues Found:**")
208
  for issue in quality_metrics['issues']:
209
  st.write(f"• {issue}")
210
 
211
+ if quality_metrics.get('recommendations'):
212
+ st.info("💡 **Recommendations:**")
213
+ for rec in quality_metrics['recommendations']:
214
+ st.write(f"• {rec}")
215
+
216
+ # Memory Analysis with actionable insights
217
+ st.subheader("💾 Memory Analysis")
218
  memory_opt = calculate_memory_optimization(self.df)
219
+
220
+ col1, col2, col3 = st.columns(3)
221
  with col1:
222
  st.metric("Current Memory", f"{memory_opt['current_memory_mb']:.1f} MB")
223
  with col2:
224
  if memory_opt['potential_savings_mb'] > 0:
225
  st.metric("Potential Savings",
226
  f"{memory_opt['potential_savings_mb']:.1f} MB",
227
+ f"-{memory_opt['potential_savings_pct']:.1f}%")
228
+ with col3:
229
+ efficiency = 100 - memory_opt['potential_savings_pct']
230
+ st.metric("Memory Efficiency", f"{efficiency:.1f}%")
231
+
232
+ if memory_opt['suggestions']:
233
+ with st.expander("🔧 View Optimization Suggestions", expanded=False):
234
+ st.dataframe(pd.DataFrame(memory_opt['suggestions']), use_container_width=True)
235
+ st.info("💡 Converting object columns to categories can significantly reduce memory usage for repeated values.")
236
 
237
+ # Enhanced Column Analysis
238
+ st.subheader("📋 Column Analysis")
239
  cardinality_df = calculate_column_cardinality(self.df)
240
 
241
+ if not cardinality_df.empty:
242
+ # Interactive filters
243
+ col1, col2 = st.columns(2)
244
+ with col1:
245
+ col_types = cardinality_df['Type'].unique()
246
+ selected_types = st.multiselect("Filter by Cardinality Type",
247
+ col_types,
248
+ default=col_types,
249
+ help="Filter columns by their cardinality classification")
250
+ with col2:
251
+ data_types = cardinality_df['Data Type'].unique()
252
+ selected_data_types = st.multiselect("Filter by Data Type",
253
+ data_types,
254
+ default=data_types,
255
+ help="Filter columns by their pandas data type")
256
+
257
+ # Apply filters
258
+ filtered_df = cardinality_df[
259
+ (cardinality_df['Type'].isin(selected_types)) &
260
+ (cardinality_df['Data Type'].isin(selected_data_types))
261
+ ]
262
+
263
+ st.dataframe(filtered_df, use_container_width=True)
264
+
265
+ # Actionable insights
266
+ self._display_cardinality_insights(filtered_df)
267
+
268
+ # Data Types Visualization
269
+ if self.stats['dtypes']:
270
+ col1, col2 = st.columns(2)
271
+ with col1:
272
+ st.subheader("📊 Data Types Distribution")
273
+ fig = px.pie(values=list(self.stats['dtypes'].values()),
274
+ names=list(self.stats['dtypes'].keys()),
275
+ title="Data Types Distribution")
276
+ fig.update_traces(textposition='inside', textinfo='percent+label')
277
+ st.plotly_chart(fig, use_container_width=True)
278
+
279
+ with col2:
280
+ st.subheader("📈 Column Count by Type")
281
+ fig = px.bar(x=list(self.stats['dtypes'].keys()),
282
+ y=list(self.stats['dtypes'].values()),
283
+ title="Column Count by Data Type")
284
+ st.plotly_chart(fig, use_container_width=True)
285
+
286
+ # Enhanced Sample Data Display
287
+ self._display_sample_data()
288
 
289
+ # Missing Values Analysis
290
+ self._analyze_missing_values()
291
+
292
+ # Record insights
293
+ self._record_stage1_insights(quality_metrics, memory_opt, cardinality_df)
294
+
295
+ def _display_cardinality_insights(self, cardinality_df: pd.DataFrame):
296
+ """Display actionable insights from cardinality analysis"""
297
+ if cardinality_df.empty:
298
+ return
299
+
300
+ # Key findings
301
+ id_cols = cardinality_df[cardinality_df['Type'] == 'Unique Identifier']['Column'].tolist()
302
+ const_cols = cardinality_df[cardinality_df['Type'] == 'Constant']['Column'].tolist()
303
+ low_card_cols = cardinality_df[cardinality_df['Type'].str.contains('Low')]['Column'].tolist()
304
 
 
 
305
  if id_cols:
306
+ st.success(f"🔑 **Potential ID Columns:** {', '.join(id_cols[:3])}" +
307
+ (f" (+{len(id_cols)-3} more)" if len(id_cols) > 3 else ""))
308
+
309
  if const_cols:
310
+ st.warning(f"⚠️ **Constant Columns (consider removing):** {', '.join(const_cols[:3])}" +
311
+ (f" (+{len(const_cols)-3} more)" if len(const_cols) > 3 else ""))
312
+
313
+ if low_card_cols:
314
+ st.info(f"📊 **Good for Grouping/Filtering:** {', '.join(low_card_cols[:3])}" +
315
+ (f" (+{len(low_card_cols)-3} more)" if len(low_card_cols) > 3 else ""))
316
+
317
+ def _display_sample_data(self):
318
+ """Enhanced sample data display with pagination"""
319
+ st.subheader("👀 Sample Data")
320
 
 
 
 
 
 
 
 
 
 
 
321
  total_pages = (len(self.df) - 1) // self.page_size + 1
322
 
323
+ col1, col2, col3 = st.columns([2, 1, 1])
324
+ with col1:
325
+ if total_pages > 1:
326
+ page = st.slider("Page", 0, total_pages - 1, 0,
327
+ help=f"Navigate through {total_pages} pages of data")
328
+ sample_data = self.get_paginated_data(page)
329
+ start_row = page * self.page_size + 1
330
+ end_row = min((page + 1) * self.page_size, len(self.df))
331
+ st.caption(f"Showing rows {start_row:,} to {end_row:,} of {len(self.df):,}")
332
+ else:
333
+ sample_data = self.df.head(20)
334
+ page = 0
335
 
336
+ with col2:
337
+ show_dtypes = st.checkbox("Show Data Types", help="Display column data types")
338
+ with col3:
339
+ max_cols = st.number_input("Max Columns", min_value=5, max_value=50, value=10,
340
+ help="Limit displayed columns for better readability")
341
 
342
+ # Display data with optional type info
343
+ display_df = sample_data.iloc[:, :max_cols]
344
+
345
+ if show_dtypes:
346
+ # Create a summary row with data types
347
+ type_row = pd.DataFrame([display_df.dtypes.astype(str)],
348
+ index=['Data Type'])
349
+ type_row.columns = display_df.columns
350
+
351
+ st.dataframe(type_row, use_container_width=True)
352
+ st.dataframe(display_df, use_container_width=True)
353
+ else:
354
+ st.dataframe(display_df, use_container_width=True)
355
+
356
+ def _analyze_missing_values(self):
357
+ """Enhanced missing values analysis"""
358
  missing_df = calculate_missing_data(self.df)
359
+
360
  if not missing_df.empty:
361
+ st.subheader("🕳️ Missing Values Analysis")
362
+
363
+ # Summary metrics
364
+ total_missing = missing_df['Missing Count'].sum()
365
+ affected_cols = len(missing_df)
366
+
367
+ col1, col2, col3 = st.columns(3)
368
+ with col1:
369
+ st.metric("Total Missing", f"{total_missing:,}")
370
+ with col2:
371
+ st.metric("Affected Columns", affected_cols)
372
+ with col3:
373
+ worst_col_pct = missing_df.iloc[0]['Missing %'] if len(missing_df) > 0 else 0
374
+ st.metric("Worst Column", f"{worst_col_pct:.1f}%")
375
+
376
+ # Detailed table
377
  st.dataframe(missing_df, use_container_width=True)
378
 
379
+ # Visualization for top missing columns
380
+ if len(missing_df) > 1:
381
+ top_missing = missing_df.head(10)
382
+ fig = px.bar(top_missing, x='Column', y='Missing %',
383
+ title="Missing Values by Column",
384
+ color='Missing %',
385
+ color_continuous_scale='Reds')
386
+ fig.update_layout(xaxis_tickangle=-45)
387
+ st.plotly_chart(fig, use_container_width=True)
388
+
389
+ # Actionable recommendations
390
+ high_missing = missing_df[missing_df['Missing %'] > 50]
391
+ if not high_missing.empty:
392
+ st.error(f"⚠️ **Critical:** {len(high_missing)} columns have >50% missing data")
393
+ st.write("Consider removing these columns or investigating data collection issues.")
394
  else:
395
+ st.success("✅ **Excellent!** No missing values found in the dataset")
396
+
397
+ def _record_stage1_insights(self, quality_metrics, memory_opt, cardinality_df):
398
+ """Record insights from stage 1 analysis"""
399
+ # Quality insights
400
+ if quality_metrics['score'] >= 90:
401
+ self.add_insight("Excellent data quality detected", 1, "success")
402
+ elif quality_metrics['score'] < 70:
403
+ self.add_insight(f"Data quality needs attention (Score: {quality_metrics['score']:.1f}/100)", 1, "warning")
404
 
405
+ # Memory insights
406
  if memory_opt['potential_savings_pct'] > 20:
407
+ self.add_insight(f"Significant memory optimization opportunity: {memory_opt['potential_savings_pct']:.1f}%", 1, "info")
408
 
409
+ # Structure insights
410
+ if not cardinality_df.empty:
411
+ id_cols = len(cardinality_df[cardinality_df['Type'] == 'Unique Identifier'])
412
+ const_cols = len(cardinality_df[cardinality_df['Type'] == 'Constant'])
413
+
414
+ if id_cols > 0:
415
+ self.add_insight(f"Found {id_cols} potential identifier column(s)", 1, "info")
416
+ if const_cols > 0:
417
+ self.add_insight(f"Found {const_cols} constant column(s) - consider removal", 1, "warning")
418
 
419
  def stage_2_exploration(self):
420
+ """Stage 2: Enhanced Exploratory Data Analysis"""
421
  st.subheader("🔍 Exploratory Data Analysis")
422
 
423
+ with st.expander("ℹ️ Help - Exploratory Analysis", expanded=False):
424
+ st.markdown("""
425
+ **This stage helps you:**
426
+ - Understand distributions of your variables
427
+ - Identify patterns and relationships
428
+ - Spot potential anomalies or interesting features
429
+ - Guide further analysis decisions
430
+ """)
431
+
432
  numeric_cols = self.column_types['numeric']
433
  categorical_cols = self.column_types['categorical']
434
 
435
+ if not numeric_cols and not categorical_cols:
436
+ st.warning("⚠️ No suitable columns found for analysis. Please check your data types.")
437
+ return
438
+
439
+ # Enhanced Numeric Analysis
440
  if numeric_cols:
441
+ self._analyze_numeric_variables(numeric_cols)
442
+
443
+ # Enhanced Categorical Analysis
444
+ if categorical_cols:
445
+ self._analyze_categorical_variables(categorical_cols)
446
+
447
+ # Relationship Analysis
448
+ self._analyze_relationships(numeric_cols, categorical_cols)
449
+
450
+ def _analyze_numeric_variables(self, numeric_cols: List[str]):
451
+ """Enhanced numeric variable analysis"""
452
+ st.subheader("🔢 Numeric Variables Analysis")
453
+
454
+ col1, col2 = st.columns([1, 1])
455
+ with col1:
456
+ selected_numeric = st.selectbox("Select numeric column:", numeric_cols,
457
+ help="Choose a numeric column to analyze its distribution")
458
+ with col2:
459
+ chart_type = st.selectbox("Chart type:", ["Histogram", "Box Plot", "Violin Plot", "Q-Q Plot"])
460
+
461
+ if selected_numeric:
462
+ # Statistics summary
463
+ stats_dict = calculate_numeric_stats(self.df, selected_numeric)
464
 
465
+ if stats_dict:
466
+ col1, col2, col3, col4 = st.columns(4)
467
+ with col1:
468
+ st.metric("Mean", f"{stats_dict['mean']:.2f}")
469
+ with col2:
470
+ st.metric("Median", f"{stats_dict['median']:.2f}")
471
+ with col3:
472
+ st.metric("Std Dev", f"{stats_dict['std']:.2f}")
473
+ with col4:
474
+ skew_interpretation = "Right-skewed" if stats_dict['skewness'] > 0.5 else "Left-skewed" if stats_dict['skewness'] < -0.5 else "Symmetric"
475
+ st.metric("Skewness", f"{stats_dict['skewness']:.2f}", help=skew_interpretation)
476
 
477
+ # Enhanced visualizations
478
+ try:
479
+ col1, col2 = st.columns(2)
480
+
481
+ with col1:
482
+ if chart_type == "Histogram":
483
+ fig = px.histogram(self.df, x=selected_numeric,
484
+ title=f"Distribution of {selected_numeric}",
485
+ marginal="rug")
486
+ elif chart_type == "Box Plot":
487
+ fig = px.box(self.df, y=selected_numeric,
488
+ title=f"Box Plot of {selected_numeric}")
489
+ elif chart_type == "Violin Plot":
490
+ fig = px.violin(self.df, y=selected_numeric,
491
+ title=f"Violin Plot of {selected_numeric}")
492
+ else: # Q-Q Plot
493
+ from scipy import stats
494
+ qq_data = stats.probplot(self.df[selected_numeric].dropna(), dist="norm")
495
+ fig = go.Figure()
496
+ fig.add_scatter(x=qq_data[0][0], y=qq_data[0][1], mode='markers',
497
+ name='Data Points')
498
+ fig.add_scatter(x=qq_data[0][0], y=qq_data[1][1] + qq_data[1][0] * qq_data[0][0],
499
+ mode='lines', name='Normal Distribution')
500
+ fig.update_layout(title=f"Q-Q Plot of {selected_numeric}",
501
+ xaxis_title="Theoretical Quantiles",
502
+ yaxis_title="Sample Quantiles")
503
+
504
  st.plotly_chart(fig, use_container_width=True)
505
+
506
+ with col2:
507
+ # Summary statistics table
508
+ if stats_dict:
509
+ summary_data = {
510
+ 'Statistic': ['Count', 'Mean', 'Median', 'Std Dev', 'Min', 'Max', 'Q25', 'Q75', 'Skewness', 'Kurtosis'],
511
+ 'Value': [
512
+ len(self.df[selected_numeric].dropna()),
513
+ f"{stats_dict['mean']:.3f}",
514
+ f"{stats_dict['median']:.3f}",
515
+ f"{stats_dict['std']:.3f}",
516
+ f"{stats_dict['min']:.3f}",
517
+ f"{stats_dict['max']:.3f}",
518
+ f"{stats_dict['q25']:.3f}",
519
+ f"{stats_dict['q75']:.3f}",
520
+ f"{stats_dict['skewness']:.3f}",
521
+ f"{stats_dict['kurtosis']:.3f}"
522
+ ]
523
+ }
524
+ st.dataframe(pd.DataFrame(summary_data), use_container_width=True, hide_index=True)
525
+
526
+ # Distribution insights
527
+ if abs(stats_dict['skewness']) > 1:
528
+ skew_type = "highly right-skewed" if stats_dict['skewness'] > 1 else "highly left-skewed"
529
+ self.add_insight(f"{selected_numeric} is {skew_type} (skewness: {stats_dict['skewness']:.2f})", 2, "info")
530
+
531
+ if stats_dict['kurtosis'] > 3:
532
+ self.add_insight(f"{selected_numeric} has heavy tails (kurtosis: {stats_dict['kurtosis']:.2f})", 2, "info")
533
+
534
+ except Exception as e:
535
+ st.error(f"Error creating visualization: {str(e)}")
536
+ logger.error(f"Visualization error for {selected_numeric}: {str(e)}")
537
+
538
+ def _analyze_categorical_variables(self, categorical_cols: List[str]):
539
+ """Enhanced categorical variable analysis"""
540
+ st.subheader("📝 Categorical Variables Analysis")
541
+
542
+ selected_categorical = st.selectbox("Select categorical column:", categorical_cols,
543
+ help="Choose a categorical column to analyze its distribution")
544
+
545
+ if selected_categorical:
546
+ try:
547
+ # Get value counts with error handling
548
+ value_counts = get_value_counts(self.df, selected_categorical, top_n=20)
549
+
550
+ if value_counts is not None and not value_counts.empty:
551
+ total_categories = self.df[selected_categorical].nunique()
552
 
553
+ # Summary metrics
554
+ col1, col2, col3 = st.columns(3)
555
+ with col1:
556
+ st.metric("Total Categories", total_categories)
557
+ with col2:
558
+ top_category_pct = (value_counts.iloc[0] / len(self.df)) * 100
559
+ st.metric("Top Category", f"{top_category_pct:.1f}%")
560
+ with col3:
561
+ entropy = -sum((value_counts / value_counts.sum()) * np.log2(value_counts / value_counts.sum() + 1e-10))
562
+ st.metric("Diversity (Entropy)", f"{entropy:.2f}")
563
 
564
+ # Visualization
565
+ col1, col2 = st.columns(2)
566
+ with col1:
567
+ fig = px.bar(x=value_counts.index, y=value_counts.values,
568
+ title=f"Top {min(20, len(value_counts))} Values in {selected_categorical}")
569
+ fig.update_layout(xaxis_tickangle=-45)
570
+ st.plotly_chart(fig, use_container_width=True)
571
+
572
+ with col2:
573
+ # Show data table
574
+ display_data = pd.DataFrame({
575
+ 'Category': value_counts.index,
576
+ 'Count': value_counts.values,
577
+ 'Percentage': np.round((value_counts.values / len(self.df)) * 100, 2)
578
+ })
579
+ st.dataframe(display_data, use_container_width=True, hide_index=True)
580
+
581
+ # Insights
582
+ if total_categories > 100:
583
+ self.add_insight(f"{selected_categorical} has very high cardinality ({total_categories} categories)", 2, "warning")
584
+ elif top_category_pct > 90:
585
+ self.add_insight(f"{selected_categorical} is highly imbalanced (top category: {top_category_pct:.1f}%)", 2, "warning")
586
+
587
+ else:
588
+ st.warning(f"⚠️ Unable to analyze column '{selected_categorical}' - it may be empty or have issues")
589
 
590
+ except Exception as e:
591
+ st.error(f"Error analyzing categorical variable: {str(e)}")
592
+ logger.error(f"Categorical analysis error for {selected_categorical}: {str(e)}")
593
+
594
+ def _analyze_relationships(self, numeric_cols: List[str], categorical_cols: List[str]):
595
+ """Enhanced relationship analysis"""
596
+ if len(numeric_cols) >= 2:
597
+ st.subheader("🔗 Variable Relationships")
598
 
599
+ # Correlation matrix
600
+ corr_matrix = calculate_correlation_matrix(self.df)
601
+ if corr_matrix is not None and not corr_matrix.empty:
602
+ col1, col2 = st.columns(2)
603
+
604
+ with col1:
605
+ fig = px.imshow(corr_matrix,
606
+ text_auto=True,
607
+ aspect="auto",
608
+ title="Correlation Matrix",
609
+ color_continuous_scale='RdBu')
610
+ st.plotly_chart(fig, use_container_width=True)
611
+
612
+ with col2:
613
+ # Find strongest correlations
614
+ corr_pairs = []
615
+ for i in range(len(corr_matrix.columns)):
616
+ for j in range(i+1, len(corr_matrix.columns)):
617
+ col1_name = corr_matrix.columns[i]
618
+ col2_name = corr_matrix.columns[j]
619
+ corr_val = corr_matrix.iloc[i, j]
620
+ if not np.isnan(corr_val):
621
+ corr_pairs.append({
622
+ 'Variable 1': col1_name,
623
+ 'Variable 2': col2_name,
624
+ 'Correlation': round(corr_val, 3),
625
+ 'Strength': 'Strong' if abs(corr_val) > 0.7 else 'Moderate' if abs(corr_val) > 0.3 else 'Weak'
626
+ })
627
+
628
+ if corr_pairs:
629
+ corr_df = pd.DataFrame(corr_pairs).sort_values('Correlation', key=abs, ascending=False)
630
+ st.subheader("🎯 Strongest Correlations")
631
+ st.dataframe(corr_df.head(10), use_container_width=True, hide_index=True)
632
+
633
+ # Record strongest correlation insight
634
+ strongest = corr_df.iloc[0]
635
+ self.add_insight(f"Strongest correlation: {strongest['Variable 1']} ↔ {strongest['Variable 2']} ({strongest['Correlation']})", 2, "info")
636
 
637
  def stage_3_cleaning(self):
638
+ """Stage 3: Enhanced Data Quality Assessment and Cleaning"""
639
+ st.subheader("🧹 Data Quality & Cleaning")
640
 
641
+ with st.expander("ℹ️ Help - Data Cleaning", expanded=False):
642
+ st.markdown("""
643
+ **Available cleaning operations:**
644
+ - **Missing Values:** Fill with statistics, drop rows, or use custom values
645
+ - **Duplicates:** Remove identical rows
646
+ - **Outliers:** Remove or cap extreme values
647
+ - **Data Types:** Convert columns to appropriate types
648
+ """)
649
 
650
+ # Progress tracking
651
+ cleaning_progress = st.empty()
652
+
653
+ # Enhanced Missing Values Handling
654
+ self._handle_missing_values()
655
+
656
+ # Enhanced Duplicates Handling
657
+ self._handle_duplicates()
658
+
659
+ # Enhanced Mixed Types Handling
660
+ self._handle_mixed_types()
661
+
662
+ # Enhanced Outlier Detection
663
+ self._handle_outliers()
664
+
665
+ # Cleaning Summary
666
+ self._display_cleaning_summary()
667
+
668
+ def _handle_missing_values(self):
669
+ """Enhanced missing values handling with preview"""
670
+ missing_df = calculate_missing_data(self.df)
671
+
672
+ if not missing_df.empty:
673
+ st.subheader("🕳️ Missing Values Treatment")
674
 
675
+ # Select column and method
676
+ col1, col2, col3 = st.columns(3)
677
  with col1:
678
+ selected_col = st.selectbox("Column to clean:", missing_df['Column'].tolist())
 
679
  with col2:
680
+ col_dtype = str(self.df[selected_col].dtype)
681
+ if 'int' in col_dtype or 'float' in col_dtype:
682
+ methods = ["Drop rows", "Mean", "Median", "Mode", "Custom value"]
683
+ else:
684
+ methods = ["Drop rows", "Mode", "Custom value"]
685
+ fill_method = st.selectbox("Fill method:", methods)
686
+ with col3:
687
+ if fill_method == "Custom value":
688
+ if 'int' in col_dtype or 'float' in col_dtype:
689
+ custom_value = st.number_input("Custom value:", value=0.0)
690
+ else:
691
+ custom_value = st.text_input("Custom value:", value="Unknown")
692
 
693
+ # Preview impact
694
+ if selected_col:
695
+ missing_count = self.df[selected_col].isnull().sum()
696
+ total_count = len(self.df)
697
+
698
+ if fill_method == "Drop rows":
699
+ remaining_rows = total_count - missing_count
700
+ st.info(f"📊 **Preview:** Will remove {missing_count} rows, keeping {remaining_rows} rows")
701
+ else:
702
+ st.info(f"📊 **Preview:** Will fill {missing_count} missing values")
703
+
704
+ # Apply cleaning
705
+ if st.button("✨ Apply Missing Value Treatment", type="primary"):
706
  try:
707
+ original_missing = self.df[selected_col].isnull().sum()
708
+
709
  if fill_method == "Drop rows":
710
  self.df = self.df.dropna(subset=[selected_col])
711
+ operation = f"Dropped {original_missing} rows with missing values in {selected_col}"
712
  else:
713
  if fill_method == "Mean":
714
  fill_value = self.df[selected_col].mean()
715
  elif fill_method == "Median":
716
  fill_value = self.df[selected_col].median()
717
  elif fill_method == "Mode":
718
+ mode_result = self.df[selected_col].mode()
719
+ fill_value = mode_result.iloc[0] if not mode_result.empty else "Unknown"
720
+ else:
721
+ fill_value = custom_value
722
 
723
  self.df[selected_col] = self.df[selected_col].fillna(fill_value)
724
+ operation = f"Filled {original_missing} missing values in {selected_col} with {fill_method}"
725
 
726
+ self.cleaning_history.append(operation)
727
+ st.success(f"✅ {operation}")
728
+ st.rerun()
729
+
730
  except Exception as e:
731
+ st.error(f"Error applying treatment: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
732
  else:
733
+ st.success("✅ No missing values found!")
734
+
735
+ def _handle_duplicates(self):
736
+ """Enhanced duplicate handling"""
737
+ if self.stats['duplicates'] > 0:
738
+ st.subheader("👥 Duplicate Rows")
 
 
739
 
740
+ duplicate_pct = (self.stats['duplicates'] / len(self.df)) * 100
741
+ st.warning(f"⚠️ Found **{self.stats['duplicates']:,}** duplicate rows ({duplicate_pct:.1f}% of data)")
742
 
743
+ # Show sample duplicates
744
+ duplicates = self.df[self.df.duplicated(keep=False)].head(10)
745
+ if not duplicates.empty:
746
+ st.write("**Sample duplicate rows:**")
747
+ st.dataframe(duplicates, use_container_width=True)
748
 
749
+ if st.button("🗑️ Remove Duplicate Rows", type="primary"):
750
  try:
751
+ original_len = len(self.df)
752
+ self.df = self.df.drop_duplicates()
753
+ removed = original_len - len(self.df)
754
+ operation = f"Removed {removed} duplicate rows"
755
+ self.cleaning_history.append(operation)
756
+ st.success(f" {operation}")
757
+ st.rerun()
758
  except Exception as e:
759
+ st.error(f"Error removing duplicates: {str(e)}")
760
+ else:
761
+ st.success("✅ No duplicate rows found!")
762
+
763
+ def _handle_mixed_types(self):
764
+ """Enhanced mixed types handling"""
765
+ mixed_types = detect_mixed_types(self.df)
766
 
767
+ if mixed_types:
768
+ st.subheader("🔀 Mixed Data Types")
 
 
 
 
 
 
769
 
770
+ for issue in mixed_types:
771
+ col = issue['column']
772
+ problems = issue['problematic_values']
773
+ pct = issue['percentage']
774
 
775
+ st.warning(f"⚠️ **{col}:** {problems} values ({pct:.1f}%) cannot be converted to numeric")
 
776
 
777
+ # Show sample problematic values
778
+ if 'sample_issues' in issue:
779
+ sample_issues = issue['sample_issues']
780
+ st.write("**Sample problematic values:**")
781
+ for value, count in list(sample_issues.items())[:5]:
782
+ st.write(f"• '{value}' ({count} occurrences)")
783
+
784
+ col1, col2 = st.columns(2)
785
+ with col1:
786
+ fix_method = st.selectbox(f"Fix method for {col}:",
787
+ ["Convert to numeric (coerce errors)", "Keep as text"],
788
+ key=f"fix_{col}")
789
+ with col2:
790
+ if st.button(f"🔧 Fix {col}", key=f"apply_{col}"):
791
+ try:
792
+ if fix_method == "Convert to numeric (coerce errors)":
793
+ self.df[col] = pd.to_numeric(self.df[col], errors='coerce')
794
+ operation = f"Converted {col} to numeric (with coercion)"
795
+ else:
796
+ operation = f"Kept {col} as text type"
797
 
798
+ self.cleaning_history.append(operation)
799
+ st.success(f" {operation}")
800
+ st.rerun()
801
+ except Exception as e:
802
+ st.error(f"❌ Error fixing {col}: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
803
  else:
804
+ st.success("✅ No mixed data type issues found!")
 
805
 
806
+ def _handle_outliers(self):
807
+ """Enhanced outlier detection and handling"""
 
 
808
  numeric_cols = self.column_types['numeric']
 
809
 
810
+ if numeric_cols:
811
+ st.subheader("🎯 Outlier Detection")
 
812
 
813
+ col1, col2, col3 = st.columns(3)
814
  with col1:
815
+ selected_col = st.selectbox("Column for outlier detection:", numeric_cols)
816
  with col2:
817
+ detection_method = st.selectbox("Detection method:",
818
+ ["IQR (Interquartile Range)", "Z-Score", "Percentile"])
819
+ with col3:
820
+ if detection_method == "Z-Score":
821
+ threshold = st.number_input("Z-Score threshold:", min_value=1.0, max_value=5.0, value=3.0)
822
+ elif detection_method == "Percentile":
823
+ percentile = st.slider("Outlier percentile:", 0.1, 5.0, 1.0)
824
 
825
+ if selected_col:
826
+ try:
827
+ method_map = {
828
+ "IQR (Interquartile Range)": "iqr",
829
+ "Z-Score": "zscore",
830
+ "Percentile": "percentile"
831
+ }
832
+ outliers = calculate_outliers(self.df, selected_col, method_map[detection_method])
833
+
834
+ if outliers is not None and not outliers.empty:
835
+ outlier_count = len(outliers)
836
+ outlier_pct = (outlier_count / len(self.df)) * 100
837
+
838
+ st.warning(f"⚠️ Found **{outlier_count}** potential outliers ({outlier_pct:.1f}% of data)")
839
+
840
+ # Show outlier statistics
841
+ col1, col2 = st.columns(2)
842
+ with col1:
843
+ outlier_stats = outliers[selected_col].describe()
844
+ st.write("**Outlier Statistics:**")
845
+ st.dataframe(outlier_stats.to_frame().T, use_container_width=True)
846
+
847
+ with col2:
848
+ # Visualization of outliers
849
+ fig = go.Figure()
850
+ fig.add_trace(go.Scatter(
851
+ x=self.df.index,
852
+ y=self.df[selected_col],
853
+ mode='markers',
854
+ name='Normal Data',
855
+ marker=dict(color='blue', opacity=0.6)
856
+ ))
857
+ fig.add_trace(go.Scatter(
858
+ x=outliers.index,
859
+ y=outliers[selected_col],
860
+ mode='markers',
861
+ name='Outliers',
862
+ marker=dict(color='red', size=8)
863
+ ))
864
+ fig.update_layout(title=f"Outliers in {selected_col}")
865
+ st.plotly_chart(fig, use_container_width=True)
866
+
867
+ # Treatment options
868
+ treatment_method = st.selectbox("Outlier treatment:",
869
+ ["None", "Remove outliers", "Cap at bounds"])
870
+
871
+ if treatment_method != "None":
872
+ st.info(f"📊 **Preview:** This will affect {outlier_count} data points")
873
+
874
+ if st.button("🔧 Apply Outlier Treatment", type="primary"):
875
+ try:
876
+ if treatment_method == "Remove outliers":
877
+ self.df = self.df[~self.df.index.isin(outliers.index)]
878
+ operation = f"Removed {outlier_count} outliers from {selected_col}"
879
+ else: # Cap at bounds
880
+ Q1 = self.df[selected_col].quantile(0.25)
881
+ Q3 = self.df[selected_col].quantile(0.75)
882
+ IQR = Q3 - Q1
883
+ lower_bound = Q1 - 1.5 * IQR
884
+ upper_bound = Q3 + 1.5 * IQR
885
+
886
+ self.df[selected_col] = self.df[selected_col].clip(lower_bound, upper_bound)
887
+ operation = f"Capped outliers in {selected_col} to bounds"
888
+
889
+ self.cleaning_history.append(operation)
890
+ st.success(f"✅ {operation}")
891
+ st.rerun()
892
+
893
+ except Exception as e:
894
+ st.error(f"❌ Error treating outliers: {str(e)}")
895
+ else:
896
+ st.success(f"✅ No outliers detected in '{selected_col}' using {detection_method}")
897
+
898
+ except Exception as e:
899
+ st.error(f"❌ Error detecting outliers: {str(e)}")
900
+
901
+ def _display_cleaning_summary(self):
902
+ """Display comprehensive cleaning summary"""
903
+ if self.cleaning_history:
904
+ st.subheader("📋 Cleaning Operations History")
905
 
906
+ for i, operation in enumerate(self.cleaning_history, 1):
907
+ st.write(f"**{i}.** {operation}")
 
 
 
 
 
908
 
909
+ # Show data changes
910
  col1, col2 = st.columns(2)
911
  with col1:
912
+ st.metric("Original Rows", f"{self.original_df.shape[0]:,}")
913
+ st.metric("Original Memory", f"{self.original_df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
914
  with col2:
915
+ st.metric("Current Rows", f"{self.df.shape[0]:,}",
916
+ delta=f"{self.df.shape[0] - self.original_df.shape[0]:,}")
917
+ current_memory = self.df.memory_usage(deep=True).sum() / 1024**2
918
+ original_memory = self.original_df.memory_usage(deep=True).sum() / 1024**2
919
+ st.metric("Current Memory", f"{current_memory:.1f} MB",
920
+ delta=f"{current_memory - original_memory:.1f} MB")
921
 
922
+ # Rollback option
923
+ if st.button("↩️ Reset to Original Data", help="Restore original dataset"):
924
+ self.df = self.original_df.copy()
925
+ self.cleaning_history = []
926
+ st.success("✅ Data reset to original state")
927
+ st.rerun()
928
 
929
+ self.add_insight(f"Applied {len(self.cleaning_history)} cleaning operations", 3, "info")
930
+ else:
931
+ st.info("ℹ️ No cleaning operations performed yet")
932
+
933
+ def stage_4_analysis(self):
934
+ """Stage 4: Enhanced Advanced Analysis"""
935
+ st.subheader("🔬 Advanced Analysis")
936
+
937
+ with st.expander("ℹ️ Help - Advanced Analysis", expanded=False):
938
+ st.markdown("""
939
+ **Advanced analysis includes:**
940
+ - **Relationships:** Correlation and scatter plot analysis
941
+ - **Group Analysis:** Compare metrics across categories
942
+ - **Distribution Analysis:** Statistical testing and comparisons
943
+ """)
944
+
945
+ numeric_cols = self.column_types['numeric']
946
+ categorical_cols = self.column_types['categorical']
947
+
948
+ # Enhanced Relationship Analysis
949
+ if len(numeric_cols) >= 2:
950
+ self._advanced_relationship_analysis(numeric_cols)
951
+
952
+ # Enhanced Group Analysis
953
+ if categorical_cols and numeric_cols:
954
+ self._advanced_group_analysis(categorical_cols, numeric_cols)
955
+
956
+ # Statistical Testing
957
+ if len(numeric_cols) >= 2:
958
+ self._statistical_testing(numeric_cols, categorical_cols)
959
+
960
+ def _advanced_relationship_analysis(self, numeric_cols: List[str]):
961
+ """Enhanced relationship analysis with statistical insights"""
962
+ st.subheader("🔗 Variable Relationships")
963
+
964
+ col1, col2, col3 = st.columns(3)
965
+ with col1:
966
+ x_var = st.selectbox("X Variable:", numeric_cols)
967
+ with col2:
968
+ y_var = st.selectbox("Y Variable:", [col for col in numeric_cols if col != x_var])
969
+ with col3:
970
+ color_var = st.selectbox("Color by (optional):",
971
+ ["None"] + self.column_types['categorical'][:10])
972
+
973
+ if x_var and y_var:
974
+ try:
975
+ # Sample for performance
976
+ sample_size = min(5000, len(self.df))
977
+ if len(self.df) > sample_size:
978
+ sample_df = self.df.sample(n=sample_size, random_state=42)
979
+ st.info(f"📊 Showing sample of {sample_size:,} points for performance")
980
+ else:
981
+ sample_df = self.df
982
+
983
+ # Create scatter plot
984
+ if color_var != "None":
985
+ fig = px.scatter(sample_df, x=x_var, y=y_var, color=color_var,
986
+ title=f"Relationship: {x_var} vs {y_var}",
987
+ trendline="ols")
988
+ else:
989
+ fig = px.scatter(sample_df, x=x_var, y=y_var,
990
+ title=f"Relationship: {x_var} vs {y_var}",
991
+ trendline="ols")
992
+
993
  st.plotly_chart(fig, use_container_width=True)
994
+
995
+ # Statistical analysis
996
+ correlation = self.df[x_var].corr(self.df[y_var])
997
+
998
+ col1, col2, col3 = st.columns(3)
999
+ with col1:
1000
+ st.metric("Correlation", f"{correlation:.3f}")
1001
+ with col2:
1002
+ if abs(correlation) > 0.7:
1003
+ strength = "Strong"
1004
+ elif abs(correlation) > 0.3:
1005
+ strength = "Moderate"
1006
+ else:
1007
+ strength = "Weak"
1008
+ st.metric("Strength", strength)
1009
+ with col3:
1010
+ direction = "Positive" if correlation > 0 else "Negative"
1011
+ st.metric("Direction", direction)
1012
+
1013
+ # Record insight
1014
+ self.add_insight(f"{strength} {direction.lower()} correlation ({correlation:.3f}) between {x_var} and {y_var}", 4, "info")
1015
 
1016
+ except Exception as e:
1017
+ st.error(f"❌ Error in relationship analysis: {str(e)}")
1018
+
1019
+ def _advanced_group_analysis(self, categorical_cols: List[str], numeric_cols: List[str]):
1020
+ """Enhanced group analysis with statistical comparisons"""
1021
+ st.subheader("👥 Group Analysis")
1022
+
1023
+ col1, col2 = st.columns(2)
1024
+ with col1:
1025
+ group_var = st.selectbox("Group by:", categorical_cols)
1026
+ with col2:
1027
+ metric_var = st.selectbox("Analyze metric:", numeric_cols)
1028
+
1029
+ if group_var and metric_var:
1030
+ try:
1031
+ group_stats = calculate_group_stats(self.df, group_var, metric_var)
1032
+
1033
+ if group_stats is not None and not group_stats.empty:
1034
+ # Display statistics
1035
+ st.dataframe(group_stats, use_container_width=True)
1036
+
1037
+ # Visualization
1038
+ unique_groups = self.df[group_var].nunique()
1039
+ if unique_groups <= 20:
1040
+ col1, col2 = st.columns(2)
1041
+ with col1:
1042
+ fig = px.box(self.df, x=group_var, y=metric_var,
1043
+ title=f"{metric_var} by {group_var}")
1044
+ fig.update_layout(xaxis_tickangle=-45)
1045
+ st.plotly_chart(fig, use_container_width=True)
1046
+
1047
+ with col2:
1048
+ # Mean comparison
1049
+ group_means = self.df.groupby(group_var)[metric_var].mean().sort_values(ascending=False)
1050
+ fig = px.bar(x=group_means.index, y=group_means.values,
1051
+ title=f"Average {metric_var} by {group_var}")
1052
+ fig.update_layout(xaxis_tickangle=-45)
1053
+ st.plotly_chart(fig, use_container_width=True)
1054
+ else:
1055
+ st.info(f"ℹ️ Too many groups ({unique_groups}) for visualization. Showing statistics only.")
1056
+
1057
+ # Find insights
1058
+ best_group = group_stats.loc[group_stats['mean'].idxmax(), group_var]
1059
+ best_value = group_stats['mean'].max()
1060
+ worst_group = group_stats.loc[group_stats['mean'].idxmin(), group_var]
1061
+ worst_value = group_stats['mean'].min()
1062
+
1063
+ col1, col2 = st.columns(2)
1064
+ with col1:
1065
+ st.success(f"🏆 **Highest {metric_var}:** {best_group} ({best_value:.2f})")
1066
+ with col2:
1067
+ st.info(f"📉 **Lowest {metric_var}:** {worst_group} ({worst_value:.2f})")
1068
+
1069
+ self.add_insight(f"'{best_group}' has highest average {metric_var}: {best_value:.2f}", 4, "success")
1070
+
1071
+ except Exception as e:
1072
+ st.error(f"❌ Error in group analysis: {str(e)}")
1073
+
1074
+ def _statistical_testing(self, numeric_cols: List[str], categorical_cols: List[str]):
1075
+ """Enhanced statistical testing capabilities"""
1076
+ if len(numeric_cols) >= 2:
1077
+ st.subheader("📊 Statistical Testing")
1078
+
1079
+ test_type = st.selectbox("Select test type:",
1080
+ ["Correlation Test", "Group Comparison"])
1081
+
1082
+ if test_type == "Correlation Test" and len(numeric_cols) >= 2:
1083
+ col1, col2 = st.columns(2)
1084
+ with col1:
1085
+ var1 = st.selectbox("Variable 1:", numeric_cols, key="corr_var1")
1086
+ with col2:
1087
+ var2 = st.selectbox("Variable 2:",
1088
+ [col for col in numeric_cols if col != var1],
1089
+ key="corr_var2")
1090
+
1091
+ if st.button("🧪 Run Correlation Test"):
1092
+ try:
1093
+ from scipy.stats import pearsonr, spearmanr
1094
+
1095
+ # Clean data for testing
1096
+ clean_data = self.df[[var1, var2]].dropna()
1097
+
1098
+ if len(clean_data) < 10:
1099
+ st.warning("⚠️ Insufficient data for reliable correlation testing")
1100
+ else:
1101
+ # Pearson correlation
1102
+ pearson_corr, pearson_p = pearsonr(clean_data[var1], clean_data[var2])
1103
+
1104
+ # Spearman correlation (rank-based)
1105
+ spearman_corr, spearman_p = spearmanr(clean_data[var1], clean_data[var2])
1106
+
1107
+ col1, col2 = st.columns(2)
1108
+ with col1:
1109
+ st.subheader("Pearson Correlation")
1110
+ st.metric("Correlation", f"{pearson_corr:.3f}")
1111
+ st.metric("P-value", f"{pearson_p:.4f}")
1112
+ if pearson_p < 0.05:
1113
+ st.success("✅ Statistically significant")
1114
+ else:
1115
+ st.warning("⚠️ Not statistically significant")
1116
+
1117
+ with col2:
1118
+ st.subheader("Spearman Correlation")
1119
+ st.metric("Correlation", f"{spearman_corr:.3f}")
1120
+ st.metric("P-value", f"{spearman_p:.4f}")
1121
+ if spearman_p < 0.05:
1122
+ st.success("✅ Statistically significant")
1123
+ else:
1124
+ st.warning("⚠️ Not statistically significant")
1125
+
1126
+ # Interpretation
1127
+ if pearson_p < 0.05:
1128
+ self.add_insight(f"Significant correlation between {var1} and {var2} (p={pearson_p:.4f})", 4, "success")
1129
+
1130
+ except Exception as e:
1131
+ st.error(f"❌ Error in correlation testing: {str(e)}")
1132
 
1133
  def stage_5_summary(self):
1134
+ """Stage 5: Enhanced Summary and Export"""
1135
+ st.subheader("📈 Analysis Summary & Export")
1136
 
1137
+ with st.expander("ℹ️ Help - Summary & Export", expanded=False):
1138
+ st.markdown("""
1139
+ **This final stage provides:**
1140
+ - Complete analysis summary with all insights
1141
+ - Multiple export formats for your results
1142
+ - Code generation for reproducible analysis
1143
+ - Data quality final report
1144
+ """)
1145
+
1146
+ # Enhanced Key Metrics Dashboard
1147
+ col1, col2, col3, col4 = st.columns(4)
1148
  with col1:
1149
+ st.metric("📊 Total Insights", len(self.insights))
1150
  with col2:
1151
+ success_insights = len([i for i in self.insights if i.get('type') == 'success'])
1152
+ st.metric(" Positive Findings", success_insights)
1153
  with col3:
1154
+ warning_insights = len([i for i in self.insights if i.get('type') == 'warning'])
1155
+ st.metric("⚠️ Issues Found", warning_insights)
1156
+ with col4:
1157
+ final_quality = "High" if self.stats['missing_values'] == 0 and self.stats['duplicates'] == 0 else "Medium"
1158
+ st.metric("🎯 Final Quality", final_quality)
1159
+
1160
+ # Categorized Insights Summary
1161
+ self._display_categorized_insights()
1162
+
1163
+ # Data Transformation Summary
1164
+ if self.cleaning_history:
1165
+ st.subheader("🔄 Data Transformations Applied")
1166
+ for i, operation in enumerate(self.cleaning_history, 1):
1167
+ st.write(f"**{i}.** {operation}")
1168
+
1169
+ st.info(f" Dataset transformed from {self.original_df.shape} to {self.df.shape}")
1170
+
1171
+ # Enhanced Export Options
1172
+ self._display_export_options()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1173
 
1174
+ def _display_categorized_insights(self):
1175
+ """Display insights organized by category and stage"""
1176
+ st.subheader("💡 Key Insights by Stage")
 
 
 
 
 
 
 
 
 
 
 
 
1177
 
1178
+ stage_names = {
1179
+ 0: "🔍 Validation",
1180
+ 1: "📊 Overview",
1181
+ 2: "🔍 Exploration",
1182
+ 3: "🧹 Cleaning",
1183
+ 4: "🔬 Analysis"
1184
+ }
1185
+
1186
+ for stage in range(5):
1187
+ stage_insights = [i for i in self.insights if i['stage'] == stage]
1188
+ if stage_insights:
1189
+ st.write(f"**{stage_names.get(stage, f'Stage {stage}')}**")
1190
+ for insight in stage_insights:
1191
+ icon = {"success": "✅", "warning": "⚠️", "error": "❌"}.get(insight.get('type'), "ℹ️")
1192
+ st.write(f" {icon} {insight['insight']}")
1193
+
1194
+ def _display_export_options(self):
1195
+ """Enhanced export options with previews"""
1196
+ st.subheader("📤 Export Results")
1197
+
1198
+ export_type = st.selectbox("Choose export type:",
1199
+ ["Analysis Report", "Cleaned Dataset", "Python Code", "Summary Dashboard"])
1200
+
1201
+ try:
1202
+ if export_type == "Analysis Report":
1203
+ format_choice = st.selectbox("Report format:", ["Markdown", "HTML", "Text"])
1204
+
1205
+ col1, col2 = st.columns([3, 1])
1206
+ with col1:
1207
+ if format_choice == "Markdown":
1208
+ report = self.generate_markdown_report()
1209
+ st.code(report[:500] + "..." if len(report) > 500 else report, language="markdown")
1210
+ with col2:
1211
+ st.download_button(
1212
+ label=f"📄 Download {format_choice} Report",
1213
+ data=report if format_choice == "Markdown" else self.generate_text_report(),
1214
+ file_name=f"analysis_report.{format_choice.lower()}",
1215
+ mime="text/markdown" if format_choice == "Markdown" else "text/plain"
1216
+ )
1217
+
1218
+ elif export_type == "Cleaned Dataset":
1219
+ format_choice = st.selectbox("Data format:", ["CSV", "Excel", "Parquet"])
1220
+
1221
+ col1, col2 = st.columns([3, 1])
1222
+ with col1:
1223
+ st.write("**Data Preview:**")
1224
+ st.dataframe(self.df.head(), use_container_width=True)
1225
+ st.write(f"**Final Shape:** {self.df.shape[0]:,} rows × {self.df.shape[1]:,} columns")
1226
+
1227
+ with col2:
1228
+ if st.button(f"📊 Export as {format_choice}"):
1229
+ try:
1230
+ if format_choice == "CSV":
1231
+ csv = self.df.to_csv(index=False)
1232
+ st.download_button("💾 Download CSV", csv, "cleaned_data.csv", "text/csv")
1233
+
1234
+ elif format_choice == "Excel":
1235
+ buffer = BytesIO()
1236
+ with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
1237
+ self.df.to_excel(writer, sheet_name='Cleaned_Data', index=False)
1238
+
1239
+ # Add summary sheet
1240
+ summary_df = pd.DataFrame({
1241
+ 'Metric': ['Original Rows', 'Final Rows', 'Columns', 'Cleaning Operations'],
1242
+ 'Value': [self.original_df.shape[0], self.df.shape[0],
1243
+ self.df.shape[1], len(self.cleaning_history)]
1244
+ })
1245
+ summary_df.to_excel(writer, sheet_name='Summary', index=False)
1246
+
1247
+ st.download_button("💾 Download Excel", buffer.getvalue(),
1248
+ "cleaned_data.xlsx",
1249
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
1250
+
1251
+ elif format_choice == "Parquet":
1252
+ buffer = BytesIO()
1253
+ self.df.to_parquet(buffer, index=False)
1254
+ st.download_button("💾 Download Parquet", buffer.getvalue(),
1255
+ "cleaned_data.parquet", "application/octet-stream")
1256
+
1257
+ except Exception as e:
1258
+ st.error(f"❌ Export error: {str(e)}")
1259
+
1260
+ elif export_type == "Python Code":
1261
+ code = self.generate_enhanced_python_code()
1262
+ st.code(code, language="python")
1263
+ st.download_button("💾 Download Python Script", code,
1264
+ "analysis_script.py", "text/plain")
1265
+
1266
+ except Exception as e:
1267
+ st.error(f"❌ Export error: {str(e)}")
1268
 
1269
  def generate_markdown_report(self) -> str:
1270
+ """Generate comprehensive markdown report"""
1271
+ report = f"""# 📊 Data Analysis Report
1272
 
1273
+ ## Executive Summary
1274
+ - **Dataset Size:** {self.stats['shape'][0]:,} rows × {self.stats['shape'][1]:,} columns
1275
+ - **Data Quality:** {calculate_data_quality_score(self.df)['grade']} grade
1276
+ - **Memory Usage:** {self.stats['memory_usage']:.1f} MB
1277
+ - **Analysis Completed:** {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
1278
 
1279
+ ## 📈 Data Overview
1280
+ | Metric | Value |
1281
+ |--------|-------|
1282
+ | Total Records | {self.stats['shape'][0]:,} |
1283
+ | Total Features | {self.stats['shape'][1]:,} |
1284
+ | Missing Values | {self.stats['missing_values']:,} |
1285
+ | Duplicate Rows | {self.stats['duplicates']:,} |
1286
 
1287
+ ## 📊 Data Types
1288
  """
1289
+ for dtype, count in self.stats['dtypes'].items():
1290
+ report += f"- **{dtype}:** {count} columns\n"
1291
+
1292
+ report += "\n## 💡 Key Insights\n"
1293
+
1294
  # Group insights by stage
1295
+ stage_names = {0: "Validation", 1: "Overview", 2: "Exploration", 3: "Cleaning", 4: "Analysis"}
1296
+
1297
+ for stage in range(5):
1298
  stage_insights = [i for i in self.insights if i['stage'] == stage]
1299
  if stage_insights:
1300
+ report += f"\n### {stage_names.get(stage, f'Stage {stage}')}\n"
1301
  for insight in stage_insights:
1302
+ icon = {"success": "✅", "warning": "⚠️", "error": "❌"}.get(insight.get('type'), "ℹ️")
1303
+ report += f"- {icon} {insight['insight']}\n"
1304
 
1305
+ if self.cleaning_history:
1306
+ report += "\n## 🔄 Data Transformations\n"
1307
+ for i, operation in enumerate(self.cleaning_history, 1):
1308
+ report += f"{i}. {operation}\n"
1309
+
1310
+ report += f"\n---\n*Report generated by Data Analysis Platform*"
1311
  return report
1312
 
1313
+ def generate_enhanced_python_code(self) -> str:
1314
+ """Generate comprehensive Python code for reproducible analysis"""
1315
+ code = f'''"""
1316
+ Data Analysis Script
1317
+ Generated on: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
1318
+ Original Dataset: {self.original_df.shape[0]:,} rows × {self.original_df.shape[1]:,} columns
1319
+ Final Dataset: {self.df.shape[0]:,} rows × {self.df.shape[1]:,} columns
1320
+ """
1321
+
1322
+ import pandas as pd
1323
  import numpy as np
1324
  import plotly.express as px
1325
+ import plotly.graph_objects as go
1326
+ from scipy import stats
1327
+ import warnings
1328
+ warnings.filterwarnings('ignore')
1329
 
1330
+ # Load data
1331
+ def load_and_prepare_data(file_path: str) -> pd.DataFrame:
1332
+ """Load and prepare data with error handling"""
1333
+ try:
1334
+ if file_path.endswith('.csv'):
1335
+ df = pd.read_csv(file_path)
1336
+ elif file_path.endswith(('.xlsx', '.xls')):
1337
+ df = pd.read_excel(file_path)
1338
+ else:
1339
+ raise ValueError("Unsupported file format")
1340
+
1341
+ print(f"Loaded data: {{df.shape[0]:,}} rows × {{df.shape[1]:,}} columns")
1342
+ return df
1343
+ except Exception as e:
1344
+ print(f"Error loading data: {{e}}")
1345
+ return None
1346
 
1347
+ # Data quality assessment
1348
+ def assess_data_quality(df: pd.DataFrame) -> dict:
1349
+ """Calculate comprehensive data quality metrics"""
1350
+ total_cells = len(df) * len(df.columns)
1351
+ missing_count = df.isnull().sum().sum()
1352
+ duplicate_count = df.duplicated().sum()
1353
+
1354
+ return {{
1355
+ 'total_rows': len(df),
1356
+ 'total_columns': len(df.columns),
1357
+ 'missing_percentage': (missing_count / total_cells) * 100,
1358
+ 'duplicate_percentage': (duplicate_count / len(df)) * 100,
1359
+ 'memory_usage_mb': df.memory_usage(deep=True).sum() / 1024**2
1360
+ }}
1361
 
1362
+ # Main analysis
1363
+ if __name__ == "__main__":
1364
+ # Load your data
1365
+ df = load_and_prepare_data('your_data_file.csv') # Update with your file path
1366
+
1367
+ if df is not None:
1368
+ # Data quality assessment
1369
+ quality = assess_data_quality(df)
1370
+ print("\\n=== DATA QUALITY REPORT ===")
1371
+ print(f"Rows: {{quality['total_rows']:,}}")
1372
+ print(f"Columns: {{quality['total_columns']:,}}")
1373
+ print(f"Missing Data: {{quality['missing_percentage']:.2f}}%")
1374
+ print(f"Duplicates: {{quality['duplicate_percentage']:.2f}}%")
1375
+ print(f"Memory Usage: {{quality['memory_usage_mb']:.1f}} MB")
1376
+ '''
1377
+
1378
+ # Add cleaning operations if any
1379
+ if self.cleaning_history:
1380
+ code += "\n # Applied cleaning operations:\n"
1381
  for operation in self.cleaning_history:
1382
+ if "missing" in operation.lower():
1383
+ code += " # df = df.fillna(method='your_chosen_method')\n"
 
1384
  elif "duplicate" in operation.lower():
1385
+ code += " df = df.drop_duplicates()\n"
 
1386
  elif "outlier" in operation.lower():
1387
+ code += """ # Remove outliers using IQR method
1388
+ def remove_outliers(df, column):
1389
+ Q1 = df[column].quantile(0.25)
1390
+ Q3 = df[column].quantile(0.75)
1391
+ IQR = Q3 - Q1
1392
+ return df[~((df[column] < (Q1 - 1.5 * IQR)) | (df[column] > (Q3 + 1.5 * IQR)))]
1393
+
1394
+ # df = remove_outliers(df, 'your_column')
1395
+ """
1396
+
1397
+ # Add analysis code
1398
+ code += f"""
1399
+ # Basic statistics
1400
+ print("\\n=== BASIC STATISTICS ===")
1401
+ print(df.describe())
1402
+
1403
+ # Correlation analysis (if numeric columns exist)
1404
+ numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
1405
+ if len(numeric_cols) > 1:
1406
+ print("\\n=== CORRELATION MATRIX ===")
1407
+ corr_matrix = df[numeric_cols].corr()
1408
+ print(corr_matrix)
1409
+
1410
+ # Visualize correlation matrix
1411
+ fig = px.imshow(corr_matrix, title='Correlation Matrix')
1412
+ fig.show()
1413
+
1414
+ # Missing values visualization
1415
+ missing = df.isnull().sum()
1416
+ if missing.sum() > 0:
1417
+ missing = missing[missing > 0]
1418
+ fig = px.bar(x=missing.index, y=missing.values,
1419
+ title='Missing Values by Column')
1420
+ fig.show()
1421
+
1422
+ # Final data quality report
1423
+ final_quality = assess_data_quality(df)
1424
+ print("\\n=== FINAL QUALITY REPORT ===")
1425
+ for key, value in final_quality.items():
1426
+ print(f"{{key}}: {{value}}")
1427
  """
1428
 
1429
+ return code
1430
+
1431
+ def generate_text_report(self) -> str:
1432
+ """Generate enhanced text analysis report"""
1433
+ report = f"""DATA ANALYSIS REPORT
1434
+ {'='*50}
1435
+
1436
+ EXECUTIVE SUMMARY
1437
+ Dataset: {self.stats['shape'][0]:,} rows × {self.stats['shape'][1]:,} columns
1438
+ Quality Grade: {calculate_data_quality_score(self.df)['grade']}
1439
+ Memory Usage: {self.stats['memory_usage']:.1f} MB
1440
+ Analysis Date: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
1441
 
1442
+ DATA OVERVIEW
1443
+ - Total Records: {self.stats['shape'][0]:,}
1444
+ - Total Features: {self.stats['shape'][1]:,}
1445
+ - Missing Values: {self.stats['missing_values']:,}
1446
+ - Duplicate Rows: {self.stats['duplicates']:,}
 
1447
 
1448
+ DATA TYPES DISTRIBUTION
 
 
1449
  """
1450
+ for dtype, count in self.stats['dtypes'].items():
1451
+ report += f"- {dtype}: {count} columns\n"
1452
+
1453
+ report += "\nKEY INSIGHTS\n" + "="*20 + "\n"
1454
+
1455
+ # Organize insights by stage
1456
+ stage_names = {0: "VALIDATION", 1: "OVERVIEW", 2: "EXPLORATION", 3: "CLEANING", 4: "ANALYSIS"}
1457
+
1458
+ for stage in range(5):
1459
+ stage_insights = [i for i in self.insights if i['stage'] == stage]
1460
+ if stage_insights:
1461
+ report += f"\n{stage_names.get(stage, f'STAGE {stage}')}:\n"
1462
+ for i, insight in enumerate(stage_insights, 1):
1463
+ report += f" {i}. {insight['insight']}\n"
1464
+
1465
+ if self.cleaning_history:
1466
+ report += f"\nDATA TRANSFORMATIONS\n{'='*20}\n"
1467
+ for i, operation in enumerate(self.cleaning_history, 1):
1468
+ report += f"{i}. {operation}\n"
1469
 
1470
+ report += f"\n{'='*50}\nReport generated by Data Analysis Platform\n"
1471
+ return report