entropy25 commited on
Commit
c50f214
·
verified ·
1 Parent(s): aa64ef2

Update analyzer.py

Browse files
Files changed (1) hide show
  1. analyzer.py +483 -1261
analyzer.py CHANGED
@@ -5,7 +5,6 @@ import plotly.express as px
5
  import plotly.graph_objects as go
6
  from typing import Dict, List, Any, Optional
7
  import os
8
- import logging
9
  from dotenv import load_dotenv
10
  from data_handler import *
11
  from io import BytesIO
@@ -13,42 +12,29 @@ from io import BytesIO
13
  # Load environment variables
14
  load_dotenv()
15
 
16
- # Configure logging
17
- logger = logging.getLogger(__name__)
18
-
19
- # Optional AI Integration with enhanced error handling
20
  try:
21
  import openai
22
  OPENAI_AVAILABLE = True
23
  except ImportError:
24
  OPENAI_AVAILABLE = False
25
- logger.info("OpenAI not available - install openai package for AI features")
26
 
27
  try:
28
  import google.generativeai as genai
29
  GEMINI_AVAILABLE = True
30
  except ImportError:
31
  GEMINI_AVAILABLE = False
32
- logger.info("Gemini not available - install google-generativeai package for AI features")
33
 
34
  class AIAssistant:
35
- """Enhanced AI-powered analysis assistant with better error handling"""
36
 
37
  def __init__(self):
38
  self.openai_key = os.getenv('OPENAI_API_KEY')
39
  self.gemini_key = os.getenv('GOOGLE_API_KEY')
40
- self.setup_models()
41
-
42
- def setup_models(self):
43
- """Initialize AI models with error handling"""
44
- try:
45
- if self.gemini_key and GEMINI_AVAILABLE:
46
- genai.configure(api_key=self.gemini_key)
47
- self.gemini_model = genai.GenerativeModel('gemini-1.5-flash')
48
- logger.info("Gemini model initialized successfully")
49
- except Exception as e:
50
- logger.error(f"Failed to initialize Gemini: {str(e)}")
51
- self.gemini_key = None
52
 
53
  def get_available_models(self) -> List[str]:
54
  """Get list of available AI models"""
@@ -60,1412 +46,648 @@ class AIAssistant:
60
  return models
61
 
62
  def analyze_insights(self, df: pd.DataFrame, insights: List[Dict], model: str = "Google Gemini") -> str:
63
- """Get AI analysis with enhanced error handling and rate limiting"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
- if not insights:
66
- return "No insights available for analysis. Please complete the data analysis stages first."
 
 
 
 
 
 
 
67
 
68
  try:
69
- # Prepare concise data summary
70
- summary = self._prepare_data_summary(df, insights)
71
- prompt = self._create_analysis_prompt(summary)
72
-
73
  if model == "Google Gemini" and hasattr(self, 'gemini_model'):
74
  response = self.gemini_model.generate_content(prompt)
75
- return self._format_ai_response(response.text)
76
-
77
- elif model == "OpenAI GPT" and self.openai_key and OPENAI_AVAILABLE:
78
  client = openai.OpenAI(api_key=self.openai_key)
79
  response = client.chat.completions.create(
80
  model="gpt-3.5-turbo",
81
- messages=[{"role": "user", "content": prompt}],
82
- max_tokens=800,
83
- temperature=0.7
84
  )
85
- return self._format_ai_response(response.choices[0].message.content)
86
-
87
  else:
88
- return "AI analysis not available. Please check your API configuration."
89
-
90
  except Exception as e:
91
- error_msg = f"AI Analysis Error: {str(e)}"
92
- logger.error(error_msg)
93
- return f"❌ {error_msg}\n\n💡 Try checking your API keys or internet connection."
94
-
95
- def _prepare_data_summary(self, df: pd.DataFrame, insights: List[Dict]) -> str:
96
- """Prepare concise data summary for AI analysis"""
97
- summary = f"""Dataset: {df.shape[0]} rows × {df.shape[1]} columns
98
- Data Types: {dict(df.dtypes.value_counts())}
99
- Missing Data: {df.isnull().sum().sum()} cells
100
-
101
- Key Findings:"""
102
-
103
- for insight in insights[-5:]: # Last 5 insights
104
- summary += f"\n• {insight['insight']}"
105
-
106
- return summary
107
-
108
- def _create_analysis_prompt(self, summary: str) -> str:
109
- """Create optimized prompt for AI analysis"""
110
- return f"""As a data scientist, provide a brief analysis focusing on:
111
-
112
- 1. **Business Impact**: What do these findings mean?
113
- 2. **Recommendations**: 2-3 actionable next steps
114
- 3. **Risks**: Potential data quality concerns
115
-
116
- {summary}
117
-
118
- Keep response under 300 words and focus on actionable insights."""
119
-
120
- def _format_ai_response(self, response: str) -> str:
121
- """Format AI response for better readability"""
122
- if not response:
123
- return "No response received from AI model."
124
-
125
- # Clean up response
126
- formatted = response.strip()
127
-
128
- # Add emoji headers if not present
129
- if "Business Impact" in formatted and "🎯" not in formatted:
130
- formatted = formatted.replace("Business Impact", "🎯 **Business Impact**")
131
- if "Recommendations" in formatted and "💡" not in formatted:
132
- formatted = formatted.replace("Recommendations", "💡 **Recommendations**")
133
- if "Risks" in formatted and "⚠️" not in formatted:
134
- formatted = formatted.replace("Risks", "⚠️ **Risks**")
135
-
136
- return formatted
137
 
138
  class DataAnalysisWorkflow:
139
- """Enhanced data analysis workflow with improved UX and error handling"""
140
 
141
  def __init__(self, df: pd.DataFrame):
142
  self.df = df
143
- self.original_df = df.copy() # Keep original for rollback
144
  self.stats = calculate_basic_stats(df)
145
  self.column_types = get_column_types(df)
146
  self.insights = []
147
- self.page_size = 1000
148
- self.cleaning_history = []
149
 
150
- # Validate data on initialization
151
- is_valid, validation_issues = validate_dataframe(df)
152
- if not is_valid:
153
- for issue in validation_issues:
154
- self.add_insight(f"Data validation issue: {issue}", 0)
155
-
156
- def add_insight(self, insight: str, stage: int, insight_type: str = "info"):
157
- """Enhanced insight tracking with types"""
158
  self.insights.append({
159
  'stage': stage,
160
  'insight': insight,
161
- 'type': insight_type,
162
  'timestamp': pd.Timestamp.now()
163
  })
164
 
165
  def get_paginated_data(self, page: int = 0) -> pd.DataFrame:
166
- """Get paginated data with validation"""
167
- try:
168
- start_idx = page * self.page_size
169
- end_idx = min(start_idx + self.page_size, len(self.df))
170
- return self.df.iloc[start_idx:end_idx]
171
- except Exception as e:
172
- logger.error(f"Pagination error: {str(e)}")
173
- return self.df.head(10)
174
 
175
  def stage_1_overview(self):
176
- """Stage 1: Enhanced Data Overview with better UX"""
177
  st.subheader("📊 Data Overview")
178
 
179
- # Help section
180
- with st.expander("ℹ️ Help - Understanding Your Data", expanded=False):
181
- st.markdown("""
182
- **This stage provides:**
183
- - Basic dataset statistics and structure
184
- - Data quality assessment and scoring
185
- - Memory usage analysis and optimization suggestions
186
- - Column type classification and cardinality analysis
187
- """)
188
-
189
- # Data Quality Score with enhanced display
190
  quality_metrics = calculate_data_quality_score(self.df)
191
-
192
  col1, col2, col3, col4 = st.columns(4)
193
  with col1:
194
- st.metric("Rows", f"{self.stats['shape'][0]:,}", help="Total number of records")
195
  with col2:
196
- st.metric("Columns", f"{self.stats['shape'][1]:,}", help="Total number of features")
197
  with col3:
198
- score_color = "normal" if quality_metrics['score'] >= 80 else "inverse"
199
- st.metric("Quality Score", f"{quality_metrics['score']:.1f}/100",
200
- help="Overall data quality assessment")
201
  with col4:
202
- grade_emoji = {"A+": "🌟", "A": "✅", "B+": "👍", "B": "👌", "C+": "⚠️", "C": "⚠️", "D": "❌", "F": "💥"}
203
- st.metric("Grade", f"{grade_emoji.get(quality_metrics['grade'], '❓')} {quality_metrics['grade']}")
204
 
205
- # Quality Issues and Recommendations
206
  if quality_metrics['issues']:
207
- st.error("🚨 **Data Quality Issues Found:**")
208
  for issue in quality_metrics['issues']:
209
  st.write(f"• {issue}")
210
 
211
- if quality_metrics.get('recommendations'):
212
- st.info("💡 **Recommendations:**")
213
- for rec in quality_metrics['recommendations']:
214
- st.write(f"• {rec}")
215
-
216
- # Memory Analysis with actionable insights
217
- st.subheader("💾 Memory Analysis")
218
  memory_opt = calculate_memory_optimization(self.df)
219
-
220
- col1, col2, col3 = st.columns(3)
221
  with col1:
222
  st.metric("Current Memory", f"{memory_opt['current_memory_mb']:.1f} MB")
223
  with col2:
224
  if memory_opt['potential_savings_mb'] > 0:
225
  st.metric("Potential Savings",
226
  f"{memory_opt['potential_savings_mb']:.1f} MB",
227
- f"-{memory_opt['potential_savings_pct']:.1f}%")
228
- with col3:
229
- efficiency = 100 - memory_opt['potential_savings_pct']
230
- st.metric("Memory Efficiency", f"{efficiency:.1f}%")
231
-
232
- if memory_opt['suggestions']:
233
- with st.expander("🔧 View Optimization Suggestions", expanded=False):
234
- st.dataframe(pd.DataFrame(memory_opt['suggestions']), use_container_width=True)
235
- st.info("💡 Converting object columns to categories can significantly reduce memory usage for repeated values.")
236
 
237
- # Enhanced Column Analysis
238
- st.subheader("📋 Column Analysis")
239
  cardinality_df = calculate_column_cardinality(self.df)
240
 
241
- if not cardinality_df.empty:
242
- # Interactive filters
243
- col1, col2 = st.columns(2)
244
- with col1:
245
- col_types = cardinality_df['Type'].unique()
246
- selected_types = st.multiselect("Filter by Cardinality Type",
247
- col_types,
248
- default=col_types,
249
- help="Filter columns by their cardinality classification")
250
- with col2:
251
- data_types = cardinality_df['Data Type'].unique()
252
- selected_data_types = st.multiselect("Filter by Data Type",
253
- data_types,
254
- default=data_types,
255
- help="Filter columns by their pandas data type")
256
-
257
- # Apply filters
258
- filtered_df = cardinality_df[
259
- (cardinality_df['Type'].isin(selected_types)) &
260
- (cardinality_df['Data Type'].isin(selected_data_types))
261
- ]
262
-
263
- st.dataframe(filtered_df, use_container_width=True)
264
-
265
- # Actionable insights
266
- self._display_cardinality_insights(filtered_df)
267
-
268
- # Data Types Visualization
269
- if self.stats['dtypes']:
270
- col1, col2 = st.columns(2)
271
- with col1:
272
- st.subheader("📊 Data Types Distribution")
273
- fig = px.pie(values=list(self.stats['dtypes'].values()),
274
- names=list(self.stats['dtypes'].keys()),
275
- title="Data Types Distribution")
276
- fig.update_traces(textposition='inside', textinfo='percent+label')
277
- st.plotly_chart(fig, use_container_width=True)
278
-
279
- with col2:
280
- st.subheader("📈 Column Count by Type")
281
- fig = px.bar(x=list(self.stats['dtypes'].keys()),
282
- y=list(self.stats['dtypes'].values()),
283
- title="Column Count by Data Type")
284
- st.plotly_chart(fig, use_container_width=True)
285
-
286
- # Enhanced Sample Data Display
287
- self._display_sample_data()
288
 
289
- # Missing Values Analysis
290
- self._analyze_missing_values()
291
-
292
- # Record insights
293
- self._record_stage1_insights(quality_metrics, memory_opt, cardinality_df)
294
-
295
- def _display_cardinality_insights(self, cardinality_df: pd.DataFrame):
296
- """Display actionable insights from cardinality analysis"""
297
- if cardinality_df.empty:
298
- return
299
-
300
- # Key findings
301
- id_cols = cardinality_df[cardinality_df['Type'] == 'Unique Identifier']['Column'].tolist()
302
- const_cols = cardinality_df[cardinality_df['Type'] == 'Constant']['Column'].tolist()
303
- low_card_cols = cardinality_df[cardinality_df['Type'].str.contains('Low')]['Column'].tolist()
304
 
 
 
305
  if id_cols:
306
- st.success(f"🔑 **Potential ID Columns:** {', '.join(id_cols[:3])}" +
307
- (f" (+{len(id_cols)-3} more)" if len(id_cols) > 3 else ""))
308
-
309
  if const_cols:
310
- st.warning(f"⚠️ **Constant Columns (consider removing):** {', '.join(const_cols[:3])}" +
311
- (f" (+{len(const_cols)-3} more)" if len(const_cols) > 3 else ""))
312
-
313
- if low_card_cols:
314
- st.info(f"📊 **Good for Grouping/Filtering:** {', '.join(low_card_cols[:3])}" +
315
- (f" (+{len(low_card_cols)-3} more)" if len(low_card_cols) > 3 else ""))
316
-
317
- def _display_sample_data(self):
318
- """Enhanced sample data display with pagination"""
319
- st.subheader("👀 Sample Data")
320
 
 
 
 
 
 
 
 
 
 
 
321
  total_pages = (len(self.df) - 1) // self.page_size + 1
322
 
323
- col1, col2, col3 = st.columns([2, 1, 1])
324
- with col1:
325
- if total_pages > 1:
326
- page = st.slider("Page", 0, total_pages - 1, 0,
327
- help=f"Navigate through {total_pages} pages of data")
328
- sample_data = self.get_paginated_data(page)
329
- start_row = page * self.page_size + 1
330
- end_row = min((page + 1) * self.page_size, len(self.df))
331
- st.caption(f"Showing rows {start_row:,} to {end_row:,} of {len(self.df):,}")
332
- else:
333
- sample_data = self.df.head(20)
334
- page = 0
335
-
336
- with col2:
337
- show_dtypes = st.checkbox("Show Data Types", help="Display column data types")
338
- with col3:
339
- max_cols = st.number_input("Max Columns", min_value=5, max_value=50, value=10,
340
- help="Limit displayed columns for better readability")
341
 
342
- # Display data with optional type info
343
- display_df = sample_data.iloc[:, :max_cols]
344
 
345
- if show_dtypes:
346
- # Create a summary row with data types
347
- type_row = pd.DataFrame([display_df.dtypes.astype(str)],
348
- index=['Data Type'])
349
- type_row.columns = display_df.columns
350
-
351
- st.dataframe(type_row, use_container_width=True)
352
- st.dataframe(display_df, use_container_width=True)
353
- else:
354
- st.dataframe(display_df, use_container_width=True)
355
-
356
- def _analyze_missing_values(self):
357
- """Enhanced missing values analysis"""
358
  missing_df = calculate_missing_data(self.df)
359
-
360
  if not missing_df.empty:
361
- st.subheader("🕳️ Missing Values Analysis")
362
-
363
- # Summary metrics
364
- total_missing = missing_df['Missing Count'].sum()
365
- affected_cols = len(missing_df)
366
-
367
- col1, col2, col3 = st.columns(3)
368
- with col1:
369
- st.metric("Total Missing", f"{total_missing:,}")
370
- with col2:
371
- st.metric("Affected Columns", affected_cols)
372
- with col3:
373
- worst_col_pct = missing_df.iloc[0]['Missing %'] if len(missing_df) > 0 else 0
374
- st.metric("Worst Column", f"{worst_col_pct:.1f}%")
375
-
376
- # Detailed table
377
  st.dataframe(missing_df, use_container_width=True)
378
 
379
- # Visualization for top missing columns
380
- if len(missing_df) > 1:
381
- top_missing = missing_df.head(10)
382
- fig = px.bar(top_missing, x='Column', y='Missing %',
383
- title="Missing Values by Column",
384
- color='Missing %',
385
- color_continuous_scale='Reds')
386
- fig.update_layout(xaxis_tickangle=-45)
387
- st.plotly_chart(fig, use_container_width=True)
388
-
389
- # Actionable recommendations
390
- high_missing = missing_df[missing_df['Missing %'] > 50]
391
- if not high_missing.empty:
392
- st.error(f"⚠️ **Critical:** {len(high_missing)} columns have >50% missing data")
393
- st.write("Consider removing these columns or investigating data collection issues.")
394
  else:
395
- st.success("✅ **Excellent!** No missing values found in the dataset")
396
-
397
- def _record_stage1_insights(self, quality_metrics, memory_opt, cardinality_df):
398
- """Record insights from stage 1 analysis"""
399
- # Quality insights
400
- if quality_metrics['score'] >= 90:
401
- self.add_insight("Excellent data quality detected", 1, "success")
402
- elif quality_metrics['score'] < 70:
403
- self.add_insight(f"Data quality needs attention (Score: {quality_metrics['score']:.1f}/100)", 1, "warning")
404
 
405
- # Memory insights
406
  if memory_opt['potential_savings_pct'] > 20:
407
- self.add_insight(f"Significant memory optimization opportunity: {memory_opt['potential_savings_pct']:.1f}%", 1, "info")
408
 
409
- # Structure insights
410
- if not cardinality_df.empty:
411
- id_cols = len(cardinality_df[cardinality_df['Type'] == 'Unique Identifier'])
412
- const_cols = len(cardinality_df[cardinality_df['Type'] == 'Constant'])
413
-
414
- if id_cols > 0:
415
- self.add_insight(f"Found {id_cols} potential identifier column(s)", 1, "info")
416
- if const_cols > 0:
417
- self.add_insight(f"Found {const_cols} constant column(s) - consider removal", 1, "warning")
418
 
419
  def stage_2_exploration(self):
420
- """Stage 2: Enhanced Exploratory Data Analysis"""
421
  st.subheader("🔍 Exploratory Data Analysis")
422
 
423
- with st.expander("ℹ️ Help - Exploratory Analysis", expanded=False):
424
- st.markdown("""
425
- **This stage helps you:**
426
- - Understand distributions of your variables
427
- - Identify patterns and relationships
428
- - Spot potential anomalies or interesting features
429
- - Guide further analysis decisions
430
- """)
431
-
432
  numeric_cols = self.column_types['numeric']
433
  categorical_cols = self.column_types['categorical']
434
 
435
- if not numeric_cols and not categorical_cols:
436
- st.warning("⚠️ No suitable columns found for analysis. Please check your data types.")
437
- return
438
-
439
- # Enhanced Numeric Analysis
440
  if numeric_cols:
441
- self._analyze_numeric_variables(numeric_cols)
442
-
443
- # Enhanced Categorical Analysis
444
- if categorical_cols:
445
- self._analyze_categorical_variables(categorical_cols)
446
-
447
- # Relationship Analysis
448
- self._analyze_relationships(numeric_cols, categorical_cols)
449
-
450
- def _analyze_numeric_variables(self, numeric_cols: List[str]):
451
- """Enhanced numeric variable analysis"""
452
- st.subheader("🔢 Numeric Variables Analysis")
453
-
454
- col1, col2 = st.columns([1, 1])
455
- with col1:
456
- selected_numeric = st.selectbox("Select numeric column:", numeric_cols,
457
- help="Choose a numeric column to analyze its distribution")
458
- with col2:
459
- chart_type = st.selectbox("Chart type:", ["Histogram", "Box Plot", "Violin Plot", "Q-Q Plot"])
460
-
461
- if selected_numeric:
462
- # Statistics summary
463
- stats_dict = calculate_numeric_stats(self.df, selected_numeric)
464
-
465
- if stats_dict:
466
- col1, col2, col3, col4 = st.columns(4)
467
- with col1:
468
- st.metric("Mean", f"{stats_dict['mean']:.2f}")
469
- with col2:
470
- st.metric("Median", f"{stats_dict['median']:.2f}")
471
- with col3:
472
- st.metric("Std Dev", f"{stats_dict['std']:.2f}")
473
- with col4:
474
- skew_interpretation = "Right-skewed" if stats_dict['skewness'] > 0.5 else "Left-skewed" if stats_dict['skewness'] < -0.5 else "Symmetric"
475
- st.metric("Skewness", f"{stats_dict['skewness']:.2f}", help=skew_interpretation)
476
-
477
- # Enhanced visualizations
478
- try:
479
- col1, col2 = st.columns(2)
480
-
481
- with col1:
482
- if chart_type == "Histogram":
483
- fig = px.histogram(self.df, x=selected_numeric,
484
- title=f"Distribution of {selected_numeric}",
485
- marginal="rug")
486
- elif chart_type == "Box Plot":
487
- fig = px.box(self.df, y=selected_numeric,
488
- title=f"Box Plot of {selected_numeric}")
489
- elif chart_type == "Violin Plot":
490
- fig = px.violin(self.df, y=selected_numeric,
491
- title=f"Violin Plot of {selected_numeric}")
492
- else: # Q-Q Plot
493
- from scipy import stats
494
- qq_data = stats.probplot(self.df[selected_numeric].dropna(), dist="norm")
495
- fig = go.Figure()
496
- fig.add_scatter(x=qq_data[0][0], y=qq_data[0][1], mode='markers',
497
- name='Data Points')
498
- fig.add_scatter(x=qq_data[0][0], y=qq_data[1][1] + qq_data[1][0] * qq_data[0][0],
499
- mode='lines', name='Normal Distribution')
500
- fig.update_layout(title=f"Q-Q Plot of {selected_numeric}",
501
- xaxis_title="Theoretical Quantiles",
502
- yaxis_title="Sample Quantiles")
503
-
504
- st.plotly_chart(fig, use_container_width=True)
505
-
506
- with col2:
507
- # Summary statistics table
508
- if stats_dict:
509
- summary_data = {
510
- 'Statistic': ['Count', 'Mean', 'Median', 'Std Dev', 'Min', 'Max', 'Q25', 'Q75', 'Skewness', 'Kurtosis'],
511
- 'Value': [
512
- len(self.df[selected_numeric].dropna()),
513
- f"{stats_dict['mean']:.3f}",
514
- f"{stats_dict['median']:.3f}",
515
- f"{stats_dict['std']:.3f}",
516
- f"{stats_dict['min']:.3f}",
517
- f"{stats_dict['max']:.3f}",
518
- f"{stats_dict['q25']:.3f}",
519
- f"{stats_dict['q75']:.3f}",
520
- f"{stats_dict['skewness']:.3f}",
521
- f"{stats_dict['kurtosis']:.3f}"
522
- ]
523
- }
524
- st.dataframe(pd.DataFrame(summary_data), use_container_width=True, hide_index=True)
525
-
526
- # Distribution insights
527
- if abs(stats_dict['skewness']) > 1:
528
- skew_type = "highly right-skewed" if stats_dict['skewness'] > 1 else "highly left-skewed"
529
- self.add_insight(f"{selected_numeric} is {skew_type} (skewness: {stats_dict['skewness']:.2f})", 2, "info")
530
-
531
- if stats_dict['kurtosis'] > 3:
532
- self.add_insight(f"{selected_numeric} has heavy tails (kurtosis: {stats_dict['kurtosis']:.2f})", 2, "info")
533
 
534
- except Exception as e:
535
- st.error(f"Error creating visualization: {str(e)}")
536
- logger.error(f"Visualization error for {selected_numeric}: {str(e)}")
537
-
538
- def _analyze_categorical_variables(self, categorical_cols: List[str]):
539
- """Enhanced categorical variable analysis"""
540
- st.subheader("📝 Categorical Variables Analysis")
541
-
542
- selected_categorical = st.selectbox("Select categorical column:", categorical_cols,
543
- help="Choose a categorical column to analyze its distribution")
544
-
545
- if selected_categorical:
546
- try:
547
- # Get value counts with error handling
548
- value_counts = get_value_counts(self.df, selected_categorical, top_n=20)
549
-
550
- if value_counts is not None and not value_counts.empty:
551
- total_categories = self.df[selected_categorical].nunique()
552
-
553
- # Summary metrics
554
- col1, col2, col3 = st.columns(3)
555
- with col1:
556
- st.metric("Total Categories", total_categories)
557
- with col2:
558
- top_category_pct = (value_counts.iloc[0] / len(self.df)) * 100
559
- st.metric("Top Category", f"{top_category_pct:.1f}%")
560
- with col3:
561
- entropy = -sum((value_counts / value_counts.sum()) * np.log2(value_counts / value_counts.sum() + 1e-10))
562
- st.metric("Diversity (Entropy)", f"{entropy:.2f}")
563
-
564
- # Visualization
565
- col1, col2 = st.columns(2)
566
- with col1:
567
- fig = px.bar(x=value_counts.index, y=value_counts.values,
568
- title=f"Top {min(20, len(value_counts))} Values in {selected_categorical}")
569
- fig.update_layout(xaxis_tickangle=-45)
570
- st.plotly_chart(fig, use_container_width=True)
571
-
572
- with col2:
573
- # Show data table
574
- display_data = pd.DataFrame({
575
- 'Category': value_counts.index,
576
- 'Count': value_counts.values,
577
- 'Percentage': np.round((value_counts.values / len(self.df)) * 100, 2)
578
- })
579
- st.dataframe(display_data, use_container_width=True, hide_index=True)
580
-
581
- # Insights
582
- if total_categories > 100:
583
- self.add_insight(f"{selected_categorical} has very high cardinality ({total_categories} categories)", 2, "warning")
584
- elif top_category_pct > 90:
585
- self.add_insight(f"{selected_categorical} is highly imbalanced (top category: {top_category_pct:.1f}%)", 2, "warning")
586
-
587
- else:
588
- st.warning(f"⚠️ Unable to analyze column '{selected_categorical}' - it may be empty or have issues")
589
 
590
- except Exception as e:
591
- st.error(f"Error analyzing categorical variable: {str(e)}")
592
- logger.error(f"Categorical analysis error for {selected_categorical}: {str(e)}")
593
-
594
- def _analyze_relationships(self, numeric_cols: List[str], categorical_cols: List[str]):
595
- """Enhanced relationship analysis"""
596
- if len(numeric_cols) >= 2:
597
- st.subheader("🔗 Variable Relationships")
598
 
599
- # Correlation matrix
600
- corr_matrix = calculate_correlation_matrix(self.df)
601
- if corr_matrix is not None and not corr_matrix.empty:
602
- col1, col2 = st.columns(2)
603
-
604
- with col1:
605
- fig = px.imshow(corr_matrix,
606
- text_auto=True,
607
- aspect="auto",
608
- title="Correlation Matrix",
609
- color_continuous_scale='RdBu')
 
610
  st.plotly_chart(fig, use_container_width=True)
611
-
612
- with col2:
613
- # Find strongest correlations
614
- corr_pairs = []
615
  for i in range(len(corr_matrix.columns)):
616
  for j in range(i+1, len(corr_matrix.columns)):
617
- col1_name = corr_matrix.columns[i]
618
- col2_name = corr_matrix.columns[j]
619
- corr_val = corr_matrix.iloc[i, j]
620
- if not np.isnan(corr_val):
621
- corr_pairs.append({
622
- 'Variable 1': col1_name,
623
- 'Variable 2': col2_name,
624
- 'Correlation': round(corr_val, 3),
625
- 'Strength': 'Strong' if abs(corr_val) > 0.7 else 'Moderate' if abs(corr_val) > 0.3 else 'Weak'
626
- })
627
 
628
- if corr_pairs:
629
- corr_df = pd.DataFrame(corr_pairs).sort_values('Correlation', key=abs, ascending=False)
630
- st.subheader("🎯 Strongest Correlations")
631
- st.dataframe(corr_df.head(10), use_container_width=True, hide_index=True)
632
-
633
- # Record strongest correlation insight
634
- strongest = corr_df.iloc[0]
635
- self.add_insight(f"Strongest correlation: {strongest['Variable 1']} ↔ {strongest['Variable 2']} ({strongest['Correlation']})", 2, "info")
 
 
 
 
 
 
 
 
636
 
637
  def stage_3_cleaning(self):
638
- """Stage 3: Enhanced Data Quality Assessment and Cleaning"""
639
- st.subheader("🧹 Data Quality & Cleaning")
640
-
641
- with st.expander("ℹ️ Help - Data Cleaning", expanded=False):
642
- st.markdown("""
643
- **Available cleaning operations:**
644
- - **Missing Values:** Fill with statistics, drop rows, or use custom values
645
- - **Duplicates:** Remove identical rows
646
- - **Outliers:** Remove or cap extreme values
647
- - **Data Types:** Convert columns to appropriate types
648
- """)
649
-
650
- # Progress tracking
651
- cleaning_progress = st.empty()
652
 
653
- # Enhanced Missing Values Handling
654
- self._handle_missing_values()
655
 
656
- # Enhanced Duplicates Handling
657
- self._handle_duplicates()
658
-
659
- # Enhanced Mixed Types Handling
660
- self._handle_mixed_types()
661
-
662
- # Enhanced Outlier Detection
663
- self._handle_outliers()
664
-
665
- # Cleaning Summary
666
- self._display_cleaning_summary()
667
-
668
- def _handle_missing_values(self):
669
- """Enhanced missing values handling with preview"""
670
- missing_df = calculate_missing_data(self.df)
671
-
672
- if not missing_df.empty:
673
- st.subheader("🕳️ Missing Values Treatment")
674
 
675
- # Select column and method
676
- col1, col2, col3 = st.columns(3)
677
  with col1:
678
- selected_col = st.selectbox("Column to clean:", missing_df['Column'].tolist())
 
679
  with col2:
680
- col_dtype = str(self.df[selected_col].dtype)
681
- if 'int' in col_dtype or 'float' in col_dtype:
682
- methods = ["Drop rows", "Mean", "Median", "Mode", "Custom value"]
683
- else:
684
- methods = ["Drop rows", "Mode", "Custom value"]
685
- fill_method = st.selectbox("Fill method:", methods)
686
- with col3:
687
- if fill_method == "Custom value":
688
- if 'int' in col_dtype or 'float' in col_dtype:
689
- custom_value = st.number_input("Custom value:", value=0.0)
690
- else:
691
- custom_value = st.text_input("Custom value:", value="Unknown")
692
 
693
- # Preview impact
694
- if selected_col:
695
- missing_count = self.df[selected_col].isnull().sum()
696
- total_count = len(self.df)
697
-
698
- if fill_method == "Drop rows":
699
- remaining_rows = total_count - missing_count
700
- st.info(f"📊 **Preview:** Will remove {missing_count} rows, keeping {remaining_rows} rows")
701
- else:
702
- st.info(f"📊 **Preview:** Will fill {missing_count} missing values")
703
-
704
- # Apply cleaning
705
- if st.button("✨ Apply Missing Value Treatment", type="primary"):
706
  try:
707
- original_missing = self.df[selected_col].isnull().sum()
708
-
709
  if fill_method == "Drop rows":
710
  self.df = self.df.dropna(subset=[selected_col])
711
- operation = f"Dropped {original_missing} rows with missing values in {selected_col}"
712
  else:
713
  if fill_method == "Mean":
714
  fill_value = self.df[selected_col].mean()
715
  elif fill_method == "Median":
716
  fill_value = self.df[selected_col].median()
717
  elif fill_method == "Mode":
718
- mode_result = self.df[selected_col].mode()
719
- fill_value = mode_result.iloc[0] if not mode_result.empty else "Unknown"
720
- else:
721
- fill_value = custom_value
722
 
723
  self.df[selected_col] = self.df[selected_col].fillna(fill_value)
724
- operation = f"Filled {original_missing} missing values in {selected_col} with {fill_method}"
725
 
726
- self.cleaning_history.append(operation)
727
- st.success(f"✅ {operation}")
728
- st.rerun()
729
-
730
  except Exception as e:
731
- st.error(f"Error applying treatment: {str(e)}")
732
- else:
733
- st.success("✅ No missing values found!")
734
-
735
- def _handle_duplicates(self):
736
- """Enhanced duplicate handling"""
737
  if self.stats['duplicates'] > 0:
738
- st.subheader("👥 Duplicate Rows")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
739
 
740
- duplicate_pct = (self.stats['duplicates'] / len(self.df)) * 100
741
- st.warning(f"⚠️ Found **{self.stats['duplicates']:,}** duplicate rows ({duplicate_pct:.1f}% of data)")
742
 
743
- # Show sample duplicates
744
- duplicates = self.df[self.df.duplicated(keep=False)].head(10)
745
- if not duplicates.empty:
746
- st.write("**Sample duplicate rows:**")
747
- st.dataframe(duplicates, use_container_width=True)
748
 
749
- if st.button("🗑️ Remove Duplicate Rows", type="primary"):
750
  try:
751
- original_len = len(self.df)
752
- self.df = self.df.drop_duplicates()
753
- removed = original_len - len(self.df)
754
- operation = f"Removed {removed} duplicate rows"
755
- self.cleaning_history.append(operation)
756
- st.success(f" {operation}")
757
- st.rerun()
758
  except Exception as e:
759
- st.error(f"Error removing duplicates: {str(e)}")
760
- else:
761
- st.success("✅ No duplicate rows found!")
762
-
763
- def _handle_mixed_types(self):
764
- """Enhanced mixed types handling"""
765
- mixed_types = detect_mixed_types(self.df)
766
 
767
- if mixed_types:
768
- st.subheader("🔀 Mixed Data Types")
 
 
 
769
 
770
- for issue in mixed_types:
771
- col = issue['column']
772
- problems = issue['problematic_values']
773
- pct = issue['percentage']
774
-
775
- st.warning(f"⚠️ **{col}:** {problems} values ({pct:.1f}%) cannot be converted to numeric")
776
 
777
- # Show sample problematic values
778
- if 'sample_issues' in issue:
779
- sample_issues = issue['sample_issues']
780
- st.write("**Sample problematic values:**")
781
- for value, count in list(sample_issues.items())[:5]:
782
- st.write(f"• '{value}' ({count} occurrences)")
783
 
784
- col1, col2 = st.columns(2)
785
- with col1:
786
- fix_method = st.selectbox(f"Fix method for {col}:",
787
- ["Convert to numeric (coerce errors)", "Keep as text"],
788
- key=f"fix_{col}")
789
- with col2:
790
- if st.button(f"🔧 Fix {col}", key=f"apply_{col}"):
791
- try:
792
- if fix_method == "Convert to numeric (coerce errors)":
793
- self.df[col] = pd.to_numeric(self.df[col], errors='coerce')
794
- operation = f"Converted {col} to numeric (with coercion)"
795
- else:
796
- operation = f"Kept {col} as text type"
797
 
798
- self.cleaning_history.append(operation)
799
- st.success(f" {operation}")
800
- st.rerun()
801
- except Exception as e:
802
- st.error(f"❌ Error fixing {col}: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
803
  else:
804
- st.success("✅ No mixed data type issues found!")
 
805
 
806
- def _handle_outliers(self):
807
- """Enhanced outlier detection and handling"""
 
 
808
  numeric_cols = self.column_types['numeric']
 
809
 
810
- if numeric_cols:
811
- st.subheader("🎯 Outlier Detection")
 
812
 
813
- col1, col2, col3 = st.columns(3)
814
  with col1:
815
- selected_col = st.selectbox("Column for outlier detection:", numeric_cols)
816
  with col2:
817
- detection_method = st.selectbox("Detection method:",
818
- ["IQR (Interquartile Range)", "Z-Score", "Percentile"])
819
- with col3:
820
- if detection_method == "Z-Score":
821
- threshold = st.number_input("Z-Score threshold:", min_value=1.0, max_value=5.0, value=3.0)
822
- elif detection_method == "Percentile":
823
- percentile = st.slider("Outlier percentile:", 0.1, 5.0, 1.0)
824
 
825
- if selected_col:
826
- try:
827
- method_map = {
828
- "IQR (Interquartile Range)": "iqr",
829
- "Z-Score": "zscore",
830
- "Percentile": "percentile"
831
- }
832
- outliers = calculate_outliers(self.df, selected_col, method_map[detection_method])
833
-
834
- if outliers is not None and not outliers.empty:
835
- outlier_count = len(outliers)
836
- outlier_pct = (outlier_count / len(self.df)) * 100
837
-
838
- st.warning(f"⚠️ Found **{outlier_count}** potential outliers ({outlier_pct:.1f}% of data)")
839
-
840
- # Show outlier statistics
841
- col1, col2 = st.columns(2)
842
- with col1:
843
- outlier_stats = outliers[selected_col].describe()
844
- st.write("**Outlier Statistics:**")
845
- st.dataframe(outlier_stats.to_frame().T, use_container_width=True)
846
-
847
- with col2:
848
- # Visualization of outliers
849
- fig = go.Figure()
850
- fig.add_trace(go.Scatter(
851
- x=self.df.index,
852
- y=self.df[selected_col],
853
- mode='markers',
854
- name='Normal Data',
855
- marker=dict(color='blue', opacity=0.6)
856
- ))
857
- fig.add_trace(go.Scatter(
858
- x=outliers.index,
859
- y=outliers[selected_col],
860
- mode='markers',
861
- name='Outliers',
862
- marker=dict(color='red', size=8)
863
- ))
864
- fig.update_layout(title=f"Outliers in {selected_col}")
865
- st.plotly_chart(fig, use_container_width=True)
866
-
867
- # Treatment options
868
- treatment_method = st.selectbox("Outlier treatment:",
869
- ["None", "Remove outliers", "Cap at bounds"])
870
-
871
- if treatment_method != "None":
872
- st.info(f"📊 **Preview:** This will affect {outlier_count} data points")
873
-
874
- if st.button("🔧 Apply Outlier Treatment", type="primary"):
875
- try:
876
- if treatment_method == "Remove outliers":
877
- self.df = self.df[~self.df.index.isin(outliers.index)]
878
- operation = f"Removed {outlier_count} outliers from {selected_col}"
879
- else: # Cap at bounds
880
- Q1 = self.df[selected_col].quantile(0.25)
881
- Q3 = self.df[selected_col].quantile(0.75)
882
- IQR = Q3 - Q1
883
- lower_bound = Q1 - 1.5 * IQR
884
- upper_bound = Q3 + 1.5 * IQR
885
-
886
- self.df[selected_col] = self.df[selected_col].clip(lower_bound, upper_bound)
887
- operation = f"Capped outliers in {selected_col} to bounds"
888
-
889
- self.cleaning_history.append(operation)
890
- st.success(f"✅ {operation}")
891
- st.rerun()
892
-
893
- except Exception as e:
894
- st.error(f"❌ Error treating outliers: {str(e)}")
895
- else:
896
- st.success(f"✅ No outliers detected in '{selected_col}' using {detection_method}")
897
-
898
- except Exception as e:
899
- st.error(f"❌ Error detecting outliers: {str(e)}")
900
-
901
- def _display_cleaning_summary(self):
902
- """Display comprehensive cleaning summary"""
903
- if self.cleaning_history:
904
- st.subheader("📋 Cleaning Operations History")
905
 
906
- for i, operation in enumerate(self.cleaning_history, 1):
907
- st.write(f"**{i}.** {operation}")
 
908
 
909
- # Show data changes
910
- col1, col2 = st.columns(2)
911
- with col1:
912
- st.metric("Original Rows", f"{self.original_df.shape[0]:,}")
913
- st.metric("Original Memory", f"{self.original_df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
914
- with col2:
915
- st.metric("Current Rows", f"{self.df.shape[0]:,}",
916
- delta=f"{self.df.shape[0] - self.original_df.shape[0]:,}")
917
- current_memory = self.df.memory_usage(deep=True).sum() / 1024**2
918
- original_memory = self.original_df.memory_usage(deep=True).sum() / 1024**2
919
- st.metric("Current Memory", f"{current_memory:.1f} MB",
920
- delta=f"{current_memory - original_memory:.1f} MB")
921
 
922
- # Rollback option
923
- if st.button("↩️ Reset to Original Data", help="Restore original dataset"):
924
- self.df = self.original_df.copy()
925
- self.cleaning_history = []
926
- st.success("✅ Data reset to original state")
927
- st.rerun()
928
 
929
- self.add_insight(f"Applied {len(self.cleaning_history)} cleaning operations", 3, "info")
930
- else:
931
- st.info("ℹ️ No cleaning operations performed yet")
932
-
933
- def stage_4_analysis(self):
934
- """Stage 4: Enhanced Advanced Analysis"""
935
- st.subheader("🔬 Advanced Analysis")
936
-
937
- with st.expander("ℹ️ Help - Advanced Analysis", expanded=False):
938
- st.markdown("""
939
- **Advanced analysis includes:**
940
- - **Relationships:** Correlation and scatter plot analysis
941
- - **Group Analysis:** Compare metrics across categories
942
- - **Distribution Analysis:** Statistical testing and comparisons
943
- """)
944
-
945
- numeric_cols = self.column_types['numeric']
946
- categorical_cols = self.column_types['categorical']
947
-
948
- # Enhanced Relationship Analysis
949
- if len(numeric_cols) >= 2:
950
- self._advanced_relationship_analysis(numeric_cols)
951
 
952
- # Enhanced Group Analysis
953
  if categorical_cols and numeric_cols:
954
- self._advanced_group_analysis(categorical_cols, numeric_cols)
955
-
956
- # Statistical Testing
957
- if len(numeric_cols) >= 2:
958
- self._statistical_testing(numeric_cols, categorical_cols)
959
-
960
- def _advanced_relationship_analysis(self, numeric_cols: List[str]):
961
- """Enhanced relationship analysis with statistical insights"""
962
- st.subheader("🔗 Variable Relationships")
963
-
964
- col1, col2, col3 = st.columns(3)
965
- with col1:
966
- x_var = st.selectbox("X Variable:", numeric_cols)
967
- with col2:
968
- y_var = st.selectbox("Y Variable:", [col for col in numeric_cols if col != x_var])
969
- with col3:
970
- color_var = st.selectbox("Color by (optional):",
971
- ["None"] + self.column_types['categorical'][:10])
972
-
973
- if x_var and y_var:
974
- try:
975
- # Sample for performance
976
- sample_size = min(5000, len(self.df))
977
- if len(self.df) > sample_size:
978
- sample_df = self.df.sample(n=sample_size, random_state=42)
979
- st.info(f"📊 Showing sample of {sample_size:,} points for performance")
980
- else:
981
- sample_df = self.df
982
-
983
- # Create scatter plot
984
- if color_var != "None":
985
- fig = px.scatter(sample_df, x=x_var, y=y_var, color=color_var,
986
- title=f"Relationship: {x_var} vs {y_var}",
987
- trendline="ols")
988
- else:
989
- fig = px.scatter(sample_df, x=x_var, y=y_var,
990
- title=f"Relationship: {x_var} vs {y_var}",
991
- trendline="ols")
992
-
993
- st.plotly_chart(fig, use_container_width=True)
994
-
995
- # Statistical analysis
996
- correlation = self.df[x_var].corr(self.df[y_var])
997
-
998
- col1, col2, col3 = st.columns(3)
999
- with col1:
1000
- st.metric("Correlation", f"{correlation:.3f}")
1001
- with col2:
1002
- if abs(correlation) > 0.7:
1003
- strength = "Strong"
1004
- elif abs(correlation) > 0.3:
1005
- strength = "Moderate"
1006
- else:
1007
- strength = "Weak"
1008
- st.metric("Strength", strength)
1009
- with col3:
1010
- direction = "Positive" if correlation > 0 else "Negative"
1011
- st.metric("Direction", direction)
1012
-
1013
- # Record insight
1014
- self.add_insight(f"{strength} {direction.lower()} correlation ({correlation:.3f}) between {x_var} and {y_var}", 4, "info")
1015
 
1016
- except Exception as e:
1017
- st.error(f"❌ Error in relationship analysis: {str(e)}")
1018
-
1019
- def _advanced_group_analysis(self, categorical_cols: List[str], numeric_cols: List[str]):
1020
- """Enhanced group analysis with statistical comparisons"""
1021
- st.subheader("👥 Group Analysis")
1022
-
1023
- col1, col2 = st.columns(2)
1024
- with col1:
1025
- group_var = st.selectbox("Group by:", categorical_cols)
1026
- with col2:
1027
- metric_var = st.selectbox("Analyze metric:", numeric_cols)
1028
-
1029
- if group_var and metric_var:
1030
- try:
1031
- group_stats = calculate_group_stats(self.df, group_var, metric_var)
1032
-
1033
- if group_stats is not None and not group_stats.empty:
1034
- # Display statistics
1035
- st.dataframe(group_stats, use_container_width=True)
1036
-
1037
- # Visualization
1038
- unique_groups = self.df[group_var].nunique()
1039
- if unique_groups <= 20:
1040
- col1, col2 = st.columns(2)
1041
- with col1:
1042
- fig = px.box(self.df, x=group_var, y=metric_var,
1043
- title=f"{metric_var} by {group_var}")
1044
- fig.update_layout(xaxis_tickangle=-45)
1045
- st.plotly_chart(fig, use_container_width=True)
1046
-
1047
- with col2:
1048
- # Mean comparison
1049
- group_means = self.df.groupby(group_var)[metric_var].mean().sort_values(ascending=False)
1050
- fig = px.bar(x=group_means.index, y=group_means.values,
1051
- title=f"Average {metric_var} by {group_var}")
1052
- fig.update_layout(xaxis_tickangle=-45)
1053
- st.plotly_chart(fig, use_container_width=True)
1054
- else:
1055
- st.info(f"ℹ️ Too many groups ({unique_groups}) for visualization. Showing statistics only.")
1056
-
1057
- # Find insights
1058
- best_group = group_stats.loc[group_stats['mean'].idxmax(), group_var]
1059
- best_value = group_stats['mean'].max()
1060
- worst_group = group_stats.loc[group_stats['mean'].idxmin(), group_var]
1061
- worst_value = group_stats['mean'].min()
1062
-
1063
- col1, col2 = st.columns(2)
1064
- with col1:
1065
- st.success(f"🏆 **Highest {metric_var}:** {best_group} ({best_value:.2f})")
1066
- with col2:
1067
- st.info(f"📉 **Lowest {metric_var}:** {worst_group} ({worst_value:.2f})")
1068
-
1069
- self.add_insight(f"'{best_group}' has highest average {metric_var}: {best_value:.2f}", 4, "success")
1070
-
1071
- except Exception as e:
1072
- st.error(f"❌ Error in group analysis: {str(e)}")
1073
-
1074
- def _statistical_testing(self, numeric_cols: List[str], categorical_cols: List[str]):
1075
- """Enhanced statistical testing capabilities"""
1076
- if len(numeric_cols) >= 2:
1077
- st.subheader("📊 Statistical Testing")
1078
 
1079
- test_type = st.selectbox("Select test type:",
1080
- ["Correlation Test", "Group Comparison"])
1081
 
1082
- if test_type == "Correlation Test" and len(numeric_cols) >= 2:
1083
- col1, col2 = st.columns(2)
1084
- with col1:
1085
- var1 = st.selectbox("Variable 1:", numeric_cols, key="corr_var1")
1086
- with col2:
1087
- var2 = st.selectbox("Variable 2:",
1088
- [col for col in numeric_cols if col != var1],
1089
- key="corr_var2")
1090
-
1091
- if st.button("🧪 Run Correlation Test"):
1092
- try:
1093
- from scipy.stats import pearsonr, spearmanr
1094
-
1095
- # Clean data for testing
1096
- clean_data = self.df[[var1, var2]].dropna()
1097
-
1098
- if len(clean_data) < 10:
1099
- st.warning("⚠️ Insufficient data for reliable correlation testing")
1100
- else:
1101
- # Pearson correlation
1102
- pearson_corr, pearson_p = pearsonr(clean_data[var1], clean_data[var2])
1103
-
1104
- # Spearman correlation (rank-based)
1105
- spearman_corr, spearman_p = spearmanr(clean_data[var1], clean_data[var2])
1106
-
1107
- col1, col2 = st.columns(2)
1108
- with col1:
1109
- st.subheader("Pearson Correlation")
1110
- st.metric("Correlation", f"{pearson_corr:.3f}")
1111
- st.metric("P-value", f"{pearson_p:.4f}")
1112
- if pearson_p < 0.05:
1113
- st.success("✅ Statistically significant")
1114
- else:
1115
- st.warning("⚠️ Not statistically significant")
1116
-
1117
- with col2:
1118
- st.subheader("Spearman Correlation")
1119
- st.metric("Correlation", f"{spearman_corr:.3f}")
1120
- st.metric("P-value", f"{spearman_p:.4f}")
1121
- if spearman_p < 0.05:
1122
- st.success("✅ Statistically significant")
1123
- else:
1124
- st.warning("⚠️ Not statistically significant")
1125
-
1126
- # Interpretation
1127
- if pearson_p < 0.05:
1128
- self.add_insight(f"Significant correlation between {var1} and {var2} (p={pearson_p:.4f})", 4, "success")
1129
-
1130
- except Exception as e:
1131
- st.error(f"❌ Error in correlation testing: {str(e)}")
1132
 
1133
  def stage_5_summary(self):
1134
- """Stage 5: Enhanced Summary and Export"""
1135
- st.subheader("📈 Analysis Summary & Export")
1136
-
1137
- with st.expander("ℹ️ Help - Summary & Export", expanded=False):
1138
- st.markdown("""
1139
- **This final stage provides:**
1140
- - Complete analysis summary with all insights
1141
- - Multiple export formats for your results
1142
- - Code generation for reproducible analysis
1143
- - Data quality final report
1144
- """)
1145
 
1146
- # Enhanced Key Metrics Dashboard
1147
- col1, col2, col3, col4 = st.columns(4)
1148
  with col1:
1149
- st.metric("📊 Total Insights", len(self.insights))
1150
  with col2:
1151
- success_insights = len([i for i in self.insights if i.get('type') == 'success'])
1152
- st.metric(" Positive Findings", success_insights)
1153
  with col3:
1154
- warning_insights = len([i for i in self.insights if i.get('type') == 'warning'])
1155
- st.metric("⚠️ Issues Found", warning_insights)
1156
- with col4:
1157
- final_quality = "High" if self.stats['missing_values'] == 0 and self.stats['duplicates'] == 0 else "Medium"
1158
- st.metric("🎯 Final Quality", final_quality)
1159
-
1160
- # Categorized Insights Summary
1161
- self._display_categorized_insights()
1162
-
1163
- # Data Transformation Summary
1164
- if self.cleaning_history:
1165
- st.subheader("🔄 Data Transformations Applied")
1166
- for i, operation in enumerate(self.cleaning_history, 1):
1167
- st.write(f"**{i}.** {operation}")
1168
-
1169
- st.info(f" Dataset transformed from {self.original_df.shape} to {self.df.shape}")
1170
-
1171
- # Enhanced Export Options
1172
- self._display_export_options()
1173
-
1174
- def _display_categorized_insights(self):
1175
- """Display insights organized by category and stage"""
1176
- st.subheader("💡 Key Insights by Stage")
1177
-
1178
- stage_names = {
1179
- 0: "🔍 Validation",
1180
- 1: "📊 Overview",
1181
- 2: "🔍 Exploration",
1182
- 3: "🧹 Cleaning",
1183
- 4: "🔬 Analysis"
1184
- }
1185
-
1186
- for stage in range(5):
1187
- stage_insights = [i for i in self.insights if i['stage'] == stage]
1188
- if stage_insights:
1189
- st.write(f"**{stage_names.get(stage, f'Stage {stage}')}**")
1190
- for insight in stage_insights:
1191
- icon = {"success": "✅", "warning": "⚠️", "error": "❌"}.get(insight.get('type'), "ℹ️")
1192
- st.write(f" {icon} {insight['insight']}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1193
 
1194
- def _display_export_options(self):
1195
- """Enhanced export options with previews"""
1196
- st.subheader("📤 Export Results")
1197
-
1198
- export_type = st.selectbox("Choose export type:",
1199
- ["Analysis Report", "Cleaned Dataset", "Python Code", "Summary Dashboard"])
1200
-
1201
- try:
1202
- if export_type == "Analysis Report":
1203
- format_choice = st.selectbox("Report format:", ["Markdown", "HTML", "Text"])
1204
-
1205
- col1, col2 = st.columns([3, 1])
1206
- with col1:
1207
- if format_choice == "Markdown":
1208
- report = self.generate_markdown_report()
1209
- st.code(report[:500] + "..." if len(report) > 500 else report, language="markdown")
1210
- with col2:
1211
- st.download_button(
1212
- label=f"📄 Download {format_choice} Report",
1213
- data=report if format_choice == "Markdown" else self.generate_text_report(),
1214
- file_name=f"analysis_report.{format_choice.lower()}",
1215
- mime="text/markdown" if format_choice == "Markdown" else "text/plain"
1216
- )
1217
-
1218
- elif export_type == "Cleaned Dataset":
1219
- format_choice = st.selectbox("Data format:", ["CSV", "Excel", "Parquet"])
1220
-
1221
- col1, col2 = st.columns([3, 1])
1222
- with col1:
1223
- st.write("**Data Preview:**")
1224
- st.dataframe(self.df.head(), use_container_width=True)
1225
- st.write(f"**Final Shape:** {self.df.shape[0]:,} rows × {self.df.shape[1]:,} columns")
1226
-
1227
- with col2:
1228
- if st.button(f"📊 Export as {format_choice}"):
1229
- try:
1230
- if format_choice == "CSV":
1231
- csv = self.df.to_csv(index=False)
1232
- st.download_button("💾 Download CSV", csv, "cleaned_data.csv", "text/csv")
1233
-
1234
- elif format_choice == "Excel":
1235
- buffer = BytesIO()
1236
- with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
1237
- self.df.to_excel(writer, sheet_name='Cleaned_Data', index=False)
1238
-
1239
- # Add summary sheet
1240
- summary_df = pd.DataFrame({
1241
- 'Metric': ['Original Rows', 'Final Rows', 'Columns', 'Cleaning Operations'],
1242
- 'Value': [self.original_df.shape[0], self.df.shape[0],
1243
- self.df.shape[1], len(self.cleaning_history)]
1244
- })
1245
- summary_df.to_excel(writer, sheet_name='Summary', index=False)
1246
-
1247
- st.download_button("💾 Download Excel", buffer.getvalue(),
1248
- "cleaned_data.xlsx",
1249
- "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
1250
-
1251
- elif format_choice == "Parquet":
1252
- buffer = BytesIO()
1253
- self.df.to_parquet(buffer, index=False)
1254
- st.download_button("💾 Download Parquet", buffer.getvalue(),
1255
- "cleaned_data.parquet", "application/octet-stream")
1256
-
1257
- except Exception as e:
1258
- st.error(f"❌ Export error: {str(e)}")
1259
-
1260
- elif export_type == "Python Code":
1261
- code = self.generate_enhanced_python_code()
1262
- st.code(code, language="python")
1263
- st.download_button("💾 Download Python Script", code,
1264
- "analysis_script.py", "text/plain")
1265
 
1266
- except Exception as e:
1267
- st.error(f"❌ Export error: {str(e)}")
1268
 
1269
  def generate_markdown_report(self) -> str:
1270
- """Generate comprehensive markdown report"""
1271
- report = f"""# 📊 Data Analysis Report
1272
 
1273
- ## Executive Summary
1274
- - **Dataset Size:** {self.stats['shape'][0]:,} rows × {self.stats['shape'][1]:,} columns
1275
- - **Data Quality:** {calculate_data_quality_score(self.df)['grade']} grade
1276
- - **Memory Usage:** {self.stats['memory_usage']:.1f} MB
1277
- - **Analysis Completed:** {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
1278
 
1279
- ## 📈 Data Overview
1280
- | Metric | Value |
1281
- |--------|-------|
1282
- | Total Records | {self.stats['shape'][0]:,} |
1283
- | Total Features | {self.stats['shape'][1]:,} |
1284
- | Missing Values | {self.stats['missing_values']:,} |
1285
- | Duplicate Rows | {self.stats['duplicates']:,} |
1286
 
1287
- ## 📊 Data Types
1288
  """
1289
- for dtype, count in self.stats['dtypes'].items():
1290
- report += f"- **{dtype}:** {count} columns\n"
1291
-
1292
- report += "\n## 💡 Key Insights\n"
1293
-
1294
  # Group insights by stage
1295
- stage_names = {0: "Validation", 1: "Overview", 2: "Exploration", 3: "Cleaning", 4: "Analysis"}
1296
-
1297
- for stage in range(5):
1298
  stage_insights = [i for i in self.insights if i['stage'] == stage]
1299
  if stage_insights:
1300
- report += f"\n### {stage_names.get(stage, f'Stage {stage}')}\n"
1301
  for insight in stage_insights:
1302
- icon = {"success": "✅", "warning": "⚠️", "error": "❌"}.get(insight.get('type'), "ℹ️")
1303
- report += f"- {icon} {insight['insight']}\n"
1304
 
1305
- if self.cleaning_history:
1306
- report += "\n## 🔄 Data Transformations\n"
1307
- for i, operation in enumerate(self.cleaning_history, 1):
1308
- report += f"{i}. {operation}\n"
1309
-
1310
- report += f"\n---\n*Report generated by Data Analysis Platform*"
1311
  return report
1312
 
1313
- def generate_enhanced_python_code(self) -> str:
1314
- """Generate comprehensive Python code for reproducible analysis"""
1315
- code = f'''"""
1316
- Data Analysis Script
1317
- Generated on: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
1318
- Original Dataset: {self.original_df.shape[0]:,} rows × {self.original_df.shape[1]:,} columns
1319
- Final Dataset: {self.df.shape[0]:,} rows × {self.df.shape[1]:,} columns
1320
- """
1321
-
1322
- import pandas as pd
1323
  import numpy as np
1324
  import plotly.express as px
1325
- import plotly.graph_objects as go
1326
- from scipy import stats
1327
- import warnings
1328
- warnings.filterwarnings('ignore')
1329
 
1330
- # Load data
1331
- def load_and_prepare_data(file_path: str) -> pd.DataFrame:
1332
- """Load and prepare data with error handling"""
1333
- try:
1334
- if file_path.endswith('.csv'):
1335
- df = pd.read_csv(file_path)
1336
- elif file_path.endswith(('.xlsx', '.xls')):
1337
- df = pd.read_excel(file_path)
1338
- else:
1339
- raise ValueError("Unsupported file format")
1340
-
1341
- print(f"Loaded data: {{df.shape[0]:,}} rows × {{df.shape[1]:,}} columns")
1342
- return df
1343
- except Exception as e:
1344
- print(f"Error loading data: {{e}}")
1345
- return None
1346
 
1347
- # Data quality assessment
1348
- def assess_data_quality(df: pd.DataFrame) -> dict:
1349
- """Calculate comprehensive data quality metrics"""
1350
- total_cells = len(df) * len(df.columns)
1351
- missing_count = df.isnull().sum().sum()
1352
- duplicate_count = df.duplicated().sum()
1353
-
1354
- return {{
1355
- 'total_rows': len(df),
1356
- 'total_columns': len(df.columns),
1357
- 'missing_percentage': (missing_count / total_cells) * 100,
1358
- 'duplicate_percentage': (duplicate_count / len(df)) * 100,
1359
- 'memory_usage_mb': df.memory_usage(deep=True).sum() / 1024**2
1360
- }}
1361
 
1362
- # Main analysis
1363
- if __name__ == "__main__":
1364
- # Load your data
1365
- df = load_and_prepare_data('your_data_file.csv') # Update with your file path
1366
-
1367
- if df is not None:
1368
- # Data quality assessment
1369
- quality = assess_data_quality(df)
1370
- print("\\n=== DATA QUALITY REPORT ===")
1371
- print(f"Rows: {{quality['total_rows']:,}}")
1372
- print(f"Columns: {{quality['total_columns']:,}}")
1373
- print(f"Missing Data: {{quality['missing_percentage']:.2f}}%")
1374
- print(f"Duplicates: {{quality['duplicate_percentage']:.2f}}%")
1375
- print(f"Memory Usage: {{quality['memory_usage_mb']:.1f}} MB")
1376
- '''
1377
-
1378
- # Add cleaning operations if any
1379
- if self.cleaning_history:
1380
- code += "\n # Applied cleaning operations:\n"
1381
  for operation in self.cleaning_history:
1382
- if "missing" in operation.lower():
1383
- code += " # df = df.fillna(method='your_chosen_method')\n"
 
1384
  elif "duplicate" in operation.lower():
1385
- code += " df = df.drop_duplicates()\n"
 
1386
  elif "outlier" in operation.lower():
1387
- code += """ # Remove outliers using IQR method
1388
- def remove_outliers(df, column):
1389
- Q1 = df[column].quantile(0.25)
1390
- Q3 = df[column].quantile(0.75)
1391
- IQR = Q3 - Q1
1392
- return df[~((df[column] < (Q1 - 1.5 * IQR)) | (df[column] > (Q3 + 1.5 * IQR)))]
1393
-
1394
- # df = remove_outliers(df, 'your_column')
1395
- """
1396
-
1397
- # Add analysis code
1398
- code += f"""
1399
- # Basic statistics
1400
- print("\\n=== BASIC STATISTICS ===")
1401
- print(df.describe())
1402
-
1403
- # Correlation analysis (if numeric columns exist)
1404
- numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
1405
- if len(numeric_cols) > 1:
1406
- print("\\n=== CORRELATION MATRIX ===")
1407
- corr_matrix = df[numeric_cols].corr()
1408
- print(corr_matrix)
1409
-
1410
- # Visualize correlation matrix
1411
- fig = px.imshow(corr_matrix, title='Correlation Matrix')
1412
- fig.show()
1413
-
1414
- # Missing values visualization
1415
- missing = df.isnull().sum()
1416
- if missing.sum() > 0:
1417
- missing = missing[missing > 0]
1418
- fig = px.bar(x=missing.index, y=missing.values,
1419
- title='Missing Values by Column')
1420
- fig.show()
1421
-
1422
- # Final data quality report
1423
- final_quality = assess_data_quality(df)
1424
- print("\\n=== FINAL QUALITY REPORT ===")
1425
- for key, value in final_quality.items():
1426
- print(f"{{key}}: {{value}}")
1427
  """
1428
 
1429
- return code
1430
-
1431
- def generate_text_report(self) -> str:
1432
- """Generate enhanced text analysis report"""
1433
- report = f"""DATA ANALYSIS REPORT
1434
- {'='*50}
1435
-
1436
- EXECUTIVE SUMMARY
1437
- Dataset: {self.stats['shape'][0]:,} rows × {self.stats['shape'][1]:,} columns
1438
- Quality Grade: {calculate_data_quality_score(self.df)['grade']}
1439
- Memory Usage: {self.stats['memory_usage']:.1f} MB
1440
- Analysis Date: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
1441
 
1442
- DATA OVERVIEW
1443
- - Total Records: {self.stats['shape'][0]:,}
1444
- - Total Features: {self.stats['shape'][1]:,}
1445
- - Missing Values: {self.stats['missing_values']:,}
1446
- - Duplicate Rows: {self.stats['duplicates']:,}
 
1447
 
1448
- DATA TYPES DISTRIBUTION
 
 
1449
  """
1450
- for dtype, count in self.stats['dtypes'].items():
1451
- report += f"- {dtype}: {count} columns\n"
1452
-
1453
- report += "\nKEY INSIGHTS\n" + "="*20 + "\n"
1454
-
1455
- # Organize insights by stage
1456
- stage_names = {0: "VALIDATION", 1: "OVERVIEW", 2: "EXPLORATION", 3: "CLEANING", 4: "ANALYSIS"}
1457
-
1458
- for stage in range(5):
1459
- stage_insights = [i for i in self.insights if i['stage'] == stage]
1460
- if stage_insights:
1461
- report += f"\n{stage_names.get(stage, f'STAGE {stage}')}:\n"
1462
- for i, insight in enumerate(stage_insights, 1):
1463
- report += f" {i}. {insight['insight']}\n"
1464
-
1465
- if self.cleaning_history:
1466
- report += f"\nDATA TRANSFORMATIONS\n{'='*20}\n"
1467
- for i, operation in enumerate(self.cleaning_history, 1):
1468
- report += f"{i}. {operation}\n"
1469
 
1470
- report += f"\n{'='*50}\nReport generated by Data Analysis Platform\n"
1471
- return report
 
5
  import plotly.graph_objects as go
6
  from typing import Dict, List, Any, Optional
7
  import os
 
8
  from dotenv import load_dotenv
9
  from data_handler import *
10
  from io import BytesIO
 
12
  # Load environment variables
13
  load_dotenv()
14
 
15
+ # Optional AI Integration
 
 
 
16
  try:
17
  import openai
18
  OPENAI_AVAILABLE = True
19
  except ImportError:
20
  OPENAI_AVAILABLE = False
 
21
 
22
  try:
23
  import google.generativeai as genai
24
  GEMINI_AVAILABLE = True
25
  except ImportError:
26
  GEMINI_AVAILABLE = False
 
27
 
28
  class AIAssistant:
29
+ """AI-powered analysis assistant"""
30
 
31
  def __init__(self):
32
  self.openai_key = os.getenv('OPENAI_API_KEY')
33
  self.gemini_key = os.getenv('GOOGLE_API_KEY')
34
+
35
+ if self.gemini_key and GEMINI_AVAILABLE:
36
+ genai.configure(api_key=self.gemini_key)
37
+ self.gemini_model = genai.GenerativeModel('gemini-1.5-flash')
 
 
 
 
 
 
 
 
38
 
39
  def get_available_models(self) -> List[str]:
40
  """Get list of available AI models"""
 
46
  return models
47
 
48
  def analyze_insights(self, df: pd.DataFrame, insights: List[Dict], model: str = "Google Gemini") -> str:
49
+ """Get AI analysis of insights"""
50
+
51
+ # Prepare data summary
52
+ summary = f"""
53
+ Dataset Summary:
54
+ - Shape: {df.shape}
55
+ - Columns: {list(df.columns)}
56
+ - Data types: {df.dtypes.value_counts().to_dict()}
57
+
58
+ Key Insights Found:
59
+ """
60
+
61
+ for insight in insights:
62
+ summary += f"\n- {insight['insight']}"
63
+
64
+ prompt = f"""
65
+ As a senior data scientist, analyze this dataset and provide:
66
 
67
+ 1. Business implications of the findings
68
+ 2. Potential opportunities or risks
69
+ 3. Recommendations for decision-making
70
+ 4. Suggestions for further analysis
71
+
72
+ {summary}
73
+
74
+ Provide actionable insights in a professional format.
75
+ """
76
 
77
  try:
 
 
 
 
78
  if model == "Google Gemini" and hasattr(self, 'gemini_model'):
79
  response = self.gemini_model.generate_content(prompt)
80
+ return response.text
81
+ elif model == "OpenAI GPT" and self.openai_key:
 
82
  client = openai.OpenAI(api_key=self.openai_key)
83
  response = client.chat.completions.create(
84
  model="gpt-3.5-turbo",
85
+ messages=[{"role": "user", "content": prompt}]
 
 
86
  )
87
+ return response.choices[0].message.content
 
88
  else:
89
+ return "AI analysis not available. Please configure API keys."
 
90
  except Exception as e:
91
+ return f"AI Analysis Error: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
  class DataAnalysisWorkflow:
94
+ """Optimized data analysis workflow with caching and pagination"""
95
 
96
  def __init__(self, df: pd.DataFrame):
97
  self.df = df
 
98
  self.stats = calculate_basic_stats(df)
99
  self.column_types = get_column_types(df)
100
  self.insights = []
101
+ self.page_size = 1000 # For pagination
 
102
 
103
+ def add_insight(self, insight: str, stage: int):
104
+ """Add insight to analysis report"""
 
 
 
 
 
 
105
  self.insights.append({
106
  'stage': stage,
107
  'insight': insight,
 
108
  'timestamp': pd.Timestamp.now()
109
  })
110
 
111
  def get_paginated_data(self, page: int = 0) -> pd.DataFrame:
112
+ """Get paginated data for display"""
113
+ start_idx = page * self.page_size
114
+ end_idx = start_idx + self.page_size
115
+ return self.df.iloc[start_idx:end_idx]
 
 
 
 
116
 
117
  def stage_1_overview(self):
118
+ """Stage 1: Data Overview with caching"""
119
  st.subheader("📊 Data Overview")
120
 
121
+ # Data Quality Score
 
 
 
 
 
 
 
 
 
 
122
  quality_metrics = calculate_data_quality_score(self.df)
 
123
  col1, col2, col3, col4 = st.columns(4)
124
  with col1:
125
+ st.metric("Rows", f"{self.stats['shape'][0]:,}")
126
  with col2:
127
+ st.metric("Columns", f"{self.stats['shape'][1]:,}")
128
  with col3:
129
+ st.metric("Quality Score", f"{quality_metrics['score']:.1f}/100")
 
 
130
  with col4:
131
+ st.metric("Grade", quality_metrics['grade'])
 
132
 
 
133
  if quality_metrics['issues']:
134
+ st.warning("Quality Issues Found:")
135
  for issue in quality_metrics['issues']:
136
  st.write(f"• {issue}")
137
 
138
+ # Memory Usage and Optimization
139
+ st.subheader("Memory Analysis")
 
 
 
 
 
140
  memory_opt = calculate_memory_optimization(self.df)
141
+ col1, col2 = st.columns(2)
 
142
  with col1:
143
  st.metric("Current Memory", f"{memory_opt['current_memory_mb']:.1f} MB")
144
  with col2:
145
  if memory_opt['potential_savings_mb'] > 0:
146
  st.metric("Potential Savings",
147
  f"{memory_opt['potential_savings_mb']:.1f} MB",
148
+ f"{memory_opt['potential_savings_pct']:.1f}%")
149
+
150
+ if st.button("Show Optimization Details"):
151
+ st.dataframe(pd.DataFrame(memory_opt['suggestions']))
 
 
 
 
 
152
 
153
+ # Column Cardinality Analysis
154
+ st.subheader("Column Cardinality Analysis")
155
  cardinality_df = calculate_column_cardinality(self.df)
156
 
157
+ # Filter options
158
+ col_types = cardinality_df['Type'].unique()
159
+ selected_types = st.multiselect("Filter by Column Type",
160
+ col_types,
161
+ default=col_types)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
+ filtered_df = cardinality_df[cardinality_df['Type'].isin(selected_types)]
164
+ st.dataframe(filtered_df, use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
165
 
166
+ # Highlight important findings
167
+ id_cols = filtered_df[filtered_df['Type'] == 'Unique Identifier']['Column'].tolist()
168
  if id_cols:
169
+ st.info(f"📌 Potential ID columns found: {', '.join(id_cols)}")
170
+
171
+ const_cols = filtered_df[filtered_df['Type'] == 'Constant']['Column'].tolist()
172
  if const_cols:
173
+ st.warning(f"⚠️ Constant columns found: {', '.join(const_cols)}")
 
 
 
 
 
 
 
 
 
174
 
175
+ # Data types visualization
176
+ if self.stats['dtypes']:
177
+ st.subheader("Data Types Distribution")
178
+ fig = px.pie(values=list(self.stats['dtypes'].values()),
179
+ names=list(self.stats['dtypes'].keys()),
180
+ title="Data Types")
181
+ st.plotly_chart(fig, use_container_width=True)
182
+
183
+ # Sample data with pagination
184
+ st.subheader("Sample Data")
185
  total_pages = (len(self.df) - 1) // self.page_size + 1
186
 
187
+ if total_pages > 1:
188
+ page = st.slider("Page", 0, total_pages - 1, 0)
189
+ sample_data = self.get_paginated_data(page)
190
+ st.write(f"Showing rows {page * self.page_size + 1} to {min((page + 1) * self.page_size, len(self.df))}")
191
+ else:
192
+ sample_data = self.df.head(10)
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
+ st.dataframe(sample_data, use_container_width=True)
 
195
 
196
+ # Missing values analysis
 
 
 
 
 
 
 
 
 
 
 
 
197
  missing_df = calculate_missing_data(self.df)
 
198
  if not missing_df.empty:
199
+ st.subheader("Missing Values Analysis")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  st.dataframe(missing_df, use_container_width=True)
201
 
202
+ worst_column = missing_df.iloc[0]['Column']
203
+ worst_percentage = missing_df.iloc[0]['Missing %']
204
+ self.add_insight(f"Column '{worst_column}' has highest missing data: {worst_percentage:.1f}%", 1)
 
 
 
 
 
 
 
 
 
 
 
 
205
  else:
206
+ st.success("✅ No missing values found!")
207
+ self.add_insight("Dataset has no missing values - excellent data quality", 1)
208
+
209
+ # Add insights about data quality and cardinality
210
+ if quality_metrics['score'] < 80:
211
+ self.add_insight(f"Data quality needs improvement (Score: {quality_metrics['score']:.1f}/100)", 1)
 
 
 
212
 
 
213
  if memory_opt['potential_savings_pct'] > 20:
214
+ self.add_insight(f"Potential memory optimization of {memory_opt['potential_savings_pct']:.1f}% identified", 1)
215
 
216
+ if id_cols:
217
+ self.add_insight(f"Found {len(id_cols)} potential ID columns", 1)
 
 
 
 
 
 
 
218
 
219
  def stage_2_exploration(self):
220
+ """Stage 2: Exploratory Data Analysis with caching"""
221
  st.subheader("🔍 Exploratory Data Analysis")
222
 
 
 
 
 
 
 
 
 
 
223
  numeric_cols = self.column_types['numeric']
224
  categorical_cols = self.column_types['categorical']
225
 
226
+ # Numeric analysis
 
 
 
 
227
  if numeric_cols:
228
+ st.subheader("Numeric Variables")
229
+ selected_numeric = st.selectbox("Select numeric column:", numeric_cols)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
 
231
+ col1, col2 = st.columns(2)
232
+ with col1:
233
+ fig = px.histogram(self.df, x=selected_numeric,
234
+ title=f"Distribution of {selected_numeric}")
235
+ st.plotly_chart(fig, use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
 
237
+ with col2:
238
+ fig = px.box(self.df, y=selected_numeric,
239
+ title=f"Box Plot of {selected_numeric}")
240
+ st.plotly_chart(fig, use_container_width=True)
 
 
 
 
241
 
242
+ # Statistical summary
243
+ st.subheader("Statistical Summary")
244
+ summary_stats = self.df[numeric_cols].describe()
245
+ st.dataframe(summary_stats, use_container_width=True)
246
+
247
+ # Correlation analysis
248
+ if len(numeric_cols) > 1:
249
+ st.subheader("Correlation Analysis")
250
+ corr_matrix = calculate_correlation_matrix(self.df)
251
+ if not corr_matrix.empty:
252
+ fig = px.imshow(corr_matrix, text_auto=True, aspect="auto",
253
+ title="Correlation Matrix")
254
  st.plotly_chart(fig, use_container_width=True)
255
+
256
+ # Find highest correlation
257
+ corr_values = []
 
258
  for i in range(len(corr_matrix.columns)):
259
  for j in range(i+1, len(corr_matrix.columns)):
260
+ corr_values.append(abs(corr_matrix.iloc[i, j]))
 
 
 
 
 
 
 
 
 
261
 
262
+ if corr_values:
263
+ max_corr = max(corr_values)
264
+ self.add_insight(f"Maximum correlation coefficient: {max_corr:.3f}", 2)
265
+
266
+ # Categorical analysis
267
+ if categorical_cols:
268
+ st.subheader("Categorical Variables")
269
+ selected_categorical = st.selectbox("Select categorical column:", categorical_cols)
270
+
271
+ value_counts = get_value_counts(self.df, selected_categorical)
272
+ fig = px.bar(x=value_counts.index, y=value_counts.values,
273
+ title=f"Top 10 {selected_categorical} Values")
274
+ st.plotly_chart(fig, use_container_width=True)
275
+
276
+ total_categories = self.df[selected_categorical].nunique()
277
+ self.add_insight(f"Column '{selected_categorical}' has {total_categories} unique categories", 2)
278
 
279
  def stage_3_cleaning(self):
280
+ """Stage 3: Data Quality Assessment"""
281
+ st.subheader("🧹 Data Quality Assessment")
 
 
 
 
 
 
 
 
 
 
 
 
282
 
283
+ cleaning_actions = []
284
+ cleaning_history = []
285
 
286
+ # Missing values handling
287
+ if self.stats['missing_values'] > 0:
288
+ st.subheader("Missing Values Treatment")
289
+ missing_df = calculate_missing_data(self.df)
290
+ st.dataframe(missing_df, use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
291
 
292
+ col1, col2 = st.columns(2)
 
293
  with col1:
294
+ selected_col = st.selectbox("Select column to handle missing values:",
295
+ missing_df['Column'].tolist())
296
  with col2:
297
+ fill_method = st.selectbox("Choose fill method:",
298
+ ["Drop rows", "Mean", "Median", "Mode", "Custom value"])
 
 
 
 
 
 
 
 
 
 
299
 
300
+ if st.button("Apply Missing Value Treatment"):
 
 
 
 
 
 
 
 
 
 
 
 
301
  try:
 
 
302
  if fill_method == "Drop rows":
303
  self.df = self.df.dropna(subset=[selected_col])
304
+ cleaning_history.append(f"Dropped rows with missing values in {selected_col}")
305
  else:
306
  if fill_method == "Mean":
307
  fill_value = self.df[selected_col].mean()
308
  elif fill_method == "Median":
309
  fill_value = self.df[selected_col].median()
310
  elif fill_method == "Mode":
311
+ fill_value = self.df[selected_col].mode()[0]
312
+ else: # Custom value
313
+ fill_value = st.number_input("Enter custom value:", value=0.0)
 
314
 
315
  self.df[selected_col] = self.df[selected_col].fillna(fill_value)
316
+ cleaning_history.append(f"Filled missing values in {selected_col} with {fill_method}")
317
 
318
+ st.success("✅ Missing values handled successfully!")
 
 
 
319
  except Exception as e:
320
+ st.error(f"Error handling missing values: {str(e)}")
321
+
322
+ # Duplicates handling
 
 
 
323
  if self.stats['duplicates'] > 0:
324
+ st.subheader("Duplicate Rows")
325
+ st.warning(f"Found {self.stats['duplicates']} duplicate rows")
326
+
327
+ if st.button("Remove Duplicate Rows"):
328
+ original_len = len(self.df)
329
+ self.df = self.df.drop_duplicates()
330
+ removed = original_len - len(self.df)
331
+ cleaning_history.append(f"Removed {removed} duplicate rows")
332
+ st.success(f"✅ Removed {removed} duplicate rows")
333
+ else:
334
+ st.success("✅ No duplicate rows found")
335
+
336
+ # Mixed type detection and handling
337
+ mixed_types = detect_mixed_types(self.df)
338
+ if mixed_types:
339
+ st.subheader("Mixed Data Types")
340
+ mixed_df = pd.DataFrame(mixed_types)
341
+ st.dataframe(mixed_df, use_container_width=True)
342
 
343
+ selected_col = st.selectbox("Select column to fix data type:",
344
+ [item['column'] for item in mixed_types])
345
 
346
+ fix_method = st.selectbox("Choose fix method:",
347
+ ["Convert to numeric", "Convert to string"])
 
 
 
348
 
349
+ if st.button("Fix Data Type"):
350
  try:
351
+ if fix_method == "Convert to numeric":
352
+ self.df[selected_col] = pd.to_numeric(self.df[selected_col], errors='coerce')
353
+ else:
354
+ self.df[selected_col] = self.df[selected_col].astype(str)
355
+
356
+ cleaning_history.append(f"Fixed data type for {selected_col} to {fix_method}")
357
+ st.success("✅ Data type fixed successfully!")
358
  except Exception as e:
359
+ st.error(f"Error fixing data type: {str(e)}")
 
 
 
 
 
 
360
 
361
+ # Outlier detection and handling
362
+ numeric_cols = self.column_types['numeric']
363
+ if numeric_cols:
364
+ st.subheader("Outlier Detection")
365
+ selected_col = st.selectbox("Select column for outlier detection:", numeric_cols)
366
 
367
+ outliers = calculate_outliers(self.df, selected_col)
368
+ outlier_count = len(outliers)
369
+
370
+ if outlier_count > 0:
371
+ st.warning(f"Found {outlier_count} potential outliers in '{selected_col}'")
372
+ st.dataframe(outliers[[selected_col]].head(100), use_container_width=True)
373
 
374
+ treatment_method = st.selectbox("Choose outlier treatment method:",
375
+ ["None", "Remove", "Cap at percentiles"])
 
 
 
 
376
 
377
+ if treatment_method != "None" and st.button("Apply Outlier Treatment"):
378
+ try:
379
+ if treatment_method == "Remove":
380
+ self.df = self.df[~self.df.index.isin(outliers.index)]
381
+ cleaning_history.append(f"Removed {outlier_count} outliers from {selected_col}")
382
+ else: # Cap at percentiles
383
+ Q1 = self.df[selected_col].quantile(0.25)
384
+ Q3 = self.df[selected_col].quantile(0.75)
385
+ IQR = Q3 - Q1
386
+ lower_bound = Q1 - 1.5 * IQR
387
+ upper_bound = Q3 + 1.5 * IQR
 
 
388
 
389
+ self.df[selected_col] = self.df[selected_col].clip(lower_bound, upper_bound)
390
+ cleaning_history.append(f"Capped outliers in {selected_col} at percentiles")
391
+
392
+ st.success("✅ Outliers handled successfully!")
393
+ except Exception as e:
394
+ st.error(f"Error handling outliers: {str(e)}")
395
+ else:
396
+ st.success(f"✅ No outliers detected in '{selected_col}'")
397
+
398
+ # Cleaning History
399
+ if cleaning_history:
400
+ st.subheader("Cleaning Operations History")
401
+ for i, operation in enumerate(cleaning_history, 1):
402
+ st.write(f"{i}. {operation}")
403
+ self.add_insight(f"Performed {len(cleaning_history)} data cleaning operations", 3)
404
+
405
+ # Summary
406
+ if cleaning_actions:
407
+ st.subheader("Remaining Action Items")
408
+ for i, action in enumerate(cleaning_actions, 1):
409
+ st.write(f"{i}. {action}")
410
+ self.add_insight(f"Identified {len(cleaning_actions)} data quality issues", 3)
411
  else:
412
+ st.success("✅ Data quality is excellent!")
413
+ self.add_insight("No major data quality issues found", 3)
414
 
415
+ def stage_4_analysis(self):
416
+ """Stage 4: Advanced Analysis"""
417
+ st.subheader("🔬 Advanced Analysis")
418
+
419
  numeric_cols = self.column_types['numeric']
420
+ categorical_cols = self.column_types['categorical']
421
 
422
+ # Relationship analysis
423
+ if len(numeric_cols) >= 2:
424
+ st.subheader("Variable Relationships")
425
 
426
+ col1, col2 = st.columns(2)
427
  with col1:
428
+ x_var = st.selectbox("X Variable:", numeric_cols)
429
  with col2:
430
+ y_var = st.selectbox("Y Variable:",
431
+ [col for col in numeric_cols if col != x_var])
 
 
 
 
 
432
 
433
+ # Sample data for performance if dataset is large
434
+ sample_size = min(5000, len(self.df))
435
+ sample_df = self.df.sample(n=sample_size) if len(self.df) > sample_size else self.df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
436
 
437
+ fig = px.scatter(sample_df, x=x_var, y=y_var,
438
+ title=f"Relationship: {x_var} vs {y_var}")
439
+ st.plotly_chart(fig, use_container_width=True)
440
 
441
+ correlation = self.df[x_var].corr(self.df[y_var])
442
+ st.metric("Correlation", f"{correlation:.3f}")
 
 
 
 
 
 
 
 
 
 
443
 
444
+ if abs(correlation) > 0.7:
445
+ strength = "Strong"
446
+ elif abs(correlation) > 0.3:
447
+ strength = "Moderate"
448
+ else:
449
+ strength = "Weak"
450
 
451
+ direction = "positive" if correlation > 0 else "negative"
452
+ st.write(f"**Result:** {strength} {direction} correlation")
453
+ self.add_insight(f"{strength} correlation ({correlation:.3f}) between {x_var} and {y_var}", 4)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
454
 
455
+ # Group analysis
456
  if categorical_cols and numeric_cols:
457
+ st.subheader("Group Analysis")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
458
 
459
+ col1, col2 = st.columns(2)
460
+ with col1:
461
+ group_var = st.selectbox("Group by:", categorical_cols)
462
+ with col2:
463
+ metric_var = st.selectbox("Analyze:", numeric_cols)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
464
 
465
+ group_stats = calculate_group_stats(self.df, group_var, metric_var)
466
+ st.dataframe(group_stats, use_container_width=True)
467
 
468
+ # Sample for visualization if too many groups
469
+ unique_groups = self.df[group_var].nunique()
470
+ if unique_groups <= 20:
471
+ fig = px.box(self.df, x=group_var, y=metric_var,
472
+ title=f"{metric_var} by {group_var}")
473
+ st.plotly_chart(fig, use_container_width=True)
474
+ else:
475
+ st.info(f"Too many groups ({unique_groups}) for visualization. Showing statistics only.")
476
+
477
+ best_group = group_stats['mean'].idxmax()
478
+ best_value = group_stats.loc[best_group, 'mean']
479
+ self.add_insight(f"'{best_group}' has highest average {metric_var}: {best_value:.2f}", 4)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
480
 
481
  def stage_5_summary(self):
482
+ """Stage 5: Summary and Export"""
483
+ st.subheader("📈 Analysis Summary")
 
 
 
 
 
 
 
 
 
484
 
485
+ # Key metrics
486
+ col1, col2, col3 = st.columns(3)
487
  with col1:
488
+ st.metric("Total Insights", len(self.insights))
489
  with col2:
490
+ quality = "High" if self.stats['missing_values'] == 0 else "Medium"
491
+ st.metric("Data Quality", quality)
492
  with col3:
493
+ st.metric("Analysis Complete", "✅")
494
+
495
+ # Insights summary
496
+ st.subheader("Key Insights")
497
+ for i, insight in enumerate(self.insights, 1):
498
+ st.write(f"{i}. **Stage {insight['stage']}:** {insight['insight']}")
499
+
500
+ # Export options
501
+ st.subheader("Export Results")
502
+ export_format = st.selectbox("Choose export format:",
503
+ ["Text Report", "Markdown Report", "Python Code", "Cleaned Data"])
504
+
505
+ if export_format == "Text Report":
506
+ report = self.generate_text_report()
507
+ st.download_button(
508
+ label="Download Text Report",
509
+ data=report,
510
+ file_name="analysis_report.txt",
511
+ mime="text/plain"
512
+ )
513
+
514
+ elif export_format == "Markdown Report":
515
+ report = self.generate_markdown_report()
516
+ st.download_button(
517
+ label="Download Markdown Report",
518
+ data=report,
519
+ file_name="analysis_report.md",
520
+ mime="text/markdown"
521
+ )
522
+
523
+ elif export_format == "Python Code":
524
+ code = self.generate_python_code()
525
+ st.code(code, language="python")
526
+ st.download_button(
527
+ label="Download Python Script",
528
+ data=code,
529
+ file_name="analysis_script.py",
530
+ mime="text/plain"
531
+ )
532
+
533
+ else: # Cleaned Data
534
+ # Offer different export formats
535
+ data_format = st.selectbox("Choose data format:",
536
+ ["CSV", "Excel", "Parquet"])
537
+
538
+ if st.button("Export Data"):
539
+ try:
540
+ if data_format == "CSV":
541
+ csv = self.df.to_csv(index=False)
542
+ st.download_button(
543
+ label="Download CSV",
544
+ data=csv,
545
+ file_name="cleaned_data.csv",
546
+ mime="text/csv"
547
+ )
548
+ elif data_format == "Excel":
549
+ excel_buffer = BytesIO()
550
+ self.df.to_excel(excel_buffer, index=False)
551
+ excel_data = excel_buffer.getvalue()
552
+ st.download_button(
553
+ label="Download Excel",
554
+ data=excel_data,
555
+ file_name="cleaned_data.xlsx",
556
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
557
+ )
558
+ else: # Parquet
559
+ parquet_buffer = BytesIO()
560
+ self.df.to_parquet(parquet_buffer, index=False)
561
+ parquet_data = parquet_buffer.getvalue()
562
+ st.download_button(
563
+ label="Download Parquet",
564
+ data=parquet_data,
565
+ file_name="cleaned_data.parquet",
566
+ mime="application/octet-stream"
567
+ )
568
+ except Exception as e:
569
+ st.error(f"Error exporting data: {str(e)}")
570
 
571
+ def generate_text_report(self) -> str:
572
+ """Generate text analysis report"""
573
+ report = f"""DATA ANALYSIS REPORT
574
+ ==================
575
+
576
+ Dataset Overview:
577
+ - Rows: {self.stats['shape'][0]:,}
578
+ - Columns: {self.stats['shape'][1]:,}
579
+ - Missing Values: {self.stats['missing_values']:,}
580
+ - Memory Usage: {self.stats['memory_usage']:.1f} MB
581
+
582
+ Key Insights:
583
+ """
584
+ for insight in self.insights:
585
+ report += f"\n- Stage {insight['stage']}: {insight['insight']}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
586
 
587
+ report += f"\n\nGenerated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}"
588
+ return report
589
 
590
  def generate_markdown_report(self) -> str:
591
+ """Generate markdown analysis report"""
592
+ report = f"""# Data Analysis Report
593
 
594
+ ## Dataset Overview
595
+ * **Rows:** {self.stats['shape'][0]:,}
596
+ * **Columns:** {self.stats['shape'][1]:,}
597
+ * **Missing Values:** {self.stats['missing_values']:,}
598
+ * **Memory Usage:** {self.stats['memory_usage']:.1f} MB
599
 
600
+ ## Data Types
601
+ ```
602
+ {pd.DataFrame(self.stats['dtypes'].items(), columns=['Type', 'Count']).to_markdown()}
603
+ ```
 
 
 
604
 
605
+ ## Key Insights
606
  """
 
 
 
 
 
607
  # Group insights by stage
608
+ for stage in range(1, 6):
 
 
609
  stage_insights = [i for i in self.insights if i['stage'] == stage]
610
  if stage_insights:
611
+ report += f"\n### Stage {stage}\n"
612
  for insight in stage_insights:
613
+ report += f"* {insight['insight']}\n"
 
614
 
615
+ report += f"\n\n*Generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}*"
 
 
 
 
 
616
  return report
617
 
618
+ def generate_python_code(self) -> str:
619
+ """Generate reproducible Python code"""
620
+ code = """import pandas as pd
 
 
 
 
 
 
 
621
  import numpy as np
622
  import plotly.express as px
623
+ from typing import Dict, List, Any
 
 
 
624
 
625
+ # Load and prepare data
626
+ df = pd.read_csv('your_data.csv') # Update with your data source
 
 
 
 
 
 
 
 
 
 
 
 
 
 
627
 
628
+ # Basic statistics
629
+ def calculate_basic_stats(df: pd.DataFrame) -> Dict[str, Any]:
630
+ return {
631
+ 'shape': df.shape,
632
+ 'memory_usage': float(df.memory_usage(deep=True).sum() / 1024**2),
633
+ 'missing_values': int(df.isnull().sum().sum()),
634
+ 'dtypes': df.dtypes.value_counts().to_dict(),
635
+ 'duplicates': int(df.duplicated().sum())
636
+ }
 
 
 
 
 
637
 
638
+ stats = calculate_basic_stats(df)
639
+ print("\\nBasic Statistics:")
640
+ print(f"- Shape: {stats['shape']}")
641
+ print(f"- Memory Usage: {stats['memory_usage']:.1f} MB")
642
+ print(f"- Missing Values: {stats['missing_values']}")
643
+ print(f"- Duplicates: {stats['duplicates']}")
644
+
645
+ """
646
+ # Add data cleaning operations if any were performed
647
+ if hasattr(self, 'cleaning_history'):
648
+ code += "\n# Data Cleaning\n"
 
 
 
 
 
 
 
 
649
  for operation in self.cleaning_history:
650
+ if "missing values" in operation.lower():
651
+ code += "# Handle missing values\n"
652
+ code += "df = df.fillna(method='ffill') # Update with your chosen method\n"
653
  elif "duplicate" in operation.lower():
654
+ code += "# Remove duplicates\n"
655
+ code += "df = df.drop_duplicates()\n"
656
  elif "outlier" in operation.lower():
657
+ code += """# Handle outliers
658
+ def remove_outliers(df: pd.DataFrame, column: str) -> pd.DataFrame:
659
+ Q1 = df[column].quantile(0.25)
660
+ Q3 = df[column].quantile(0.75)
661
+ IQR = Q3 - Q1
662
+ return df[~((df[column] < (Q1 - 1.5 * IQR)) | (df[column] > (Q3 + 1.5 * IQR)))]
663
+
664
+ # Apply to numeric columns as needed
665
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
666
+ for col in numeric_cols:
667
+ df = remove_outliers(df, col)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
668
  """
669
 
670
+ # Add visualization code
671
+ code += """
672
+ # Visualizations
673
+ def plot_missing_values(df: pd.DataFrame):
674
+ missing = df.isnull().sum()
675
+ if missing.sum() > 0:
676
+ missing = missing[missing > 0]
677
+ fig = px.bar(x=missing.index, y=missing.values,
678
+ title='Missing Values by Column')
679
+ fig.show()
 
 
680
 
681
+ def plot_correlations(df: pd.DataFrame):
682
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
683
+ if len(numeric_cols) > 1:
684
+ corr = df[numeric_cols].corr()
685
+ fig = px.imshow(corr, title='Correlation Matrix')
686
+ fig.show()
687
 
688
+ # Generate plots
689
+ plot_missing_values(df)
690
+ plot_correlations(df)
691
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
692
 
693
+ return code