entropy25 commited on
Commit
7583e80
·
verified ·
1 Parent(s): f0413da

Update analyzer.py

Browse files
Files changed (1) hide show
  1. analyzer.py +1262 -536
analyzer.py CHANGED
@@ -3,102 +3,190 @@ import pandas as pd
3
  import numpy as np
4
  import plotly.express as px
5
  import plotly.graph_objects as go
 
6
  from typing import Dict, List, Any, Optional
7
  import os
8
- from dotenv import load_dotenv
9
  from data_handler import *
10
  from io import BytesIO
11
 
12
- # Load environment variables
13
- load_dotenv()
14
-
15
- # Optional AI Integration
16
- try:
17
- import openai
18
- OPENAI_AVAILABLE = True
19
- except ImportError:
20
- OPENAI_AVAILABLE = False
21
-
22
- try:
23
- import google.generativeai as genai
24
- GEMINI_AVAILABLE = True
25
- except ImportError:
26
- GEMINI_AVAILABLE = False
27
-
28
  class AIAssistant:
29
- """AI-powered analysis assistant"""
30
 
31
  def __init__(self):
32
- self.openai_key = os.getenv('OPENAI_API_KEY')
33
- self.gemini_key = os.getenv('GOOGLE_API_KEY')
34
-
35
- if self.gemini_key and GEMINI_AVAILABLE:
36
- genai.configure(api_key=self.gemini_key)
37
- self.gemini_model = genai.GenerativeModel('gemini-1.5-flash')
38
 
39
  def get_available_models(self) -> List[str]:
40
  """Get list of available AI models"""
41
- models = []
42
- if self.openai_key and OPENAI_AVAILABLE:
43
- models.append("OpenAI GPT")
44
- if self.gemini_key and GEMINI_AVAILABLE:
45
- models.append("Google Gemini")
46
- return models
47
 
48
- def analyze_insights(self, df: pd.DataFrame, insights: List[Dict], model: str = "Google Gemini") -> str:
49
- """Get AI analysis of insights"""
50
-
51
- # Prepare data summary
52
- summary = f"""
53
- Dataset Summary:
54
- - Shape: {df.shape}
55
- - Columns: {list(df.columns)}
56
- - Data types: {df.dtypes.value_counts().to_dict()}
57
-
58
- Key Insights Found:
59
- """
60
-
61
- for insight in insights:
62
- summary += f"\n- {insight['insight']}"
63
-
64
- prompt = f"""
65
- As a senior data scientist, analyze this dataset and provide:
66
-
67
- 1. Business implications of the findings
68
- 2. Potential opportunities or risks
69
- 3. Recommendations for decision-making
70
- 4. Suggestions for further analysis
71
-
72
- {summary}
73
-
74
- Provide actionable insights in a professional format.
75
- """
76
-
77
- try:
78
- if model == "Google Gemini" and hasattr(self, 'gemini_model'):
79
- response = self.gemini_model.generate_content(prompt)
80
- return response.text
81
- elif model == "OpenAI GPT" and self.openai_key:
82
- client = openai.OpenAI(api_key=self.openai_key)
83
- response = client.chat.completions.create(
84
- model="gpt-3.5-turbo",
85
- messages=[{"role": "user", "content": prompt}]
86
- )
87
- return response.choices[0].message.content
88
- else:
89
- return "AI analysis not available. Please configure API keys."
90
- except Exception as e:
91
- return f"AI Analysis Error: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
  class DataAnalysisWorkflow:
94
- """Optimized data analysis workflow with caching and pagination"""
95
 
96
  def __init__(self, df: pd.DataFrame):
97
  self.df = df
 
98
  self.stats = calculate_basic_stats(df)
99
  self.column_types = get_column_types(df)
100
  self.insights = []
101
- self.page_size = 1000 # For pagination
 
102
 
103
  def add_insight(self, insight: str, stage: int):
104
  """Add insight to analysis report"""
@@ -108,586 +196,1224 @@ class DataAnalysisWorkflow:
108
  'timestamp': pd.Timestamp.now()
109
  })
110
 
111
- def get_paginated_data(self, page: int = 0) -> pd.DataFrame:
112
- """Get paginated data for display"""
113
- start_idx = page * self.page_size
114
- end_idx = start_idx + self.page_size
115
- return self.df.iloc[start_idx:end_idx]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
  def stage_1_overview(self):
118
- """Stage 1: Data Overview with caching"""
119
- st.subheader("📊 Data Overview")
 
 
 
 
 
120
 
121
- # Data Quality Score
122
- quality_metrics = calculate_data_quality_score(self.df)
123
- col1, col2, col3, col4 = st.columns(4)
124
  with col1:
125
- st.metric("Rows", f"{self.stats['shape'][0]:,}")
 
 
 
 
 
 
 
 
126
  with col2:
127
- st.metric("Columns", f"{self.stats['shape'][1]:,}")
 
128
  with col3:
129
- st.metric("Quality Score", f"{quality_metrics['score']:.1f}/100")
 
130
  with col4:
131
- st.metric("Grade", quality_metrics['grade'])
 
 
 
 
 
132
 
 
133
  if quality_metrics['issues']:
134
- st.warning("Quality Issues Found:")
135
- for issue in quality_metrics['issues']:
136
- st.write(f"• {issue}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
 
138
- # Memory Usage and Optimization
139
- st.subheader("Memory Analysis")
140
- memory_opt = calculate_memory_optimization(self.df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  col1, col2 = st.columns(2)
 
142
  with col1:
143
- st.metric("Current Memory", f"{memory_opt['current_memory_mb']:.1f} MB")
 
 
 
 
 
 
 
144
  with col2:
145
- if memory_opt['potential_savings_mb'] > 0:
146
- st.metric("Potential Savings",
147
- f"{memory_opt['potential_savings_mb']:.1f} MB",
148
- f"{memory_opt['potential_savings_pct']:.1f}%")
149
-
150
- if st.button("Show Optimization Details"):
151
- st.dataframe(pd.DataFrame(memory_opt['suggestions']))
152
-
153
- # Column Cardinality Analysis
154
- st.subheader("Column Cardinality Analysis")
155
- cardinality_df = calculate_column_cardinality(self.df)
156
-
157
- # Filter options
158
- col_types = cardinality_df['Type'].unique()
159
- selected_types = st.multiselect("Filter by Column Type",
160
- col_types,
161
- default=col_types)
162
-
163
- filtered_df = cardinality_df[cardinality_df['Type'].isin(selected_types)]
164
- st.dataframe(filtered_df, use_container_width=True)
165
-
166
- # Highlight important findings
167
- id_cols = filtered_df[filtered_df['Type'] == 'Unique Identifier']['Column'].tolist()
168
- if id_cols:
169
- st.info(f"📌 Potential ID columns found: {', '.join(id_cols)}")
170
-
171
- const_cols = filtered_df[filtered_df['Type'] == 'Constant']['Column'].tolist()
172
- if const_cols:
173
- st.warning(f"⚠️ Constant columns found: {', '.join(const_cols)}")
174
-
175
- # Data types visualization
176
- if self.stats['dtypes']:
177
- st.subheader("Data Types Distribution")
178
- fig = px.pie(values=list(self.stats['dtypes'].values()),
179
- names=list(self.stats['dtypes'].keys()),
180
- title="Data Types")
181
- st.plotly_chart(fig, use_container_width=True)
182
-
183
- # Sample data with pagination
184
- st.subheader("Sample Data")
185
- total_pages = (len(self.df) - 1) // self.page_size + 1
186
-
187
- if total_pages > 1:
188
- page = st.slider("Page", 0, total_pages - 1, 0)
189
- sample_data = self.get_paginated_data(page)
190
- st.write(f"Showing rows {page * self.page_size + 1} to {min((page + 1) * self.page_size, len(self.df))}")
191
- else:
192
- sample_data = self.df.head(10)
193
 
194
- st.dataframe(sample_data, use_container_width=True)
 
 
195
 
196
- # Missing values analysis
197
- missing_df = calculate_missing_data(self.df)
198
- if not missing_df.empty:
199
- st.subheader("Missing Values Analysis")
200
- st.dataframe(missing_df, use_container_width=True)
201
-
202
- worst_column = missing_df.iloc[0]['Column']
203
- worst_percentage = missing_df.iloc[0]['Missing %']
204
- self.add_insight(f"Column '{worst_column}' has highest missing data: {worst_percentage:.1f}%", 1)
205
  else:
206
- st.success("✅ No missing values found!")
207
- self.add_insight("Dataset has no missing values - excellent data quality", 1)
 
208
 
209
- # Add insights about data quality and cardinality
210
  if quality_metrics['score'] < 80:
211
  self.add_insight(f"Data quality needs improvement (Score: {quality_metrics['score']:.1f}/100)", 1)
212
-
213
- if memory_opt['potential_savings_pct'] > 20:
214
- self.add_insight(f"Potential memory optimization of {memory_opt['potential_savings_pct']:.1f}% identified", 1)
215
-
216
- if id_cols:
217
- self.add_insight(f"Found {len(id_cols)} potential ID columns", 1)
218
 
219
  def stage_2_exploration(self):
220
- """Stage 2: Exploratory Data Analysis with caching"""
221
- st.subheader("🔍 Exploratory Data Analysis")
222
 
223
  numeric_cols = self.column_types['numeric']
224
  categorical_cols = self.column_types['categorical']
225
 
226
- # Numeric analysis
 
 
 
 
227
  if numeric_cols:
228
- st.subheader("Numeric Variables")
229
- selected_numeric = st.selectbox("Select numeric column:", numeric_cols)
230
 
231
  col1, col2 = st.columns(2)
232
  with col1:
233
- fig = px.histogram(self.df, x=selected_numeric,
234
- title=f"Distribution of {selected_numeric}")
235
- st.plotly_chart(fig, use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
 
 
 
 
 
 
 
 
237
  with col2:
238
- fig = px.box(self.df, y=selected_numeric,
239
- title=f"Box Plot of {selected_numeric}")
240
- st.plotly_chart(fig, use_container_width=True)
241
-
242
- # Statistical summary
243
- st.subheader("Statistical Summary")
244
- summary_stats = self.df[numeric_cols].describe()
245
- st.dataframe(summary_stats, use_container_width=True)
246
-
247
- # Correlation analysis
248
- if len(numeric_cols) > 1:
249
- st.subheader("Correlation Analysis")
250
- corr_matrix = calculate_correlation_matrix(self.df)
251
- if not corr_matrix.empty:
252
- fig = px.imshow(corr_matrix, text_auto=True, aspect="auto",
253
- title="Correlation Matrix")
254
- st.plotly_chart(fig, use_container_width=True)
255
-
256
- # Find highest correlation
257
- corr_values = []
258
- for i in range(len(corr_matrix.columns)):
259
- for j in range(i+1, len(corr_matrix.columns)):
260
- corr_values.append(abs(corr_matrix.iloc[i, j]))
261
-
262
- if corr_values:
263
- max_corr = max(corr_values)
264
- self.add_insight(f"Maximum correlation coefficient: {max_corr:.3f}", 2)
265
 
266
- # Categorical analysis
267
  if categorical_cols:
268
- st.subheader("Categorical Variables")
 
269
  selected_categorical = st.selectbox("Select categorical column:", categorical_cols)
270
 
271
- value_counts = get_value_counts(self.df, selected_categorical)
272
- fig = px.bar(x=value_counts.index, y=value_counts.values,
273
- title=f"Top 10 {selected_categorical} Values")
274
- st.plotly_chart(fig, use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
 
 
276
  total_categories = self.df[selected_categorical].nunique()
277
- self.add_insight(f"Column '{selected_categorical}' has {total_categories} unique categories", 2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
 
279
  def stage_3_cleaning(self):
280
- """Stage 3: Data Quality Assessment"""
281
- st.subheader("🧹 Data Quality Assessment")
 
282
 
283
- cleaning_actions = []
284
- cleaning_history = []
285
 
286
- # Missing values handling
287
- if self.stats['missing_values'] > 0:
288
- st.subheader("Missing Values Treatment")
289
- missing_df = calculate_missing_data(self.df)
290
- st.dataframe(missing_df, use_container_width=True)
 
291
 
292
- col1, col2 = st.columns(2)
293
  with col1:
294
- selected_col = st.selectbox("Select column to handle missing values:",
295
- missing_df['Column'].tolist())
296
- with col2:
297
- fill_method = st.selectbox("Choose fill method:",
298
- ["Drop rows", "Mean", "Median", "Mode", "Custom value"])
 
 
 
 
 
 
 
 
 
 
 
299
 
300
- if st.button("Apply Missing Value Treatment"):
301
- try:
302
- if fill_method == "Drop rows":
303
- self.df = self.df.dropna(subset=[selected_col])
304
- cleaning_history.append(f"Dropped rows with missing values in {selected_col}")
 
 
 
 
 
 
 
 
 
 
 
 
305
  else:
306
- if fill_method == "Mean":
307
- fill_value = self.df[selected_col].mean()
308
- elif fill_method == "Median":
309
- fill_value = self.df[selected_col].median()
310
- elif fill_method == "Mode":
311
- fill_value = self.df[selected_col].mode()[0]
312
- else: # Custom value
313
- fill_value = st.number_input("Enter custom value:", value=0.0)
314
-
315
- self.df[selected_col] = self.df[selected_col].fillna(fill_value)
316
- cleaning_history.append(f"Filled missing values in {selected_col} with {fill_method}")
317
 
318
- st.success(" Missing values handled successfully!")
319
- except Exception as e:
320
- st.error(f"Error handling missing values: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321
 
322
- # Duplicates handling
323
  if self.stats['duplicates'] > 0:
324
- st.subheader("Duplicate Rows")
325
- st.warning(f"Found {self.stats['duplicates']} duplicate rows")
326
-
327
- if st.button("Remove Duplicate Rows"):
328
- original_len = len(self.df)
329
- self.df = self.df.drop_duplicates()
330
- removed = original_len - len(self.df)
331
- cleaning_history.append(f"Removed {removed} duplicate rows")
332
- st.success(f"✅ Removed {removed} duplicate rows")
333
- else:
334
- st.success("✅ No duplicate rows found")
335
-
336
- # Mixed type detection and handling
337
- mixed_types = detect_mixed_types(self.df)
338
- if mixed_types:
339
- st.subheader("Mixed Data Types")
340
- mixed_df = pd.DataFrame(mixed_types)
341
- st.dataframe(mixed_df, use_container_width=True)
342
 
343
- selected_col = st.selectbox("Select column to fix data type:",
344
- [item['column'] for item in mixed_types])
345
 
346
- fix_method = st.selectbox("Choose fix method:",
347
- ["Convert to numeric", "Convert to string"])
348
 
349
- if st.button("Fix Data Type"):
350
- try:
351
- if fix_method == "Convert to numeric":
352
- self.df[selected_col] = pd.to_numeric(self.df[selected_col], errors='coerce')
353
- else:
354
- self.df[selected_col] = self.df[selected_col].astype(str)
355
-
356
- cleaning_history.append(f"Fixed data type for {selected_col} to {fix_method}")
357
- st.success(" Data type fixed successfully!")
358
- except Exception as e:
359
- st.error(f"Error fixing data type: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
360
 
361
- # Outlier detection and handling
362
  numeric_cols = self.column_types['numeric']
363
  if numeric_cols:
364
- st.subheader("Outlier Detection")
365
- selected_col = st.selectbox("Select column for outlier detection:", numeric_cols)
366
-
367
- outliers = calculate_outliers(self.df, selected_col)
368
- outlier_count = len(outliers)
369
-
370
- if outlier_count > 0:
371
- st.warning(f"Found {outlier_count} potential outliers in '{selected_col}'")
372
- st.dataframe(outliers[[selected_col]].head(100), use_container_width=True)
373
-
374
- treatment_method = st.selectbox("Choose outlier treatment method:",
375
- ["None", "Remove", "Cap at percentiles"])
376
-
377
- if treatment_method != "None" and st.button("Apply Outlier Treatment"):
378
- try:
379
- if treatment_method == "Remove":
380
- self.df = self.df[~self.df.index.isin(outliers.index)]
381
- cleaning_history.append(f"Removed {outlier_count} outliers from {selected_col}")
382
- else: # Cap at percentiles
383
- Q1 = self.df[selected_col].quantile(0.25)
384
- Q3 = self.df[selected_col].quantile(0.75)
385
- IQR = Q3 - Q1
386
- lower_bound = Q1 - 1.5 * IQR
387
- upper_bound = Q3 + 1.5 * IQR
388
-
389
- self.df[selected_col] = self.df[selected_col].clip(lower_bound, upper_bound)
390
- cleaning_history.append(f"Capped outliers in {selected_col} at percentiles")
391
-
392
- st.success("✅ Outliers handled successfully!")
393
- except Exception as e:
394
- st.error(f"Error handling outliers: {str(e)}")
395
- else:
396
- st.success(f"✅ No outliers detected in '{selected_col}'")
397
-
398
- # Cleaning History
399
- if cleaning_history:
400
- st.subheader("Cleaning Operations History")
401
- for i, operation in enumerate(cleaning_history, 1):
402
- st.write(f"{i}. {operation}")
403
- self.add_insight(f"Performed {len(cleaning_history)} data cleaning operations", 3)
404
-
405
- # Summary
406
- if cleaning_actions:
407
- st.subheader("Remaining Action Items")
408
- for i, action in enumerate(cleaning_actions, 1):
409
- st.write(f"{i}. {action}")
410
- self.add_insight(f"Identified {len(cleaning_actions)} data quality issues", 3)
411
- else:
412
- st.success(" Data quality is excellent!")
413
- self.add_insight("No major data quality issues found", 3)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
414
 
415
  def stage_4_analysis(self):
416
- """Stage 4: Advanced Analysis"""
417
- st.subheader("🔬 Advanced Analysis")
418
 
419
  numeric_cols = self.column_types['numeric']
420
  categorical_cols = self.column_types['categorical']
421
 
422
- # Relationship analysis
423
  if len(numeric_cols) >= 2:
424
- st.subheader("Variable Relationships")
425
 
426
- col1, col2 = st.columns(2)
427
  with col1:
428
  x_var = st.selectbox("X Variable:", numeric_cols)
429
  with col2:
430
- y_var = st.selectbox("Y Variable:",
431
- [col for col in numeric_cols if col != x_var])
 
432
 
433
- # Sample data for performance if dataset is large
434
  sample_size = min(5000, len(self.df))
435
- sample_df = self.df.sample(n=sample_size) if len(self.df) > sample_size else self.df
 
 
 
 
436
 
437
- fig = px.scatter(sample_df, x=x_var, y=y_var,
438
- title=f"Relationship: {x_var} vs {y_var}")
439
- st.plotly_chart(fig, use_container_width=True)
 
 
 
 
 
 
 
 
 
 
440
 
 
 
 
 
441
  correlation = self.df[x_var].corr(self.df[y_var])
442
- st.metric("Correlation", f"{correlation:.3f}")
443
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
444
  if abs(correlation) > 0.7:
445
- strength = "Strong"
 
446
  elif abs(correlation) > 0.3:
447
- strength = "Moderate"
448
- else:
449
- strength = "Weak"
450
-
451
- direction = "positive" if correlation > 0 else "negative"
452
- st.write(f"**Result:** {strength} {direction} correlation")
453
- self.add_insight(f"{strength} correlation ({correlation:.3f}) between {x_var} and {y_var}", 4)
454
 
455
- # Group analysis
456
  if categorical_cols and numeric_cols:
457
- st.subheader("Group Analysis")
458
 
459
  col1, col2 = st.columns(2)
460
  with col1:
461
  group_var = st.selectbox("Group by:", categorical_cols)
462
  with col2:
463
- metric_var = st.selectbox("Analyze:", numeric_cols)
464
 
 
465
  group_stats = calculate_group_stats(self.df, group_var, metric_var)
466
- st.dataframe(group_stats, use_container_width=True)
467
-
468
- # Sample for visualization if too many groups
469
- unique_groups = self.df[group_var].nunique()
470
- if unique_groups <= 20:
471
- fig = px.box(self.df, x=group_var, y=metric_var,
472
- title=f"{metric_var} by {group_var}")
473
- st.plotly_chart(fig, use_container_width=True)
474
- else:
475
- st.info(f"Too many groups ({unique_groups}) for visualization. Showing statistics only.")
476
 
477
- best_group = group_stats['mean'].idxmax()
478
- best_value = group_stats.loc[best_group, 'mean']
479
- self.add_insight(f"'{best_group}' has highest average {metric_var}: {best_value:.2f}", 4)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
480
 
481
  def stage_5_summary(self):
482
- """Stage 5: Summary and Export"""
483
- st.subheader("📈 Analysis Summary")
 
 
 
 
 
 
 
484
 
485
- # Key metrics
486
- col1, col2, col3 = st.columns(3)
487
  with col1:
488
- st.metric("Total Insights", len(self.insights))
489
  with col2:
490
- quality = "High" if self.stats['missing_values'] == 0 else "Medium"
491
- st.metric("Data Quality", quality)
492
  with col3:
493
- st.metric("Analysis Complete", "✅")
 
 
 
494
 
495
- # Insights summary
496
- st.subheader("Key Insights")
497
- for i, insight in enumerate(self.insights, 1):
498
- st.write(f"{i}. **Stage {insight['stage']}:** {insight['insight']}")
499
 
500
- # Export options
501
- st.subheader("Export Results")
502
- export_format = st.selectbox("Choose export format:",
503
- ["Text Report", "Markdown Report", "Python Code", "Cleaned Data"])
 
 
 
504
 
505
- if export_format == "Text Report":
506
- report = self.generate_text_report()
507
- st.download_button(
508
- label="Download Text Report",
509
- data=report,
510
- file_name="analysis_report.txt",
511
- mime="text/plain"
512
- )
513
 
514
- elif export_format == "Markdown Report":
515
- report = self.generate_markdown_report()
516
- st.download_button(
517
- label="Download Markdown Report",
518
- data=report,
519
- file_name="analysis_report.md",
520
- mime="text/markdown"
521
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
522
 
523
- elif export_format == "Python Code":
524
- code = self.generate_python_code()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
525
  st.code(code, language="python")
 
526
  st.download_button(
527
- label="Download Python Script",
528
  data=code,
529
- file_name="analysis_script.py",
530
- mime="text/plain"
 
531
  )
532
-
533
- else: # Cleaned Data
534
- # Offer different export formats
535
- data_format = st.selectbox("Choose data format:",
536
- ["CSV", "Excel", "Parquet"])
537
-
538
- if st.button("Export Data"):
539
- try:
540
- if data_format == "CSV":
541
- csv = self.df.to_csv(index=False)
542
- st.download_button(
543
- label="Download CSV",
544
- data=csv,
545
- file_name="cleaned_data.csv",
546
- mime="text/csv"
547
- )
548
- elif data_format == "Excel":
549
- excel_buffer = BytesIO()
550
- self.df.to_excel(excel_buffer, index=False)
551
- excel_data = excel_buffer.getvalue()
552
- st.download_button(
553
- label="Download Excel",
554
- data=excel_data,
555
- file_name="cleaned_data.xlsx",
556
- mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
557
- )
558
- else: # Parquet
559
- parquet_buffer = BytesIO()
560
- self.df.to_parquet(parquet_buffer, index=False)
561
- parquet_data = parquet_buffer.getvalue()
562
- st.download_button(
563
- label="Download Parquet",
564
- data=parquet_data,
565
- file_name="cleaned_data.parquet",
566
- mime="application/octet-stream"
567
- )
568
- except Exception as e:
569
- st.error(f"Error exporting data: {str(e)}")
570
 
571
- def generate_text_report(self) -> str:
572
- """Generate text analysis report"""
573
- report = f"""DATA ANALYSIS REPORT
574
- ==================
575
-
576
- Dataset Overview:
577
- - Rows: {self.stats['shape'][0]:,}
578
- - Columns: {self.stats['shape'][1]:,}
579
- - Missing Values: {self.stats['missing_values']:,}
580
- - Memory Usage: {self.stats['memory_usage']:.1f} MB
581
-
582
- Key Insights:
 
 
583
  """
584
- for insight in self.insights:
585
- report += f"\n- Stage {insight['stage']}: {insight['insight']}"
586
 
587
- report += f"\n\nGenerated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}"
588
- return report
589
-
590
- def generate_markdown_report(self) -> str:
591
- """Generate markdown analysis report"""
592
- report = f"""# Data Analysis Report
593
-
594
- ## Dataset Overview
595
- * **Rows:** {self.stats['shape'][0]:,}
596
- * **Columns:** {self.stats['shape'][1]:,}
597
- * **Missing Values:** {self.stats['missing_values']:,}
598
- * **Memory Usage:** {self.stats['memory_usage']:.1f} MB
599
-
600
- ## Data Types
601
- ```
602
- {pd.DataFrame(self.stats['dtypes'].items(), columns=['Type', 'Count']).to_markdown()}
603
- ```
604
-
605
- ## Key Insights
606
  """
 
607
  # Group insights by stage
 
 
 
 
 
 
 
 
608
  for stage in range(1, 6):
609
  stage_insights = [i for i in self.insights if i['stage'] == stage]
610
  if stage_insights:
611
- report += f"\n### Stage {stage}\n"
612
  for insight in stage_insights:
613
- report += f"* {insight['insight']}\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
614
 
615
- report += f"\n\n*Generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}*"
616
  return report
617
 
618
- def generate_python_code(self) -> str:
619
- """Generate reproducible Python code"""
620
- code = """import pandas as pd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
621
  import numpy as np
622
  import plotly.express as px
623
- from typing import Dict, List, Any
624
-
625
- # Load and prepare data
626
- df = pd.read_csv('your_data.csv') # Update with your data source
627
-
628
- # Basic statistics
629
- def calculate_basic_stats(df: pd.DataFrame) -> Dict[str, Any]:
630
- return {
631
- 'shape': df.shape,
632
- 'memory_usage': float(df.memory_usage(deep=True).sum() / 1024**2),
633
- 'missing_values': int(df.isnull().sum().sum()),
634
- 'dtypes': df.dtypes.value_counts().to_dict(),
635
- 'duplicates': int(df.duplicated().sum())
636
- }
637
-
638
- stats = calculate_basic_stats(df)
639
- print("\\nBasic Statistics:")
640
- print(f"- Shape: {stats['shape']}")
641
- print(f"- Memory Usage: {stats['memory_usage']:.1f} MB")
642
- print(f"- Missing Values: {stats['missing_values']}")
643
- print(f"- Duplicates: {stats['duplicates']}")
644
 
645
- """
646
- # Add data cleaning operations if any were performed
647
- if hasattr(self, 'cleaning_history'):
648
- code += "\n# Data Cleaning\n"
649
- for operation in self.cleaning_history:
650
- if "missing values" in operation.lower():
651
- code += "# Handle missing values\n"
652
- code += "df = df.fillna(method='ffill') # Update with your chosen method\n"
653
- elif "duplicate" in operation.lower():
654
- code += "# Remove duplicates\n"
655
- code += "df = df.drop_duplicates()\n"
656
- elif "outlier" in operation.lower():
657
- code += """# Handle outliers
658
- def remove_outliers(df: pd.DataFrame, column: str) -> pd.DataFrame:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
659
  Q1 = df[column].quantile(0.25)
660
  Q3 = df[column].quantile(0.75)
661
  IQR = Q3 - Q1
662
- return df[~((df[column] < (Q1 - 1.5 * IQR)) | (df[column] > (Q3 + 1.5 * IQR)))]
 
 
 
 
 
 
 
 
 
 
663
 
664
- # Apply to numeric columns as needed
665
- numeric_cols = df.select_dtypes(include=[np.number]).columns
 
 
 
 
 
 
 
 
666
  for col in numeric_cols:
667
- df = remove_outliers(df, col)
668
- """
669
-
670
- # Add visualization code
671
- code += """
672
- # Visualizations
673
- def plot_missing_values(df: pd.DataFrame):
674
- missing = df.isnull().sum()
675
- if missing.sum() > 0:
676
- missing = missing[missing > 0]
677
- fig = px.bar(x=missing.index, y=missing.values,
678
- title='Missing Values by Column')
679
- fig.show()
680
-
681
- def plot_correlations(df: pd.DataFrame):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
682
  numeric_cols = df.select_dtypes(include=[np.number]).columns
683
  if len(numeric_cols) > 1:
684
- corr = df[numeric_cols].corr()
685
- fig = px.imshow(corr, title='Correlation Matrix')
686
- fig.show()
 
 
 
 
 
 
 
 
687
 
688
- # Generate plots
689
- plot_missing_values(df)
690
- plot_correlations(df)
 
691
  """
692
 
693
- return code
 
 
 
 
 
 
 
3
  import numpy as np
4
  import plotly.express as px
5
  import plotly.graph_objects as go
6
+ from plotly.subplots import make_subplots
7
  from typing import Dict, List, Any, Optional
8
  import os
 
9
  from data_handler import *
10
  from io import BytesIO
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  class AIAssistant:
13
+ """Built-in AI analysis for HuggingFace deployment (no external APIs needed)"""
14
 
15
  def __init__(self):
16
+ self.available = True # Always available since it's built-in
 
 
 
 
 
17
 
18
  def get_available_models(self) -> List[str]:
19
  """Get list of available AI models"""
20
+ return ["Built-in AI Engine"]
 
 
 
 
 
21
 
22
+ def analyze_insights(self, df: pd.DataFrame, insights: List[Dict], model: str = "Built-in AI Engine") -> str:
23
+ """Generate comprehensive AI analysis using built-in intelligence"""
24
+
25
+ # Calculate key metrics
26
+ missing_pct = (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100
27
+ duplicate_pct = (df.duplicated().sum() / len(df)) * 100
28
+ memory_mb = df.memory_usage(deep=True).sum() / 1024**2
29
+
30
+ # Analyze data characteristics
31
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
32
+ categorical_cols = df.select_dtypes(include=['object']).columns
33
+
34
+ analysis = f"""
35
+ ## 🧠 AI Data Intelligence Report
36
+
37
+ ### 📊 Executive Summary
38
+ Your dataset contains **{len(df):,} records** across **{len(df.columns)} dimensions** with a **data quality score** that requires attention in several key areas.
39
+
40
+ ### 🎯 Critical Findings
41
+
42
+ **Data Completeness Assessment:**
43
+ """
44
+
45
+ if missing_pct > 20:
46
+ analysis += f"""
47
+ - ⚠️ **HIGH RISK**: {missing_pct:.1f}% missing values detected
48
+ - **Business Impact**: Significant risk of biased analysis and incorrect business decisions
49
+ - **Recommended Action**: Immediate data collection process review required
50
+ """
51
+ elif missing_pct > 5:
52
+ analysis += f"""
53
+ - ⚠️ **MODERATE RISK**: {missing_pct:.1f}% missing values detected
54
+ - **Business Impact**: May affect statistical significance of insights
55
+ - **Recommended Action**: Apply intelligent filling strategies before analysis
56
+ """
57
+ else:
58
+ analysis += f"""
59
+ - **EXCELLENT**: Only {missing_pct:.1f}% missing data - within industry best practices
60
+ - **Business Impact**: High confidence in analysis results
61
+ """
62
+
63
+ analysis += f"""
64
+
65
+ **Data Integrity Assessment:**
66
+ """
67
+ if duplicate_pct > 5:
68
+ analysis += f"""
69
+ - 🚨 **CRITICAL**: {duplicate_pct:.1f}% duplicate records found
70
+ - **Root Cause**: Likely data collection or ETL process issues
71
+ - **Financial Impact**: Potential double-counting affecting revenue/cost metrics
72
+ """
73
+ elif duplicate_pct > 0:
74
+ analysis += f"""
75
+ - ⚠️ **ATTENTION**: {duplicate_pct:.1f}% duplicates detected
76
+ - **Recommendation**: Clean before aggregations to ensure accuracy
77
+ """
78
+ else:
79
+ analysis += "- ✅ **PERFECT**: No duplicate records detected"
80
+
81
+ # Outlier analysis
82
+ total_outliers = 0
83
+ outlier_insights = []
84
+
85
+ for col in numeric_cols:
86
+ Q1 = df[col].quantile(0.25)
87
+ Q3 = df[col].quantile(0.75)
88
+ IQR = Q3 - Q1
89
+ outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]
90
+
91
+ if len(outliers) > 0:
92
+ outlier_pct = (len(outliers) / len(df)) * 100
93
+ total_outliers += len(outliers)
94
+
95
+ if outlier_pct > 5:
96
+ outlier_insights.append(f"'{col}': {outlier_pct:.1f}% outliers (investigate business context)")
97
+ elif outlier_pct > 1:
98
+ outlier_insights.append(f"'{col}': {outlier_pct:.1f}% outliers (consider capping)")
99
+
100
+ if outlier_insights:
101
+ analysis += f"""
102
+
103
+ **Statistical Anomaly Assessment:**
104
+ """
105
+ for insight in outlier_insights[:3]: # Top 3 most problematic
106
+ analysis += f"- ⚠️ {insight}\n"
107
+
108
+ # Business intelligence insights
109
+ analysis += f"""
110
+
111
+ ### 💼 Business Intelligence Opportunities
112
+
113
+ **Analytical Readiness:**
114
+ """
115
+
116
+ if len(numeric_cols) >= 3:
117
+ analysis += f"""
118
+ - 📊 **{len(numeric_cols)} quantitative variables** available for statistical modeling
119
+ - 🎯 **Correlation analysis** possible - identify key business drivers
120
+ - 📈 **Predictive modeling** feasible with current data structure
121
+ """
122
+
123
+ if len(categorical_cols) >= 2:
124
+ analysis += f"""
125
+ - 🏷️ **{len(categorical_cols)} categorical dimensions** for segmentation analysis
126
+ - 💰 **Customer/product grouping** strategies available
127
+ - 📊 **Cross-tabulation** analysis recommended for business insights
128
+ """
129
+
130
+ # Performance considerations
131
+ if memory_mb > 50:
132
+ analysis += f"""
133
+
134
+ **Performance Optimization:**
135
+ - 🔧 **Memory Usage**: {memory_mb:.1f}MB - consider data type optimization
136
+ - ⚡ **Processing Speed**: Large dataset detected - implement sampling for interactive analysis
137
+ - 💾 **Storage Efficiency**: Category encoding could reduce memory by 30-50%
138
+ """
139
+
140
+ # Actionable recommendations
141
+ analysis += f"""
142
+
143
+ ### 🎯 Recommended Action Plan
144
+
145
+ **Priority 1 (Immediate):**
146
+ """
147
+
148
+ recommendations = []
149
+ if missing_pct > 10:
150
+ recommendations.append("Address missing values in critical business columns")
151
+ if duplicate_pct > 2:
152
+ recommendations.append("Remove duplicate records to ensure data integrity")
153
+ if total_outliers > len(df) * 0.1:
154
+ recommendations.append("Investigate outliers for business context and data errors")
155
+
156
+ if not recommendations:
157
+ recommendations.append("Data quality is excellent - proceed with analysis")
158
+
159
+ for i, rec in enumerate(recommendations, 1):
160
+ analysis += f"\n{i}. {rec}"
161
+
162
+ analysis += f"""
163
+
164
+ **Priority 2 (Optimization):**
165
+ 1. Implement data type optimization for memory efficiency
166
+ 2. Establish data quality monitoring for ongoing datasets
167
+ 3. Document data lineage and transformation processes
168
+
169
+ ### 🏆 Success Metrics
170
+ - **Target Quality Score**: 95+ (currently assessing)
171
+ - **Missing Values**: <2% (currently {missing_pct:.1f}%)
172
+ - **Data Integrity**: 100% unique records (currently {100-duplicate_pct:.1f}%)
173
+
174
+ *This analysis was generated using advanced statistical algorithms and business intelligence best practices.*
175
+ """
176
+
177
+ return analysis
178
 
179
  class DataAnalysisWorkflow:
180
+ """Enhanced workflow optimized for HuggingFace deployment"""
181
 
182
  def __init__(self, df: pd.DataFrame):
183
  self.df = df
184
+ self.original_df = df.copy() # Keep original for comparison
185
  self.stats = calculate_basic_stats(df)
186
  self.column_types = get_column_types(df)
187
  self.insights = []
188
+ self.page_size = 1000
189
+ self.quality_metrics = None
190
 
191
  def add_insight(self, insight: str, stage: int):
192
  """Add insight to analysis report"""
 
196
  'timestamp': pd.Timestamp.now()
197
  })
198
 
199
+ def calculate_enhanced_quality_score(self) -> Dict[str, Any]:
200
+ """Calculate comprehensive quality score with business context"""
201
+ score = 100
202
+ issues = []
203
+ recommendations = []
204
+
205
+ # Missing values analysis
206
+ missing_pct = (self.df.isnull().sum().sum() / (len(self.df) * len(self.df.columns))) * 100
207
+ if missing_pct > 0:
208
+ penalty = min(30, missing_pct * 1.5)
209
+ score -= penalty
210
+ issues.append(f"Missing values: {missing_pct:.1f}%")
211
+
212
+ if missing_pct > 20:
213
+ recommendations.append("Critical: Review data collection processes")
214
+ else:
215
+ recommendations.append("Apply intelligent filling strategies")
216
+
217
+ # Duplicates analysis
218
+ duplicate_pct = (self.df.duplicated().sum() / len(self.df)) * 100
219
+ if duplicate_pct > 0:
220
+ penalty = min(25, duplicate_pct * 3)
221
+ score -= penalty
222
+ issues.append(f"Duplicate rows: {duplicate_pct:.1f}%")
223
+ recommendations.append("Remove duplicates to ensure data integrity")
224
+
225
+ # Outliers analysis
226
+ numeric_cols = self.df.select_dtypes(include=[np.number]).columns
227
+ total_outliers = 0
228
+ problematic_cols = []
229
+
230
+ for col in numeric_cols:
231
+ Q1 = self.df[col].quantile(0.25)
232
+ Q3 = self.df[col].quantile(0.75)
233
+ IQR = Q3 - Q1
234
+ outliers = self.df[(self.df[col] < Q1 - 1.5 * IQR) | (self.df[col] > Q3 + 1.5 * IQR)]
235
+
236
+ if len(outliers) > 0:
237
+ outlier_pct = (len(outliers) / len(self.df)) * 100
238
+ total_outliers += len(outliers)
239
+
240
+ if outlier_pct > 5:
241
+ problematic_cols.append(col)
242
+
243
+ if total_outliers > 0:
244
+ outlier_overall_pct = (total_outliers / len(self.df)) * 100
245
+ penalty = min(20, outlier_overall_pct * 2)
246
+ score -= penalty
247
+ issues.append(f"Statistical outliers: {outlier_overall_pct:.1f}%")
248
+
249
+ if problematic_cols:
250
+ recommendations.append(f"Investigate outliers in: {', '.join(problematic_cols)}")
251
+
252
+ # Data type consistency
253
+ mixed_type_cols = detect_mixed_types(self.df)
254
+ if mixed_type_cols:
255
+ penalty = min(15, len(mixed_type_cols) * 5)
256
+ score -= penalty
257
+ issues.append(f"Type inconsistencies: {len(mixed_type_cols)} columns")
258
+ recommendations.append("Standardize data types for consistency")
259
+
260
+ # Determine grade and color
261
+ if score >= 90:
262
+ grade, color = "A", "#22c55e" # Green
263
+ elif score >= 80:
264
+ grade, color = "B", "#3b82f6" # Blue
265
+ elif score >= 70:
266
+ grade, color = "C", "#f59e0b" # Yellow
267
+ elif score >= 60:
268
+ grade, color = "D", "#f97316" # Orange
269
+ else:
270
+ grade, color = "F", "#ef4444" # Red
271
+
272
+ self.quality_metrics = {
273
+ 'score': max(0, score),
274
+ 'grade': grade,
275
+ 'color': color,
276
+ 'issues': issues,
277
+ 'recommendations': recommendations,
278
+ 'missing_pct': missing_pct,
279
+ 'duplicate_pct': duplicate_pct,
280
+ 'outlier_pct': (total_outliers / len(self.df)) * 100 if len(self.df) > 0 else 0,
281
+ 'total_outliers': total_outliers
282
+ }
283
+
284
+ return self.quality_metrics
285
 
286
  def stage_1_overview(self):
287
+ """Enhanced Stage 1: Quality-focused overview with visual dashboard"""
288
+
289
+ # Calculate quality metrics
290
+ quality_metrics = self.calculate_enhanced_quality_score()
291
+
292
+ # Quality Dashboard Header
293
+ col1, col2, col3, col4, col5 = st.columns(5)
294
 
 
 
 
295
  with col1:
296
+ # Quality score with color coding
297
+ st.markdown(f"""
298
+ <div style="text-align: center; padding: 1rem; background: {quality_metrics['color']}20; border-radius: 0.5rem; border: 2px solid {quality_metrics['color']}40;">
299
+ <h1 style="color: {quality_metrics['color']}; margin: 0;">{quality_metrics['score']:.0f}</h1>
300
+ <p style="margin: 0; font-weight: bold;">Quality Score</p>
301
+ <p style="margin: 0; color: {quality_metrics['color']};">Grade {quality_metrics['grade']}</p>
302
+ </div>
303
+ """, unsafe_allow_html=True)
304
+
305
  with col2:
306
+ st.metric("📊 Rows", f"{self.stats['shape'][0]:,}")
307
+
308
  with col3:
309
+ st.metric("📋 Columns", f"{self.stats['shape'][1]:,}")
310
+
311
  with col4:
312
+ st.metric("💾 Memory", f"{self.stats['memory_usage']:.1f} MB")
313
+
314
+ with col5:
315
+ issues_count = len(quality_metrics['issues'])
316
+ st.metric("⚠️ Issues", issues_count,
317
+ delta=f"-{issues_count}" if issues_count == 0 else None)
318
 
319
+ # Issues breakdown with visual elements
320
  if quality_metrics['issues']:
321
+ st.markdown("### 🚨 Quality Issues Detected")
322
+
323
+ col1, col2 = st.columns([2, 1])
324
+
325
+ with col1:
326
+ # Issues pie chart
327
+ issue_categories = []
328
+ issue_values = []
329
+ issue_colors = []
330
+
331
+ if quality_metrics['missing_pct'] > 0:
332
+ issue_categories.append("Missing Values")
333
+ issue_values.append(quality_metrics['missing_pct'])
334
+ issue_colors.append("#ef4444")
335
+
336
+ if quality_metrics['duplicate_pct'] > 0:
337
+ issue_categories.append("Duplicates")
338
+ issue_values.append(quality_metrics['duplicate_pct'])
339
+ issue_colors.append("#f97316")
340
+
341
+ if quality_metrics['outlier_pct'] > 0:
342
+ issue_categories.append("Outliers")
343
+ issue_values.append(quality_metrics['outlier_pct'])
344
+ issue_colors.append("#eab308")
345
+
346
+ if issue_categories:
347
+ fig_issues = px.pie(
348
+ values=issue_values,
349
+ names=issue_categories,
350
+ title="Quality Issues Distribution (%)",
351
+ color_discrete_sequence=issue_colors
352
+ )
353
+ fig_issues.update_traces(textposition='inside', textinfo='percent+label')
354
+ st.plotly_chart(fig_issues, use_container_width=True)
355
+
356
+ with col2:
357
+ st.markdown("#### 🤖 AI Recommendations")
358
+ for i, rec in enumerate(quality_metrics['recommendations'], 1):
359
+ st.markdown(f"**{i}.** {rec}")
360
 
361
+ else:
362
+ st.success("🎉 Excellent! No major quality issues detected.")
363
+
364
+ # Column-level quality heatmap
365
+ st.markdown("### 📊 Column Quality Heatmap")
366
+ col_quality_data = []
367
+
368
+ for col in self.df.columns:
369
+ missing_rate = self.df[col].isnull().sum() / len(self.df)
370
+
371
+ # Calculate quality score per column
372
+ col_score = 100
373
+ if missing_rate > 0:
374
+ col_score -= missing_rate * 50 # Penalty for missing values
375
+
376
+ # Check for outliers in numeric columns
377
+ if self.df[col].dtype in ['int64', 'float64']:
378
+ Q1 = self.df[col].quantile(0.25)
379
+ Q3 = self.df[col].quantile(0.75)
380
+ IQR = Q3 - Q1
381
+ outliers = self.df[(self.df[col] < Q1 - 1.5 * IQR) | (self.df[col] > Q3 + 1.5 * IQR)]
382
+ outlier_rate = len(outliers) / len(self.df)
383
+ col_score -= outlier_rate * 30
384
+
385
+ col_quality_data.append({
386
+ 'Column': col,
387
+ 'Quality Score': max(0, col_score),
388
+ 'Missing %': missing_rate * 100,
389
+ 'Data Type': str(self.df[col].dtype)
390
+ })
391
+
392
+ quality_df = pd.DataFrame(col_quality_data)
393
+
394
+ # Interactive column quality chart
395
+ fig_quality = px.bar(
396
+ quality_df,
397
+ x='Column',
398
+ y='Quality Score',
399
+ color='Quality Score',
400
+ color_continuous_scale='RdYlGn',
401
+ title="Column Quality Scores",
402
+ hover_data=['Missing %', 'Data Type']
403
+ )
404
+ fig_quality.update_layout(height=400)
405
+ st.plotly_chart(fig_quality, use_container_width=True)
406
+
407
+ # Data types distribution
408
+ st.markdown("### 📋 Data Types Analysis")
409
  col1, col2 = st.columns(2)
410
+
411
  with col1:
412
+ if self.stats['dtypes']:
413
+ fig_types = px.pie(
414
+ values=list(self.stats['dtypes'].values()),
415
+ names=list(self.stats['dtypes'].keys()),
416
+ title="Data Types Distribution"
417
+ )
418
+ st.plotly_chart(fig_types, use_container_width=True)
419
+
420
  with col2:
421
+ # Memory optimization opportunities
422
+ memory_opt = calculate_memory_optimization(self.df)
423
+ if memory_opt['potential_savings_mb'] > 1:
424
+ st.warning(f"💾 Memory Optimization Available")
425
+ st.write(f"Potential savings: {memory_opt['potential_savings_mb']:.1f} MB ({memory_opt['potential_savings_pct']:.1f}%)")
426
+
427
+ if st.button("🔧 Apply Memory Optimization"):
428
+ for suggestion in memory_opt['suggestions']:
429
+ if suggestion['suggested_type'] == 'category':
430
+ self.df[suggestion['column']] = self.df[suggestion['column']].astype('category')
431
+ st.success("✅ Memory optimized!")
432
+ st.rerun()
433
+ else:
434
+ st.success("✅ Memory usage is optimal")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
435
 
436
+ # Quick data preview with enhanced styling
437
+ st.markdown("### 👀 Data Preview")
438
+ preview_option = st.radio("Preview type:", ["First 10 rows", "Random sample", "Last 10 rows"], horizontal=True)
439
 
440
+ if preview_option == "Random sample":
441
+ sample_df = self.df.sample(n=min(10, len(self.df)))
442
+ elif preview_option == "Last 10 rows":
443
+ sample_df = self.df.tail(10)
 
 
 
 
 
444
  else:
445
+ sample_df = self.df.head(10)
446
+
447
+ st.dataframe(sample_df, use_container_width=True)
448
 
449
+ # Add quality insights
450
  if quality_metrics['score'] < 80:
451
  self.add_insight(f"Data quality needs improvement (Score: {quality_metrics['score']:.1f}/100)", 1)
452
+ else:
453
+ self.add_insight(f"Good data quality detected (Score: {quality_metrics['score']:.1f}/100)", 1)
 
 
 
 
454
 
455
  def stage_2_exploration(self):
456
+ """Enhanced Stage 2: Interactive data exploration"""
 
457
 
458
  numeric_cols = self.column_types['numeric']
459
  categorical_cols = self.column_types['categorical']
460
 
461
+ # Smart column selection based on quality
462
+ if self.quality_metrics:
463
+ st.info(f"🎯 **Focus Areas**: Columns with quality issues detected - prioritize these for exploration")
464
+
465
+ # Numeric analysis with enhanced visualizations
466
  if numeric_cols:
467
+ st.markdown("### 📊 Numeric Variables Deep Dive")
 
468
 
469
  col1, col2 = st.columns(2)
470
  with col1:
471
+ selected_numeric = st.selectbox("Select numeric column:", numeric_cols)
472
+ with col2:
473
+ chart_type = st.selectbox("Visualization type:",
474
+ ["Distribution + Box Plot", "Only Histogram", "Only Box Plot"])
475
+
476
+ if chart_type == "Distribution + Box Plot":
477
+ col_a, col_b = st.columns(2)
478
+ with col_a:
479
+ fig_hist = px.histogram(self.df, x=selected_numeric,
480
+ title=f"Distribution: {selected_numeric}",
481
+ nbins=30)
482
+ st.plotly_chart(fig_hist, use_container_width=True)
483
+
484
+ with col_b:
485
+ fig_box = px.box(self.df, y=selected_numeric,
486
+ title=f"Box Plot: {selected_numeric}")
487
+ st.plotly_chart(fig_box, use_container_width=True)
488
+
489
+ elif chart_type == "Only Histogram":
490
+ fig_hist = px.histogram(self.df, x=selected_numeric,
491
+ title=f"Distribution: {selected_numeric}",
492
+ nbins=50)
493
+ st.plotly_chart(fig_hist, use_container_width=True)
494
+
495
+ else: # Only Box Plot
496
+ fig_box = px.box(self.df, y=selected_numeric,
497
+ title=f"Box Plot: {selected_numeric}")
498
+ st.plotly_chart(fig_box, use_container_width=True)
499
 
500
+ # Enhanced statistical insights
501
+ col_stats = self.df[selected_numeric].describe()
502
+ col1, col2, col3, col4 = st.columns(4)
503
+
504
+ with col1:
505
+ st.metric("Mean", f"{col_stats['mean']:.2f}")
506
+ st.metric("Std Dev", f"{col_stats['std']:.2f}")
507
  with col2:
508
+ st.metric("Minimum", f"{col_stats['min']:.2f}")
509
+ st.metric("Maximum", f"{col_stats['max']:.2f}")
510
+ with col3:
511
+ st.metric("Q1 (25%)", f"{col_stats['25%']:.2f}")
512
+ st.metric("Q3 (75%)", f"{col_stats['75%']:.2f}")
513
+ with col4:
514
+ skewness = self.df[selected_numeric].skew()
515
+ st.metric("Skewness", f"{skewness:.3f}")
516
+ kurtosis = self.df[selected_numeric].kurtosis()
517
+ st.metric("Kurtosis", f"{kurtosis:.3f}")
518
+
519
+ # Business insights for the selected column
520
+ if abs(skewness) > 1:
521
+ self.add_insight(f"'{selected_numeric}' shows high skewness ({skewness:.2f}) - consider transformation", 2)
 
 
 
 
 
 
 
 
 
 
 
 
 
522
 
523
+ # Categorical analysis with enhanced features
524
  if categorical_cols:
525
+ st.markdown("### 🏷️ Categorical Variables Analysis")
526
+
527
  selected_categorical = st.selectbox("Select categorical column:", categorical_cols)
528
 
529
+ col1, col2 = st.columns(2)
530
+
531
+ with col1:
532
+ # Top categories bar chart
533
+ value_counts = self.df[selected_categorical].value_counts().head(10)
534
+ fig_bar = px.bar(
535
+ x=value_counts.values,
536
+ y=value_counts.index,
537
+ orientation='h',
538
+ title=f"Top 10 Categories: {selected_categorical}",
539
+ color=value_counts.values,
540
+ color_continuous_scale='Blues'
541
+ )
542
+ st.plotly_chart(fig_bar, use_container_width=True)
543
+
544
+ with col2:
545
+ # Category distribution pie chart
546
+ top_5 = value_counts.head(5)
547
+ others_count = value_counts.iloc[5:].sum() if len(value_counts) > 5 else 0
548
+
549
+ if others_count > 0:
550
+ pie_data = list(top_5.values) + [others_count]
551
+ pie_labels = list(top_5.index) + ['Others']
552
+ else:
553
+ pie_data = list(top_5.values)
554
+ pie_labels = list(top_5.index)
555
+
556
+ fig_pie = px.pie(
557
+ values=pie_data,
558
+ names=pie_labels,
559
+ title=f"Distribution: {selected_categorical}"
560
+ )
561
+ st.plotly_chart(fig_pie, use_container_width=True)
562
 
563
+ # Category insights
564
  total_categories = self.df[selected_categorical].nunique()
565
+ most_common = value_counts.index[0]
566
+ most_common_pct = (value_counts.iloc[0] / len(self.df)) * 100
567
+
568
+ st.info(f"📈 **Insights**: '{most_common}' is the dominant category ({most_common_pct:.1f}% of data)")
569
+ self.add_insight(f"'{selected_categorical}' has {total_categories} categories, dominated by '{most_common}' ({most_common_pct:.1f}%)", 2)
570
+
571
+ # Enhanced correlation analysis
572
+ if len(numeric_cols) > 1:
573
+ st.markdown("### 🔗 Correlation Analysis")
574
+
575
+ corr_matrix = calculate_correlation_matrix(self.df)
576
+ if not corr_matrix.empty:
577
+ # Interactive correlation heatmap
578
+ fig_corr = px.imshow(
579
+ corr_matrix,
580
+ text_auto=True,
581
+ aspect="auto",
582
+ title="Correlation Matrix",
583
+ color_continuous_scale='RdBu_r',
584
+ zmin=-1, zmax=1
585
+ )
586
+ fig_corr.update_layout(height=500)
587
+ st.plotly_chart(fig_corr, use_container_width=True)
588
+
589
+ # Find and highlight strongest correlations
590
+ corr_pairs = []
591
+ for i in range(len(corr_matrix.columns)):
592
+ for j in range(i+1, len(corr_matrix.columns)):
593
+ corr_val = corr_matrix.iloc[i, j]
594
+ if abs(corr_val) > 0.3: # Only significant correlations
595
+ corr_pairs.append({
596
+ 'Variable 1': corr_matrix.columns[i],
597
+ 'Variable 2': corr_matrix.columns[j],
598
+ 'Correlation': corr_val,
599
+ 'Strength': 'Strong' if abs(corr_val) > 0.7 else 'Moderate'
600
+ })
601
+
602
+ if corr_pairs:
603
+ st.markdown("#### 🎯 Key Correlations")
604
+ corr_df = pd.DataFrame(corr_pairs).sort_values('Correlation', key=abs, ascending=False)
605
+ st.dataframe(corr_df, use_container_width=True)
606
+
607
+ strongest = corr_df.iloc[0]
608
+ self.add_insight(f"Strongest correlation: {strongest['Variable 1']} ↔ {strongest['Variable 2']} (r={strongest['Correlation']:.3f})", 2)
609
 
610
  def stage_3_cleaning(self):
611
+ """Enhanced Stage 3: Visual data cleaning with AI suggestions"""
612
+
613
+ st.markdown("### 🧹 Intelligent Data Cleaning")
614
 
615
+ cleaning_operations = []
 
616
 
617
+ # Missing values section with enhanced visualization
618
+ missing_data = calculate_missing_data(self.df)
619
+ if not missing_data.empty:
620
+ st.markdown("#### 🕳️ Missing Values Treatment")
621
+
622
+ col1, col2 = st.columns([2, 1])
623
 
 
624
  with col1:
625
+ # Missing values heatmap for top 10 problematic columns
626
+ top_missing_cols = missing_data.head(10)['Column'].tolist()
627
+ if len(top_missing_cols) > 0:
628
+ # Create missing pattern visualization
629
+ sample_size = min(100, len(self.df))
630
+ sample_df = self.df[top_missing_cols].head(sample_size)
631
+ missing_matrix = sample_df.isnull().astype(int)
632
+
633
+ fig_missing = px.imshow(
634
+ missing_matrix.T,
635
+ title=f"Missing Values Pattern (Top {len(top_missing_cols)} columns, First {sample_size} rows)",
636
+ color_continuous_scale='Reds',
637
+ labels={'x': 'Row Index', 'y': 'Columns', 'color': 'Missing'},
638
+ aspect='auto'
639
+ )
640
+ st.plotly_chart(fig_missing, use_container_width=True)
641
 
642
+ with col2:
643
+ # AI-powered missing value suggestions
644
+ st.markdown("**🤖 AI Repair Suggestions**")
645
+
646
+ for _, row in missing_data.head(3).iterrows():
647
+ col_name = row['Column']
648
+ missing_pct = row['Missing %']
649
+
650
+ # Generate smart suggestion based on column type and missing percentage
651
+ if missing_pct > 50:
652
+ suggestion_type = "🚨 Critical"
653
+ suggestion = f"Drop column (>{missing_pct:.0f}% missing)"
654
+ action = "drop"
655
+ elif self.df[col_name].dtype in ['int64', 'float64']:
656
+ suggestion_type = "🔧 Repair"
657
+ suggestion = f"Fill with median ({missing_pct:.1f}% missing)"
658
+ action = "median"
659
  else:
660
+ suggestion_type = "🔧 Repair"
661
+ suggestion = f"Fill with mode ({missing_pct:.1f}% missing)"
662
+ action = "mode"
 
 
 
 
 
 
 
 
663
 
664
+ with st.expander(f"{suggestion_type}: {col_name}"):
665
+ st.write(f"**Issue**: {missing_pct:.1f}% missing values")
666
+ st.write(f"**Suggestion**: {suggestion}")
667
+
668
+ if st.button(f"Apply to {col_name}", key=f"fix_missing_{col_name}"):
669
+ if action == "drop":
670
+ self.df = self.df.drop(columns=[col_name])
671
+ cleaning_operations.append(f"Dropped column '{col_name}' (too many missing values)")
672
+ elif action == "median":
673
+ self.df[col_name] = self.df[col_name].fillna(self.df[col_name].median())
674
+ cleaning_operations.append(f"Filled missing values in '{col_name}' with median")
675
+ elif action == "mode":
676
+ mode_val = self.df[col_name].mode()
677
+ if not mode_val.empty:
678
+ self.df[col_name] = self.df[col_name].fillna(mode_val[0])
679
+ cleaning_operations.append(f"Filled missing values in '{col_name}' with mode")
680
+
681
+ st.success("✅ Applied successfully!")
682
+ st.rerun()
683
 
684
+ # Duplicates handling with enhanced detection
685
  if self.stats['duplicates'] > 0:
686
+ st.markdown("#### 🔄 Duplicate Records")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
687
 
688
+ duplicate_pct = (self.stats['duplicates'] / len(self.df)) * 100
 
689
 
690
+ col1, col2 = st.columns([2, 1])
 
691
 
692
+ with col1:
693
+ st.warning(f"🚨 Found **{self.stats['duplicates']}** duplicate rows ({duplicate_pct:.1f}% of dataset)")
694
+
695
+ # Show sample duplicates
696
+ duplicates = self.df[self.df.duplicated(keep=False)].head(10)
697
+ st.dataframe(duplicates, use_container_width=True)
698
+
699
+ with col2:
700
+ st.markdown("**🤖 AI Assessment**")
701
+ if duplicate_pct > 10:
702
+ st.error("**Critical**: High duplication rate suggests systematic data collection issues")
703
+ elif duplicate_pct > 2:
704
+ st.warning("**Moderate**: Notable duplication - verify data sources")
705
+ else:
706
+ st.info("**Minor**: Low duplication rate - likely isolated incidents")
707
+
708
+ if st.button("🗑️ Remove All Duplicates"):
709
+ original_len = len(self.df)
710
+ self.df = self.df.drop_duplicates()
711
+ removed = original_len - len(self.df)
712
+ cleaning_operations.append(f"Removed {removed} duplicate rows")
713
+ st.success(f"✅ Removed {removed} duplicates!")
714
+ st.rerun()
715
 
716
+ # Enhanced outlier detection
717
  numeric_cols = self.column_types['numeric']
718
  if numeric_cols:
719
+ st.markdown("#### 📊 Outlier Detection & Treatment")
720
+
721
+ selected_col = st.selectbox("Select column for outlier analysis:", numeric_cols)
722
+
723
+ # Calculate outliers
724
+ Q1 = self.df[selected_col].quantile(0.25)
725
+ Q3 = self.df[selected_col].quantile(0.75)
726
+ IQR = Q3 - Q1
727
+ lower_bound = Q1 - 1.5 * IQR
728
+ upper_bound = Q3 + 1.5 * IQR
729
+
730
+ outliers = self.df[(self.df[selected_col] < lower_bound) | (self.df[selected_col] > upper_bound)]
731
+ outlier_pct = (len(outliers) / len(self.df)) * 100
732
+
733
+ col1, col2 = st.columns([2, 1])
734
+
735
+ with col1:
736
+ # Enhanced box plot with outlier highlighting
737
+ fig_outliers = go.Figure()
738
+
739
+ # Box plot
740
+ fig_outliers.add_trace(go.Box(
741
+ y=self.df[selected_col],
742
+ name=selected_col,
743
+ boxpoints='outliers',
744
+ marker_color='lightblue'
745
+ ))
746
+
747
+ # Highlight outliers
748
+ if len(outliers) > 0:
749
+ fig_outliers.add_trace(go.Scatter(
750
+ y=outliers[selected_col],
751
+ mode='markers',
752
+ marker=dict(color='red', size=8),
753
+ name=f'Outliers ({len(outliers)})'
754
+ ))
755
+
756
+ fig_outliers.update_layout(
757
+ title=f"Outlier Analysis: {selected_col}",
758
+ height=400
759
+ )
760
+ st.plotly_chart(fig_outliers, use_container_width=True)
761
+
762
+ with col2:
763
+ st.markdown("**🤖 AI Outlier Assessment**")
764
+
765
+ if outlier_pct > 10:
766
+ st.error(f"**High Risk**: {outlier_pct:.1f}% outliers detected")
767
+ st.write("**Likely Cause**: Systematic data issues or measurement errors")
768
+ recommendation = "Investigate business context before any treatment"
769
+ elif outlier_pct > 2:
770
+ st.warning(f"**Moderate**: {outlier_pct:.1f}% outliers detected")
771
+ recommendation = "Consider capping values at statistical bounds"
772
+ else:
773
+ st.info(f"**Normal**: {outlier_pct:.1f}% outliers detected")
774
+ recommendation = "Safe to remove if confirmed as errors"
775
+
776
+ st.write(f"**AI Recommendation**: {recommendation}")
777
+
778
+ # Outlier treatment options
779
+ col_a, col_b = st.columns(2)
780
+
781
+ with col_a:
782
+ if st.button("🗑️ Remove", key=f"remove_outliers_{selected_col}"):
783
+ self.df = self.df[~self.df.index.isin(outliers.index)]
784
+ cleaning_operations.append(f"Removed {len(outliers)} outliers from '{selected_col}'")
785
+ st.success("✅ Outliers removed!")
786
+ st.rerun()
787
+
788
+ with col_b:
789
+ if st.button("📌 Cap", key=f"cap_outliers_{selected_col}"):
790
+ self.df[selected_col] = self.df[selected_col].clip(lower_bound, upper_bound)
791
+ cleaning_operations.append(f"Capped outliers in '{selected_col}' at statistical bounds")
792
+ st.success("✅ Outliers capped!")
793
+ st.rerun()
794
+
795
+ # Show cleaning history
796
+ if cleaning_operations:
797
+ st.markdown("#### 📋 Cleaning Operations Applied")
798
+ for i, operation in enumerate(cleaning_operations, 1):
799
+ st.success(f"{i}. {operation}")
800
+
801
+ self.add_insight(f"Applied {len(cleaning_operations)} data cleaning operations", 3)
802
 
803
  def stage_4_analysis(self):
804
+ """Enhanced Stage 4: Advanced analysis with AI insights"""
 
805
 
806
  numeric_cols = self.column_types['numeric']
807
  categorical_cols = self.column_types['categorical']
808
 
809
+ # Relationship analysis with enhanced visualizations
810
  if len(numeric_cols) >= 2:
811
+ st.markdown("### 🔗 Variable Relationships")
812
 
813
+ col1, col2, col3 = st.columns(3)
814
  with col1:
815
  x_var = st.selectbox("X Variable:", numeric_cols)
816
  with col2:
817
+ y_var = st.selectbox("Y Variable:", [col for col in numeric_cols if col != x_var])
818
+ with col3:
819
+ color_var = st.selectbox("Color by (optional):", ["None"] + categorical_cols)
820
 
821
+ # Smart sampling for large datasets
822
  sample_size = min(5000, len(self.df))
823
+ if len(self.df) > sample_size:
824
+ sample_df = self.df.sample(n=sample_size, random_state=42)
825
+ st.info(f"📊 Showing sample of {sample_size:,} points for performance")
826
+ else:
827
+ sample_df = self.df
828
 
829
+ # Enhanced scatter plot
830
+ if color_var != "None":
831
+ fig_scatter = px.scatter(
832
+ sample_df, x=x_var, y=y_var, color=color_var,
833
+ title=f"Relationship: {x_var} vs {y_var} (colored by {color_var})",
834
+ trendline="ols"
835
+ )
836
+ else:
837
+ fig_scatter = px.scatter(
838
+ sample_df, x=x_var, y=y_var,
839
+ title=f"Relationship: {x_var} vs {y_var}",
840
+ trendline="ols"
841
+ )
842
 
843
+ fig_scatter.update_layout(height=500)
844
+ st.plotly_chart(fig_scatter, use_container_width=True)
845
+
846
+ # Correlation analysis with business insights
847
  correlation = self.df[x_var].corr(self.df[y_var])
 
848
 
849
+ col1, col2, col3 = st.columns(3)
850
+ with col1:
851
+ st.metric("Correlation", f"{correlation:.3f}")
852
+ with col2:
853
+ if abs(correlation) > 0.7:
854
+ strength = "Strong"
855
+ color = "🟢"
856
+ elif abs(correlation) > 0.3:
857
+ strength = "Moderate"
858
+ color = "🟡"
859
+ else:
860
+ strength = "Weak"
861
+ color = "🔴"
862
+ st.metric("Strength", f"{color} {strength}")
863
+ with col3:
864
+ direction = "Positive" if correlation > 0 else "Negative"
865
+ st.metric("Direction", direction)
866
+
867
+ # Business interpretation
868
  if abs(correlation) > 0.7:
869
+ st.success(f"🎯 **Business Insight**: Strong relationship detected! {x_var} and {y_var} move together - valuable for prediction and business planning.")
870
+ self.add_insight(f"Strong correlation ({correlation:.3f}) between {x_var} and {y_var} - high predictive value", 4)
871
  elif abs(correlation) > 0.3:
872
+ st.info(f"📊 **Moderate relationship** between {x_var} and {y_var} - worth investigating further.")
873
+ self.add_insight(f"Moderate correlation ({correlation:.3f}) between {x_var} and {y_var}", 4)
 
 
 
 
 
874
 
875
+ # Group analysis with enhanced insights
876
  if categorical_cols and numeric_cols:
877
+ st.markdown("### 👥 Group Analysis")
878
 
879
  col1, col2 = st.columns(2)
880
  with col1:
881
  group_var = st.selectbox("Group by:", categorical_cols)
882
  with col2:
883
+ metric_var = st.selectbox("Analyze metric:", numeric_cols)
884
 
885
+ # Calculate group statistics
886
  group_stats = calculate_group_stats(self.df, group_var, metric_var)
 
 
 
 
 
 
 
 
 
 
887
 
888
+ col_a, col_b = st.columns([1, 2])
889
+
890
+ with col_a:
891
+ st.dataframe(group_stats, use_container_width=True)
892
+
893
+ # Find best and worst performing groups
894
+ best_group = group_stats['mean'].idxmax()
895
+ worst_group = group_stats['mean'].idxmin()
896
+
897
+ st.success(f"🏆 **Best**: {best_group} (avg: {group_stats.loc[best_group, 'mean']:.2f})")
898
+ st.error(f"📉 **Needs Attention**: {worst_group} (avg: {group_stats.loc[worst_group, 'mean']:.2f})")
899
+
900
+ with col_b:
901
+ # Group comparison visualization
902
+ unique_groups = self.df[group_var].nunique()
903
+
904
+ if unique_groups <= 15: # Manageable number of groups
905
+ fig_groups = px.box(
906
+ self.df, x=group_var, y=metric_var,
907
+ title=f"{metric_var} Distribution by {group_var}",
908
+ color=group_var
909
+ )
910
+ fig_groups.update_layout(height=400)
911
+ st.plotly_chart(fig_groups, use_container_width=True)
912
+ else:
913
+ # Too many groups - show summary statistics
914
+ st.info(f"📊 {unique_groups} groups detected - showing statistical summary")
915
+ summary_stats = self.df.groupby(group_var)[metric_var].agg(['count', 'mean', 'std']).reset_index()
916
+ summary_stats = summary_stats.sort_values('mean', ascending=False).head(10)
917
+
918
+ fig_summary = px.bar(
919
+ summary_stats, x=group_var, y='mean',
920
+ title=f"Top 10 {group_var} by Average {metric_var}",
921
+ error_y='std'
922
+ )
923
+ st.plotly_chart(fig_summary, use_container_width=True)
924
+
925
+ # Statistical significance testing
926
+ if unique_groups <= 10 and len(group_stats) > 1:
927
+ from scipy import stats as scipy_stats
928
+
929
+ try:
930
+ # ANOVA test for multiple groups
931
+ groups = [self.df[self.df[group_var] == group][metric_var].dropna()
932
+ for group in self.df[group_var].unique() if not pd.isna(group)]
933
+
934
+ if len(groups) >= 2 and all(len(g) > 1 for g in groups):
935
+ f_stat, p_value = scipy_stats.f_oneway(*groups)
936
+
937
+ st.markdown("#### 📊 Statistical Significance")
938
+ col1, col2 = st.columns(2)
939
+ with col1:
940
+ st.metric("F-statistic", f"{f_stat:.3f}")
941
+ with col2:
942
+ st.metric("P-value", f"{p_value:.4f}")
943
+
944
+ if p_value < 0.05:
945
+ st.success("✅ **Statistically significant** differences between groups!")
946
+ self.add_insight(f"Significant group differences in {metric_var} by {group_var} (p={p_value:.4f})", 4)
947
+ else:
948
+ st.info("📊 No statistically significant differences between groups")
949
+
950
+ except Exception as e:
951
+ st.warning(f"Statistical test failed: {str(e)}")
952
+
953
+ performance_gap = group_stats['mean'].max() - group_stats['mean'].min()
954
+ self.add_insight(f"Performance gap in {metric_var}: {performance_gap:.2f} between best and worst {group_var}", 4)
955
 
956
  def stage_5_summary(self):
957
+ """Enhanced Stage 5: Comprehensive summary with AI recommendations"""
958
+
959
+ st.markdown("### 📈 Analysis Summary & Results")
960
+
961
+ # Calculate final quality metrics
962
+ final_quality = self.calculate_enhanced_quality_score() if hasattr(self, 'calculate_enhanced_quality_score') else calculate_data_quality_score(self.df)
963
+
964
+ # Summary dashboard
965
+ col1, col2, col3, col4 = st.columns(4)
966
 
 
 
967
  with col1:
968
+ st.metric("Final Quality Score", f"{final_quality['score']:.0f}/100")
969
  with col2:
970
+ st.metric("Total Insights Generated", len(self.insights))
 
971
  with col3:
972
+ st.metric("Data Integrity", final_quality['grade'])
973
+ with col4:
974
+ improvement = "✅ Improved" if len(self.insights) > 5 else "📊 Analyzed"
975
+ st.metric("Status", improvement)
976
 
977
+ # Insights timeline
978
+ st.markdown("### 💡 Analysis Journey")
 
 
979
 
980
+ # Group insights by stage
981
+ stage_insights = {}
982
+ for insight in self.insights:
983
+ stage = insight['stage']
984
+ if stage not in stage_insights:
985
+ stage_insights[stage] = []
986
+ stage_insights[stage].append(insight['insight'])
987
 
988
+ for stage in sorted(stage_insights.keys()):
989
+ with st.expander(f"📋 Stage {stage}: {len(stage_insights[stage])} insights", expanded=True):
990
+ for i, insight in enumerate(stage_insights[stage], 1):
991
+ st.write(f"{i}. {insight}")
 
 
 
 
992
 
993
+ # Enhanced export options
994
+ st.markdown("### 📥 Export Your Results")
995
+
996
+ tab1, tab2, tab3 = st.tabs(["📊 Cleaned Data", "📋 Analysis Report", "🐍 Python Code"])
997
+
998
+ with tab1:
999
+ st.markdown("#### 🔍 Data Preview")
1000
+ col1, col2 = st.columns([3, 1])
1001
+
1002
+ with col1:
1003
+ # Show comparison if data was modified
1004
+ if not self.df.equals(self.original_df):
1005
+ st.success("✅ **Data has been cleaned and optimized!**")
1006
+
1007
+ comparison_metrics = {
1008
+ 'Original Rows': len(self.original_df),
1009
+ 'Current Rows': len(self.df),
1010
+ 'Rows Changed': len(self.df) - len(self.original_df),
1011
+ 'Original Columns': len(self.original_df.columns),
1012
+ 'Current Columns': len(self.df.columns)
1013
+ }
1014
+
1015
+ comparison_df = pd.DataFrame([comparison_metrics])
1016
+ st.dataframe(comparison_df, use_container_width=True)
1017
+ else:
1018
+ st.info("📊 **No cleaning operations applied** - original data maintained")
1019
+
1020
+ # Data preview
1021
+ st.dataframe(self.df.head(10), use_container_width=True)
1022
+
1023
+ with col2:
1024
+ st.markdown("**📥 Download Options**")
1025
+
1026
+ # CSV download
1027
+ csv_data = self.df.to_csv(index=False)
1028
+ st.download_button(
1029
+ label="📄 Download CSV",
1030
+ data=csv_data,
1031
+ file_name="cleaned_data.csv",
1032
+ mime="text/csv",
1033
+ use_container_width=True
1034
+ )
1035
+
1036
+ # Excel download
1037
+ excel_buffer = BytesIO()
1038
+ self.df.to_excel(excel_buffer, index=False)
1039
+ excel_data = excel_buffer.getvalue()
1040
+
1041
+ st.download_button(
1042
+ label="📊 Download Excel",
1043
+ data=excel_data,
1044
+ file_name="cleaned_data.xlsx",
1045
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
1046
+ use_container_width=True
1047
+ )
1048
 
1049
+ with tab2:
1050
+ # Generate comprehensive report
1051
+ report = self.generate_enhanced_report()
1052
+
1053
+ col1, col2 = st.columns([3, 1])
1054
+
1055
+ with col1:
1056
+ st.markdown(report)
1057
+
1058
+ with col2:
1059
+ # Download report
1060
+ st.download_button(
1061
+ label="📋 Download Report",
1062
+ data=report,
1063
+ file_name="data_analysis_report.md",
1064
+ mime="text/markdown",
1065
+ use_container_width=True
1066
+ )
1067
+
1068
+ # Generate executive summary
1069
+ exec_summary = self.generate_executive_summary()
1070
+ st.download_button(
1071
+ label="📈 Executive Summary",
1072
+ data=exec_summary,
1073
+ file_name="executive_summary.txt",
1074
+ mime="text/plain",
1075
+ use_container_width=True
1076
+ )
1077
+
1078
+ with tab3:
1079
+ # Generate reproducible code
1080
+ code = self.generate_enhanced_python_code()
1081
  st.code(code, language="python")
1082
+
1083
  st.download_button(
1084
+ label="🐍 Download Python Script",
1085
  data=code,
1086
+ file_name="data_analysis_script.py",
1087
+ mime="text/plain",
1088
+ use_container_width=True
1089
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1090
 
1091
+ def generate_enhanced_report(self) -> str:
1092
+ """Generate comprehensive markdown report"""
1093
+
1094
+ report = f"""# 🔍 AI Data Quality Analysis Report
1095
+
1096
+ ## 📊 Executive Summary
1097
+
1098
+ **Dataset**: {self.df.shape[0]:,} rows × {self.df.shape[1]} columns
1099
+ **Analysis Date**: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
1100
+ **Quality Score**: {self.quality_metrics['score'] if self.quality_metrics else 'Not calculated'}/100
1101
+
1102
+ ## 🎯 Key Findings
1103
+
1104
+ ### Data Quality Assessment
1105
  """
 
 
1106
 
1107
+ if hasattr(self, 'quality_metrics') and self.quality_metrics:
1108
+ for issue in self.quality_metrics['issues']:
1109
+ report += f"- ⚠️ {issue}\n"
1110
+
1111
+ if not self.quality_metrics['issues']:
1112
+ report += "- No major quality issues detected\n"
1113
+
1114
+ report += f"""
1115
+
1116
+ ### 📈 Analysis Insights
1117
+
 
 
 
 
 
 
 
 
1118
  """
1119
+
1120
  # Group insights by stage
1121
+ stage_names = {
1122
+ 1: "Data Overview",
1123
+ 2: "Exploratory Analysis",
1124
+ 3: "Quality Assessment",
1125
+ 4: "Advanced Analysis",
1126
+ 5: "Summary"
1127
+ }
1128
+
1129
  for stage in range(1, 6):
1130
  stage_insights = [i for i in self.insights if i['stage'] == stage]
1131
  if stage_insights:
1132
+ report += f"#### {stage_names.get(stage, f'Stage {stage}')}\n"
1133
  for insight in stage_insights:
1134
+ report += f"- {insight['insight']}\n"
1135
+ report += "\n"
1136
+
1137
+ # Add data profile
1138
+ report += f"""
1139
+ ## 📋 Data Profile
1140
+
1141
+ | Metric | Value |
1142
+ |--------|-------|
1143
+ | Total Records | {len(self.df):,} |
1144
+ | Total Columns | {len(self.df.columns)} |
1145
+ | Memory Usage | {self.stats['memory_usage']:.1f} MB |
1146
+ | Missing Values | {self.stats['missing_values']:,} |
1147
+ | Duplicate Records | {self.stats['duplicates']:,} |
1148
+
1149
+ ### Column Types Distribution
1150
+ """
1151
+
1152
+ for dtype, count in self.stats['dtypes'].items():
1153
+ report += f"- **{dtype}**: {count} columns\n"
1154
+
1155
+ report += f"""
1156
+
1157
+ ## 🚀 Recommendations
1158
+
1159
+ ### Immediate Actions
1160
+ 1. **Data Quality**: Address missing values in critical business columns
1161
+ 2. **Data Integrity**: Remove duplicate records before analysis
1162
+ 3. **Outlier Treatment**: Investigate statistical anomalies for business context
1163
+
1164
+ ### Long-term Improvements
1165
+ 1. **Process Enhancement**: Implement data validation at collection points
1166
+ 2. **Monitoring**: Establish ongoing data quality metrics
1167
+ 3. **Documentation**: Create data dictionary and lineage documentation
1168
+
1169
+ ---
1170
+ *Report generated by AI Data Quality Inspector*
1171
+ """
1172
 
 
1173
  return report
1174
 
1175
+ def generate_executive_summary(self) -> str:
1176
+ """Generate executive summary for business stakeholders"""
1177
+
1178
+ summary = f"""AI DATA QUALITY INSPECTOR - EXECUTIVE SUMMARY
1179
+ ================================================
1180
+
1181
+ DATASET: {self.df.shape[0]:,} records across {self.df.shape[1]} dimensions
1182
+ ANALYSIS DATE: {pd.Timestamp.now().strftime('%Y-%m-%d')}
1183
+
1184
+ QUALITY ASSESSMENT:
1185
+ - Overall Score: {self.quality_metrics['score'] if self.quality_metrics else 'Calculating'}/100
1186
+ - Data Completeness: {100 - (self.stats['missing_values']/(len(self.df)*len(self.df.columns))*100):.1f}%
1187
+ - Data Integrity: {100 - (self.stats['duplicates']/len(self.df)*100):.1f}%
1188
+
1189
+ KEY INSIGHTS:
1190
+ """
1191
+
1192
+ # Add top 5 most important insights
1193
+ important_insights = [i for i in self.insights if any(keyword in i['insight'].lower()
1194
+ for keyword in ['critical', 'strong', 'significant', 'high', 'best'])][:5]
1195
+
1196
+ for i, insight in enumerate(important_insights, 1):
1197
+ summary += f"{i}. {insight['insight']}\n"
1198
+
1199
+ summary += f"""
1200
+
1201
+ RECOMMENDATIONS:
1202
+ 1. Address data quality issues before business analysis
1203
+ 2. Leverage strong correlations for predictive insights
1204
+ 3. Investigate outliers for business opportunities
1205
+ 4. Implement ongoing data quality monitoring
1206
+
1207
+ BUSINESS IMPACT:
1208
+ - Analysis Confidence: {'High' if len(important_insights) < 3 else 'Medium'}
1209
+ - Decision-Making Risk: {'Low' if self.stats['missing_values'] < len(self.df)*0.05 else 'Medium'}
1210
+ - Analytical Value: {'High' if len(self.column_types['numeric']) > 2 else 'Medium'}
1211
+
1212
+ Generated by AI Data Quality Inspector
1213
+ """
1214
+
1215
+ return summary
1216
+
1217
+ def generate_enhanced_python_code(self) -> str:
1218
+ """Generate production-ready Python code"""
1219
+
1220
+ code = f"""# AI Data Quality Inspector - Generated Analysis Code
1221
+ # Dataset: {self.df.shape[0]:,} rows × {self.df.shape[1]} columns
1222
+ # Generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
1223
+
1224
+ import pandas as pd
1225
  import numpy as np
1226
  import plotly.express as px
1227
+ import plotly.graph_objects as go
1228
+ from scipy import stats
1229
+ import warnings
1230
+ warnings.filterwarnings('ignore')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1231
 
1232
+ # Load your data
1233
+ df = pd.read_csv('your_data.csv') # Replace with your data source
1234
+
1235
+ print(f"Dataset loaded: {{df.shape[0]:,}} rows × {{df.shape[1]}} columns")
1236
+
1237
+ # ===== DATA QUALITY ASSESSMENT =====
1238
+
1239
+ def calculate_quality_score(df):
1240
+ \"\"\"Calculate comprehensive data quality score\"\"\"
1241
+ score = 100
1242
+ issues = []
1243
+
1244
+ # Missing values penalty
1245
+ missing_pct = (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100
1246
+ if missing_pct > 0:
1247
+ penalty = min(30, missing_pct * 1.5)
1248
+ score -= penalty
1249
+ issues.append(f"Missing values: {{missing_pct:.1f}}%")
1250
+
1251
+ # Duplicates penalty
1252
+ duplicate_pct = (df.duplicated().sum() / len(df)) * 100
1253
+ if duplicate_pct > 0:
1254
+ penalty = min(25, duplicate_pct * 3)
1255
+ score -= penalty
1256
+ issues.append(f"Duplicates: {{duplicate_pct:.1f}}%")
1257
+
1258
+ return {{'score': max(0, score), 'issues': issues}}
1259
+
1260
+ quality_results = calculate_quality_score(df)
1261
+ print(f"\\nQuality Score: {{quality_results['score']:.0f}}/100")
1262
+ if quality_results['issues']:
1263
+ print("Issues found:")
1264
+ for issue in quality_results['issues']:
1265
+ print(f" - {{issue}}")
1266
+
1267
+ # ===== DATA CLEANING =====
1268
+
1269
+ def clean_dataset(df):
1270
+ \"\"\"Apply comprehensive data cleaning\"\"\"
1271
+ cleaned_df = df.copy()
1272
+ cleaning_log = []
1273
+
1274
+ # Remove duplicates
1275
+ original_len = len(cleaned_df)
1276
+ cleaned_df = cleaned_df.drop_duplicates()
1277
+ if len(cleaned_df) < original_len:
1278
+ removed = original_len - len(cleaned_df)
1279
+ cleaning_log.append(f"Removed {{removed}} duplicate rows")
1280
+
1281
+ # Handle missing values intelligently
1282
+ for col in cleaned_df.columns:
1283
+ missing_count = cleaned_df[col].isnull().sum()
1284
+ if missing_count > 0:
1285
+ missing_pct = (missing_count / len(cleaned_df)) * 100
1286
+
1287
+ if missing_pct > 50:
1288
+ # Drop columns with too many missing values
1289
+ cleaned_df = cleaned_df.drop(columns=[col])
1290
+ cleaning_log.append(f"Dropped column '{{col}}' ({{missing_pct:.1f}}% missing)")
1291
+ elif cleaned_df[col].dtype in ['int64', 'float64']:
1292
+ # Fill numeric with median
1293
+ cleaned_df[col] = cleaned_df[col].fillna(cleaned_df[col].median())
1294
+ cleaning_log.append(f"Filled missing values in '{{col}}' with median")
1295
+ else:
1296
+ # Fill categorical with mode
1297
+ mode_val = cleaned_df[col].mode()
1298
+ if not mode_val.empty:
1299
+ cleaned_df[col] = cleaned_df[col].fillna(mode_val[0])
1300
+ cleaning_log.append(f"Filled missing values in '{{col}}' with mode")
1301
+
1302
+ return cleaned_df, cleaning_log
1303
+
1304
+ # Apply cleaning
1305
+ cleaned_df, cleaning_operations = clean_dataset(df)
1306
+
1307
+ print("\\nCleaning Operations Applied:")
1308
+ for operation in cleaning_operations:
1309
+ print(f" ✅ {{operation}}")
1310
+
1311
+ # ===== ANALYSIS FUNCTIONS =====
1312
+
1313
+ def analyze_correlations(df):
1314
+ \"\"\"Analyze correlations between numeric variables\"\"\"
1315
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
1316
+
1317
+ if len(numeric_cols) > 1:
1318
+ corr_matrix = df[numeric_cols].corr()
1319
+
1320
+ # Find strongest correlations
1321
+ correlations = []
1322
+ for i in range(len(corr_matrix.columns)):
1323
+ for j in range(i+1, len(corr_matrix.columns)):
1324
+ corr_val = corr_matrix.iloc[i, j]
1325
+ if abs(corr_val) > 0.3:
1326
+ correlations.append({{
1327
+ 'var1': corr_matrix.columns[i],
1328
+ 'var2': corr_matrix.columns[j],
1329
+ 'correlation': corr_val,
1330
+ 'strength': 'Strong' if abs(corr_val) > 0.7 else 'Moderate'
1331
+ }})
1332
+
1333
+ return correlations
1334
+ return []
1335
+
1336
+ def detect_outliers(df, column):
1337
+ \"\"\"Detect outliers using IQR method\"\"\"
1338
  Q1 = df[column].quantile(0.25)
1339
  Q3 = df[column].quantile(0.75)
1340
  IQR = Q3 - Q1
1341
+ lower_bound = Q1 - 1.5 * IQR
1342
+ upper_bound = Q3 + 1.5 * IQR
1343
+
1344
+ outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
1345
+ return outliers, lower_bound, upper_bound
1346
+
1347
+ # ===== EXECUTE ANALYSIS =====
1348
+
1349
+ print("\\n" + "="*50)
1350
+ print("ANALYSIS RESULTS")
1351
+ print("="*50)
1352
 
1353
+ # Correlation analysis
1354
+ correlations = analyze_correlations(cleaned_df)
1355
+ if correlations:
1356
+ print("\\nKey Correlations:")
1357
+ for corr in correlations[:5]:
1358
+ print(f" {{corr['strength']}}: {{corr['var1']}} ↔ {{corr['var2']}} (r={{corr['correlation']:.3f}})")
1359
+
1360
+ # Outlier analysis for numeric columns
1361
+ numeric_cols = cleaned_df.select_dtypes(include=[np.number]).columns
1362
+ print("\\nOutlier Analysis:")
1363
  for col in numeric_cols:
1364
+ outliers, lower, upper = detect_outliers(cleaned_df, col)
1365
+ if len(outliers) > 0:
1366
+ outlier_pct = (len(outliers) / len(cleaned_df)) * 100
1367
+ print(f" ⚠️ {{col}}: {{len(outliers)}} outliers ({{outlier_pct:.1f}}%)")
1368
+ else:
1369
+ print(f" ✅ {{col}}: No outliers detected")
1370
+
1371
+ # Final quality assessment
1372
+ final_quality = calculate_quality_score(cleaned_df)
1373
+ print(f"\\nFinal Quality Score: {{final_quality['score']:.0f}}/100")
1374
+
1375
+ print("\\n🎉 Analysis Complete! Use the cleaned dataset for your business analysis.")
1376
+
1377
+ # ===== VISUALIZATION EXAMPLES =====
1378
+
1379
+ def create_quality_dashboard(df):
1380
+ \"\"\"Create quality visualization dashboard\"\"\"
1381
+
1382
+ # Missing values heatmap
1383
+ if df.isnull().sum().sum() > 0:
1384
+ missing_matrix = df.isnull().head(100) # First 100 rows
1385
+ fig_missing = px.imshow(
1386
+ missing_matrix.T,
1387
+ title="Missing Values Pattern",
1388
+ color_continuous_scale='Reds'
1389
+ )
1390
+ fig_missing.show()
1391
+
1392
+ # Correlation heatmap
1393
  numeric_cols = df.select_dtypes(include=[np.number]).columns
1394
  if len(numeric_cols) > 1:
1395
+ corr_matrix = df[numeric_cols].corr()
1396
+ fig_corr = px.imshow(
1397
+ corr_matrix,
1398
+ text_auto=True,
1399
+ title="Correlation Matrix",
1400
+ color_continuous_scale='RdBu_r'
1401
+ )
1402
+ fig_corr.show()
1403
+
1404
+ # Uncomment to generate visualizations
1405
+ # create_quality_dashboard(cleaned_df)
1406
 
1407
+ print("\\n📊 Visualization functions available:")
1408
+ print(" - create_quality_dashboard(df): Generate quality visualizations")
1409
+ print(" - Use plotly.express for interactive charts")
1410
+ print(" - All analysis functions are ready to use")
1411
  """
1412
 
1413
+ return code
1414
+
1415
+ def get_paginated_data(self, page: int = 0) -> pd.DataFrame:
1416
+ """Get paginated data for display"""
1417
+ start_idx = page * self.page_size
1418
+ end_idx = start_idx + self.page_size
1419
+ return self.df.iloc[start_idx:end_idx]