entropy25 commited on
Commit
e879f17
·
verified ·
1 Parent(s): 9dd7d77

Update analyzer.py

Browse files
Files changed (1) hide show
  1. analyzer.py +277 -1115
analyzer.py CHANGED
@@ -1,1166 +1,328 @@
1
- import streamlit as st
2
  import pandas as pd
3
  import numpy as np
4
- import plotly.express as px
5
- import plotly.graph_objects as go
6
- import plotly.figure_factory as ff
7
- from plotly.subplots import make_subplots
8
- from typing import Dict, List, Any, Optional
9
- import os
10
- from dotenv import load_dotenv
11
- from data_handler import *
12
 
13
- # ML imports
14
  try:
15
  from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
16
- from sklearn.linear_model import LogisticRegression, LinearRegression
17
- from sklearn.model_selection import train_test_split
18
- from sklearn.metrics import accuracy_score, r2_score, mean_squared_error
19
- from sklearn.preprocessing import LabelEncoder
20
  ML_AVAILABLE = True
21
  except ImportError:
22
  ML_AVAILABLE = False
 
23
 
24
- # Load environment variables
25
- load_dotenv()
26
-
27
- class AIAssistant:
28
- """AI-powered analysis assistant"""
29
 
30
- def __init__(self):
31
- self.openai_key = os.getenv('OPENAI_API_KEY')
32
- self.gemini_key = os.getenv('GOOGLE_API_KEY')
33
-
34
- try:
35
- import google.generativeai as genai
36
- if self.gemini_key:
37
- genai.configure(api_key=self.gemini_key)
38
- self.gemini_model = genai.GenerativeModel('gemini-1.5-flash')
39
- except ImportError:
40
- pass
41
 
42
- def analyze_insights(self, df: pd.DataFrame, insights: List[Dict], model: str = "Google Gemini") -> str:
43
- """Get AI analysis of insights"""
44
- summary = f"""
45
- Dataset Summary:
46
- - Shape: {df.shape}
47
- - Columns: {list(df.columns)}
48
-
49
- Key Insights:
50
- """
51
-
52
- for insight in insights:
53
- summary += f"\n- {insight['insight']}"
54
-
55
- prompt = f"""
56
- As a senior data scientist, analyze this dataset and provide:
57
- 1. Business implications
58
- 2. Key opportunities and risks
59
- 3. Actionable recommendations
60
- 4. Suggestions for further analysis
61
-
62
- {summary}
63
- """
64
-
65
  try:
66
- if hasattr(self, 'gemini_model'):
67
- response = self.gemini_model.generate_content(prompt)
68
- return response.text
69
- else:
70
- return "AI analysis not available. Please configure API keys in .env file."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  except Exception as e:
72
- return f"AI Analysis Error: {str(e)}"
73
-
74
- class DataAnalysisWorkflow:
75
- """Enhanced data analysis workflow with ML capabilities"""
76
 
77
- def __init__(self, df: pd.DataFrame):
78
- self.df = df
79
- self.original_df = df.copy() # Keep original for reference
80
- self.stats = calculate_basic_stats(df)
81
- self.column_types = get_column_types(df)
82
- self.insights = []
83
- self.ml_results = {}
84
-
85
- def add_insight(self, insight: str, stage: int):
86
- """Add insight to analysis report"""
87
- self.insights.append({
88
- 'stage': stage,
89
- 'insight': insight,
90
- 'timestamp': pd.Timestamp.now()
91
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
- def stage_1_overview(self):
94
- """Stage 1: Enhanced Data Overview"""
95
- st.subheader("📊 Data Overview")
96
-
97
- # Key metrics with better formatting
98
- col1, col2, col3, col4 = st.columns(4)
99
- with col1:
100
- st.metric("Total Rows", f"{self.stats['shape'][0]:,}")
101
- with col2:
102
- st.metric("Total Columns", f"{self.stats['shape'][1]:,}")
103
- with col3:
104
- missing_pct = (self.stats['missing_values'] / (self.stats['shape'][0] * self.stats['shape'][1])) * 100
105
- st.metric("Missing Values", f"{self.stats['missing_values']:,}", f"{missing_pct:.1f}%")
106
- with col4:
107
- st.metric("Memory Usage", f"{self.stats['memory_usage']:.1f} MB")
108
 
109
- # Enhanced data types visualization
110
- if self.stats['dtypes']:
111
- col1, col2 = st.columns(2)
 
 
112
 
113
- with col1:
114
- fig = px.pie(
115
- values=list(self.stats['dtypes'].values()),
116
- names=list(self.stats['dtypes'].keys()),
117
- title="Data Types Distribution",
118
- color_discrete_sequence=px.colors.qualitative.Set3
119
- )
120
- st.plotly_chart(fig, use_container_width=True)
121
 
122
- with col2:
123
- # Column overview table
124
- column_info = []
125
- for col in self.df.columns:
126
- column_info.append({
127
- 'Column': col,
128
- 'Type': str(self.df[col].dtype),
129
- 'Non-Null': self.df[col].notna().sum(),
130
- 'Unique': self.df[col].nunique()
131
- })
132
-
133
- info_df = pd.DataFrame(column_info)
134
- st.subheader("Column Details")
135
- st.dataframe(info_df, use_container_width=True, height=300)
136
-
137
- # Enhanced data preview
138
- st.subheader("Data Preview")
139
- preview_option = st.radio(
140
- "Preview type:",
141
- ["First 10 rows", "Last 10 rows", "Random sample", "Custom range"],
142
- horizontal=True
143
- )
144
-
145
- if preview_option == "First 10 rows":
146
- st.dataframe(self.df.head(10), use_container_width=True)
147
- elif preview_option == "Last 10 rows":
148
- st.dataframe(self.df.tail(10), use_container_width=True)
149
- elif preview_option == "Random sample":
150
- sample_size = min(10, len(self.df))
151
- st.dataframe(self.df.sample(n=sample_size), use_container_width=True)
152
- else:
153
- col1, col2 = st.columns(2)
154
- with col1:
155
- start_row = st.number_input("Start row", 0, len(self.df)-1, 0)
156
- with col2:
157
- end_row = st.number_input("End row", start_row+1, len(self.df), min(start_row+10, len(self.df)))
158
- st.dataframe(self.df.iloc[start_row:end_row], use_container_width=True)
159
-
160
- # Missing values analysis
161
- missing_df = calculate_missing_data(self.df)
162
- if not missing_df.empty:
163
- st.subheader("Missing Values Analysis")
164
 
165
- # Visualize missing values
166
- fig = px.bar(
167
- missing_df,
168
- x='Column',
169
- y='Missing %',
170
- title="Missing Values by Column",
171
- color='Missing %',
172
- color_continuous_scale='Reds'
173
- )
174
- st.plotly_chart(fig, use_container_width=True)
175
 
176
- st.dataframe(missing_df, use_container_width=True)
 
 
 
177
 
178
- worst_column = missing_df.iloc[0]['Column']
179
- worst_percentage = missing_df.iloc[0]['Missing %']
180
- self.add_insight(f"Column '{worst_column}' has highest missing data: {worst_percentage:.1f}%", 1)
181
- else:
182
- st.success("✅ No missing values found - Excellent data quality!")
183
- self.add_insight("Dataset has perfect completeness with no missing values", 1)
184
-
185
- def stage_2_exploration(self):
186
- """Stage 2: Enhanced Exploratory Data Analysis"""
187
- st.subheader("🔍 Exploratory Data Analysis")
188
-
189
- numeric_cols = self.column_types['numeric']
190
- categorical_cols = self.column_types['categorical']
191
-
192
- # Numeric analysis with enhanced visualizations
193
- if numeric_cols:
194
- st.subheader("Numeric Variables Analysis")
195
 
196
- # Multi-column selection
197
- selected_numerics = st.multiselect(
198
- "Select numeric columns for analysis:",
199
- numeric_cols,
200
- default=numeric_cols[:3] if len(numeric_cols) >= 3 else numeric_cols
201
- )
202
 
203
- if selected_numerics:
204
- # Distribution plots
205
- st.subheader("Distribution Analysis")
 
 
 
206
 
207
- if len(selected_numerics) == 1:
208
- col = selected_numerics[0]
209
- col1, col2 = st.columns(2)
210
-
211
- with col1:
212
- fig = px.histogram(
213
- self.df,
214
- x=col,
215
- marginal="box",
216
- title=f"Distribution of {col}",
217
- nbins=50
218
- )
219
- st.plotly_chart(fig, use_container_width=True)
220
-
221
- with col2:
222
- # Q-Q plot
223
- from scipy import stats
224
- fig = go.Figure()
225
 
226
- # Remove NaN values for Q-Q plot
227
- clean_data = self.df[col].dropna()
228
- if len(clean_data) > 0:
229
- qq = stats.probplot(clean_data, dist="norm")
230
- fig.add_trace(go.Scatter(
231
- x=qq[0][0],
232
- y=qq[0][1],
233
- mode='markers',
234
- name='Data points'
235
- ))
236
- fig.add_trace(go.Scatter(
237
- x=qq[0][0],
238
- y=qq[1][1] + qq[1][0] * qq[0][0],
239
- mode='lines',
240
- name='Normal distribution line',
241
- line=dict(color='red')
242
- ))
243
- fig.update_layout(
244
- title=f"Q-Q Plot: {col}",
245
- xaxis_title="Theoretical Quantiles",
246
- yaxis_title="Sample Quantiles"
247
- )
248
- st.plotly_chart(fig, use_container_width=True)
249
-
250
- else:
251
- # Multiple distributions
252
- fig = make_subplots(
253
- rows=len(selected_numerics),
254
- cols=1,
255
- subplot_titles=selected_numerics,
256
- vertical_spacing=0.05
257
- )
258
-
259
- for i, col in enumerate(selected_numerics, 1):
260
- fig.add_trace(
261
- go.Histogram(x=self.df[col], name=col, nbinsx=30),
262
- row=i, col=1
263
- )
264
-
265
- fig.update_layout(height=200 * len(selected_numerics), showlegend=False)
266
- st.plotly_chart(fig, use_container_width=True)
267
-
268
- # Statistical summary
269
- st.subheader("Statistical Summary")
270
- summary_stats = self.df[selected_numerics].describe()
271
- st.dataframe(summary_stats, use_container_width=True)
272
-
273
- # Correlation analysis
274
- if len(selected_numerics) > 1:
275
- st.subheader("Correlation Analysis")
276
- corr_matrix = self.df[selected_numerics].corr()
277
-
278
- # Enhanced correlation heatmap
279
- fig = px.imshow(
280
- corr_matrix,
281
- text_auto=True,
282
- aspect="auto",
283
- title="Correlation Matrix",
284
- color_continuous_scale='RdBu',
285
- zmin=-1, zmax=1
286
- )
287
- fig.update_layout(height=500)
288
- st.plotly_chart(fig, use_container_width=True)
289
-
290
- # Find strongest correlations
291
- corr_pairs = []
292
- for i in range(len(corr_matrix.columns)):
293
- for j in range(i+1, len(corr_matrix.columns)):
294
- corr_val = corr_matrix.iloc[i, j]
295
- if abs(corr_val) > 0.1: # Only show meaningful correlations
296
- corr_pairs.append({
297
- 'Variable 1': corr_matrix.columns[i],
298
- 'Variable 2': corr_matrix.columns[j],
299
- 'Correlation': corr_val,
300
- 'Strength': 'Strong' if abs(corr_val) > 0.7 else 'Moderate' if abs(corr_val) > 0.3 else 'Weak'
301
- })
302
-
303
- if corr_pairs:
304
- corr_df = pd.DataFrame(corr_pairs).sort_values('Correlation', key=abs, ascending=False)
305
- st.subheader("Top Correlations")
306
- st.dataframe(corr_df, use_container_width=True)
307
 
308
- strongest = corr_df.iloc[0]
309
- self.add_insight(
310
- f"Strongest correlation: {strongest['Variable 1']} vs {strongest['Variable 2']} ({strongest['Correlation']:.3f})",
311
- 2
312
- )
313
-
314
- # Enhanced categorical analysis
315
- if categorical_cols:
316
- st.subheader("Categorical Variables Analysis")
317
- selected_categorical = st.selectbox("Select categorical column:", categorical_cols)
318
-
319
- value_counts = get_value_counts(self.df, selected_categorical, 15) # Top 15
320
-
321
- col1, col2 = st.columns(2)
322
-
323
- with col1:
324
- # Bar chart
325
- fig = px.bar(
326
- x=value_counts.values,
327
- y=value_counts.index,
328
- orientation='h',
329
- title=f"Top Categories in {selected_categorical}",
330
- color=value_counts.values,
331
- color_continuous_scale='viridis'
332
- )
333
- fig.update_layout(height=400, yaxis={'categoryorder':'total ascending'})
334
- st.plotly_chart(fig, use_container_width=True)
335
 
336
- with col2:
337
- # Pie chart for top categories
338
- top_5 = value_counts.head(5)
339
- others = value_counts.iloc[5:].sum() if len(value_counts) > 5 else 0
340
-
341
- if others > 0:
342
- pie_data = list(top_5.values) + [others]
343
- pie_labels = list(top_5.index) + ['Others']
344
- else:
345
- pie_data = list(top_5.values)
346
- pie_labels = list(top_5.index)
347
 
348
- fig = px.pie(
349
- values=pie_data,
350
- names=pie_labels,
351
- title=f"Distribution of {selected_categorical}",
352
- color_discrete_sequence=px.colors.qualitative.Set3
353
- )
354
- st.plotly_chart(fig, use_container_width=True)
355
-
356
- # Category statistics
357
- total_categories = self.df[selected_categorical].nunique()
358
- most_common = value_counts.index[0]
359
- most_common_pct = (value_counts.iloc[0] / len(self.df)) * 100
 
 
 
 
 
 
 
 
 
 
 
360
 
361
- st.metric("Total Unique Categories", total_categories)
362
- st.metric("Most Common Category", f"{most_common} ({most_common_pct:.1f}%)")
363
 
364
- self.add_insight(f"Column '{selected_categorical}' has {total_categories} categories, dominated by '{most_common}' ({most_common_pct:.1f}%)", 2)
 
 
365
 
366
- def stage_3_quality_check(self):
367
- """Stage 3: Enhanced Data Quality Assessment"""
368
- st.subheader("🧹 Data Quality Assessment")
369
-
370
- quality_score = 100
371
- issues = []
372
-
373
- # Missing values check
374
- if self.stats['missing_values'] > 0:
375
- missing_pct = (self.stats['missing_values'] / (self.stats['shape'][0] * self.stats['shape'][1])) * 100
376
- st.warning(f"⚠️ Found {self.stats['missing_values']:,} missing values ({missing_pct:.2f}%)")
377
- quality_score -= min(missing_pct * 2, 30)
378
- issues.append("Missing values detected")
379
- else:
380
- st.success("✅ No missing values")
381
-
382
- # Duplicates check
383
- if self.stats['duplicates'] > 0:
384
- dup_pct = (self.stats['duplicates'] / self.stats['shape'][0]) * 100
385
- st.warning(f"⚠️ Found {self.stats['duplicates']:,} duplicate rows ({dup_pct:.2f}%)")
386
- quality_score -= min(dup_pct * 3, 25)
387
- issues.append("Duplicate rows found")
388
- else:
389
- st.success("✅ No duplicate rows")
390
-
391
- # Outlier detection with enhanced visualization
392
- numeric_cols = self.column_types['numeric']
393
- if numeric_cols:
394
- st.subheader("Outlier Detection")
395
-
396
- outlier_summary = []
397
- for col in numeric_cols:
398
- outliers = calculate_outliers(self.df, col)
399
- outlier_pct = (len(outliers) / len(self.df)) * 100
400
- outlier_summary.append({
401
- 'Column': col,
402
- 'Outliers': len(outliers),
403
- 'Percentage': outlier_pct,
404
- 'Status': '⚠️ High' if outlier_pct > 10 else '⚡ Medium' if outlier_pct > 5 else '✅ Low'
405
- })
406
-
407
- outlier_df = pd.DataFrame(outlier_summary)
408
- st.dataframe(outlier_df, use_container_width=True)
409
-
410
- # Visualize outliers
411
- selected_col = st.selectbox("Select column for detailed outlier analysis:", numeric_cols)
412
-
413
- col1, col2 = st.columns(2)
414
-
415
- with col1:
416
- fig = px.box(
417
- self.df,
418
- y=selected_col,
419
- title=f"Box Plot: {selected_col}",
420
- points="outliers"
421
- )
422
- st.plotly_chart(fig, use_container_width=True)
423
-
424
- with col2:
425
- # Outlier details
426
- outliers = calculate_outliers(self.df, selected_col)
427
- if len(outliers) > 0:
428
- st.metric("Outliers Found", len(outliers))
429
- st.metric("Outlier Percentage", f"{len(outliers)/len(self.df)*100:.2f}%")
430
-
431
- if len(outliers) <= 100: # Show outlier values if not too many
432
- st.subheader("Outlier Values")
433
- st.dataframe(outliers[[selected_col]].head(20), use_container_width=True)
434
  else:
435
- st.success("✅ No outliers detected")
436
 
437
- # Adjust quality score based on outliers
438
- total_outlier_pct = sum([row['Percentage'] for row in outlier_summary]) / len(outlier_summary)
439
- quality_score -= min(total_outlier_pct, 20)
440
-
441
- # Data consistency checks
442
- st.subheader("Data Consistency Analysis")
443
-
444
- consistency_issues = []
445
-
446
- # Check for mixed data types in object columns
447
- for col in self.column_types['categorical']:
448
- unique_types = set(type(x).__name__ for x in self.df[col].dropna().head(100))
449
- if len(unique_types) > 1:
450
- consistency_issues.append(f"Mixed data types in column '{col}': {unique_types}")
451
-
452
- # Check for unusual string patterns
453
- for col in self.column_types['categorical']:
454
- sample_values = self.df[col].dropna().head(50).astype(str)
455
- if sample_values.str.contains(r'^[0-9]+$').any() and sample_values.str.contains(r'[a-zA-Z]').any():
456
- consistency_issues.append(f"Mixed numeric/text patterns in column '{col}'")
457
-
458
- if consistency_issues:
459
- for issue in consistency_issues:
460
- st.warning(f"⚠️ {issue}")
461
- quality_score -= len(consistency_issues) * 5
462
- else:
463
- st.success("✅ Data types are consistent")
464
-
465
- # Overall quality score
466
- st.subheader("Overall Data Quality Score")
467
- quality_score = max(0, min(100, quality_score)) # Ensure 0-100 range
468
-
469
- col1, col2, col3 = st.columns(3)
470
- with col2:
471
- if quality_score >= 90:
472
- st.success(f"🏆 Excellent Quality: {quality_score:.0f}/100")
473
- quality_level = "Excellent"
474
- elif quality_score >= 75:
475
- st.info(f"👍 Good Quality: {quality_score:.0f}/100")
476
- quality_level = "Good"
477
- elif quality_score >= 60:
478
- st.warning(f"⚠️ Fair Quality: {quality_score:.0f}/100")
479
- quality_level = "Fair"
480
- else:
481
- st.error(f"❌ Poor Quality: {quality_score:.0f}/100")
482
- quality_level = "Poor"
483
-
484
- # Action recommendations
485
- if issues:
486
- st.subheader("📋 Recommended Actions")
487
- for i, issue in enumerate(issues, 1):
488
- st.write(f"{i}. Address {issue}")
489
-
490
- self.add_insight(f"Data quality: {quality_level} ({quality_score:.0f}/100) - {len(issues)} issues identified", 3)
491
- else:
492
- st.success("🎉 No major data quality issues found!")
493
- self.add_insight(f"Excellent data quality ({quality_score:.0f}/100) with no major issues", 3)
494
-
495
- def stage_4_advanced_analysis(self):
496
- """Stage 4: Advanced Statistical Analysis"""
497
- st.subheader("🔬 Advanced Analysis")
498
-
499
- numeric_cols = self.column_types['numeric']
500
- categorical_cols = self.column_types['categorical']
501
-
502
- # Advanced relationship analysis
503
- if len(numeric_cols) >= 2:
504
- st.subheader("🔗 Advanced Relationship Analysis")
505
 
506
- # Scatter plot matrix for multiple variables
507
- if len(numeric_cols) >= 3:
508
- st.subheader("Scatter Plot Matrix")
509
- selected_vars = st.multiselect(
510
- "Select variables for scatter plot matrix:",
511
- numeric_cols,
512
- default=numeric_cols[:4] if len(numeric_cols) >= 4 else numeric_cols
513
- )
514
-
515
- if len(selected_vars) >= 2:
516
- # Sample data for performance
517
- sample_size = min(1000, len(self.df))
518
- sample_df = self.df[selected_vars].sample(n=sample_size) if len(self.df) > sample_size else self.df[selected_vars]
519
-
520
- fig = px.scatter_matrix(
521
- sample_df,
522
- dimensions=selected_vars,
523
- title="Scatter Plot Matrix"
524
- )
525
- fig.update_layout(height=600)
526
- st.plotly_chart(fig, use_container_width=True)
527
 
528
- # Pairwise analysis
529
- st.subheader("Detailed Pairwise Analysis")
530
- col1, col2 = st.columns(2)
531
- with col1:
532
- x_var = st.selectbox("X Variable:", numeric_cols, key="x_var_advanced")
533
- with col2:
534
- y_var = st.selectbox("Y Variable:", [col for col in numeric_cols if col != x_var], key="y_var_advanced")
535
 
536
- # Color by categorical variable option
537
- color_var = None
538
- if categorical_cols:
539
- use_color = st.checkbox("Color by categorical variable")
540
- if use_color:
541
- color_var = st.selectbox("Color variable:", categorical_cols)
542
 
543
- # Create enhanced scatter plot
544
- sample_size = min(5000, len(self.df))
545
- plot_df = self.df.sample(n=sample_size) if len(self.df) > sample_size else self.df
546
 
547
- fig = px.scatter(
548
- plot_df,
549
- x=x_var,
550
- y=y_var,
551
- color=color_var,
552
- title=f"Advanced Analysis: {x_var} vs {y_var}",
553
- trendline="ols",
554
- marginal_x="histogram",
555
- marginal_y="histogram"
556
- )
557
- st.plotly_chart(fig, use_container_width=True)
558
 
559
- # Statistical analysis
560
- correlation = self.df[x_var].corr(self.df[y_var])
561
 
562
- col1, col2, col3 = st.columns(3)
563
- with col1:
564
- st.metric("Correlation", f"{correlation:.3f}")
565
- with col2:
566
- r_squared = correlation ** 2
567
- st.metric("R²", f"{r_squared:.3f}")
568
- with col3:
569
- if abs(correlation) > 0.7:
570
- strength = "Strong"
571
- elif abs(correlation) > 0.3:
572
- strength = "Moderate"
573
- else:
574
- strength = "Weak"
575
- st.metric("Relationship", strength)
576
 
577
- self.add_insight(f"Advanced analysis: {strength} relationship between {x_var} and {y_var} (r={correlation:.3f})", 4)
578
-
579
- # Group comparison analysis
580
- if categorical_cols and numeric_cols:
581
- st.subheader("📊 Group Comparison Analysis")
582
 
583
- col1, col2 = st.columns(2)
584
- with col1:
585
- group_var = st.selectbox("Group by:", categorical_cols, key="group_var_advanced")
586
- with col2:
587
- metric_var = st.selectbox("Analyze metric:", numeric_cols, key="metric_var_advanced")
588
 
589
- # Calculate group statistics
590
- group_stats = calculate_group_stats(self.df, group_var, metric_var)
591
 
592
- # Enhanced group visualization
593
- unique_groups = self.df[group_var].nunique()
594
 
595
- if unique_groups <= 20:
596
- col1, col2 = st.columns(2)
597
-
598
- with col1:
599
- # Box plot
600
- fig = px.box(
601
- self.df,
602
- x=group_var,
603
- y=metric_var,
604
- title=f"{metric_var} Distribution by {group_var}",
605
- points="outliers"
606
- )
607
- fig.update_xaxes(tickangle=45)
608
- st.plotly_chart(fig, use_container_width=True)
609
-
610
- with col2:
611
- # Violin plot
612
- fig = px.violin(
613
- self.df,
614
- x=group_var,
615
- y=metric_var,
616
- title=f"{metric_var} Density by {group_var}",
617
- box=True
618
- )
619
- fig.update_xaxes(tickangle=45)
620
- st.plotly_chart(fig, use_container_width=True)
621
-
622
- # Statistical comparison
623
- st.subheader("Statistical Comparison")
624
- st.dataframe(group_stats, use_container_width=True)
625
-
626
- # Identify best performing group
627
- best_group = group_stats['mean'].idxmax()
628
- best_value = group_stats.loc[best_group, 'mean']
629
- worst_group = group_stats['mean'].idxmin()
630
- worst_value = group_stats.loc[worst_group, 'mean']
631
-
632
- col1, col2 = st.columns(2)
633
- with col1:
634
- st.metric("Best Performing Group", best_group, f"Avg: {best_value:.2f}")
635
- with col2:
636
- st.metric("Lowest Performing Group", worst_group, f"Avg: {worst_value:.2f}")
637
-
638
- self.add_insight(f"Group analysis: '{best_group}' performs best with average {metric_var} of {best_value:.2f}", 4)
639
- else:
640
- st.info(f"Too many groups ({unique_groups}) for detailed visualization. Showing summary statistics only.")
641
- st.dataframe(group_stats.head(15), use_container_width=True)
642
-
643
- def stage_5_ml_modeling(self):
644
- """Stage 5: Machine Learning Modeling"""
645
- st.subheader("🤖 Machine Learning Modeling")
646
-
647
- if not ML_AVAILABLE:
648
- st.warning("⚠️ Machine Learning libraries not available. Please install scikit-learn to use this feature.")
649
- st.code("pip install scikit-learn")
650
- return
651
-
652
- numeric_cols = self.column_types['numeric']
653
- categorical_cols = self.column_types['categorical']
654
-
655
- if len(numeric_cols) < 2:
656
- st.warning("⚠️ Need at least 2 numeric columns for ML modeling.")
657
- return
658
-
659
- st.info("🎯 Automated machine learning model training and evaluation")
660
-
661
- # Model configuration
662
- st.subheader("Model Configuration")
663
-
664
- col1, col2 = st.columns(2)
665
- with col1:
666
- target_column = st.selectbox(
667
- "Select target variable (what to predict):",
668
- numeric_cols + categorical_cols
669
- )
670
-
671
- with col2:
672
- model_type = st.radio(
673
- "Problem type:",
674
- ["Auto-detect", "Regression", "Classification"]
675
- )
676
-
677
- # Feature selection
678
- available_features = [col for col in numeric_cols if col != target_column]
679
- if len(available_features) == 0:
680
- st.error("❌ No suitable features available for modeling.")
681
- return
682
-
683
- selected_features = st.multiselect(
684
- "Select features (leave empty for auto-selection):",
685
- available_features,
686
- default=available_features[:5] if len(available_features) >= 5 else available_features
687
- )
688
-
689
- if not selected_features:
690
- selected_features = available_features[:10] # Auto-select top 10
691
-
692
- if st.button("🚀 Train Models", type="primary"):
693
- try:
694
- with st.spinner("Training machine learning models..."):
695
- self._train_ml_models(target_column, selected_features, model_type)
696
-
697
- st.success("✅ Models trained successfully!")
698
-
699
- except Exception as e:
700
- st.error(f"❌ Model training failed: {str(e)}")
701
-
702
- # Display results if available
703
- if hasattr(self, 'ml_results') and self.ml_results:
704
- self._display_ml_results()
705
 
706
- def _train_ml_models(self, target_col: str, feature_cols: List[str], model_type: str):
707
- """Train ML models"""
708
- # Prepare data
709
- X = self.df[feature_cols].copy()
710
- y = self.df[target_col].copy()
711
-
712
- # Handle missing values
713
- X = X.fillna(X.mean())
714
- y = y.fillna(y.mean() if y.dtype in ['int64', 'float64'] else y.mode()[0])
715
-
716
- # Auto-detect problem type
717
- if model_type == "Auto-detect":
718
- if y.dtype == 'object' or y.nunique() < 10:
719
- model_type = "Classification"
720
- else:
721
- model_type = "Regression"
722
-
723
- # Encode categorical target if needed
724
- label_encoder = None
725
- if model_type == "Classification" and y.dtype == 'object':
726
- label_encoder = LabelEncoder()
727
- y = label_encoder.fit_transform(y)
728
-
729
- # Split data
730
- X_train, X_test, y_train, y_test = train_test_split(
731
- X, y, test_size=0.2, random_state=42, stratify=y if model_type == "Classification" else None
732
- )
733
-
734
- # Train models
735
- models = {}
736
- results = {}
737
-
738
- if model_type == "Regression":
739
- models = {
740
- "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
741
- "Linear Regression": LinearRegression()
742
- }
743
- else:
744
- models = {
745
- "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
746
- "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000)
747
  }
748
-
749
- for name, model in models.items():
750
- # Train model
751
- model.fit(X_train, y_train)
752
-
753
- # Make predictions
754
- y_pred = model.predict(X_test)
755
-
756
- # Calculate metrics
757
- if model_type == "Regression":
758
- r2 = r2_score(y_test, y_pred)
759
- mse = mean_squared_error(y_test, y_pred)
760
- results[name] = {
761
- "R² Score": r2,
762
- "MSE": mse,
763
- "RMSE": np.sqrt(mse)
764
- }
765
- else:
766
- accuracy = accuracy_score(y_test, y_pred)
767
- results[name] = {
768
- "Accuracy": accuracy
769
- }
770
 
771
- # Feature importance
772
- if hasattr(model, 'feature_importances_'):
773
- feature_importance = pd.DataFrame({
774
- 'feature': feature_cols,
775
- 'importance': model.feature_importances_
776
- }).sort_values('importance', ascending=False)
777
- results[name]['feature_importance'] = feature_importance
778
-
779
- # Store results
780
- self.ml_results = {
781
- 'model_type': model_type,
782
- 'target_column': target_col,
783
- 'feature_columns': feature_cols,
784
- 'results': results,
785
- 'label_encoder': label_encoder,
786
- 'test_size': len(X_test)
787
- }
788
-
789
- # Add insight
790
- best_model = max(results.keys(), key=lambda x:
791
- results[x]['R² Score'] if model_type == "Regression" else results[x]['Accuracy']
792
- )
793
- best_score = (results[best_model]['R² Score'] if model_type == "Regression"
794
- else results[best_model]['Accuracy'])
795
-
796
- self.add_insight(f"ML modeling: Best {model_type.lower()} model is {best_model} with score {best_score:.3f}", 5)
797
-
798
- def _display_ml_results(self):
799
- """Display ML modeling results"""
800
- st.subheader("🎯 Model Performance Results")
801
-
802
- results = self.ml_results['results']
803
- model_type = self.ml_results['model_type']
804
-
805
- # Performance comparison
806
- performance_data = []
807
- for model_name, metrics in results.items():
808
- row = {'Model': model_name}
809
- for metric, value in metrics.items():
810
- if metric != 'feature_importance':
811
- row[metric] = value
812
- performance_data.append(row)
813
-
814
- performance_df = pd.DataFrame(performance_data)
815
- st.dataframe(performance_df, use_container_width=True)
816
-
817
- # Visualize performance
818
- if model_type == "Regression":
819
- metric_to_plot = "R² Score"
820
- else:
821
- metric_to_plot = "Accuracy"
822
-
823
- fig = px.bar(
824
- performance_df,
825
- x='Model',
826
- y=metric_to_plot,
827
- title=f"Model Performance Comparison ({metric_to_plot})",
828
- color=metric_to_plot,
829
- color_continuous_scale='viridis'
830
- )
831
- st.plotly_chart(fig, use_container_width=True)
832
-
833
- # Feature importance analysis
834
- st.subheader("🔍 Feature Importance Analysis")
835
-
836
- # Get feature importance from best model
837
- best_model = max(results.keys(), key=lambda x:
838
- results[x][metric_to_plot]
839
- )
840
-
841
- if 'feature_importance' in results[best_model]:
842
- importance_df = results[best_model]['feature_importance']
843
-
844
- col1, col2 = st.columns(2)
845
 
846
- with col1:
847
- # Bar plot
848
- fig = px.bar(
849
- importance_df.head(10),
850
- x='importance',
851
- y='feature',
852
- orientation='h',
853
- title=f"Top 10 Feature Importance ({best_model})",
854
- color='importance',
855
- color_continuous_scale='plasma'
856
- )
857
- fig.update_layout(yaxis={'categoryorder':'total ascending'})
858
- st.plotly_chart(fig, use_container_width=True)
859
 
860
- with col2:
861
- # Show importance table
862
- st.subheader("Feature Rankings")
863
- st.dataframe(importance_df.head(10), use_container_width=True)
864
 
865
- # Top features insight
866
- top_feature = importance_df.iloc[0]['feature']
867
- top_importance = importance_df.iloc[0]['importance']
868
- self.add_insight(f"Most important feature: '{top_feature}' (importance: {top_importance:.3f})", 5)
869
-
870
- # Model recommendations
871
- st.subheader("📋 Model Recommendations")
872
-
873
- best_score = results[best_model][metric_to_plot]
874
-
875
- if model_type == "Regression":
876
- if best_score > 0.8:
877
- st.success(f"🏆 Excellent model performance! {best_model} explains {best_score*100:.1f}% of the variance.")
878
- elif best_score > 0.6:
879
- st.info(f"👍 Good model performance. {best_model} explains {best_score*100:.1f}% of the variance.")
880
- else:
881
- st.warning(f"⚠️ Model performance could be improved. Consider feature engineering or more advanced models.")
882
- else:
883
- if best_score > 0.9:
884
- st.success(f"🏆 Excellent classification accuracy: {best_score*100:.1f}%")
885
- elif best_score > 0.8:
886
- st.info(f"👍 Good classification accuracy: {best_score*100:.1f}%")
887
- else:
888
- st.warning(f"⚠️ Classification accuracy could be improved: {best_score*100:.1f}%")
889
-
890
- def stage_6_summary(self):
891
- """Stage 6: Enhanced Summary and Export"""
892
- st.subheader("📈 Analysis Summary & Export")
893
-
894
- # Key metrics overview
895
- col1, col2, col3, col4 = st.columns(4)
896
- with col1:
897
- st.metric("Total Insights Generated", len(self.insights))
898
- with col2:
899
- quality = "High" if self.stats['missing_values'] == 0 and self.stats['duplicates'] == 0 else "Medium"
900
- st.metric("Data Quality", quality)
901
- with col3:
902
- analysis_completeness = "100%" if len(self.insights) >= 5 else f"{len(self.insights)*20}%"
903
- st.metric("Analysis Complete", analysis_completeness)
904
- with col4:
905
- ml_status = "✅" if hasattr(self, 'ml_results') and self.ml_results else "➖"
906
- st.metric("ML Models", ml_status)
907
-
908
- # Insights timeline
909
- st.subheader("🔍 Key Insights Timeline")
910
-
911
- insights_by_stage = {}
912
- for insight in self.insights:
913
- stage = insight['stage']
914
- if stage not in insights_by_stage:
915
- insights_by_stage[stage] = []
916
- insights_by_stage[stage].append(insight)
917
-
918
- stage_names = {
919
- 1: "📊 Data Overview",
920
- 2: "🔍 Exploration",
921
- 3: "🧹 Quality Check",
922
- 4: "🔬 Advanced Analysis",
923
- 5: "🤖 ML Modeling",
924
- 6: "📈 Summary"
925
- }
926
-
927
- for stage_num in sorted(insights_by_stage.keys()):
928
- with st.expander(f"{stage_names.get(stage_num, f'Stage {stage_num}')} - {len(insights_by_stage[stage_num])} insights"):
929
- for i, insight in enumerate(insights_by_stage[stage_num], 1):
930
- st.write(f"{i}. {insight['insight']}")
931
- st.caption(f"Generated: {insight['timestamp'].strftime('%H:%M:%S')}")
932
-
933
- # Executive summary with AI
934
- st.subheader("🤖 AI-Powered Executive Summary")
935
-
936
- ai_assistant = AIAssistant()
937
-
938
- if st.button("Generate AI Summary", type="primary"):
939
- with st.spinner("Generating AI-powered analysis summary..."):
940
- ai_summary = ai_assistant.analyze_insights(self.df, self.insights)
941
-
942
- st.markdown("### 📋 Executive Summary")
943
- st.markdown(ai_summary)
944
-
945
- # Store AI summary for export
946
- self.ai_summary = ai_summary
947
-
948
- # Export options
949
- st.subheader("📥 Export Results")
950
-
951
- col1, col2, col3 = st.columns(3)
952
-
953
- with col1:
954
- if st.button("📄 Generate Report"):
955
- report = self._generate_comprehensive_report()
956
- st.download_button(
957
- label="📥 Download Analysis Report",
958
- data=report,
959
- file_name=f"analysis_report_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.txt",
960
- mime="text/plain"
961
- )
962
-
963
- with col2:
964
- if st.button("📊 Export Data Summary"):
965
- summary_data = self._generate_data_summary()
966
- st.download_button(
967
- label="📥 Download Data Summary (CSV)",
968
- data=summary_data.to_csv(index=False),
969
- file_name=f"data_summary_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.csv",
970
- mime="text/csv"
971
- )
972
-
973
- with col3:
974
- if hasattr(self, 'ml_results') and self.ml_results:
975
- if st.button("🤖 Export ML Results"):
976
- ml_report = self._generate_ml_report()
977
- st.download_button(
978
- label="📥 Download ML Report",
979
- data=ml_report,
980
- file_name=f"ml_report_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.txt",
981
- mime="text/plain"
982
- )
983
-
984
- # Analysis completion celebration
985
- if len(self.insights) >= 5:
986
- st.balloons()
987
- st.success("🎉 Comprehensive analysis completed successfully!")
988
-
989
- def _generate_comprehensive_report(self) -> str:
990
- """Generate comprehensive analysis report"""
991
- report = f"""
992
- COMPREHENSIVE DATA ANALYSIS REPORT
993
- {'='*50}
994
-
995
- DATASET OVERVIEW
996
- {'-'*20}
997
- • Dataset Shape: {self.stats['shape'][0]:,} rows × {self.stats['shape'][1]:,} columns
998
- • Memory Usage: {self.stats['memory_usage']:.2f} MB
999
- • Missing Values: {self.stats['missing_values']:,} ({self.stats['missing_values']/(self.stats['shape'][0]*self.stats['shape'][1])*100:.2f}%)
1000
- • Duplicate Rows: {self.stats['duplicates']:,}
1001
-
1002
- DATA TYPES DISTRIBUTION
1003
- {'-'*25}
1004
- """
1005
- for dtype, count in self.stats['dtypes'].items():
1006
- report += f"• {dtype}: {count} columns\n"
1007
-
1008
- report += f"""
1009
- KEY INSIGHTS BY ANALYSIS STAGE
1010
- {'-'*35}
1011
- """
1012
-
1013
- stage_names = {
1014
- 1: "Data Overview",
1015
- 2: "Exploratory Analysis",
1016
- 3: "Quality Assessment",
1017
- 4: "Advanced Analysis",
1018
- 5: "Machine Learning",
1019
- 6: "Summary"
1020
- }
1021
-
1022
- for i, insight in enumerate(self.insights, 1):
1023
- stage_name = stage_names.get(insight['stage'], f"Stage {insight['stage']}")
1024
- report += f"\n{i}. [{stage_name}] {insight['insight']}"
1025
-
1026
- # Add ML results if available
1027
- if hasattr(self, 'ml_results') and self.ml_results:
1028
- report += f"""
1029
-
1030
- MACHINE LEARNING RESULTS
1031
- {'-'*25}
1032
- • Problem Type: {self.ml_results['model_type']}
1033
- • Target Variable: {self.ml_results['target_column']}
1034
- • Features Used: {len(self.ml_results['feature_columns'])}
1035
- • Test Set Size: {self.ml_results['test_size']} samples
1036
-
1037
- Model Performance:
1038
- """
1039
- for model_name, metrics in self.ml_results['results'].items():
1040
- report += f"\n{model_name}:\n"
1041
- for metric, value in metrics.items():
1042
- if metric != 'feature_importance':
1043
- report += f" • {metric}: {value:.4f}\n"
1044
-
1045
- # Add AI summary if available
1046
- if hasattr(self, 'ai_summary'):
1047
- report += f"""
1048
-
1049
- AI-POWERED EXECUTIVE SUMMARY
1050
- {'-'*30}
1051
- {self.ai_summary}
1052
- """
1053
-
1054
- report += f"""
1055
-
1056
- ANALYSIS METADATA
1057
- {'-'*18}
1058
- • Total Insights Generated: {len(self.insights)}
1059
- • Analysis Completion Time: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
1060
- • Platform: Enhanced Data Analysis Platform v2.0
1061
-
1062
- {'-'*50}
1063
- Report generated automatically by Enhanced Data Analysis Platform
1064
- """
1065
-
1066
- return report
1067
-
1068
- def _generate_data_summary(self) -> pd.DataFrame:
1069
- """Generate data summary for export"""
1070
- summary_data = []
1071
-
1072
- for col in self.df.columns:
1073
- col_info = {
1074
- 'Column': col,
1075
- 'Data_Type': str(self.df[col].dtype),
1076
- 'Non_Null_Count': self.df[col].notna().sum(),
1077
- 'Missing_Count': self.df[col].isna().sum(),
1078
- 'Missing_Percentage': (self.df[col].isna().sum() / len(self.df)) * 100,
1079
- 'Unique_Values': self.df[col].nunique(),
1080
- 'Most_Common_Value': str(self.df[col].mode().iloc[0]) if not self.df[col].mode().empty else 'N/A'
1081
- }
1082
-
1083
- if self.df[col].dtype in ['int64', 'float64']:
1084
- col_info.update({
1085
- 'Mean': self.df[col].mean(),
1086
- 'Median': self.df[col].median(),
1087
- 'Std_Dev': self.df[col].std(),
1088
- 'Min_Value': self.df[col].min(),
1089
- 'Max_Value': self.df[col].max()
1090
- })
1091
-
1092
- summary_data.append(col_info)
1093
-
1094
- return pd.DataFrame(summary_data)
1095
-
1096
- def _generate_ml_report(self) -> str:
1097
- """Generate ML-specific report"""
1098
- if not hasattr(self, 'ml_results') or not self.ml_results:
1099
- return "No ML results available."
1100
-
1101
- ml_report = f"""
1102
- MACHINE LEARNING ANALYSIS REPORT
1103
- {'='*40}
1104
-
1105
- MODEL CONFIGURATION
1106
- {'-'*20}
1107
- • Problem Type: {self.ml_results['model_type']}
1108
- • Target Variable: {self.ml_results['target_column']}
1109
- • Number of Features: {len(self.ml_results['feature_columns'])}
1110
- • Features Used: {', '.join(self.ml_results['feature_columns'])}
1111
- • Test Set Size: {self.ml_results['test_size']} samples
1112
-
1113
- MODEL PERFORMANCE RESULTS
1114
- {'-'*27}
1115
- """
1116
-
1117
- for model_name, metrics in self.ml_results['results'].items():
1118
- ml_report += f"\n{model_name}:\n"
1119
- for metric, value in metrics.items():
1120
- if metric != 'feature_importance':
1121
- ml_report += f" • {metric}: {value:.6f}\n"
1122
-
1123
- # Add feature importance for best model
1124
- best_model = max(self.ml_results['results'].keys(), key=lambda x:
1125
- list(self.ml_results['results'][x].values())[0] if isinstance(list(self.ml_results['results'][x].values())[0], (int, float)) else 0
1126
- )
1127
-
1128
- if 'feature_importance' in self.ml_results['results'][best_model]:
1129
- ml_report += f"""
1130
- FEATURE IMPORTANCE ANALYSIS ({best_model})
1131
- {'-'*35}
1132
- """
1133
- importance_df = self.ml_results['results'][best_model]['feature_importance']
1134
- for _, row in importance_df.head(10).iterrows():
1135
- ml_report += f"• {row['feature']}: {row['importance']:.6f}\n"
1136
-
1137
- ml_report += f"""
1138
-
1139
- RECOMMENDATIONS
1140
- {'-'*15}
1141
- """
1142
-
1143
- if self.ml_results['model_type'] == "Regression":
1144
- best_score = max([metrics.get('R² Score', 0) for metrics in self.ml_results['results'].values()])
1145
- if best_score > 0.8:
1146
- ml_report += "• Excellent model performance - ready for production use\n"
1147
- elif best_score > 0.6:
1148
- ml_report += "• Good model performance - consider feature engineering for improvement\n"
1149
- else:
1150
- ml_report += "• Model performance needs improvement - try advanced algorithms or more features\n"
1151
- else:
1152
- best_score = max([metrics.get('Accuracy', 0) for metrics in self.ml_results['results'].values()])
1153
- if best_score > 0.9:
1154
- ml_report += "• Excellent classification accuracy - model ready for deployment\n"
1155
- elif best_score > 0.8:
1156
- ml_report += "• Good classification performance - minor optimizations recommended\n"
1157
- else:
1158
- ml_report += "• Classification accuracy needs improvement - consider ensemble methods\n"
1159
-
1160
- ml_report += f"""
1161
-
1162
- {'-'*40}
1163
- ML Report generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
1164
- """
1165
-
1166
- return ml_report
 
 
1
  import pandas as pd
2
  import numpy as np
3
+ import streamlit as st
4
+ from typing import Dict, List, Any, Optional, Tuple
5
+ import warnings
6
+ warnings.filterwarnings('ignore')
 
 
 
 
7
 
8
+ # Machine Learning imports
9
  try:
10
  from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
11
+ from sklearn.linear_model import LinearRegression, LogisticRegression
12
+ from sklearn.model_selection import train_test_split, cross_val_score
13
+ from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report
14
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
15
  ML_AVAILABLE = True
16
  except ImportError:
17
  ML_AVAILABLE = False
18
+ st.warning("⚠️ Machine Learning libraries not available. Please install scikit-learn for ML features.")
19
 
20
+ class DataAnalyzer:
21
+ """Enhanced data analyzer with ML capabilities"""
 
 
 
22
 
23
+ def __init__(self, df: pd.DataFrame):
24
+ """Initialize analyzer with dataframe"""
25
+ self.df = df.copy()
26
+ self.numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
27
+ self.categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
28
+ self.results = {}
 
 
 
 
 
29
 
30
+ def run_basic_analysis(self) -> Dict[str, Any]:
31
+ """Run basic statistical analysis"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  try:
33
+ analysis = {}
34
+
35
+ # Shape and basic info
36
+ analysis['dataset_info'] = {
37
+ 'rows': self.df.shape[0],
38
+ 'columns': self.df.shape[1],
39
+ 'memory_usage_mb': self.df.memory_usage(deep=True).sum() / (1024**2)
40
+ }
41
+
42
+ # Missing data summary
43
+ missing_data = self.df.isnull().sum()
44
+ analysis['missing_data'] = {
45
+ 'total_missing': int(missing_data.sum()),
46
+ 'missing_percentage': float((missing_data.sum() / (self.df.shape[0] * self.df.shape[1])) * 100),
47
+ 'columns_with_missing': missing_data[missing_data > 0].to_dict()
48
+ }
49
+
50
+ # Data types summary
51
+ dtype_counts = self.df.dtypes.value_counts()
52
+ analysis['data_types'] = {str(k): int(v) for k, v in dtype_counts.items()}
53
+
54
+ # Numeric columns analysis
55
+ if self.numeric_cols:
56
+ numeric_analysis = {}
57
+ for col in self.numeric_cols:
58
+ try:
59
+ numeric_analysis[col] = {
60
+ 'mean': float(self.df[col].mean()),
61
+ 'median': float(self.df[col].median()),
62
+ 'std': float(self.df[col].std()),
63
+ 'min': float(self.df[col].min()),
64
+ 'max': float(self.df[col].max()),
65
+ 'skewness': float(self.df[col].skew()),
66
+ 'kurtosis': float(self.df[col].kurtosis())
67
+ }
68
+ except:
69
+ numeric_analysis[col] = {'error': 'Could not calculate statistics'}
70
+ analysis['numeric_analysis'] = numeric_analysis
71
+
72
+ # Categorical columns analysis
73
+ if self.categorical_cols:
74
+ categorical_analysis = {}
75
+ for col in self.categorical_cols:
76
+ try:
77
+ mode_val = self.df[col].mode()
78
+ most_frequent = str(mode_val.iloc[0]) if not mode_val.empty else 'None'
79
+ most_frequent_count = int(self.df[col].value_counts().iloc[0]) if len(self.df[col].value_counts()) > 0 else 0
80
+
81
+ categorical_analysis[col] = {
82
+ 'unique_values': int(self.df[col].nunique()),
83
+ 'most_frequent': most_frequent,
84
+ 'most_frequent_count': most_frequent_count
85
+ }
86
+ except:
87
+ categorical_analysis[col] = {'error': 'Could not calculate statistics'}
88
+ analysis['categorical_analysis'] = categorical_analysis
89
+
90
+ self.results['basic_analysis'] = analysis
91
+ return analysis
92
+
93
  except Exception as e:
94
+ st.error(f"Error in basic analysis: {str(e)}")
95
+ return {}
 
 
96
 
97
+ def run_correlation_analysis(self) -> Dict[str, Any]:
98
+ """Run correlation analysis for numeric columns"""
99
+ try:
100
+ if len(self.numeric_cols) < 2:
101
+ return {'message': 'Need at least 2 numeric columns for correlation analysis'}
102
+
103
+ # Calculate correlation matrix
104
+ correlation_matrix = self.df[self.numeric_cols].corr()
105
+
106
+ # Find strong correlations (threshold > 0.7)
107
+ strong_correlations = []
108
+ for i in range(len(correlation_matrix.columns)):
109
+ for j in range(i+1, len(correlation_matrix.columns)):
110
+ corr_value = correlation_matrix.iloc[i, j]
111
+ if not pd.isna(corr_value) and abs(corr_value) > 0.7:
112
+ strong_correlations.append({
113
+ 'variable_1': correlation_matrix.columns[i],
114
+ 'variable_2': correlation_matrix.columns[j],
115
+ 'correlation': float(corr_value),
116
+ 'strength': 'Strong Positive' if corr_value > 0.7 else 'Strong Negative'
117
+ })
118
+
119
+ analysis = {
120
+ 'correlation_matrix': correlation_matrix.to_dict(),
121
+ 'strong_correlations': strong_correlations,
122
+ 'total_pairs': len(strong_correlations)
123
+ }
124
+
125
+ self.results['correlation_analysis'] = analysis
126
+ return analysis
127
+
128
+ except Exception as e:
129
+ st.error(f"Error in correlation analysis: {str(e)}")
130
+ return {}
131
 
132
+ def run_ml_analysis(self, target_column: str) -> Dict[str, Any]:
133
+ """Run machine learning analysis"""
134
+ if not ML_AVAILABLE:
135
+ return {'error': 'Machine learning libraries not available'}
 
 
 
 
 
 
 
 
 
 
 
136
 
137
+ try:
138
+ # Prepare data
139
+ features = [col for col in self.numeric_cols if col != target_column]
140
+ if len(features) < 1:
141
+ return {'error': 'Not enough features for ML analysis'}
142
 
143
+ # Get clean data (no missing values)
144
+ ml_data = self.df[features + [target_column]].dropna()
145
+ if len(ml_data) < 10:
146
+ return {'error': 'Not enough data points for ML analysis'}
 
 
 
 
147
 
148
+ X = ml_data[features]
149
+ y = ml_data[target_column]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
 
151
+ # Split data
152
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 
 
 
 
 
 
 
 
153
 
154
+ # Scale features
155
+ scaler = StandardScaler()
156
+ X_train_scaled = scaler.fit_transform(X_train)
157
+ X_test_scaled = scaler.transform(X_test)
158
 
159
+ results = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
 
161
+ # Determine if regression or classification
162
+ is_classification = len(np.unique(y)) < 10 and (y.dtype == 'object' or len(np.unique(y)) <= 5)
 
 
 
 
163
 
164
+ if is_classification:
165
+ # Classification models
166
+ models = {
167
+ 'Random Forest Classifier': RandomForestClassifier(n_estimators=100, random_state=42),
168
+ 'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000)
169
+ }
170
 
171
+ for name, model in models.items():
172
+ try:
173
+ # Train model
174
+ if name == 'Logistic Regression':
175
+ model.fit(X_train_scaled, y_train)
176
+ y_pred = model.predict(X_test_scaled)
177
+ else:
178
+ model.fit(X_train, y_train)
179
+ y_pred = model.predict(X_test)
 
 
 
 
 
 
 
 
 
180
 
181
+ # Calculate metrics
182
+ accuracy = accuracy_score(y_test, y_pred)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
184
+ results[name] = {
185
+ 'accuracy': float(accuracy),
186
+ 'type': 'classification'
187
+ }
188
+
189
+ except Exception as e:
190
+ results[name] = {'error': str(e)}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
 
192
+ else:
193
+ # Regression models
194
+ models = {
195
+ 'Random Forest Regressor': RandomForestRegressor(n_estimators=100, random_state=42),
196
+ 'Linear Regression': LinearRegression()
197
+ }
 
 
 
 
 
198
 
199
+ for name, model in models.items():
200
+ try:
201
+ # Train model
202
+ if name == 'Linear Regression':
203
+ model.fit(X_train_scaled, y_train)
204
+ y_pred = model.predict(X_test_scaled)
205
+ else:
206
+ model.fit(X_train, y_train)
207
+ y_pred = model.predict(X_test)
208
+
209
+ # Calculate metrics
210
+ r2 = r2_score(y_test, y_pred)
211
+ mse = mean_squared_error(y_test, y_pred)
212
+
213
+ results[name] = {
214
+ 'r2_score': float(r2),
215
+ 'mse': float(mse),
216
+ 'rmse': float(np.sqrt(mse)),
217
+ 'type': 'regression'
218
+ }
219
+
220
+ except Exception as e:
221
+ results[name] = {'error': str(e)}
222
 
223
+ self.results['ml_analysis'] = results
224
+ return results
225
 
226
+ except Exception as e:
227
+ st.error(f"Error in ML analysis: {str(e)}")
228
+ return {'error': str(e)}
229
 
230
+ def generate_insights(self) -> Dict[str, Any]:
231
+ """Generate comprehensive insights from all analyses"""
232
+ try:
233
+ insights = {}
234
+
235
+ # Basic insights
236
+ basic = self.run_basic_analysis()
237
+ if basic:
238
+ insights['data_summary'] = [
239
+ f"Dataset contains {basic['dataset_info']['rows']:,} rows and {basic['dataset_info']['columns']} columns",
240
+ f"Memory usage: {basic['dataset_info']['memory_usage_mb']:.1f} MB",
241
+ f"Missing data: {basic['missing_data']['missing_percentage']:.1f}% of total cells"
242
+ ]
243
+
244
+ # Correlation insights
245
+ correlation = self.run_correlation_analysis()
246
+ if correlation and 'strong_correlations' in correlation:
247
+ if correlation['strong_correlations']:
248
+ corr_insights = []
249
+ for corr in correlation['strong_correlations'][:5]: # Top 5
250
+ corr_insights.append(
251
+ f"{corr['variable_1']} and {corr['variable_2']} are strongly correlated (r={corr['correlation']:.3f})"
252
+ )
253
+ insights['correlation_insights'] = corr_insights
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
  else:
255
+ insights['correlation_insights'] = ["No strong correlations found between numeric variables"]
256
 
257
+ # Data quality insights
258
+ quality_insights = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
 
260
+ # Missing data insights
261
+ if basic and basic['missing_data']['total_missing'] > 0:
262
+ quality_insights.append(f"Found {basic['missing_data']['total_missing']} missing values")
263
+ if basic['missing_data']['missing_percentage'] > 10:
264
+ quality_insights.append("⚠️ High percentage of missing data may affect analysis quality")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
 
266
+ # Duplicates
267
+ duplicates = self.df.duplicated().sum()
268
+ if duplicates > 0:
269
+ quality_insights.append(f"Found {duplicates} duplicate rows")
 
 
 
270
 
271
+ if not quality_insights:
272
+ quality_insights.append("✅ Data quality looks good - no major issues detected")
 
 
 
 
273
 
274
+ insights['data_quality'] = quality_insights
 
 
275
 
276
+ # Recommendations
277
+ recommendations = []
 
 
 
 
 
 
 
 
 
278
 
279
+ if basic and basic['missing_data']['missing_percentage'] > 5:
280
+ recommendations.append("Consider handling missing values before analysis")
281
 
282
+ if len(self.numeric_cols) < 2:
283
+ recommendations.append("Add more numeric columns for better analysis capabilities")
 
 
 
 
 
 
 
 
 
 
 
 
284
 
285
+ if self.df.shape[0] < 100:
286
+ recommendations.append("Consider collecting more data points for robust analysis")
 
 
 
287
 
288
+ if not recommendations:
289
+ recommendations.append("Dataset is ready for comprehensive analysis")
 
 
 
290
 
291
+ insights['recommendations'] = recommendations
 
292
 
293
+ return insights
 
294
 
295
+ except Exception as e:
296
+ st.error(f"Error generating insights: {str(e)}")
297
+ return {'error': str(e)}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
298
 
299
+ def get_summary_statistics(self) -> Dict[str, Any]:
300
+ """Get comprehensive summary statistics"""
301
+ try:
302
+ summary = {
303
+ 'shape': self.df.shape,
304
+ 'columns': self.df.columns.tolist(),
305
+ 'dtypes': self.df.dtypes.to_dict(),
306
+ 'missing_values': self.df.isnull().sum().to_dict(),
307
+ 'memory_usage': self.df.memory_usage(deep=True).sum() / (1024**2) # MB
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
308
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
 
310
+ # Numeric statistics
311
+ if self.numeric_cols:
312
+ summary['numeric_stats'] = self.df[self.numeric_cols].describe().to_dict()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313
 
314
+ # Categorical statistics
315
+ if self.categorical_cols:
316
+ categorical_stats = {}
317
+ for col in self.categorical_cols:
318
+ categorical_stats[col] = {
319
+ 'unique_count': self.df[col].nunique(),
320
+ 'top_values': self.df[col].value_counts().head(5).to_dict()
321
+ }
322
+ summary['categorical_stats'] = categorical_stats
 
 
 
 
323
 
324
+ return summary
 
 
 
325
 
326
+ except Exception as e:
327
+ st.error(f"Error getting summary statistics: {str(e)}")
328
+ return {}